diff options
Diffstat (limited to 'tools')
353 files changed, 22597 insertions, 3292 deletions
diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 53adc1762ec0..ec31f5b60323 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -366,7 +366,7 @@ AVXcode: 1 1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv 1c: Grp20 (1A),(1C) 1d: -1e: +1e: Grp21 (1A) 1f: NOP Ev # 0x0f 0x20-0x2f 20: MOV Rd,Cd @@ -803,8 +803,8 @@ f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) f3: Grp17 (1A) -f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) -f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) +f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) f9: MOVDIRI My,Gy @@ -970,7 +970,7 @@ GrpTable: Grp7 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv -5: rdpkru (110),(11B) | wrpkru (111),(11B) +5: rdpkru (110),(11B) | wrpkru (111),(11B) | SAVEPREVSSP (F3),(010),(11B) | RSTORSSP Mq (F3) | SETSSBSY (F3),(000),(11B) 6: LMSW Ew 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) EndTable @@ -1041,8 +1041,8 @@ GrpTable: Grp15 2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE | ptwrite Ey (F3),(11B) -5: XRSTOR | lfence (11B) -6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B) +5: XRSTOR | lfence (11B) | INCSSPD/Q Ry (F3),(11B) +6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B) | CLRSSBSY Mq (F3) 7: clflush | clflushopt (66) | sfence (11B) EndTable @@ -1077,6 +1077,11 @@ GrpTable: Grp20 0: cldemote Mb EndTable +GrpTable: Grp21 +1: RDSSPD/Q Ry (F3),(11B) +7: ENDBR64 (F3),(010),(11B) | ENDBR32 (F3),(011),(11B) +EndTable + # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index b13926432b84..8d6e8901ed2b 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -1,7 +1,9 @@ *.d +/_bpftool /bpftool bpftool*.8 bpf-helpers.* FEATURE-DUMP.bpftool feature libbpf +profiler.skel.h diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 4d08f35034a2..b04156cfd7a3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -19,19 +19,24 @@ SYNOPSIS FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**macros** [**prefix** *PREFIX*]] | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } DESCRIPTION =========== - **bpftool feature probe** [**kernel**] [**macros** [**prefix** *PREFIX*]] + **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] Probe the running kernel and dump a number of eBPF-related parameters, such as availability of the **bpf()** system call, JIT status, eBPF program types availability, eBPF helper functions availability, and more. + By default, bpftool **does not run probes** for + **bpf_probe_write_user**\ () and **bpf_trace_printk**\() + helpers which print warnings to kernel logs. To enable them + and run all probes, the **full** keyword should be used. + If the **macros** keyword (but not the **-j** option) is passed, a subset of the output is dumped as a list of **#define** macros that are ready to be included in a C @@ -44,16 +49,12 @@ DESCRIPTION Keyword **kernel** can be omitted. If no probe target is specified, probing the kernel is the default behaviour. - Note that when probed, some eBPF helpers (e.g. - **bpf_trace_printk**\ () or **bpf_probe_write_user**\ ()) may - print warnings to kernel logs. - - **bpftool feature probe dev** *NAME* [**macros** [**prefix** *PREFIX*]] + **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] Probe network device for supported eBPF features and dump results to the console. - The two keywords **macros** and **prefix** have the same - role as when probing the kernel. + The keywords **full**, **macros** and **prefix** have the + same role as when probing the kernel. **bpftool feature help** Print short help message. diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 64ddf8a4c518..9f19404f470e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -30,6 +30,7 @@ PROG COMMANDS | **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] | **bpftool** **prog tracelog** | **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* | **bpftool** **prog help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } @@ -42,11 +43,15 @@ PROG COMMANDS | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | | **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** +| **cgroup/getsockopt** | **cgroup/setsockopt** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | } | *ATTACH_TYPE* := { | **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** | } +| *METRIC* := { +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** +| } DESCRIPTION @@ -188,6 +193,12 @@ DESCRIPTION not all of them can take the **ctx_in**/**ctx_out** arguments. bpftool does not perform checks on program types. + **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* + Profile *METRICs* for bpf program *PROG* for *DURATION* + seconds or until user hits Ctrl-C. *DURATION* is optional. + If *DURATION* is not specified, the profiling will run up to + UINT_MAX seconds. + **bpftool prog help** Print short help message. @@ -310,6 +321,15 @@ EXAMPLES **# rm /sys/fs/bpf/xdp1** +| +| **# bpftool prog profile id 337 duration 10 cycles instructions llc_misses** + +:: + 51397 run_cnt + 40176203 cycles (83.05%) + 42518139 instructions # 1.06 insns per cycle (83.39%) + 123 llc_misses # 2.89 LLC misses per million insns (83.15%) + SEE ALSO ======== **bpf**\ (2), diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst new file mode 100644 index 000000000000..f045cc89dd6d --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -0,0 +1,116 @@ +================== +bpftool-struct_ops +================== +------------------------------------------------------------------------------- +tool to register/unregister/introspect BPF struct_ops +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* + + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + + *COMMANDS* := + { **show** | **list** | **dump** | **register** | **unregister** | **help** } + +STRUCT_OPS COMMANDS +=================== + +| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops register** *OBJ* +| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* +| **bpftool** **struct_ops help** +| +| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } +| *OBJ* := /a/file/of/bpf_struct_ops.o + + +DESCRIPTION +=========== + **bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] + Show brief information about the struct_ops in the system. + If *STRUCT_OPS_MAP* is specified, it shows information only + for the given struct_ops. Otherwise, it lists all struct_ops + currently existing in the system. + + Output will start with struct_ops map ID, followed by its map + name and its struct_ops's kernel type. + + **bpftool struct_ops dump** [*STRUCT_OPS_MAP*] + Dump details information about the struct_ops in the system. + If *STRUCT_OPS_MAP* is specified, it dumps information only + for the given struct_ops. Otherwise, it dumps all struct_ops + currently existing in the system. + + **bpftool struct_ops register** *OBJ* + Register bpf struct_ops from *OBJ*. All struct_ops under + the ELF section ".struct_ops" will be registered to + its kernel subsystem. + + **bpftool struct_ops unregister** *STRUCT_OPS_MAP* + Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. + + **bpftool struct_ops help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -V, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + + -d, --debug + Print all logs available, even debug-level information. This + includes logs from libbpf as well as from the verifier, when + attempting to load programs. + +EXAMPLES +======== +**# bpftool struct_ops show** + +:: + + 100: dctcp tcp_congestion_ops + 105: cubic tcp_congestion_ops + +**# bpftool struct_ops unregister id 105** + +:: + + Unregistered tcp_congestion_ops cubic id 105 + +**# bpftool struct_ops register bpf_cubic.o** + +:: + + Registered tcp_congestion_ops cubic id 110 + + +SEE ALSO +======== + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog**\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) + **bpftool-gen**\ (8) + diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index c4e810335810..f584d1fdfc64 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -59,10 +59,12 @@ LIBS = $(LIBBPF) -lelf -lz INSTALL ?= install RM ?= rm -f +CLANG ?= clang FEATURE_USER = .bpftool -FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib -FEATURE_DISPLAY = libbfd disassembler-four-args zlib +FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib \ + clang-bpf-global-var +FEATURE_DISPLAY = libbfd disassembler-four-args zlib clang-bpf-global-var check_feat := 1 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall @@ -110,14 +112,39 @@ SRCS += $(BFD_SRCS) endif OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o +_OBJS = $(filter-out $(OUTPUT)prog.o,$(OBJS)) $(OUTPUT)_prog.o + +ifeq ($(feature-clang-bpf-global-var),1) + __OBJS = $(OBJS) +else + __OBJS = $(_OBJS) +endif + +$(OUTPUT)_prog.o: prog.c + $(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $< + +$(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS) + +skeleton/profiler.bpf.o: skeleton/profiler.bpf.c $(LIBBPF) + $(QUIET_CLANG)$(CLANG) \ + -I$(srctree)/tools/include/uapi/ \ + -I$(LIBBPF_PATH) -I$(srctree)/tools/lib \ + -g -O2 -target bpf -c $< -o $@ + +profiler.skel.h: $(OUTPUT)_bpftool skeleton/profiler.bpf.o + $(QUIET_GEN)$(OUTPUT)./_bpftool gen skeleton skeleton/profiler.bpf.o > $@ + +$(OUTPUT)prog.o: prog.c profiler.skel.h + $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< $(OUTPUT)feature.o: | zdep -$(OUTPUT)bpftool: $(OBJS) $(LIBBPF) - $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS) +$(OUTPUT)bpftool: $(__OBJS) $(LIBBPF) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(__OBJS) $(LIBS) $(OUTPUT)%.o: %.c $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< @@ -125,6 +152,7 @@ $(OUTPUT)%.o: %.c clean: $(LIBBPF)-clean $(call QUIET_CLEAN, bpftool) $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d + $(Q)$(RM) -- $(OUTPUT)_bpftool profiler.skel.h skeleton/profiler.bpf.o $(Q)$(RM) -r -- $(OUTPUT)libbpf/ $(call QUIET_CLEAN, core-gen) $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 754d8395e451..45ee99b159e2 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -337,6 +337,7 @@ _bpftool() local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned name' + local METRIC_TYPE='cycles instructions l1d_loads llc_misses' case $command in show|list) [[ $prev != "$command" ]] && return 0 @@ -388,7 +389,7 @@ _bpftool() _bpftool_get_prog_ids ;; name) - _bpftool_get_map_names + _bpftool_get_prog_names ;; pinned) _filedir @@ -469,7 +470,8 @@ _bpftool() cgroup/recvmsg4 cgroup/recvmsg6 \ cgroup/post_bind4 cgroup/post_bind6 \ cgroup/sysctl cgroup/getsockopt \ - cgroup/setsockopt" -- \ + cgroup/setsockopt struct_ops \ + fentry fexit freplace" -- \ "$cur" ) ) return 0 ;; @@ -497,9 +499,51 @@ _bpftool() tracelog) return 0 ;; + profile) + case $cword in + 3) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + 4) + case $prev in + id) + _bpftool_get_prog_ids + ;; + name) + _bpftool_get_prog_names + ;; + pinned) + _filedir + ;; + esac + return 0 + ;; + 5) + COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) ) + return 0 + ;; + 6) + case $prev in + duration) + return 0 + ;; + *) + COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + return 0 + ;; + esac + return 0 + ;; + *) + COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + return 0 + ;; + esac + ;; run) - if [[ ${#words[@]} -lt 5 ]]; then - _filedir + if [[ ${#words[@]} -eq 4 ]]; then + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) return 0 fi case $prev in @@ -507,6 +551,10 @@ _bpftool() _bpftool_get_prog_ids return 0 ;; + name) + _bpftool_get_prog_names + return 0 + ;; data_in|data_out|ctx_in|ctx_out) _filedir return 0 @@ -524,7 +572,35 @@ _bpftool() *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'dump help pin attach detach \ - load loadall show list tracelog run' -- "$cur" ) ) + load loadall show list tracelog run profile' -- "$cur" ) ) + ;; + esac + ;; + struct_ops) + local STRUCT_OPS_TYPE='id name' + case $command in + show|list|dump|unregister) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$STRUCT_OPS_TYPE" -- "$cur" ) ) + ;; + id) + _bpftool_get_map_ids_for_type struct_ops + ;; + name) + _bpftool_get_map_names_for_type struct_ops + ;; + esac + return 0 + ;; + register) + _filedir + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'register unregister show list dump help' \ + -- "$cur" ) ) ;; esac ;; @@ -712,11 +788,17 @@ _bpftool() esac ;; pin) - if [[ $prev == "$command" ]]; then - COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) - else - _filedir - fi + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + ;; + id) + _bpftool_get_map_ids + ;; + name) + _bpftool_get_map_names + ;; + esac return 0 ;; event_pipe) @@ -843,7 +925,7 @@ _bpftool() case $command in skeleton) _filedir - ;; + ;; *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'skeleton help' -- "$cur" ) ) @@ -943,6 +1025,9 @@ _bpftool() id) _bpftool_get_prog_ids ;; + name) + _bpftool_get_prog_names + ;; pinned) _filedir ;; @@ -983,11 +1068,12 @@ _bpftool() probe) [[ $prev == "prefix" ]] && return 0 if _bpftool_search_list 'macros'; then - COMPREPLY+=( $( compgen -W 'prefix' -- "$cur" ) ) + _bpftool_once_attr 'prefix' else COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) ) fi _bpftool_one_of_list 'kernel dev' + _bpftool_once_attr 'full' return 0 ;; *) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index b3745ed711ba..bcaf55b59498 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -389,6 +389,9 @@ static int dump_btf_c(const struct btf *btf, if (IS_ERR(d)) return PTR_ERR(d); + printf("#ifndef __VMLINUX_H__\n"); + printf("#define __VMLINUX_H__\n"); + printf("\n"); printf("#ifndef BPF_NO_PRESERVE_ACCESS_INDEX\n"); printf("#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)\n"); printf("#endif\n\n"); @@ -412,6 +415,8 @@ static int dump_btf_c(const struct btf *btf, printf("#ifndef BPF_NO_PRESERVE_ACCESS_INDEX\n"); printf("#pragma clang attribute pop\n"); printf("#endif\n"); + printf("\n"); + printf("#endif /* __VMLINUX_H__ */\n"); done: btf_dump__free(d); diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 01cc52b834fa..497807bec675 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -4,11 +4,13 @@ #include <ctype.h> #include <stdio.h> /* for (FILE *) used by json_writer */ #include <string.h> +#include <unistd.h> #include <asm/byteorder.h> #include <linux/bitops.h> #include <linux/btf.h> #include <linux/err.h> #include <bpf/btf.h> +#include <bpf/bpf.h> #include "json_writer.h" #include "main.h" @@ -22,13 +24,102 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, __u8 bit_offset, const void *data); -static void btf_dumper_ptr(const void *data, json_writer_t *jw, - bool is_plain_text) +static int btf_dump_func(const struct btf *btf, char *func_sig, + const struct btf_type *func_proto, + const struct btf_type *func, int pos, int size); + +static int dump_prog_id_as_func_ptr(const struct btf_dumper *d, + const struct btf_type *func_proto, + __u32 prog_id) { - if (is_plain_text) - jsonw_printf(jw, "%p", *(void **)data); + struct bpf_prog_info_linear *prog_info = NULL; + const struct btf_type *func_type; + const char *prog_name = NULL; + struct bpf_func_info *finfo; + struct btf *prog_btf = NULL; + struct bpf_prog_info *info; + int prog_fd, func_sig_len; + char prog_str[1024]; + + /* Get the ptr's func_proto */ + func_sig_len = btf_dump_func(d->btf, prog_str, func_proto, NULL, 0, + sizeof(prog_str)); + if (func_sig_len == -1) + return -1; + + if (!prog_id) + goto print; + + /* Get the bpf_prog's name. Obtain from func_info. */ + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd == -1) + goto print; + + prog_info = bpf_program__get_prog_info_linear(prog_fd, + 1UL << BPF_PROG_INFO_FUNC_INFO); + close(prog_fd); + if (IS_ERR(prog_info)) { + prog_info = NULL; + goto print; + } + info = &prog_info->info; + + if (!info->btf_id || !info->nr_func_info || + btf__get_from_id(info->btf_id, &prog_btf)) + goto print; + finfo = (struct bpf_func_info *)info->func_info; + func_type = btf__type_by_id(prog_btf, finfo->type_id); + if (!func_type || !btf_is_func(func_type)) + goto print; + + prog_name = btf__name_by_offset(prog_btf, func_type->name_off); + +print: + if (!prog_id) + snprintf(&prog_str[func_sig_len], + sizeof(prog_str) - func_sig_len, " 0"); + else if (prog_name) + snprintf(&prog_str[func_sig_len], + sizeof(prog_str) - func_sig_len, + " %s/prog_id:%u", prog_name, prog_id); else - jsonw_printf(jw, "%lu", *(unsigned long *)data); + snprintf(&prog_str[func_sig_len], + sizeof(prog_str) - func_sig_len, + " <unknown_prog_name>/prog_id:%u", prog_id); + + prog_str[sizeof(prog_str) - 1] = '\0'; + jsonw_string(d->jw, prog_str); + btf__free(prog_btf); + free(prog_info); + return 0; +} + +static void btf_dumper_ptr(const struct btf_dumper *d, + const struct btf_type *t, + const void *data) +{ + unsigned long value = *(unsigned long *)data; + const struct btf_type *ptr_type; + __s32 ptr_type_id; + + if (!d->prog_id_as_func_ptr || value > UINT32_MAX) + goto print_ptr_value; + + ptr_type_id = btf__resolve_type(d->btf, t->type); + if (ptr_type_id < 0) + goto print_ptr_value; + ptr_type = btf__type_by_id(d->btf, ptr_type_id); + if (!ptr_type || !btf_is_func_proto(ptr_type)) + goto print_ptr_value; + + if (!dump_prog_id_as_func_ptr(d, ptr_type, value)) + return; + +print_ptr_value: + if (d->is_plain_text) + jsonw_printf(d->jw, "%p", (void *)value); + else + jsonw_printf(d->jw, "%lu", value); } static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id, @@ -43,9 +134,78 @@ static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id, return btf_dumper_do_type(d, actual_type_id, bit_offset, data); } -static void btf_dumper_enum(const void *data, json_writer_t *jw) +static int btf_dumper_enum(const struct btf_dumper *d, + const struct btf_type *t, + const void *data) +{ + const struct btf_enum *enums = btf_enum(t); + __s64 value; + __u16 i; + + switch (t->size) { + case 8: + value = *(__s64 *)data; + break; + case 4: + value = *(__s32 *)data; + break; + case 2: + value = *(__s16 *)data; + break; + case 1: + value = *(__s8 *)data; + break; + default: + return -EINVAL; + } + + for (i = 0; i < btf_vlen(t); i++) { + if (value == enums[i].val) { + jsonw_string(d->jw, + btf__name_by_offset(d->btf, + enums[i].name_off)); + return 0; + } + } + + jsonw_int(d->jw, value); + return 0; +} + +static bool is_str_array(const struct btf *btf, const struct btf_array *arr, + const char *s) { - jsonw_printf(jw, "%d", *(int *)data); + const struct btf_type *elem_type; + const char *end_s; + + if (!arr->nelems) + return false; + + elem_type = btf__type_by_id(btf, arr->type); + /* Not skipping typedef. typedef to char does not count as + * a string now. + */ + while (elem_type && btf_is_mod(elem_type)) + elem_type = btf__type_by_id(btf, elem_type->type); + + if (!elem_type || !btf_is_int(elem_type) || elem_type->size != 1) + return false; + + if (btf_int_encoding(elem_type) != BTF_INT_CHAR && + strcmp("char", btf__name_by_offset(btf, elem_type->name_off))) + return false; + + end_s = s + arr->nelems; + while (s < end_s) { + if (!*s) + return true; + if (*s <= 0x1f || *s >= 0x7f) + return false; + s++; + } + + /* '\0' is not found */ + return false; } static int btf_dumper_array(const struct btf_dumper *d, __u32 type_id, @@ -57,6 +217,11 @@ static int btf_dumper_array(const struct btf_dumper *d, __u32 type_id, int ret = 0; __u32 i; + if (is_str_array(d->btf, arr, data)) { + jsonw_string(d->jw, data); + return 0; + } + elem_size = btf__resolve_size(d->btf, arr->type); if (elem_size < 0) return elem_size; @@ -366,10 +531,9 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, case BTF_KIND_ARRAY: return btf_dumper_array(d, type_id, data); case BTF_KIND_ENUM: - btf_dumper_enum(data, d->jw); - return 0; + return btf_dumper_enum(d, t, data); case BTF_KIND_PTR: - btf_dumper_ptr(data, d->jw, d->is_plain_text); + btf_dumper_ptr(d, t, data); return 0; case BTF_KIND_UNKN: jsonw_printf(d->jw, "(unknown)"); @@ -414,10 +578,6 @@ int btf_dumper_type(const struct btf_dumper *d, __u32 type_id, return -1; \ } while (0) -static int btf_dump_func(const struct btf *btf, char *func_sig, - const struct btf_type *func_proto, - const struct btf_type *func, int pos, int size); - static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id, char *func_sig, int pos, int size) { @@ -526,8 +686,15 @@ static int btf_dump_func(const struct btf *btf, char *func_sig, BTF_PRINT_ARG(", "); if (arg->type) { BTF_PRINT_TYPE(arg->type); - BTF_PRINT_ARG("%s", - btf__name_by_offset(btf, arg->name_off)); + if (arg->name_off) + BTF_PRINT_ARG("%s", + btf__name_by_offset(btf, arg->name_off)); + else if (pos && func_sig[pos - 1] == ' ') + /* Remove unnecessary space for + * FUNC_PROTO that does not have + * arg->name_off + */ + func_sig[--pos] = '\0'; } else { BTF_PRINT_ARG("..."); } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index b75b8ec5469c..f2223dbdfb0a 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -211,39 +211,14 @@ int do_pin_fd(int fd, const char *name) return err; } -int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)) +int do_pin_any(int argc, char **argv, int (*get_fd)(int *, char ***)) { - unsigned int id; - char *endptr; int err; int fd; - if (argc < 3) { - p_err("too few arguments, id ID and FILE path is required"); - return -1; - } else if (argc > 3) { - p_err("too many arguments"); - return -1; - } - - if (!is_prefix(*argv, "id")) { - p_err("expected 'id' got %s", *argv); - return -1; - } - NEXT_ARG(); - - id = strtoul(*argv, &endptr, 0); - if (*endptr) { - p_err("can't parse %s as ID", *argv); - return -1; - } - NEXT_ARG(); - - fd = get_fd_by_id(id); - if (fd < 0) { - p_err("can't open object by id (%u): %s", id, strerror(errno)); - return -1; - } + fd = get_fd(&argc, &argv); + if (fd < 0) + return fd; err = do_pin_fd(fd, *argv); @@ -597,3 +572,10 @@ int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what) return 0; } + +int __printf(2, 0) +print_all_levels(__maybe_unused enum libbpf_print_level level, + const char *format, va_list args) +{ + return vfprintf(stderr, format, args); +} diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 941873d778d8..88718ee6a438 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -112,18 +112,12 @@ print_start_section(const char *json_title, const char *plain_title, } } -static void -print_end_then_start_section(const char *json_title, const char *plain_title, - const char *define_comment, - const char *define_prefix) +static void print_end_section(void) { if (json_output) jsonw_end_object(json_wtr); else printf("\n"); - - print_start_section(json_title, plain_title, define_comment, - define_prefix); } /* Probing functions */ @@ -520,13 +514,38 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix, } static void +probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, + const char *define_prefix, unsigned int id, + const char *ptype_name, __u32 ifindex) +{ + bool res; + + if (!supported_type) + res = false; + else + res = bpf_probe_helper(id, prog_type, ifindex); + + if (json_output) { + if (res) + jsonw_string(json_wtr, helper_name[id]); + } else if (define_prefix) { + printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n", + define_prefix, ptype_name, helper_name[id], + res ? "1" : "0"); + } else { + if (res) + printf("\n\t- %s", helper_name[id]); + } +} + +static void probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, - const char *define_prefix, __u32 ifindex) + const char *define_prefix, bool full_mode, + __u32 ifindex) { const char *ptype_name = prog_type_name[prog_type]; char feat_name[128]; unsigned int id; - bool res; if (ifindex) /* Only test helpers for offload-able program types */ @@ -548,21 +567,19 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, } for (id = 1; id < ARRAY_SIZE(helper_name); id++) { - if (!supported_type) - res = false; - else - res = bpf_probe_helper(id, prog_type, ifindex); - - if (json_output) { - if (res) - jsonw_string(json_wtr, helper_name[id]); - } else if (define_prefix) { - printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n", - define_prefix, ptype_name, helper_name[id], - res ? "1" : "0"); - } else { - if (res) - printf("\n\t- %s", helper_name[id]); + /* Skip helper functions which emit dmesg messages when not in + * the full mode. + */ + switch (id) { + case BPF_FUNC_trace_printk: + case BPF_FUNC_probe_write_user: + if (!full_mode) + continue; + /* fallthrough */ + default: + probe_helper_for_progtype(prog_type, supported_type, + define_prefix, id, ptype_name, + ifindex); } } @@ -584,13 +601,132 @@ probe_large_insn_limit(const char *define_prefix, __u32 ifindex) res, define_prefix); } +static void +section_system_config(enum probe_component target, const char *define_prefix) +{ + switch (target) { + case COMPONENT_KERNEL: + case COMPONENT_UNSPEC: + if (define_prefix) + break; + + print_start_section("system_config", + "Scanning system configuration...", + NULL, /* define_comment never used here */ + NULL); /* define_prefix always NULL here */ + if (check_procfs()) { + probe_unprivileged_disabled(); + probe_jit_enable(); + probe_jit_harden(); + probe_jit_kallsyms(); + probe_jit_limit(); + } else { + p_info("/* procfs not mounted, skipping related probes */"); + } + probe_kernel_image_config(); + print_end_section(); + break; + default: + break; + } +} + +static bool section_syscall_config(const char *define_prefix) +{ + bool res; + + print_start_section("syscall_config", + "Scanning system call availability...", + "/*** System call availability ***/", + define_prefix); + res = probe_bpf_syscall(define_prefix); + print_end_section(); + + return res; +} + +static void +section_program_types(bool *supported_types, const char *define_prefix, + __u32 ifindex) +{ + unsigned int i; + + print_start_section("program_types", + "Scanning eBPF program types...", + "/*** eBPF program types ***/", + define_prefix); + + for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) + probe_prog_type(i, supported_types, define_prefix, ifindex); + + print_end_section(); +} + +static void section_map_types(const char *define_prefix, __u32 ifindex) +{ + unsigned int i; + + print_start_section("map_types", + "Scanning eBPF map types...", + "/*** eBPF map types ***/", + define_prefix); + + for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++) + probe_map_type(i, define_prefix, ifindex); + + print_end_section(); +} + +static void +section_helpers(bool *supported_types, const char *define_prefix, + bool full_mode, __u32 ifindex) +{ + unsigned int i; + + print_start_section("helpers", + "Scanning eBPF helper functions...", + "/*** eBPF helper functions ***/", + define_prefix); + + if (define_prefix) + printf("/*\n" + " * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n" + " * to determine if <helper_name> is available for <prog_type_name>,\n" + " * e.g.\n" + " * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n" + " * // do stuff with this helper\n" + " * #elif\n" + " * // use a workaround\n" + " * #endif\n" + " */\n" + "#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n" + " %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n", + define_prefix, define_prefix, define_prefix, + define_prefix); + for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) + probe_helpers_for_progtype(i, supported_types[i], + define_prefix, full_mode, ifindex); + + print_end_section(); +} + +static void section_misc(const char *define_prefix, __u32 ifindex) +{ + print_start_section("misc", + "Scanning miscellaneous eBPF features...", + "/*** eBPF misc features ***/", + define_prefix); + probe_large_insn_limit(define_prefix, ifindex); + print_end_section(); +} + static int do_probe(int argc, char **argv) { enum probe_component target = COMPONENT_UNSPEC; const char *define_prefix = NULL; bool supported_types[128] = {}; + bool full_mode = false; __u32 ifindex = 0; - unsigned int i; char *ifname; /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). @@ -629,6 +765,9 @@ static int do_probe(int argc, char **argv) strerror(errno)); return -1; } + } else if (is_prefix(*argv, "full")) { + full_mode = true; + NEXT_ARG(); } else if (is_prefix(*argv, "macros") && !define_prefix) { define_prefix = ""; NEXT_ARG(); @@ -658,97 +797,19 @@ static int do_probe(int argc, char **argv) jsonw_start_object(json_wtr); } - switch (target) { - case COMPONENT_KERNEL: - case COMPONENT_UNSPEC: - if (define_prefix) - break; - - print_start_section("system_config", - "Scanning system configuration...", - NULL, /* define_comment never used here */ - NULL); /* define_prefix always NULL here */ - if (check_procfs()) { - probe_unprivileged_disabled(); - probe_jit_enable(); - probe_jit_harden(); - probe_jit_kallsyms(); - probe_jit_limit(); - } else { - p_info("/* procfs not mounted, skipping related probes */"); - } - probe_kernel_image_config(); - if (json_output) - jsonw_end_object(json_wtr); - else - printf("\n"); - break; - default: - break; - } - - print_start_section("syscall_config", - "Scanning system call availability...", - "/*** System call availability ***/", - define_prefix); - - if (!probe_bpf_syscall(define_prefix)) + section_system_config(target, define_prefix); + if (!section_syscall_config(define_prefix)) /* bpf() syscall unavailable, don't probe other BPF features */ goto exit_close_json; - - print_end_then_start_section("program_types", - "Scanning eBPF program types...", - "/*** eBPF program types ***/", - define_prefix); - - for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) - probe_prog_type(i, supported_types, define_prefix, ifindex); - - print_end_then_start_section("map_types", - "Scanning eBPF map types...", - "/*** eBPF map types ***/", - define_prefix); - - for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++) - probe_map_type(i, define_prefix, ifindex); - - print_end_then_start_section("helpers", - "Scanning eBPF helper functions...", - "/*** eBPF helper functions ***/", - define_prefix); - - if (define_prefix) - printf("/*\n" - " * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n" - " * to determine if <helper_name> is available for <prog_type_name>,\n" - " * e.g.\n" - " * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n" - " * // do stuff with this helper\n" - " * #elif\n" - " * // use a workaround\n" - " * #endif\n" - " */\n" - "#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n" - " %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n", - define_prefix, define_prefix, define_prefix, - define_prefix); - for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) - probe_helpers_for_progtype(i, supported_types[i], - define_prefix, ifindex); - - print_end_then_start_section("misc", - "Scanning miscellaneous eBPF features...", - "/*** eBPF misc features ***/", - define_prefix); - probe_large_insn_limit(define_prefix, ifindex); + section_program_types(supported_types, define_prefix, ifindex); + section_map_types(define_prefix, ifindex); + section_helpers(supported_types, define_prefix, full_mode, ifindex); + section_misc(define_prefix, ifindex); exit_close_json: - if (json_output) { - /* End current "section" of probes */ - jsonw_end_object(json_wtr); + if (json_output) /* End root object */ jsonw_end_object(json_wtr); - } return 0; } @@ -761,7 +822,7 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s probe [COMPONENT] [macros [prefix PREFIX]]\n" + "Usage: %s %s probe [COMPONENT] [full] [macros [prefix PREFIX]]\n" " %s %s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 6d41bbfc6459..466c269eabdd 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -58,7 +58,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf | net | feature | btf | gen }\n" + " OBJECT := { prog | map | cgroup | perf | net | feature | btf | gen | struct_ops }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -79,13 +79,6 @@ static int do_version(int argc, char **argv) return 0; } -static int __printf(2, 0) -print_all_levels(__maybe_unused enum libbpf_print_level level, - const char *format, va_list args) -{ - return vfprintf(stderr, format, args); -} - int cmd_select(const struct cmd *cmds, int argc, char **argv, int (*help)(int argc, char **argv)) { @@ -228,6 +221,7 @@ static const struct cmd cmds[] = { { "feature", do_feature }, { "btf", do_btf }, { "gen", do_gen }, + { "struct_ops", do_struct_ops }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 4e75b58d3989..86f14ce26fd7 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -14,6 +14,8 @@ #include <linux/hashtable.h> #include <tools/libc_compat.h> +#include <bpf/libbpf.h> + #include "json_writer.h" #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) @@ -76,6 +78,9 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", + [BPF_PROG_TYPE_TRACING] = "tracing", + [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_PROG_TYPE_EXT] = "ext", }; extern const char * const map_type_name[]; @@ -143,7 +148,7 @@ char *get_fdinfo(int fd, const char *key); int open_obj_pinned(char *path, bool quiet); int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type); int mount_bpffs_for_pin(const char *name); -int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)); +int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***)); int do_pin_fd(int fd, const char *name); int do_prog(int argc, char **arg); @@ -156,6 +161,7 @@ int do_tracelog(int argc, char **arg); int do_feature(int argc, char **argv); int do_btf(int argc, char **argv); int do_gen(int argc, char **argv); +int do_struct_ops(int argc, char **argv); int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); @@ -200,6 +206,7 @@ struct btf_dumper { const struct btf *btf; json_writer_t *jw; bool is_plain_text; + bool prog_id_as_func_ptr; }; /* btf_dumper_type - print data along with type information @@ -226,4 +233,7 @@ struct tcmsg; int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb); int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind, const char *devname, int ifindex); + +int print_all_levels(__maybe_unused enum libbpf_print_level level, + const char *format, va_list args); #endif diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index e6c85680b34d..693a632f6813 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1384,7 +1384,7 @@ static int do_pin(int argc, char **argv) { int err; - err = do_pin_any(argc, argv, bpf_map_get_fd_by_id); + err = do_pin_any(argc, argv, map_parse_fd); if (!err && json_output) jsonw_null(json_wtr); return err; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index b352ab041160..f6a5974a7b0a 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -4,6 +4,7 @@ #define _GNU_SOURCE #include <errno.h> #include <fcntl.h> +#include <signal.h> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> @@ -11,10 +12,13 @@ #include <time.h> #include <unistd.h> #include <net/if.h> +#include <sys/ioctl.h> #include <sys/types.h> #include <sys/stat.h> +#include <sys/syscall.h> #include <linux/err.h> +#include <linux/perf_event.h> #include <linux/sizes.h> #include <bpf/bpf.h> @@ -809,7 +813,7 @@ static int do_pin(int argc, char **argv) { int err; - err = do_pin_any(argc, argv, bpf_prog_get_fd_by_id); + err = do_pin_any(argc, argv, prog_parse_fd); if (!err && json_output) jsonw_null(json_wtr); return err; @@ -1243,6 +1247,25 @@ free_data_in: return err; } +static int +get_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type) +{ + libbpf_print_fn_t print_backup; + int ret; + + ret = libbpf_prog_type_by_name(name, prog_type, expected_attach_type); + if (!ret) + return ret; + + /* libbpf_prog_type_by_name() failed, let's re-run with debug level */ + print_backup = libbpf_set_print(print_all_levels); + ret = libbpf_prog_type_by_name(name, prog_type, expected_attach_type); + libbpf_set_print(print_backup); + + return ret; +} + static int load_with_options(int argc, char **argv, bool first_prog_only) { enum bpf_prog_type common_prog_type = BPF_PROG_TYPE_UNSPEC; @@ -1292,8 +1315,8 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) strcat(type, *argv); strcat(type, "/"); - err = libbpf_prog_type_by_name(type, &common_prog_type, - &expected_attach_type); + err = get_prog_type_by_name(type, &common_prog_type, + &expected_attach_type); free(type); if (err < 0) goto err_free_reuse_maps; @@ -1392,8 +1415,8 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) if (prog_type == BPF_PROG_TYPE_UNSPEC) { const char *sec_name = bpf_program__title(pos, false); - err = libbpf_prog_type_by_name(sec_name, &prog_type, - &expected_attach_type); + err = get_prog_type_by_name(sec_name, &prog_type, + &expected_attach_type); if (err < 0) goto err_close_obj; } @@ -1537,6 +1560,422 @@ static int do_loadall(int argc, char **argv) return load_with_options(argc, argv, false); } +#ifdef BPFTOOL_WITHOUT_SKELETONS + +static int do_profile(int argc, char **argv) +{ + p_err("bpftool prog profile command is not supported. Please build bpftool with clang >= 10.0.0"); + return 0; +} + +#else /* BPFTOOL_WITHOUT_SKELETONS */ + +#include "profiler.skel.h" + +struct profile_metric { + const char *name; + struct bpf_perf_event_value val; + struct perf_event_attr attr; + bool selected; + + /* calculate ratios like instructions per cycle */ + const int ratio_metric; /* 0 for N/A, 1 for index 0 (cycles) */ + const char *ratio_desc; + const float ratio_mul; +} metrics[] = { + { + .name = "cycles", + .attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .exclude_user = 1, + }, + }, + { + .name = "instructions", + .attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_INSTRUCTIONS, + .exclude_user = 1, + }, + .ratio_metric = 1, + .ratio_desc = "insns per cycle", + .ratio_mul = 1.0, + }, + { + .name = "l1d_loads", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + .exclude_user = 1, + }, + }, + { + .name = "llc_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "LLC misses per million insns", + .ratio_mul = 1e6, + }, +}; + +static __u64 profile_total_count; + +#define MAX_NUM_PROFILE_METRICS 4 + +static int profile_parse_metrics(int argc, char **argv) +{ + unsigned int metric_cnt; + int selected_cnt = 0; + unsigned int i; + + metric_cnt = sizeof(metrics) / sizeof(struct profile_metric); + + while (argc > 0) { + for (i = 0; i < metric_cnt; i++) { + if (is_prefix(argv[0], metrics[i].name)) { + if (!metrics[i].selected) + selected_cnt++; + metrics[i].selected = true; + break; + } + } + if (i == metric_cnt) { + p_err("unknown metric %s", argv[0]); + return -1; + } + NEXT_ARG(); + } + if (selected_cnt > MAX_NUM_PROFILE_METRICS) { + p_err("too many (%d) metrics, please specify no more than %d metrics at at time", + selected_cnt, MAX_NUM_PROFILE_METRICS); + return -1; + } + return selected_cnt; +} + +static void profile_read_values(struct profiler_bpf *obj) +{ + __u32 m, cpu, num_cpu = obj->rodata->num_cpu; + int reading_map_fd, count_map_fd; + __u64 counts[num_cpu]; + __u32 key = 0; + int err; + + reading_map_fd = bpf_map__fd(obj->maps.accum_readings); + count_map_fd = bpf_map__fd(obj->maps.counts); + if (reading_map_fd < 0 || count_map_fd < 0) { + p_err("failed to get fd for map"); + return; + } + + err = bpf_map_lookup_elem(count_map_fd, &key, counts); + if (err) { + p_err("failed to read count_map: %s", strerror(errno)); + return; + } + + profile_total_count = 0; + for (cpu = 0; cpu < num_cpu; cpu++) + profile_total_count += counts[cpu]; + + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + struct bpf_perf_event_value values[num_cpu]; + + if (!metrics[m].selected) + continue; + + err = bpf_map_lookup_elem(reading_map_fd, &key, values); + if (err) { + p_err("failed to read reading_map: %s", + strerror(errno)); + return; + } + for (cpu = 0; cpu < num_cpu; cpu++) { + metrics[m].val.counter += values[cpu].counter; + metrics[m].val.enabled += values[cpu].enabled; + metrics[m].val.running += values[cpu].running; + } + key++; + } +} + +static void profile_print_readings_json(void) +{ + __u32 m; + + jsonw_start_array(json_wtr); + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + if (!metrics[m].selected) + continue; + jsonw_start_object(json_wtr); + jsonw_string_field(json_wtr, "metric", metrics[m].name); + jsonw_lluint_field(json_wtr, "run_cnt", profile_total_count); + jsonw_lluint_field(json_wtr, "value", metrics[m].val.counter); + jsonw_lluint_field(json_wtr, "enabled", metrics[m].val.enabled); + jsonw_lluint_field(json_wtr, "running", metrics[m].val.running); + + jsonw_end_object(json_wtr); + } + jsonw_end_array(json_wtr); +} + +static void profile_print_readings_plain(void) +{ + __u32 m; + + printf("\n%18llu %-20s\n", profile_total_count, "run_cnt"); + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + struct bpf_perf_event_value *val = &metrics[m].val; + int r; + + if (!metrics[m].selected) + continue; + printf("%18llu %-20s", val->counter, metrics[m].name); + + r = metrics[m].ratio_metric - 1; + if (r >= 0 && metrics[r].selected && + metrics[r].val.counter > 0) { + printf("# %8.2f %-30s", + val->counter * metrics[m].ratio_mul / + metrics[r].val.counter, + metrics[m].ratio_desc); + } else { + printf("%-41s", ""); + } + + if (val->enabled > val->running) + printf("(%4.2f%%)", + val->running * 100.0 / val->enabled); + printf("\n"); + } +} + +static void profile_print_readings(void) +{ + if (json_output) + profile_print_readings_json(); + else + profile_print_readings_plain(); +} + +static char *profile_target_name(int tgt_fd) +{ + struct bpf_prog_info_linear *info_linear; + struct bpf_func_info *func_info; + const struct btf_type *t; + char *name = NULL; + struct btf *btf; + + info_linear = bpf_program__get_prog_info_linear( + tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO); + if (IS_ERR_OR_NULL(info_linear)) { + p_err("failed to get info_linear for prog FD %d", tgt_fd); + return NULL; + } + + if (info_linear->info.btf_id == 0 || + btf__get_from_id(info_linear->info.btf_id, &btf)) { + p_err("prog FD %d doesn't have valid btf", tgt_fd); + goto out; + } + + func_info = (struct bpf_func_info *)(info_linear->info.func_info); + t = btf__type_by_id(btf, func_info[0].type_id); + if (!t) { + p_err("btf %d doesn't have type %d", + info_linear->info.btf_id, func_info[0].type_id); + goto out; + } + name = strdup(btf__name_by_offset(btf, t->name_off)); +out: + free(info_linear); + return name; +} + +static struct profiler_bpf *profile_obj; +static int profile_tgt_fd = -1; +static char *profile_tgt_name; +static int *profile_perf_events; +static int profile_perf_event_cnt; + +static void profile_close_perf_events(struct profiler_bpf *obj) +{ + int i; + + for (i = profile_perf_event_cnt - 1; i >= 0; i--) + close(profile_perf_events[i]); + + free(profile_perf_events); + profile_perf_event_cnt = 0; +} + +static int profile_open_perf_events(struct profiler_bpf *obj) +{ + unsigned int cpu, m; + int map_fd, pmu_fd; + + profile_perf_events = calloc( + sizeof(int), obj->rodata->num_cpu * obj->rodata->num_metric); + if (!profile_perf_events) { + p_err("failed to allocate memory for perf_event array: %s", + strerror(errno)); + return -1; + } + map_fd = bpf_map__fd(obj->maps.events); + if (map_fd < 0) { + p_err("failed to get fd for events map"); + return -1; + } + + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + if (!metrics[m].selected) + continue; + for (cpu = 0; cpu < obj->rodata->num_cpu; cpu++) { + pmu_fd = syscall(__NR_perf_event_open, &metrics[m].attr, + -1/*pid*/, cpu, -1/*group_fd*/, 0); + if (pmu_fd < 0 || + bpf_map_update_elem(map_fd, &profile_perf_event_cnt, + &pmu_fd, BPF_ANY) || + ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) { + p_err("failed to create event %s on cpu %d", + metrics[m].name, cpu); + return -1; + } + profile_perf_events[profile_perf_event_cnt++] = pmu_fd; + } + } + return 0; +} + +static void profile_print_and_cleanup(void) +{ + profile_close_perf_events(profile_obj); + profile_read_values(profile_obj); + profile_print_readings(); + profiler_bpf__destroy(profile_obj); + + close(profile_tgt_fd); + free(profile_tgt_name); +} + +static void int_exit(int signo) +{ + profile_print_and_cleanup(); + exit(0); +} + +static int do_profile(int argc, char **argv) +{ + int num_metric, num_cpu, err = -1; + struct bpf_program *prog; + unsigned long duration; + char *endptr; + + /* we at least need two args for the prog and one metric */ + if (!REQ_ARGS(3)) + return -EINVAL; + + /* parse target fd */ + profile_tgt_fd = prog_parse_fd(&argc, &argv); + if (profile_tgt_fd < 0) { + p_err("failed to parse fd"); + return -1; + } + + /* parse profiling optional duration */ + if (argc > 2 && is_prefix(argv[0], "duration")) { + NEXT_ARG(); + duration = strtoul(*argv, &endptr, 0); + if (*endptr) + usage(); + NEXT_ARG(); + } else { + duration = UINT_MAX; + } + + num_metric = profile_parse_metrics(argc, argv); + if (num_metric <= 0) + goto out; + + num_cpu = libbpf_num_possible_cpus(); + if (num_cpu <= 0) { + p_err("failed to identify number of CPUs"); + goto out; + } + + profile_obj = profiler_bpf__open(); + if (!profile_obj) { + p_err("failed to open and/or load BPF object"); + goto out; + } + + profile_obj->rodata->num_cpu = num_cpu; + profile_obj->rodata->num_metric = num_metric; + + /* adjust map sizes */ + bpf_map__resize(profile_obj->maps.events, num_metric * num_cpu); + bpf_map__resize(profile_obj->maps.fentry_readings, num_metric); + bpf_map__resize(profile_obj->maps.accum_readings, num_metric); + bpf_map__resize(profile_obj->maps.counts, 1); + + /* change target name */ + profile_tgt_name = profile_target_name(profile_tgt_fd); + if (!profile_tgt_name) + goto out; + + bpf_object__for_each_program(prog, profile_obj->obj) { + err = bpf_program__set_attach_target(prog, profile_tgt_fd, + profile_tgt_name); + if (err) { + p_err("failed to set attach target\n"); + goto out; + } + } + + set_max_rlimit(); + err = profiler_bpf__load(profile_obj); + if (err) { + p_err("failed to load profile_obj"); + goto out; + } + + err = profile_open_perf_events(profile_obj); + if (err) + goto out; + + err = profiler_bpf__attach(profile_obj); + if (err) { + p_err("failed to attach profile_obj"); + goto out; + } + signal(SIGINT, int_exit); + + sleep(duration); + profile_print_and_cleanup(); + return 0; + +out: + profile_close_perf_events(profile_obj); + if (profile_obj) + profiler_bpf__destroy(profile_obj); + close(profile_tgt_fd); + free(profile_tgt_name); + return err; +} + +#endif /* BPFTOOL_WITHOUT_SKELETONS */ + static int do_help(int argc, char **argv) { if (json_output) { @@ -1560,6 +1999,7 @@ static int do_help(int argc, char **argv) " [data_out FILE [data_size_out L]] \\\n" " [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n" " [repeat N]\n" + " %s %s profile PROG [duration DURATION] METRICs\n" " %s %s tracelog\n" " %s %s help\n" "\n" @@ -1573,16 +2013,17 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n" - " cgroup/recvmsg6 | cgroup/getsockopt |\n" - " cgroup/setsockopt }\n" + " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n" + " struct_ops | fentry | fexit | freplace }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" + " METRIC := { cycles | instructions | l1d_loads | llc_misses }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2]); return 0; } @@ -1599,6 +2040,7 @@ static const struct cmd cmds[] = { { "detach", do_detach }, { "tracelog", do_tracelog }, { "run", do_run }, + { "profile", do_profile }, { 0 } }; diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c new file mode 100644 index 000000000000..20034c12f7c5 --- /dev/null +++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook +#include "profiler.h" +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* map of perf event fds, num_cpu * num_metric entries */ +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(int)); +} events SEC(".maps"); + +/* readings at fentry */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} fentry_readings SEC(".maps"); + +/* accumulated readings */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} accum_readings SEC(".maps"); + +/* sample counts, one per cpu */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); +} counts SEC(".maps"); + +const volatile __u32 num_cpu = 1; +const volatile __u32 num_metric = 1; +#define MAX_NUM_MATRICS 4 + +SEC("fentry/XXX") +int BPF_PROG(fentry_XXX) +{ + struct bpf_perf_event_value *ptrs[MAX_NUM_MATRICS]; + u32 key = bpf_get_smp_processor_id(); + u32 i; + + /* look up before reading, to reduce error */ + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { + u32 flag = i; + + ptrs[i] = bpf_map_lookup_elem(&fentry_readings, &flag); + if (!ptrs[i]) + return 0; + } + + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { + struct bpf_perf_event_value reading; + int err; + + err = bpf_perf_event_read_value(&events, key, &reading, + sizeof(reading)); + if (err) + return 0; + *(ptrs[i]) = reading; + key += num_cpu; + } + + return 0; +} + +static inline void +fexit_update_maps(u32 id, struct bpf_perf_event_value *after) +{ + struct bpf_perf_event_value *before, diff, *accum; + + before = bpf_map_lookup_elem(&fentry_readings, &id); + /* only account samples with a valid fentry_reading */ + if (before && before->counter) { + struct bpf_perf_event_value *accum; + + diff.counter = after->counter - before->counter; + diff.enabled = after->enabled - before->enabled; + diff.running = after->running - before->running; + + accum = bpf_map_lookup_elem(&accum_readings, &id); + if (accum) { + accum->counter += diff.counter; + accum->enabled += diff.enabled; + accum->running += diff.running; + } + } +} + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value readings[MAX_NUM_MATRICS]; + u32 cpu = bpf_get_smp_processor_id(); + u32 i, one = 1, zero = 0; + int err; + u64 *count; + + /* read all events before updating the maps, to reduce error */ + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { + err = bpf_perf_event_read_value(&events, cpu + i * num_cpu, + readings + i, sizeof(*readings)); + if (err) + return 0; + } + count = bpf_map_lookup_elem(&counts, &zero); + if (count) { + *count += 1; + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) + fexit_update_maps(i, &readings[i]); + } + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/bpf/bpftool/skeleton/profiler.h b/tools/bpf/bpftool/skeleton/profiler.h new file mode 100644 index 000000000000..1f767e9510f7 --- /dev/null +++ b/tools/bpf/bpftool/skeleton/profiler.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __PROFILER_H +#define __PROFILER_H + +/* useful typedefs from vmlinux.h */ + +typedef signed char __s8; +typedef unsigned char __u8; +typedef short int __s16; +typedef short unsigned int __u16; +typedef int __s32; +typedef unsigned int __u32; +typedef long long int __s64; +typedef long long unsigned int __u64; + +typedef __s8 s8; +typedef __u8 u8; +typedef __s16 s16; +typedef __u16 u16; +typedef __s32 s32; +typedef __u32 u32; +typedef __s64 s64; +typedef __u64 u64; + +enum { + false = 0, + true = 1, +}; + +#ifdef __CHECKER__ +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif + +typedef __u16 __bitwise__ __le16; +typedef __u16 __bitwise__ __be16; +typedef __u32 __bitwise__ __le32; +typedef __u32 __bitwise__ __be32; +typedef __u64 __bitwise__ __le64; +typedef __u64 __bitwise__ __be64; + +typedef __u16 __bitwise__ __sum16; +typedef __u32 __bitwise__ __wsum; + +#endif /* __PROFILER_H */ diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c new file mode 100644 index 000000000000..2a7befbd11ad --- /dev/null +++ b/tools/bpf/bpftool/struct_ops.c @@ -0,0 +1,596 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook */ + +#include <errno.h> +#include <stdio.h> +#include <unistd.h> + +#include <linux/err.h> + +#include <bpf/bpf.h> +#include <bpf/btf.h> +#include <bpf/libbpf.h> + +#include "json_writer.h" +#include "main.h" + +#define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_" + +static const struct btf_type *map_info_type; +static __u32 map_info_alloc_len; +static struct btf *btf_vmlinux; +static __s32 map_info_type_id; + +struct res { + unsigned int nr_maps; + unsigned int nr_errs; +}; + +static const struct btf *get_btf_vmlinux(void) +{ + if (btf_vmlinux) + return btf_vmlinux; + + btf_vmlinux = libbpf_find_kernel_btf(); + if (IS_ERR(btf_vmlinux)) + p_err("struct_ops requires kernel CONFIG_DEBUG_INFO_BTF=y"); + + return btf_vmlinux; +} + +static const char *get_kern_struct_ops_name(const struct bpf_map_info *info) +{ + const struct btf *kern_btf; + const struct btf_type *t; + const char *st_ops_name; + + kern_btf = get_btf_vmlinux(); + if (IS_ERR(kern_btf)) + return "<btf_vmlinux_not_found>"; + + t = btf__type_by_id(kern_btf, info->btf_vmlinux_value_type_id); + st_ops_name = btf__name_by_offset(kern_btf, t->name_off); + st_ops_name += strlen(STRUCT_OPS_VALUE_PREFIX); + + return st_ops_name; +} + +static __s32 get_map_info_type_id(void) +{ + const struct btf *kern_btf; + + if (map_info_type_id) + return map_info_type_id; + + kern_btf = get_btf_vmlinux(); + if (IS_ERR(kern_btf)) { + map_info_type_id = PTR_ERR(kern_btf); + return map_info_type_id; + } + + map_info_type_id = btf__find_by_name_kind(kern_btf, "bpf_map_info", + BTF_KIND_STRUCT); + if (map_info_type_id < 0) { + p_err("can't find bpf_map_info from btf_vmlinux"); + return map_info_type_id; + } + map_info_type = btf__type_by_id(kern_btf, map_info_type_id); + + /* Ensure map_info_alloc() has at least what the bpftool needs */ + map_info_alloc_len = map_info_type->size; + if (map_info_alloc_len < sizeof(struct bpf_map_info)) + map_info_alloc_len = sizeof(struct bpf_map_info); + + return map_info_type_id; +} + +/* If the subcmd needs to print out the bpf_map_info, + * it should always call map_info_alloc to allocate + * a bpf_map_info object instead of allocating it + * on the stack. + * + * map_info_alloc() will take the running kernel's btf + * into account. i.e. it will consider the + * sizeof(struct bpf_map_info) of the running kernel. + * + * It will enable the "struct_ops" cmd to print the latest + * "struct bpf_map_info". + * + * [ Recall that "struct_ops" requires the kernel's btf to + * be available ] + */ +static struct bpf_map_info *map_info_alloc(__u32 *alloc_len) +{ + struct bpf_map_info *info; + + if (get_map_info_type_id() < 0) + return NULL; + + info = calloc(1, map_info_alloc_len); + if (!info) + p_err("mem alloc failed"); + else + *alloc_len = map_info_alloc_len; + + return info; +} + +/* It iterates all struct_ops maps of the system. + * It returns the fd in "*res_fd" and map_info in "*info". + * In the very first iteration, info->id should be 0. + * An optional map "*name" filter can be specified. + * The filter can be made more flexible in the future. + * e.g. filter by kernel-struct-ops-name, regex-name, glob-name, ...etc. + * + * Return value: + * 1: A struct_ops map found. It is returned in "*res_fd" and "*info". + * The caller can continue to call get_next in the future. + * 0: No struct_ops map is returned. + * All struct_ops map has been found. + * -1: Error and the caller should abort the iteration. + */ +static int get_next_struct_ops_map(const char *name, int *res_fd, + struct bpf_map_info *info, __u32 info_len) +{ + __u32 id = info->id; + int err, fd; + + while (true) { + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + return 0; + p_err("can't get next map: %s", strerror(errno)); + return -1; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get map by id (%u): %s", + id, strerror(errno)); + return -1; + } + + err = bpf_obj_get_info_by_fd(fd, info, &info_len); + if (err) { + p_err("can't get map info: %s", strerror(errno)); + close(fd); + return -1; + } + + if (info->type == BPF_MAP_TYPE_STRUCT_OPS && + (!name || !strcmp(name, info->name))) { + *res_fd = fd; + return 1; + } + close(fd); + } +} + +static int cmd_retval(const struct res *res, bool must_have_one_map) +{ + if (res->nr_errs || (!res->nr_maps && must_have_one_map)) + return -1; + + return 0; +} + +/* "data" is the work_func private storage */ +typedef int (*work_func)(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr); + +/* Find all struct_ops map in the system. + * Filter out by "name" (if specified). + * Then call "func(fd, info, data, wtr)" on each struct_ops map found. + */ +static struct res do_search(const char *name, work_func func, void *data, + struct json_writer *wtr) +{ + struct bpf_map_info *info; + struct res res = {}; + __u32 info_len; + int fd, err; + + info = map_info_alloc(&info_len); + if (!info) { + res.nr_errs++; + return res; + } + + if (wtr) + jsonw_start_array(wtr); + while ((err = get_next_struct_ops_map(name, &fd, info, info_len)) == 1) { + res.nr_maps++; + err = func(fd, info, data, wtr); + if (err) + res.nr_errs++; + close(fd); + } + if (wtr) + jsonw_end_array(wtr); + + if (err) + res.nr_errs++; + + if (!wtr && name && !res.nr_errs && !res.nr_maps) + /* It is not printing empty []. + * Thus, needs to specifically say nothing found + * for "name" here. + */ + p_err("no struct_ops found for %s", name); + else if (!wtr && json_output && !res.nr_errs) + /* The "func()" above is not writing any json (i.e. !wtr + * test here). + * + * However, "-j" is enabled and there is no errs here, + * so call json_null() as the current convention of + * other cmds. + */ + jsonw_null(json_wtr); + + free(info); + return res; +} + +static struct res do_one_id(const char *id_str, work_func func, void *data, + struct json_writer *wtr) +{ + struct bpf_map_info *info; + struct res res = {}; + unsigned long id; + __u32 info_len; + char *endptr; + int fd; + + id = strtoul(id_str, &endptr, 0); + if (*endptr || !id || id > UINT32_MAX) { + p_err("invalid id %s", id_str); + res.nr_errs++; + return res; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd == -1) { + p_err("can't get map by id (%lu): %s", id, strerror(errno)); + res.nr_errs++; + return res; + } + + info = map_info_alloc(&info_len); + if (!info) { + res.nr_errs++; + goto done; + } + + if (bpf_obj_get_info_by_fd(fd, info, &info_len)) { + p_err("can't get map info: %s", strerror(errno)); + res.nr_errs++; + goto done; + } + + if (info->type != BPF_MAP_TYPE_STRUCT_OPS) { + p_err("%s id %u is not a struct_ops map", info->name, info->id); + res.nr_errs++; + goto done; + } + + res.nr_maps++; + + if (func(fd, info, data, wtr)) + res.nr_errs++; + else if (!wtr && json_output) + /* The "func()" above is not writing any json (i.e. !wtr + * test here). + * + * However, "-j" is enabled and there is no errs here, + * so call json_null() as the current convention of + * other cmds. + */ + jsonw_null(json_wtr); + +done: + free(info); + close(fd); + + return res; +} + +static struct res do_work_on_struct_ops(const char *search_type, + const char *search_term, + work_func func, void *data, + struct json_writer *wtr) +{ + if (search_type) { + if (is_prefix(search_type, "id")) + return do_one_id(search_term, func, data, wtr); + else if (!is_prefix(search_type, "name")) + usage(); + } + + return do_search(search_term, func, data, wtr); +} + +static int __do_show(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr) +{ + if (wtr) { + jsonw_start_object(wtr); + jsonw_uint_field(wtr, "id", info->id); + jsonw_string_field(wtr, "name", info->name); + jsonw_string_field(wtr, "kernel_struct_ops", + get_kern_struct_ops_name(info)); + jsonw_end_object(wtr); + } else { + printf("%u: %-15s %-32s\n", info->id, info->name, + get_kern_struct_ops_name(info)); + } + + return 0; +} + +static int do_show(int argc, char **argv) +{ + const char *search_type = NULL, *search_term = NULL; + struct res res; + + if (argc && argc != 2) + usage(); + + if (argc == 2) { + search_type = GET_ARG(); + search_term = GET_ARG(); + } + + res = do_work_on_struct_ops(search_type, search_term, __do_show, + NULL, json_wtr); + + return cmd_retval(&res, !!search_term); +} + +static int __do_dump(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr) +{ + struct btf_dumper *d = (struct btf_dumper *)data; + const struct btf_type *struct_ops_type; + const struct btf *kern_btf = d->btf; + const char *struct_ops_name; + int zero = 0; + void *value; + + /* note: d->jw == wtr */ + + kern_btf = d->btf; + + /* The kernel supporting BPF_MAP_TYPE_STRUCT_OPS must have + * btf_vmlinux_value_type_id. + */ + struct_ops_type = btf__type_by_id(kern_btf, + info->btf_vmlinux_value_type_id); + struct_ops_name = btf__name_by_offset(kern_btf, + struct_ops_type->name_off); + value = calloc(1, info->value_size); + if (!value) { + p_err("mem alloc failed"); + return -1; + } + + if (bpf_map_lookup_elem(fd, &zero, value)) { + p_err("can't lookup struct_ops map %s id %u", + info->name, info->id); + free(value); + return -1; + } + + jsonw_start_object(wtr); + jsonw_name(wtr, "bpf_map_info"); + btf_dumper_type(d, map_info_type_id, (void *)info); + jsonw_end_object(wtr); + + jsonw_start_object(wtr); + jsonw_name(wtr, struct_ops_name); + btf_dumper_type(d, info->btf_vmlinux_value_type_id, value); + jsonw_end_object(wtr); + + free(value); + + return 0; +} + +static int do_dump(int argc, char **argv) +{ + const char *search_type = NULL, *search_term = NULL; + json_writer_t *wtr = json_wtr; + const struct btf *kern_btf; + struct btf_dumper d = {}; + struct res res; + + if (argc && argc != 2) + usage(); + + if (argc == 2) { + search_type = GET_ARG(); + search_term = GET_ARG(); + } + + kern_btf = get_btf_vmlinux(); + if (IS_ERR(kern_btf)) + return -1; + + if (!json_output) { + wtr = jsonw_new(stdout); + if (!wtr) { + p_err("can't create json writer"); + return -1; + } + jsonw_pretty(wtr, true); + } + + d.btf = kern_btf; + d.jw = wtr; + d.is_plain_text = !json_output; + d.prog_id_as_func_ptr = true; + + res = do_work_on_struct_ops(search_type, search_term, __do_dump, &d, + wtr); + + if (!json_output) + jsonw_destroy(&wtr); + + return cmd_retval(&res, !!search_term); +} + +static int __do_unregister(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr) +{ + int zero = 0; + + if (bpf_map_delete_elem(fd, &zero)) { + p_err("can't unload %s %s id %u: %s", + get_kern_struct_ops_name(info), info->name, + info->id, strerror(errno)); + return -1; + } + + p_info("Unregistered %s %s id %u", + get_kern_struct_ops_name(info), info->name, + info->id); + + return 0; +} + +static int do_unregister(int argc, char **argv) +{ + const char *search_type, *search_term; + struct res res; + + if (argc != 2) + usage(); + + search_type = GET_ARG(); + search_term = GET_ARG(); + + res = do_work_on_struct_ops(search_type, search_term, + __do_unregister, NULL, NULL); + + return cmd_retval(&res, true); +} + +static int do_register(int argc, char **argv) +{ + const struct bpf_map_def *def; + struct bpf_map_info info = {}; + __u32 info_len = sizeof(info); + int nr_errs = 0, nr_maps = 0; + struct bpf_object *obj; + struct bpf_link *link; + struct bpf_map *map; + const char *file; + + if (argc != 1) + usage(); + + file = GET_ARG(); + + obj = bpf_object__open(file); + if (IS_ERR_OR_NULL(obj)) + return -1; + + set_max_rlimit(); + + if (bpf_object__load(obj)) { + bpf_object__close(obj); + return -1; + } + + bpf_object__for_each_map(map, obj) { + def = bpf_map__def(map); + if (def->type != BPF_MAP_TYPE_STRUCT_OPS) + continue; + + link = bpf_map__attach_struct_ops(map); + if (IS_ERR(link)) { + p_err("can't register struct_ops %s: %s", + bpf_map__name(map), + strerror(-PTR_ERR(link))); + nr_errs++; + continue; + } + nr_maps++; + + bpf_link__disconnect(link); + bpf_link__destroy(link); + + if (!bpf_obj_get_info_by_fd(bpf_map__fd(map), &info, + &info_len)) + p_info("Registered %s %s id %u", + get_kern_struct_ops_name(&info), + bpf_map__name(map), + info.id); + else + /* Not p_err. The struct_ops was attached + * successfully. + */ + p_info("Registered %s but can't find id: %s", + bpf_map__name(map), strerror(errno)); + } + + bpf_object__close(obj); + + if (nr_errs) + return -1; + + if (!nr_maps) { + p_err("no struct_ops found in %s", file); + return -1; + } + + if (json_output) + jsonw_null(json_wtr); + + return 0; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s %s { show | list } [STRUCT_OPS_MAP]\n" + " %s %s dump [STRUCT_OPS_MAP]\n" + " %s %s register OBJ\n" + " %s %s unregister STRUCT_OPS_MAP\n" + " %s %s help\n" + "\n" + " OPTIONS := { {-j|--json} [{-p|--pretty}] }\n" + " STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n", + bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "register", do_register }, + { "unregister", do_unregister }, + { "dump", do_dump }, + { "help", do_help }, + { 0 } +}; + +int do_struct_ops(int argc, char **argv) +{ + int err; + + err = cmd_select(cmds, argc, argv, do_help); + + btf__free(btf_vmlinux); + return err; +} diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 48a39f72fadf..1f18a409f044 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -5,9 +5,7 @@ #include "runqslower.h" #define TASK_RUNNING 0 - -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +#define BPF_F_CURRENT_CPU 0xffffffffULL const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 7ac0d8088565..ab8e89a7009c 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -67,7 +67,8 @@ FILES= \ test-llvm.bin \ test-llvm-version.bin \ test-libaio.bin \ - test-libzstd.bin + test-libzstd.bin \ + test-clang-bpf-global-var.bin FILES := $(addprefix $(OUTPUT),$(FILES)) @@ -75,6 +76,7 @@ CC ?= $(CROSS_COMPILE)gcc CXX ?= $(CROSS_COMPILE)g++ PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config LLVM_CONFIG ?= llvm-config +CLANG ?= clang all: $(FILES) @@ -321,6 +323,11 @@ $(OUTPUT)test-libaio.bin: $(OUTPUT)test-libzstd.bin: $(BUILD) -lzstd +$(OUTPUT)test-clang-bpf-global-var.bin: + $(CLANG) -S -g -target bpf -o - $(patsubst %.bin,%.c,$(@F)) | \ + grep BTF_KIND_VAR + + ############################### clean: diff --git a/tools/build/feature/test-clang-bpf-global-var.c b/tools/build/feature/test-clang-bpf-global-var.c new file mode 100644 index 000000000000..221f1481d52e --- /dev/null +++ b/tools/build/feature/test-clang-bpf-global-var.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +volatile int global_value_for_test = 1; diff --git a/tools/edid/1024x768.S b/tools/edid/1024x768.S new file mode 100644 index 000000000000..4aed3f9ab88a --- /dev/null +++ b/tools/edid/1024x768.S @@ -0,0 +1,43 @@ +/* + 1024x768.S: EDID data set for standard 1024x768 60 Hz monitor + + Copyright (C) 2011 Carsten Emde <C.Emde@osadl.org> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +/* EDID */ +#define VERSION 1 +#define REVISION 3 + +/* Display */ +#define CLOCK 65000 /* kHz */ +#define XPIX 1024 +#define YPIX 768 +#define XY_RATIO XY_RATIO_4_3 +#define XBLANK 320 +#define YBLANK 38 +#define XOFFSET 8 +#define XPULSE 144 +#define YOFFSET 3 +#define YPULSE 6 +#define DPI 72 +#define VFREQ 60 /* Hz */ +#define TIMING_NAME "Linux XGA" +#define ESTABLISHED_TIMING2_BITS 0x08 /* Bit 3 -> 1024x768 @60 Hz */ +#define HSYNC_POL 0 +#define VSYNC_POL 0 + +#include "edid.S" diff --git a/tools/edid/1280x1024.S b/tools/edid/1280x1024.S new file mode 100644 index 000000000000..b26dd424cad7 --- /dev/null +++ b/tools/edid/1280x1024.S @@ -0,0 +1,43 @@ +/* + 1280x1024.S: EDID data set for standard 1280x1024 60 Hz monitor + + Copyright (C) 2011 Carsten Emde <C.Emde@osadl.org> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +/* EDID */ +#define VERSION 1 +#define REVISION 3 + +/* Display */ +#define CLOCK 108000 /* kHz */ +#define XPIX 1280 +#define YPIX 1024 +#define XY_RATIO XY_RATIO_5_4 +#define XBLANK 408 +#define YBLANK 42 +#define XOFFSET 48 +#define XPULSE 112 +#define YOFFSET 1 +#define YPULSE 3 +#define DPI 72 +#define VFREQ 60 /* Hz */ +#define TIMING_NAME "Linux SXGA" +/* No ESTABLISHED_TIMINGx_BITS */ +#define HSYNC_POL 1 +#define VSYNC_POL 1 + +#include "edid.S" diff --git a/tools/edid/1600x1200.S b/tools/edid/1600x1200.S new file mode 100644 index 000000000000..0d091b282768 --- /dev/null +++ b/tools/edid/1600x1200.S @@ -0,0 +1,43 @@ +/* + 1600x1200.S: EDID data set for standard 1600x1200 60 Hz monitor + + Copyright (C) 2013 Carsten Emde <C.Emde@osadl.org> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +/* EDID */ +#define VERSION 1 +#define REVISION 3 + +/* Display */ +#define CLOCK 162000 /* kHz */ +#define XPIX 1600 +#define YPIX 1200 +#define XY_RATIO XY_RATIO_4_3 +#define XBLANK 560 +#define YBLANK 50 +#define XOFFSET 64 +#define XPULSE 192 +#define YOFFSET 1 +#define YPULSE 3 +#define DPI 72 +#define VFREQ 60 /* Hz */ +#define TIMING_NAME "Linux UXGA" +/* No ESTABLISHED_TIMINGx_BITS */ +#define HSYNC_POL 1 +#define VSYNC_POL 1 + +#include "edid.S" diff --git a/tools/edid/1680x1050.S b/tools/edid/1680x1050.S new file mode 100644 index 000000000000..7dfed9a33eab --- /dev/null +++ b/tools/edid/1680x1050.S @@ -0,0 +1,43 @@ +/* + 1680x1050.S: EDID data set for standard 1680x1050 60 Hz monitor + + Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +/* EDID */ +#define VERSION 1 +#define REVISION 3 + +/* Display */ +#define CLOCK 146250 /* kHz */ +#define XPIX 1680 +#define YPIX 1050 +#define XY_RATIO XY_RATIO_16_10 +#define XBLANK 560 +#define YBLANK 39 +#define XOFFSET 104 +#define XPULSE 176 +#define YOFFSET 3 +#define YPULSE 6 +#define DPI 96 +#define VFREQ 60 /* Hz */ +#define TIMING_NAME "Linux WSXGA" +/* No ESTABLISHED_TIMINGx_BITS */ +#define HSYNC_POL 1 +#define VSYNC_POL 1 + +#include "edid.S" diff --git a/tools/edid/1920x1080.S b/tools/edid/1920x1080.S new file mode 100644 index 000000000000..d6ffbba28e95 --- /dev/null +++ b/tools/edid/1920x1080.S @@ -0,0 +1,43 @@ +/* + 1920x1080.S: EDID data set for standard 1920x1080 60 Hz monitor + + Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +/* EDID */ +#define VERSION 1 +#define REVISION 3 + +/* Display */ +#define CLOCK 148500 /* kHz */ +#define XPIX 1920 +#define YPIX 1080 +#define XY_RATIO XY_RATIO_16_9 +#define XBLANK 280 +#define YBLANK 45 +#define XOFFSET 88 +#define XPULSE 44 +#define YOFFSET 4 +#define YPULSE 5 +#define DPI 96 +#define VFREQ 60 /* Hz */ +#define TIMING_NAME "Linux FHD" +/* No ESTABLISHED_TIMINGx_BITS */ +#define HSYNC_POL 1 +#define VSYNC_POL 1 + +#include "edid.S" diff --git a/tools/edid/800x600.S b/tools/edid/800x600.S new file mode 100644 index 000000000000..a5616588de08 --- /dev/null +++ b/tools/edid/800x600.S @@ -0,0 +1,40 @@ +/* + 800x600.S: EDID data set for standard 800x600 60 Hz monitor + + Copyright (C) 2011 Carsten Emde <C.Emde@osadl.org> + Copyright (C) 2014 Linaro Limited + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*/ + +/* EDID */ +#define VERSION 1 +#define REVISION 3 + +/* Display */ +#define CLOCK 40000 /* kHz */ +#define XPIX 800 +#define YPIX 600 +#define XY_RATIO XY_RATIO_4_3 +#define XBLANK 256 +#define YBLANK 28 +#define XOFFSET 40 +#define XPULSE 128 +#define YOFFSET 1 +#define YPULSE 4 +#define DPI 72 +#define VFREQ 60 /* Hz */ +#define TIMING_NAME "Linux SVGA" +#define ESTABLISHED_TIMING1_BITS 0x01 /* Bit 0: 800x600 @ 60Hz */ +#define HSYNC_POL 1 +#define VSYNC_POL 1 + +#include "edid.S" diff --git a/tools/edid/Makefile b/tools/edid/Makefile new file mode 100644 index 000000000000..85a927dfab02 --- /dev/null +++ b/tools/edid/Makefile @@ -0,0 +1,37 @@ + +SOURCES := $(wildcard [0-9]*x[0-9]*.S) + +BIN := $(patsubst %.S, %.bin, $(SOURCES)) + +IHEX := $(patsubst %.S, %.bin.ihex, $(SOURCES)) + +CODE := $(patsubst %.S, %.c, $(SOURCES)) + +all: $(BIN) $(IHEX) $(CODE) + +clean: + @rm -f *.o *.bin.ihex *.bin *.c + +%.o: %.S + @cc -c $^ + +%.bin.nocrc: %.o + @objcopy -Obinary $^ $@ + +%.crc: %.bin.nocrc + @list=$$(for i in `seq 1 127`; do head -c$$i $^ | tail -c1 \ + | hexdump -v -e '/1 "%02X+"'; done); \ + echo "ibase=16;100-($${list%?})%100" | bc >$@ + +%.p: %.crc %.S + @cc -c -DCRC="$$(cat $*.crc)" -o $@ $*.S + +%.bin: %.p + @objcopy -Obinary $^ $@ + +%.bin.ihex: %.p + @objcopy -Oihex $^ $@ + @dos2unix $@ 2>/dev/null + +%.c: %.bin + @echo "{" >$@; hexdump -f hex $^ >>$@; echo "};" >>$@ diff --git a/tools/edid/edid.S b/tools/edid/edid.S new file mode 100644 index 000000000000..c3d13815526d --- /dev/null +++ b/tools/edid/edid.S @@ -0,0 +1,274 @@ +/* + edid.S: EDID data template + + Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + + +/* Manufacturer */ +#define MFG_LNX1 'L' +#define MFG_LNX2 'N' +#define MFG_LNX3 'X' +#define SERIAL 0 +#define YEAR 2012 +#define WEEK 5 + +/* EDID 1.3 standard definitions */ +#define XY_RATIO_16_10 0b00 +#define XY_RATIO_4_3 0b01 +#define XY_RATIO_5_4 0b10 +#define XY_RATIO_16_9 0b11 + +/* Provide defaults for the timing bits */ +#ifndef ESTABLISHED_TIMING1_BITS +#define ESTABLISHED_TIMING1_BITS 0x00 +#endif +#ifndef ESTABLISHED_TIMING2_BITS +#define ESTABLISHED_TIMING2_BITS 0x00 +#endif +#ifndef ESTABLISHED_TIMING3_BITS +#define ESTABLISHED_TIMING3_BITS 0x00 +#endif + +#define mfgname2id(v1,v2,v3) \ + ((((v1-'@')&0x1f)<<10)+(((v2-'@')&0x1f)<<5)+((v3-'@')&0x1f)) +#define swap16(v1) ((v1>>8)+((v1&0xff)<<8)) +#define lsbs2(v1,v2) (((v1&0x0f)<<4)+(v2&0x0f)) +#define msbs2(v1,v2) ((((v1>>8)&0x0f)<<4)+((v2>>8)&0x0f)) +#define msbs4(v1,v2,v3,v4) \ + ((((v1>>8)&0x03)<<6)+(((v2>>8)&0x03)<<4)+\ + (((v3>>4)&0x03)<<2)+((v4>>4)&0x03)) +#define pixdpi2mm(pix,dpi) ((pix*25)/dpi) +#define xsize pixdpi2mm(XPIX,DPI) +#define ysize pixdpi2mm(YPIX,DPI) + + .data + +/* Fixed header pattern */ +header: .byte 0x00,0xff,0xff,0xff,0xff,0xff,0xff,0x00 + +mfg_id: .hword swap16(mfgname2id(MFG_LNX1, MFG_LNX2, MFG_LNX3)) + +prod_code: .hword 0 + +/* Serial number. 32 bits, little endian. */ +serial_number: .long SERIAL + +/* Week of manufacture */ +week: .byte WEEK + +/* Year of manufacture, less 1990. (1990-2245) + If week=255, it is the model year instead */ +year: .byte YEAR-1990 + +version: .byte VERSION /* EDID version, usually 1 (for 1.3) */ +revision: .byte REVISION /* EDID revision, usually 3 (for 1.3) */ + +/* If Bit 7=1 Digital input. If set, the following bit definitions apply: + Bits 6-1 Reserved, must be 0 + Bit 0 Signal is compatible with VESA DFP 1.x TMDS CRGB, + 1 pixel per clock, up to 8 bits per color, MSB aligned, + If Bit 7=0 Analog input. If clear, the following bit definitions apply: + Bits 6-5 Video white and sync levels, relative to blank + 00=+0.7/-0.3 V; 01=+0.714/-0.286 V; + 10=+1.0/-0.4 V; 11=+0.7/0 V + Bit 4 Blank-to-black setup (pedestal) expected + Bit 3 Separate sync supported + Bit 2 Composite sync (on HSync) supported + Bit 1 Sync on green supported + Bit 0 VSync pulse must be serrated when somposite or + sync-on-green is used. */ +video_parms: .byte 0x6d + +/* Maximum horizontal image size, in centimetres + (max 292 cm/115 in at 16:9 aspect ratio) */ +max_hor_size: .byte xsize/10 + +/* Maximum vertical image size, in centimetres. + If either byte is 0, undefined (e.g. projector) */ +max_vert_size: .byte ysize/10 + +/* Display gamma, minus 1, times 100 (range 1.00-3.5 */ +gamma: .byte 120 + +/* Bit 7 DPMS standby supported + Bit 6 DPMS suspend supported + Bit 5 DPMS active-off supported + Bits 4-3 Display type: 00=monochrome; 01=RGB colour; + 10=non-RGB multicolour; 11=undefined + Bit 2 Standard sRGB colour space. Bytes 25-34 must contain + sRGB standard values. + Bit 1 Preferred timing mode specified in descriptor block 1. + Bit 0 GTF supported with default parameter values. */ +dsp_features: .byte 0xea + +/* Chromaticity coordinates. */ +/* Red and green least-significant bits + Bits 7-6 Red x value least-significant 2 bits + Bits 5-4 Red y value least-significant 2 bits + Bits 3-2 Green x value lst-significant 2 bits + Bits 1-0 Green y value least-significant 2 bits */ +red_green_lsb: .byte 0x5e + +/* Blue and white least-significant 2 bits */ +blue_white_lsb: .byte 0xc0 + +/* Red x value most significant 8 bits. + 0-255 encodes 0-0.996 (255/256); 0-0.999 (1023/1024) with lsbits */ +red_x_msb: .byte 0xa4 + +/* Red y value most significant 8 bits */ +red_y_msb: .byte 0x59 + +/* Green x and y value most significant 8 bits */ +green_x_y_msb: .byte 0x4a,0x98 + +/* Blue x and y value most significant 8 bits */ +blue_x_y_msb: .byte 0x25,0x20 + +/* Default white point x and y value most significant 8 bits */ +white_x_y_msb: .byte 0x50,0x54 + +/* Established timings */ +/* Bit 7 720x400 @ 70 Hz + Bit 6 720x400 @ 88 Hz + Bit 5 640x480 @ 60 Hz + Bit 4 640x480 @ 67 Hz + Bit 3 640x480 @ 72 Hz + Bit 2 640x480 @ 75 Hz + Bit 1 800x600 @ 56 Hz + Bit 0 800x600 @ 60 Hz */ +estbl_timing1: .byte ESTABLISHED_TIMING1_BITS + +/* Bit 7 800x600 @ 72 Hz + Bit 6 800x600 @ 75 Hz + Bit 5 832x624 @ 75 Hz + Bit 4 1024x768 @ 87 Hz, interlaced (1024x768) + Bit 3 1024x768 @ 60 Hz + Bit 2 1024x768 @ 72 Hz + Bit 1 1024x768 @ 75 Hz + Bit 0 1280x1024 @ 75 Hz */ +estbl_timing2: .byte ESTABLISHED_TIMING2_BITS + +/* Bit 7 1152x870 @ 75 Hz (Apple Macintosh II) + Bits 6-0 Other manufacturer-specific display mod */ +estbl_timing3: .byte ESTABLISHED_TIMING3_BITS + +/* Standard timing */ +/* X resolution, less 31, divided by 8 (256-2288 pixels) */ +std_xres: .byte (XPIX/8)-31 +/* Y resolution, X:Y pixel ratio + Bits 7-6 X:Y pixel ratio: 00=16:10; 01=4:3; 10=5:4; 11=16:9. + Bits 5-0 Vertical frequency, less 60 (60-123 Hz) */ +std_vres: .byte (XY_RATIO<<6)+VFREQ-60 + .fill 7,2,0x0101 /* Unused */ + +descriptor1: +/* Pixel clock in 10 kHz units. (0.-655.35 MHz, little-endian) */ +clock: .hword CLOCK/10 + +/* Horizontal active pixels 8 lsbits (0-4095) */ +x_act_lsb: .byte XPIX&0xff +/* Horizontal blanking pixels 8 lsbits (0-4095) + End of active to start of next active. */ +x_blk_lsb: .byte XBLANK&0xff +/* Bits 7-4 Horizontal active pixels 4 msbits + Bits 3-0 Horizontal blanking pixels 4 msbits */ +x_msbs: .byte msbs2(XPIX,XBLANK) + +/* Vertical active lines 8 lsbits (0-4095) */ +y_act_lsb: .byte YPIX&0xff +/* Vertical blanking lines 8 lsbits (0-4095) */ +y_blk_lsb: .byte YBLANK&0xff +/* Bits 7-4 Vertical active lines 4 msbits + Bits 3-0 Vertical blanking lines 4 msbits */ +y_msbs: .byte msbs2(YPIX,YBLANK) + +/* Horizontal sync offset pixels 8 lsbits (0-1023) From blanking start */ +x_snc_off_lsb: .byte XOFFSET&0xff +/* Horizontal sync pulse width pixels 8 lsbits (0-1023) */ +x_snc_pls_lsb: .byte XPULSE&0xff +/* Bits 7-4 Vertical sync offset lines 4 lsbits (0-63) + Bits 3-0 Vertical sync pulse width lines 4 lsbits (0-63) */ +y_snc_lsb: .byte lsbs2(YOFFSET, YPULSE) +/* Bits 7-6 Horizontal sync offset pixels 2 msbits + Bits 5-4 Horizontal sync pulse width pixels 2 msbits + Bits 3-2 Vertical sync offset lines 2 msbits + Bits 1-0 Vertical sync pulse width lines 2 msbits */ +xy_snc_msbs: .byte msbs4(XOFFSET,XPULSE,YOFFSET,YPULSE) + +/* Horizontal display size, mm, 8 lsbits (0-4095 mm, 161 in) */ +x_dsp_size: .byte xsize&0xff + +/* Vertical display size, mm, 8 lsbits (0-4095 mm, 161 in) */ +y_dsp_size: .byte ysize&0xff + +/* Bits 7-4 Horizontal display size, mm, 4 msbits + Bits 3-0 Vertical display size, mm, 4 msbits */ +dsp_size_mbsb: .byte msbs2(xsize,ysize) + +/* Horizontal border pixels (each side; total is twice this) */ +x_border: .byte 0 +/* Vertical border lines (each side; total is twice this) */ +y_border: .byte 0 + +/* Bit 7 Interlaced + Bits 6-5 Stereo mode: 00=No stereo; other values depend on bit 0: + Bit 0=0: 01=Field sequential, sync=1 during right; 10=similar, + sync=1 during left; 11=4-way interleaved stereo + Bit 0=1 2-way interleaved stereo: 01=Right image on even lines; + 10=Left image on even lines; 11=side-by-side + Bits 4-3 Sync type: 00=Analog composite; 01=Bipolar analog composite; + 10=Digital composite (on HSync); 11=Digital separate + Bit 2 If digital separate: Vertical sync polarity (1=positive) + Other types: VSync serrated (HSync during VSync) + Bit 1 If analog sync: Sync on all 3 RGB lines (else green only) + Digital: HSync polarity (1=positive) + Bit 0 2-way line-interleaved stereo, if bits 4-3 are not 00. */ +features: .byte 0x18+(VSYNC_POL<<2)+(HSYNC_POL<<1) + +descriptor2: .byte 0,0 /* Not a detailed timing descriptor */ + .byte 0 /* Must be zero */ + .byte 0xff /* Descriptor is monitor serial number (text) */ + .byte 0 /* Must be zero */ +start1: .ascii "Linux #0" +end1: .byte 0x0a /* End marker */ + .fill 12-(end1-start1), 1, 0x20 /* Padded spaces */ +descriptor3: .byte 0,0 /* Not a detailed timing descriptor */ + .byte 0 /* Must be zero */ + .byte 0xfd /* Descriptor is monitor range limits */ + .byte 0 /* Must be zero */ +start2: .byte VFREQ-1 /* Minimum vertical field rate (1-255 Hz) */ + .byte VFREQ+1 /* Maximum vertical field rate (1-255 Hz) */ + .byte (CLOCK/(XPIX+XBLANK))-1 /* Minimum horizontal line rate + (1-255 kHz) */ + .byte (CLOCK/(XPIX+XBLANK))+1 /* Maximum horizontal line rate + (1-255 kHz) */ + .byte (CLOCK/10000)+1 /* Maximum pixel clock rate, rounded up + to 10 MHz multiple (10-2550 MHz) */ + .byte 0 /* No extended timing information type */ +end2: .byte 0x0a /* End marker */ + .fill 12-(end2-start2), 1, 0x20 /* Padded spaces */ +descriptor4: .byte 0,0 /* Not a detailed timing descriptor */ + .byte 0 /* Must be zero */ + .byte 0xfc /* Descriptor is text */ + .byte 0 /* Must be zero */ +start3: .ascii TIMING_NAME +end3: .byte 0x0a /* End marker */ + .fill 12-(end3-start3), 1, 0x20 /* Padded spaces */ +extensions: .byte 0 /* Number of extensions to follow */ +checksum: .byte CRC /* Sum of all bytes must be 0 */ diff --git a/tools/edid/hex b/tools/edid/hex new file mode 100644 index 000000000000..8873ebb618af --- /dev/null +++ b/tools/edid/hex @@ -0,0 +1 @@ +"\t" 8/1 "0x%02x, " "\n" diff --git a/tools/include/linux/irqflags.h b/tools/include/linux/irqflags.h index e734da3e5b33..67e01bbadbfe 100644 --- a/tools/include/linux/irqflags.h +++ b/tools/include/linux/irqflags.h @@ -2,12 +2,12 @@ #ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ #define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ -# define trace_hardirq_context(p) 0 -# define trace_softirq_context(p) 0 -# define trace_hardirqs_enabled(p) 0 -# define trace_softirqs_enabled(p) 0 -# define trace_hardirq_enter() do { } while (0) -# define trace_hardirq_exit() do { } while (0) +# define lockdep_hardirq_context(p) 0 +# define lockdep_softirq_context(p) 0 +# define lockdep_hardirqs_enabled(p) 0 +# define lockdep_softirqs_enabled(p) 0 +# define lockdep_hardirq_enter() do { } while (0) +# define lockdep_hardirq_exit() do { } while (0) # define lockdep_softirq_enter() do { } while (0) # define lockdep_softirq_exit() do { } while (0) # define INIT_TRACE_IRQFLAGS diff --git a/tools/include/uapi/asm/errno.h b/tools/include/uapi/asm/errno.h index ce3c5945a1c4..637189ec1ab9 100644 --- a/tools/include/uapi/asm/errno.h +++ b/tools/include/uapi/asm/errno.h @@ -1,18 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #if defined(__i386__) || defined(__x86_64__) -#include "../../arch/x86/include/uapi/asm/errno.h" +#include "../../../arch/x86/include/uapi/asm/errno.h" #elif defined(__powerpc__) -#include "../../arch/powerpc/include/uapi/asm/errno.h" +#include "../../../arch/powerpc/include/uapi/asm/errno.h" #elif defined(__sparc__) -#include "../../arch/sparc/include/uapi/asm/errno.h" +#include "../../../arch/sparc/include/uapi/asm/errno.h" #elif defined(__alpha__) -#include "../../arch/alpha/include/uapi/asm/errno.h" +#include "../../../arch/alpha/include/uapi/asm/errno.h" #elif defined(__mips__) -#include "../../arch/mips/include/uapi/asm/errno.h" +#include "../../../arch/mips/include/uapi/asm/errno.h" #elif defined(__ia64__) -#include "../../arch/ia64/include/uapi/asm/errno.h" +#include "../../../arch/ia64/include/uapi/asm/errno.h" #elif defined(__xtensa__) -#include "../../arch/xtensa/include/uapi/asm/errno.h" +#include "../../../arch/xtensa/include/uapi/asm/errno.h" #else #include <asm-generic/errno.h> #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 22f235260a3a..2e29a671d67e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ + __u8 data[]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { @@ -111,6 +111,8 @@ enum bpf_cmd { BPF_MAP_LOOKUP_AND_DELETE_BATCH, BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, + BPF_LINK_UPDATE, }; enum bpf_map_type { @@ -181,6 +183,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACING, BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, }; enum bpf_attach_type { @@ -210,6 +213,8 @@ enum bpf_attach_type { BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, + BPF_LSM_MAC, __MAX_BPF_ATTACH_TYPE }; @@ -325,44 +330,46 @@ enum bpf_attach_type { #define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; /* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) +enum { + BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ -#define BPF_F_NO_COMMON_LRU (1U << 1) + BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U + BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) + BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) + BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) -#define BPF_F_WRONLY_PROG (1U << 8) + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ -#define BPF_F_CLONE (1U << 9) + BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ -#define BPF_F_MMAPABLE (1U << 10) + BPF_F_MMAPABLE = (1U << 10), +}; /* Flags for BPF_PROG_QUERY. */ @@ -391,6 +398,8 @@ struct bpf_stack_build_id { }; }; +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -534,7 +543,7 @@ union bpf_attr { __u32 prog_cnt; } query; - struct { + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; } raw_tracepoint; @@ -562,6 +571,24 @@ union bpf_attr { __u64 probe_offset; /* output: probe_offset */ __u64 probe_addr; /* output: probe_addr */ } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + __u32 prog_fd; /* eBPF program to attach */ + __u32 target_fd; /* object to attach to */ + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + /* new program fd to update link with */ + __u32 new_prog_fd; + __u32 flags; /* extra flags */ + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags */ + __u32 old_prog_fd; + } link_update; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -2890,6 +2917,114 @@ union bpf_attr { * Obtain the 64bit jiffies * Return * The 64 bit jiffies + * + * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (struct perf_branch_entry) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of sizeof(struct perf_branch_entry). + * + * **-ENOENT** if architecture does not support branch records. + * + * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * + * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of bpf_get_socket_cookie() helper, but for network namespaces + * instead of sockets. + * Return + * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * Description + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EINVAL** Unsupported flags specified. + * * **-ENOENT** Socket is unavailable for assignment. + * * **-ENETUNREACH** Socket is unreachable (wrong netns). + * * **-EOPNOTSUPP** Unsupported operation, for example a + * call from outside of TC ingress. + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3010,7 +3145,13 @@ union bpf_attr { FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ FN(send_signal_thread), \ - FN(jiffies64), + FN(jiffies64), \ + FN(read_branch_records), \ + FN(get_ns_current_pid_tgid), \ + FN(xdp_output), \ + FN(get_netns_cookie), \ + FN(get_current_ancestor_cgroup_id), \ + FN(sk_assign), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3025,69 +3166,100 @@ enum bpf_func_id { /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ -#define BPF_F_HDR_FIELD_MASK 0xfULL +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; /* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) +enum { + BPF_F_INGRESS = (1ULL << 0), +}; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; /* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; /* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; /* BPF_FUNC_skb_adjust_room flags. */ -#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), +}; -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) -#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) -#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ -#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; /* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +enum { + BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), +}; + +/* BPF_FUNC_read_branch_records flags. */ +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3153,6 +3325,7 @@ struct __sk_buff { __u32 wire_len; __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; }; struct bpf_tunnel_key { @@ -3505,13 +3678,14 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently - * supported cb flags - */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, +}; /* List of known BPF sock_ops operators. * New entries can only be added at the end @@ -3590,8 +3764,10 @@ enum { BPF_TCP_MAX_STATES /* Leave at the end! */ }; -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ +}; struct bpf_perf_event_value { __u64 counter; @@ -3599,12 +3775,16 @@ struct bpf_perf_event_value { __u64 running; }; -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ @@ -3620,8 +3800,10 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT (1U << 0) -#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ @@ -3693,9 +3875,11 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; -#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; struct bpf_flow_keys { __u16 nhoff; @@ -3761,4 +3945,8 @@ struct bpf_sockopt { __s32 retval; }; +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 024af2d1d0af..ca6665ea758a 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -463,6 +463,7 @@ enum { IFLA_MACSEC_REPLAY_PROTECT, IFLA_MACSEC_VALIDATION, IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, __IFLA_MACSEC_MAX, }; @@ -489,6 +490,7 @@ enum macsec_validation_type { enum macsec_offload { MACSEC_OFFLOAD_OFF = 0, MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, __MACSEC_OFFLOAD_END, MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, }; @@ -960,11 +962,12 @@ enum { #define XDP_FLAGS_SKB_MODE (1U << 1) #define XDP_FLAGS_DRV_MODE (1U << 2) #define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ XDP_FLAGS_DRV_MODE | \ XDP_FLAGS_HW_MODE) #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ - XDP_FLAGS_MODES) + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) /* These are stored into IFLA_XDP_ATTACHED on dump. */ enum { @@ -984,6 +987,7 @@ enum { IFLA_XDP_DRV_PROG_ID, IFLA_XDP_SKB_PROG_ID, IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, __IFLA_XDP_MAX, }; diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index 1521073b6348..8533bf07450f 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -74,6 +74,8 @@ enum { #define IPPROTO_UDPLITE IPPROTO_UDPLITE IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */ #define IPPROTO_MPLS IPPROTO_MPLS + IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ +#define IPPROTO_ETHERNET IPPROTO_ETHERNET IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_MPTCP = 262, /* Multipath TCP connection */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 377d794d3105..397cfd65b3fe 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -181,6 +181,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -208,6 +210,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -853,7 +857,9 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * * { u64 nr; - * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER diff --git a/tools/testing/selftests/bpf/include/uapi/linux/types.h b/tools/include/uapi/linux/types.h index 91fa51a9c31d..91fa51a9c31d 100644 --- a/tools/testing/selftests/bpf/include/uapi/linux/types.h +++ b/tools/include/uapi/linux/types.h diff --git a/tools/lib/api/fs/Build b/tools/lib/api/fs/Build index f4ed9629ae85..0f75b28654de 100644 --- a/tools/lib/api/fs/Build +++ b/tools/lib/api/fs/Build @@ -1,2 +1,3 @@ libapi-y += fs.o libapi-y += tracing_path.o +libapi-y += cgroup.o diff --git a/tools/lib/api/fs/cgroup.c b/tools/lib/api/fs/cgroup.c new file mode 100644 index 000000000000..889a6eb4aaca --- /dev/null +++ b/tools/lib/api/fs/cgroup.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/stringify.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "fs.h" + +int cgroupfs_find_mountpoint(char *buf, size_t maxlen, const char *subsys) +{ + FILE *fp; + char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; + char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path; + char *token, *saved_ptr = NULL; + + fp = fopen("/proc/mounts", "r"); + if (!fp) + return -1; + + /* + * in order to handle split hierarchy, we need to scan /proc/mounts + * and inspect every cgroupfs mount point to find one that has + * perf_event subsystem + */ + path_v1[0] = '\0'; + path_v2[0] = '\0'; + + while (fscanf(fp, "%*s %"__stringify(PATH_MAX)"s %"__stringify(PATH_MAX)"s %" + __stringify(PATH_MAX)"s %*d %*d\n", + mountpoint, type, tokens) == 3) { + + if (!path_v1[0] && !strcmp(type, "cgroup")) { + + token = strtok_r(tokens, ",", &saved_ptr); + + while (token != NULL) { + if (subsys && !strcmp(token, subsys)) { + strcpy(path_v1, mountpoint); + break; + } + token = strtok_r(NULL, ",", &saved_ptr); + } + } + + if (!path_v2[0] && !strcmp(type, "cgroup2")) + strcpy(path_v2, mountpoint); + + if (path_v1[0] && path_v2[0]) + break; + } + fclose(fp); + + if (path_v1[0]) + path = path_v1; + else if (path_v2[0]) + path = path_v2; + else + return -1; + + if (strlen(path) < maxlen) { + strcpy(buf, path); + return 0; + } + return -1; +} diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h index 92d03b8396b1..936edb95e1f3 100644 --- a/tools/lib/api/fs/fs.h +++ b/tools/lib/api/fs/fs.h @@ -28,6 +28,8 @@ FS(bpf_fs) #undef FS +int cgroupfs_find_mountpoint(char *buf, size_t maxlen, const char *subsys); + int filename__read_int(const char *filename, int *value); int filename__read_ull(const char *filename, unsigned long long *value); int filename__read_xll(const char *filename, unsigned long long *value); diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index c6dafe563176..5cc1b0785d18 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -235,7 +235,8 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, memset(&attr, 0, sizeof(attr)); attr.prog_type = load_attr->prog_type; attr.expected_attach_type = load_attr->expected_attach_type; - if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS) { + if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS || + attr.prog_type == BPF_PROG_TYPE_LSM) { attr.attach_btf_id = load_attr->attach_btf_id; } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || attr.prog_type == BPF_PROG_TYPE_EXT) { @@ -584,6 +585,40 @@ int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); } +int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts) +{ + union bpf_attr attr; + + if (!OPTS_VALID(opts, bpf_link_create_opts)) + return -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + + return sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); +} + +int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts) +{ + union bpf_attr attr; + + if (!OPTS_VALID(opts, bpf_link_update_opts)) + return -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.link_update.link_fd = link_fd; + attr.link_update.new_prog_fd = new_prog_fd; + attr.link_update.flags = OPTS_GET(opts, flags, 0); + attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + + return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr)); +} + int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) { diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index b976e77316cc..46d47afdd887 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -168,6 +168,25 @@ LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); +struct bpf_link_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ +}; +#define bpf_link_create_opts__last_field sz + +LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts); + +struct bpf_link_update_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; /* extra flags */ + __u32 old_prog_fd; /* expected old program FD */ +}; +#define bpf_link_update_opts__last_field old_prog_fd + +LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts); + struct bpf_prog_test_run_attr { int prog_fd; int repeat; diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index b0dafe8b4ebc..f3f3c3fb98cb 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -49,7 +49,8 @@ #if defined(bpf_target_x86) -#ifdef __KERNEL__ +#if defined(__KERNEL__) || defined(__VMLINUX_H__) + #define PT_REGS_PARM1(x) ((x)->di) #define PT_REGS_PARM2(x) ((x)->si) #define PT_REGS_PARM3(x) ((x)->dx) @@ -60,7 +61,20 @@ #define PT_REGS_RC(x) ((x)->ax) #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->ip) + +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), di) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), si) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), dx) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), cx) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), r8) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), sp) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), bp) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), ax) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), sp) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), ip) + #else + #ifdef __i386__ /* i386 kernel is built with -mregparm=3 */ #define PT_REGS_PARM1(x) ((x)->eax) @@ -73,7 +87,20 @@ #define PT_REGS_RC(x) ((x)->eax) #define PT_REGS_SP(x) ((x)->esp) #define PT_REGS_IP(x) ((x)->eip) + +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), eax) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), edx) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), ecx) +#define PT_REGS_PARM4_CORE(x) 0 +#define PT_REGS_PARM5_CORE(x) 0 +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), esp) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), ebp) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), eax) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), esp) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), eip) + #else + #define PT_REGS_PARM1(x) ((x)->rdi) #define PT_REGS_PARM2(x) ((x)->rsi) #define PT_REGS_PARM3(x) ((x)->rdx) @@ -84,6 +111,18 @@ #define PT_REGS_RC(x) ((x)->rax) #define PT_REGS_SP(x) ((x)->rsp) #define PT_REGS_IP(x) ((x)->rip) + +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), rdi) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), rsi) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), rdx) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), rcx) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), r8) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), rsp) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), rbp) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), rax) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), rsp) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), rip) + #endif #endif @@ -104,6 +143,17 @@ struct pt_regs; #define PT_REGS_SP(x) (((PT_REGS_S390 *)(x))->gprs[15]) #define PT_REGS_IP(x) (((PT_REGS_S390 *)(x))->psw.addr) +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[2]) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[3]) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[4]) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[5]) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[6]) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), grps[14]) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[11]) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[2]) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[15]) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), pdw.addr) + #elif defined(bpf_target_arm) #define PT_REGS_PARM1(x) ((x)->uregs[0]) @@ -117,6 +167,17 @@ struct pt_regs; #define PT_REGS_SP(x) ((x)->uregs[13]) #define PT_REGS_IP(x) ((x)->uregs[12]) +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), uregs[0]) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), uregs[1]) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), uregs[2]) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), uregs[3]) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), uregs[4]) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), uregs[14]) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), uregs[11]) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), uregs[0]) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), uregs[13]) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), uregs[12]) + #elif defined(bpf_target_arm64) /* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ @@ -134,6 +195,17 @@ struct pt_regs; #define PT_REGS_SP(x) (((PT_REGS_ARM64 *)(x))->sp) #define PT_REGS_IP(x) (((PT_REGS_ARM64 *)(x))->pc) +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[0]) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[1]) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[2]) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[3]) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[4]) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[30]) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[29]) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[0]) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), sp) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), pc) + #elif defined(bpf_target_mips) #define PT_REGS_PARM1(x) ((x)->regs[4]) @@ -147,6 +219,17 @@ struct pt_regs; #define PT_REGS_SP(x) ((x)->regs[29]) #define PT_REGS_IP(x) ((x)->cp0_epc) +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), regs[4]) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), regs[5]) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), regs[6]) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), regs[7]) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), regs[8]) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), regs[31]) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), regs[30]) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), regs[1]) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), regs[29]) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), cp0_epc) + #elif defined(bpf_target_powerpc) #define PT_REGS_PARM1(x) ((x)->gpr[3]) @@ -158,6 +241,15 @@ struct pt_regs; #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->nip) +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), gpr[3]) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), gpr[4]) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), gpr[5]) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), gpr[6]) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), gpr[7]) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), gpr[3]) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), sp) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), nip) + #elif defined(bpf_target_sparc) #define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) @@ -169,11 +261,22 @@ struct pt_regs; #define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) #define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I0]) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I1]) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I2]) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I3]) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I4]) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I7]) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I0]) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), u_regs[UREG_FP]) + /* Should this also be a bpf_target check for the sparc case? */ #if defined(__arch64__) #define PT_REGS_IP(x) ((x)->tpc) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), tpc) #else #define PT_REGS_IP(x) ((x)->pc) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), pc) #endif #endif @@ -192,4 +295,122 @@ struct pt_regs; (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) #endif +#define ___bpf_concat(a, b) a ## b +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#define ___bpf_narg(...) \ + ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#define ___bpf_empty(...) \ + ___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) + +#define ___bpf_ctx_cast0() ctx +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast(args...) \ + ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) + +/* + * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and + * similar kinds of BPF programs, that accept input arguments as a single + * pointer to untyped u64 array, where each u64 can actually be a typed + * pointer or integer of different size. Instead of requring user to write + * manual casts and work with array elements by index, BPF_PROG macro + * allows user to declare a list of named and typed input arguments in the + * same syntax as for normal C function. All the casting is hidden and + * performed transparently, while user code can just assume working with + * function arguments of specified type and name. + * + * Original raw context argument is preserved as well as 'ctx' argument. + * This is useful when using BPF helpers that expect original context + * as one of the parameters (e.g., for bpf_perf_event_output()). + */ +#define BPF_PROG(name, args...) \ +name(unsigned long long *ctx); \ +static __attribute__((always_inline)) typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_ctx_cast(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __attribute__((always_inline)) typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args) + +struct pt_regs; + +#define ___bpf_kprobe_args0() ctx +#define ___bpf_kprobe_args1(x) \ + ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) \ + ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) \ + ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) \ + ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) \ + ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args(args...) \ + ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for + * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific + * low-level way of getting kprobe input arguments from struct pt_regs, and + * provides a familiar typed and named function arguments syntax and + * semantics of accessing kprobe input paremeters. + * + * Original struct pt_regs* context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + */ +#define BPF_KPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __attribute__((always_inline)) typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __attribute__((always_inline)) typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#define ___bpf_kretprobe_args0() ctx +#define ___bpf_kretprobe_args1(x) \ + ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args(args...) \ + ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KRETPROBE is similar to BPF_KPROBE, except, it only provides optional + * return value (in addition to `struct pt_regs *ctx`), but no input + * arguments, because they will be clobbered by the time probed function + * returns. + */ +#define BPF_KRETPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __attribute__((always_inline)) typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kretprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) + #endif diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 3d1c25fc97ae..bfef3d606b54 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -657,22 +657,32 @@ int btf__finalize_data(struct bpf_object *obj, struct btf *btf) int btf__load(struct btf *btf) { - __u32 log_buf_size = BPF_LOG_BUF_SIZE; + __u32 log_buf_size = 0; char *log_buf = NULL; int err = 0; if (btf->fd >= 0) return -EEXIST; - log_buf = malloc(log_buf_size); - if (!log_buf) - return -ENOMEM; +retry_load: + if (log_buf_size) { + log_buf = malloc(log_buf_size); + if (!log_buf) + return -ENOMEM; - *log_buf = 0; + *log_buf = 0; + } btf->fd = bpf_load_btf(btf->data, btf->data_size, log_buf, log_buf_size, false); if (btf->fd < 0) { + if (!log_buf || errno == ENOSPC) { + log_buf_size = max((__u32)BPF_LOG_BUF_SIZE, + log_buf_size << 1); + free(log_buf); + goto retry_load; + } + err = -errno; pr_warn("Error loading BTF: %s(%d)\n", strerror(errno), errno); if (*log_buf) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index bd09ed1710f1..0c28ee82834b 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -916,13 +916,13 @@ static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, /* enumerators share namespace with typedef idents */ dup_cnt = btf_dump_name_dups(d, d->ident_names, name); if (dup_cnt > 1) { - btf_dump_printf(d, "\n%s%s___%zu = %d,", + btf_dump_printf(d, "\n%s%s___%zu = %u,", pfx(lvl + 1), name, dup_cnt, - (__s32)v->val); + (__u32)v->val); } else { - btf_dump_printf(d, "\n%s%s = %d,", + btf_dump_printf(d, "\n%s%s = %u,", pfx(lvl + 1), name, - (__s32)v->val); + (__u32)v->val); } } btf_dump_printf(d, "\n%s}", pfx(lvl)); @@ -1030,7 +1030,7 @@ int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, if (!OPTS_VALID(opts, btf_dump_emit_type_decl_opts)) return -EINVAL; - fname = OPTS_GET(opts, field_name, NULL); + fname = OPTS_GET(opts, field_name, ""); lvl = OPTS_GET(opts, indent_level, 0); btf_dump_emit_type_decl(d, id, fname, lvl); return 0; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7469c7dcc15e..ff9174282a8c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1845,7 +1845,6 @@ resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) * type definition, while using only sizeof(void *) space in ELF data section. */ static bool get_map_field_int(const char *map_name, const struct btf *btf, - const struct btf_type *def, const struct btf_member *m, __u32 *res) { const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL); @@ -1972,19 +1971,19 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, return -EINVAL; } if (strcmp(name, "type") == 0) { - if (!get_map_field_int(map_name, obj->btf, def, m, + if (!get_map_field_int(map_name, obj->btf, m, &map->def.type)) return -EINVAL; pr_debug("map '%s': found type = %u.\n", map_name, map->def.type); } else if (strcmp(name, "max_entries") == 0) { - if (!get_map_field_int(map_name, obj->btf, def, m, + if (!get_map_field_int(map_name, obj->btf, m, &map->def.max_entries)) return -EINVAL; pr_debug("map '%s': found max_entries = %u.\n", map_name, map->def.max_entries); } else if (strcmp(name, "map_flags") == 0) { - if (!get_map_field_int(map_name, obj->btf, def, m, + if (!get_map_field_int(map_name, obj->btf, m, &map->def.map_flags)) return -EINVAL; pr_debug("map '%s': found map_flags = %u.\n", @@ -1992,8 +1991,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } else if (strcmp(name, "key_size") == 0) { __u32 sz; - if (!get_map_field_int(map_name, obj->btf, def, m, - &sz)) + if (!get_map_field_int(map_name, obj->btf, m, &sz)) return -EINVAL; pr_debug("map '%s': found key_size = %u.\n", map_name, sz); @@ -2035,8 +2033,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } else if (strcmp(name, "value_size") == 0) { __u32 sz; - if (!get_map_field_int(map_name, obj->btf, def, m, - &sz)) + if (!get_map_field_int(map_name, obj->btf, m, &sz)) return -EINVAL; pr_debug("map '%s': found value_size = %u.\n", map_name, sz); @@ -2079,8 +2076,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, __u32 val; int err; - if (!get_map_field_int(map_name, obj->btf, def, m, - &val)) + if (!get_map_field_int(map_name, obj->btf, m, &val)) return -EINVAL; pr_debug("map '%s': found pinning = %u.\n", map_name, val); @@ -2284,11 +2280,16 @@ static void bpf_object__sanitize_btf_ext(struct bpf_object *obj) } } -static bool bpf_object__is_btf_mandatory(const struct bpf_object *obj) +static bool libbpf_needs_btf(const struct bpf_object *obj) { return obj->efile.btf_maps_shndx >= 0 || - obj->efile.st_ops_shndx >= 0 || - obj->nr_extern > 0; + obj->efile.st_ops_shndx >= 0 || + obj->nr_extern > 0; +} + +static bool kernel_needs_btf(const struct bpf_object *obj) +{ + return obj->efile.st_ops_shndx >= 0; } static int bpf_object__init_btf(struct bpf_object *obj, @@ -2324,7 +2325,7 @@ static int bpf_object__init_btf(struct bpf_object *obj, } } out: - if (err && bpf_object__is_btf_mandatory(obj)) { + if (err && libbpf_needs_btf(obj)) { pr_warn("BTF is required, but is missing or corrupted.\n"); return err; } @@ -2348,7 +2349,7 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) btf_ext__free(obj->btf_ext); obj->btf_ext = NULL; - if (bpf_object__is_btf_mandatory(obj)) { + if (libbpf_needs_btf(obj)) { pr_warn("BTF is required, but is missing or corrupted.\n"); return -ENOENT; } @@ -2357,7 +2358,8 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) { - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_LSM) return true; /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs @@ -2412,7 +2414,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) obj->btf_ext = NULL; } - if (bpf_object__is_btf_mandatory(obj)) + if (kernel_needs_btf(obj)) return err; } return 0; @@ -3868,6 +3870,10 @@ static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf, if (str_is_empty(targ_name)) continue; + t = skip_mods_and_typedefs(targ_btf, i, NULL); + if (!btf_is_composite(t) && !btf_is_array(t)) + continue; + targ_essent_len = bpf_core_essential_name_len(targ_name); if (targ_essent_len != local_essent_len) continue; @@ -4846,8 +4852,8 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, { struct bpf_load_program_attr load_attr; char *cp, errmsg[STRERR_BUFSIZE]; - int log_buf_size = BPF_LOG_BUF_SIZE; - char *log_buf; + size_t log_buf_size = 0; + char *log_buf = NULL; int btf_fd, ret; if (!insns || !insns_cnt) @@ -4861,7 +4867,8 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, load_attr.insns = insns; load_attr.insns_cnt = insns_cnt; load_attr.license = license; - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_LSM) { load_attr.attach_btf_id = prog->attach_btf_id; } else if (prog->type == BPF_PROG_TYPE_TRACING || prog->type == BPF_PROG_TYPE_EXT) { @@ -4887,22 +4894,28 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, load_attr.prog_flags = prog->prog_flags; retry_load: - log_buf = malloc(log_buf_size); - if (!log_buf) - pr_warn("Alloc log buffer for bpf loader error, continue without log\n"); + if (log_buf_size) { + log_buf = malloc(log_buf_size); + if (!log_buf) + return -ENOMEM; + + *log_buf = 0; + } ret = bpf_load_program_xattr(&load_attr, log_buf, log_buf_size); if (ret >= 0) { - if (load_attr.log_level) + if (log_buf && load_attr.log_level) pr_debug("verifier log:\n%s", log_buf); *pfd = ret; ret = 0; goto out; } - if (errno == ENOSPC) { - log_buf_size <<= 1; + if (!log_buf || errno == ENOSPC) { + log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, + log_buf_size << 1); + free(log_buf); goto retry_load; } @@ -4945,8 +4958,9 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) { int err = 0, fd, i, btf_id; - if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { + if ((prog->type == BPF_PROG_TYPE_TRACING || + prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { btf_id = libbpf_find_attach_btf_id(prog); if (btf_id <= 0) return btf_id; @@ -6185,6 +6199,7 @@ bool bpf_program__is_##NAME(const struct bpf_program *prog) \ } \ BPF_PROG_TYPE_FNS(socket_filter, BPF_PROG_TYPE_SOCKET_FILTER); +BPF_PROG_TYPE_FNS(lsm, BPF_PROG_TYPE_LSM); BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE); BPF_PROG_TYPE_FNS(sched_cls, BPF_PROG_TYPE_SCHED_CLS); BPF_PROG_TYPE_FNS(sched_act, BPF_PROG_TYPE_SCHED_ACT); @@ -6251,6 +6266,8 @@ static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec, struct bpf_program *prog); static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, struct bpf_program *prog); +static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, + struct bpf_program *prog); struct bpf_sec_def { const char *sec; @@ -6290,6 +6307,10 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_FENTRY, .is_attach_btf = true, .attach_fn = attach_trace), + SEC_DEF("fmod_ret/", TRACING, + .expected_attach_type = BPF_MODIFY_RETURN, + .is_attach_btf = true, + .attach_fn = attach_trace), SEC_DEF("fexit/", TRACING, .expected_attach_type = BPF_TRACE_FEXIT, .is_attach_btf = true, @@ -6297,6 +6318,10 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("freplace/", EXT, .is_attach_btf = true, .attach_fn = attach_trace), + SEC_DEF("lsm/", LSM, + .is_attach_btf = true, + .expected_attach_type = BPF_LSM_MAC, + .attach_fn = attach_lsm), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), @@ -6559,6 +6584,7 @@ invalid_prog: } #define BTF_TRACE_PREFIX "btf_trace_" +#define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_MAX_NAME_SIZE 128 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, @@ -6586,9 +6612,15 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, if (attach_type == BPF_TRACE_RAW_TP) err = find_btf_by_prefix_kind(btf, BTF_TRACE_PREFIX, name, BTF_KIND_TYPEDEF); + else if (attach_type == BPF_LSM_MAC) + err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name, + BTF_KIND_FUNC); else err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + if (err <= 0) + pr_warn("%s is not found in vmlinux BTF\n", name); + return err; } @@ -6661,8 +6693,6 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog) err = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, name + section_defs[i].len, attach_type); - if (err <= 0) - pr_warn("%s is not found in vmlinux BTF\n", name); return err; } pr_warn("failed to identify btf_id based on ELF section name '%s'\n", name); @@ -6742,6 +6772,17 @@ void *bpf_map__priv(const struct bpf_map *map) return map ? map->priv : ERR_PTR(-EINVAL); } +int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size) +{ + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || + size != map->def.value_size || map->fd >= 0) + return -EINVAL; + + memcpy(map->mmaped, data, size); + return 0; +} + bool bpf_map__is_offload_neutral(const struct bpf_map *map) { return map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; @@ -6932,9 +6973,17 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, struct bpf_link { int (*detach)(struct bpf_link *link); int (*destroy)(struct bpf_link *link); + char *pin_path; /* NULL, if not pinned */ + int fd; /* hook FD, -1 if not applicable */ bool disconnected; }; +/* Replace link's underlying BPF program with the new one */ +int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) +{ + return bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); +} + /* Release "ownership" of underlying BPF resource (typically, BPF program * attached to some BPF hook, e.g., tracepoint, kprobe, etc). Disconnected * link, when destructed through bpf_link__destroy() call won't attempt to @@ -6961,26 +7010,109 @@ int bpf_link__destroy(struct bpf_link *link) err = link->detach(link); if (link->destroy) link->destroy(link); + if (link->pin_path) + free(link->pin_path); free(link); return err; } -struct bpf_link_fd { - struct bpf_link link; /* has to be at the top of struct */ - int fd; /* hook FD */ -}; +int bpf_link__fd(const struct bpf_link *link) +{ + return link->fd; +} + +const char *bpf_link__pin_path(const struct bpf_link *link) +{ + return link->pin_path; +} + +static int bpf_link__detach_fd(struct bpf_link *link) +{ + return close(link->fd); +} + +struct bpf_link *bpf_link__open(const char *path) +{ + struct bpf_link *link; + int fd; + + fd = bpf_obj_get(path); + if (fd < 0) { + fd = -errno; + pr_warn("failed to open link at %s: %d\n", path, fd); + return ERR_PTR(fd); + } + + link = calloc(1, sizeof(*link)); + if (!link) { + close(fd); + return ERR_PTR(-ENOMEM); + } + link->detach = &bpf_link__detach_fd; + link->fd = fd; + + link->pin_path = strdup(path); + if (!link->pin_path) { + bpf_link__destroy(link); + return ERR_PTR(-ENOMEM); + } + + return link; +} + +int bpf_link__pin(struct bpf_link *link, const char *path) +{ + int err; + + if (link->pin_path) + return -EBUSY; + err = make_parent_dir(path); + if (err) + return err; + err = check_path(path); + if (err) + return err; + + link->pin_path = strdup(path); + if (!link->pin_path) + return -ENOMEM; + + if (bpf_obj_pin(link->fd, link->pin_path)) { + err = -errno; + zfree(&link->pin_path); + return err; + } + + pr_debug("link fd=%d: pinned at %s\n", link->fd, link->pin_path); + return 0; +} + +int bpf_link__unpin(struct bpf_link *link) +{ + int err; + + if (!link->pin_path) + return -EINVAL; + + err = unlink(link->pin_path); + if (err != 0) + return -errno; + + pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path); + zfree(&link->pin_path); + return 0; +} static int bpf_link__detach_perf_event(struct bpf_link *link) { - struct bpf_link_fd *l = (void *)link; int err; - err = ioctl(l->fd, PERF_EVENT_IOC_DISABLE, 0); + err = ioctl(link->fd, PERF_EVENT_IOC_DISABLE, 0); if (err) err = -errno; - close(l->fd); + close(link->fd); return err; } @@ -6988,7 +7120,7 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pfd) { char errmsg[STRERR_BUFSIZE]; - struct bpf_link_fd *link; + struct bpf_link *link; int prog_fd, err; if (pfd < 0) { @@ -7006,7 +7138,7 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, link = calloc(1, sizeof(*link)); if (!link) return ERR_PTR(-ENOMEM); - link->link.detach = &bpf_link__detach_perf_event; + link->detach = &bpf_link__detach_perf_event; link->fd = pfd; if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { @@ -7025,7 +7157,7 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return ERR_PTR(err); } - return (struct bpf_link *)link; + return link; } /* @@ -7313,18 +7445,11 @@ out: return link; } -static int bpf_link__detach_fd(struct bpf_link *link) -{ - struct bpf_link_fd *l = (void *)link; - - return close(l->fd); -} - struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, const char *tp_name) { char errmsg[STRERR_BUFSIZE]; - struct bpf_link_fd *link; + struct bpf_link *link; int prog_fd, pfd; prog_fd = bpf_program__fd(prog); @@ -7337,7 +7462,7 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, link = calloc(1, sizeof(*link)); if (!link) return ERR_PTR(-ENOMEM); - link->link.detach = &bpf_link__detach_fd; + link->detach = &bpf_link__detach_fd; pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); if (pfd < 0) { @@ -7349,7 +7474,7 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, return ERR_PTR(pfd); } link->fd = pfd; - return (struct bpf_link *)link; + return link; } static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec, @@ -7360,10 +7485,11 @@ static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec, return bpf_program__attach_raw_tracepoint(prog, tp_name); } -struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) +/* Common logic for all BPF program types that attach to a btf_id */ +static struct bpf_link *bpf_program__attach_btf_id(struct bpf_program *prog) { char errmsg[STRERR_BUFSIZE]; - struct bpf_link_fd *link; + struct bpf_link *link; int prog_fd, pfd; prog_fd = bpf_program__fd(prog); @@ -7376,13 +7502,13 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) link = calloc(1, sizeof(*link)); if (!link) return ERR_PTR(-ENOMEM); - link->link.detach = &bpf_link__detach_fd; + link->detach = &bpf_link__detach_fd; pfd = bpf_raw_tracepoint_open(NULL, prog_fd); if (pfd < 0) { pfd = -errno; free(link); - pr_warn("program '%s': failed to attach to trace: %s\n", + pr_warn("program '%s': failed to attach: %s\n", bpf_program__title(prog, false), libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); @@ -7391,12 +7517,68 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) return (struct bpf_link *)link; } +struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog); +} + +struct bpf_link *bpf_program__attach_lsm(struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog); +} + static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, struct bpf_program *prog) { return bpf_program__attach_trace(prog); } +static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + return bpf_program__attach_lsm(prog); +} + +struct bpf_link * +bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) +{ + const struct bpf_sec_def *sec_def; + enum bpf_attach_type attach_type; + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("program '%s': can't attach before loaded\n", + bpf_program__title(prog, false)); + return ERR_PTR(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return ERR_PTR(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + attach_type = bpf_program__get_expected_attach_type(prog); + if (!attach_type) { + sec_def = find_sec_def(bpf_program__title(prog, false)); + if (sec_def) + attach_type = sec_def->attach_type; + } + link_fd = bpf_link_create(prog_fd, cgroup_fd, attach_type, NULL); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("program '%s': failed to attach to cgroup: %s\n", + bpf_program__title(prog, false), + libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return ERR_PTR(link_fd); + } + link->fd = link_fd; + return link; +} + struct bpf_link *bpf_program__attach(struct bpf_program *prog) { const struct bpf_sec_def *sec_def; @@ -7410,10 +7592,9 @@ struct bpf_link *bpf_program__attach(struct bpf_program *prog) static int bpf_link__detach_struct_ops(struct bpf_link *link) { - struct bpf_link_fd *l = (void *)link; __u32 zero = 0; - if (bpf_map_delete_elem(l->fd, &zero)) + if (bpf_map_delete_elem(link->fd, &zero)) return -errno; return 0; @@ -7422,7 +7603,7 @@ static int bpf_link__detach_struct_ops(struct bpf_link *link) struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map) { struct bpf_struct_ops *st_ops; - struct bpf_link_fd *link; + struct bpf_link *link; __u32 i, zero = 0; int err; @@ -7454,10 +7635,10 @@ struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map) return ERR_PTR(err); } - link->link.detach = bpf_link__detach_struct_ops; + link->detach = bpf_link__detach_struct_ops; link->fd = map->fd; - return (struct bpf_link *)link; + return link; } enum bpf_perf_event_ret @@ -8138,6 +8319,31 @@ void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear) } } +int bpf_program__set_attach_target(struct bpf_program *prog, + int attach_prog_fd, + const char *attach_func_name) +{ + int btf_id; + + if (!prog || attach_prog_fd < 0 || !attach_func_name) + return -EINVAL; + + if (attach_prog_fd) + btf_id = libbpf_find_prog_btf_id(attach_func_name, + attach_prog_fd); + else + btf_id = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, + attach_func_name, + prog->expected_attach_type); + + if (btf_id < 0) + return btf_id; + + prog->attach_btf_id = btf_id; + prog->attach_prog_fd = attach_prog_fd; + return 0; +} + int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) { int err = 0, n, len, start, end = -1; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3fe12c9d1f92..44df1d3e7287 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -219,6 +219,13 @@ LIBBPF_API void bpf_program__unload(struct bpf_program *prog); struct bpf_link; +LIBBPF_API struct bpf_link *bpf_link__open(const char *path); +LIBBPF_API int bpf_link__fd(const struct bpf_link *link); +LIBBPF_API const char *bpf_link__pin_path(const struct bpf_link *link); +LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path); +LIBBPF_API int bpf_link__unpin(struct bpf_link *link); +LIBBPF_API int bpf_link__update_program(struct bpf_link *link, + struct bpf_program *prog); LIBBPF_API void bpf_link__disconnect(struct bpf_link *link); LIBBPF_API int bpf_link__destroy(struct bpf_link *link); @@ -240,11 +247,17 @@ bpf_program__attach_tracepoint(struct bpf_program *prog, LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(struct bpf_program *prog, const char *tp_name); - LIBBPF_API struct bpf_link * bpf_program__attach_trace(struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_lsm(struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd); + struct bpf_map; + LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map); + struct bpf_insn; /* @@ -316,6 +329,7 @@ LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog); LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog); LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog); LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_lsm(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog); LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog); @@ -334,10 +348,15 @@ LIBBPF_API void bpf_program__set_expected_attach_type(struct bpf_program *prog, enum bpf_attach_type type); +LIBBPF_API int +bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, + const char *attach_func_name); + LIBBPF_API bool bpf_program__is_socket_filter(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_tracepoint(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_raw_tracepoint(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_kprobe(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_lsm(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_sched_cls(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog); @@ -398,6 +417,8 @@ typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, bpf_map_clear_priv_t clear_priv); LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size); LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries); LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); @@ -435,7 +456,15 @@ struct xdp_link_info { __u8 attach_mode; }; +struct bpf_xdp_set_link_opts { + size_t sz; + __u32 old_fd; +}; +#define bpf_xdp_set_link_opts__last_field old_fd + LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); +LIBBPF_API int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, + const struct bpf_xdp_set_link_opts *opts); LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, size_t info_size, __u32 flags); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index b035122142bb..bb8831605b25 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -235,3 +235,22 @@ LIBBPF_0.0.7 { btf__align_of; libbpf_find_kernel_btf; } LIBBPF_0.0.6; + +LIBBPF_0.0.8 { + global: + bpf_link__fd; + bpf_link__open; + bpf_link__pin; + bpf_link__pin_path; + bpf_link__unpin; + bpf_link__update_program; + bpf_link_create; + bpf_link_update; + bpf_map__set_initial_value; + bpf_program__attach_cgroup; + bpf_program__attach_lsm; + bpf_program__is_lsm; + bpf_program__set_attach_target; + bpf_program__set_lsm; + bpf_set_link_xdp_fd_opts; +} LIBBPF_0.0.7; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index b782ebef6ac9..2c92059c0c90 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -108,6 +108,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: default: break; } diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 431bd25c6cdb..18b5319025e1 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -132,7 +132,8 @@ done: return ret; } -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) +static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, + __u32 flags) { int sock, seq = 0, ret; struct nlattr *nla, *nla_xdp; @@ -178,6 +179,14 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) nla->nla_len += nla_xdp->nla_len; } + if (flags & XDP_FLAGS_REPLACE) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_EXPECTED_FD; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(old_fd); + memcpy((char *)nla_xdp + NLA_HDRLEN, &old_fd, sizeof(old_fd)); + nla->nla_len += nla_xdp->nla_len; + } + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { @@ -191,6 +200,29 @@ cleanup: return ret; } +int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, + const struct bpf_xdp_set_link_opts *opts) +{ + int old_fd = -1; + + if (!OPTS_VALID(opts, bpf_xdp_set_link_opts)) + return -EINVAL; + + if (OPTS_HAS(opts, old_fd)) { + old_fd = OPTS_GET(opts, old_fd, -1); + flags |= XDP_FLAGS_REPLACE; + } + + return __bpf_set_link_xdp_fd_replace(ifindex, fd, + old_fd, + flags); +} + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) +{ + return __bpf_set_link_xdp_fd_replace(ifindex, fd, 0, flags); +} + static int __dump_link_nlmsg(struct nlmsghdr *nlh, libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) { diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 9807903f121e..f7f4efb70a4c 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -280,7 +280,11 @@ int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area, fill->consumer = map + off.fr.consumer; fill->flags = map + off.fr.flags; fill->ring = map + off.fr.desc; - fill->cached_cons = umem->config.fill_size; + fill->cached_prod = *fill->producer; + /* cached_cons is "size" bigger than the real consumer pointer + * See xsk_prod_nb_free + */ + fill->cached_cons = *fill->consumer + umem->config.fill_size; map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd, @@ -297,6 +301,8 @@ int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area, comp->consumer = map + off.cr.consumer; comp->flags = map + off.cr.flags; comp->ring = map + off.cr.desc; + comp->cached_prod = *comp->producer; + comp->cached_cons = *comp->consumer; *umem_ptr = umem; return 0; @@ -672,6 +678,8 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, rx->consumer = rx_map + off.rx.consumer; rx->flags = rx_map + off.rx.flags; rx->ring = rx_map + off.rx.desc; + rx->cached_prod = *rx->producer; + rx->cached_cons = *rx->consumer; } xsk->rx = rx; @@ -691,7 +699,11 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, tx->consumer = tx_map + off.tx.consumer; tx->flags = tx_map + off.tx.flags; tx->ring = tx_map + off.tx.desc; - tx->cached_cons = xsk->config.tx_size; + tx->cached_prod = *tx->producer; + /* cached_cons is r->size bigger than the real consumer pointer + * See xsk_prod_nb_free + */ + tx->cached_cons = *tx->consumer + xsk->config.tx_size; } xsk->tx = tx; diff --git a/tools/lib/perf/Documentation/examples/counting.c b/tools/lib/perf/Documentation/examples/counting.c new file mode 100644 index 000000000000..6085693571ef --- /dev/null +++ b/tools/lib/perf/Documentation/examples/counting.c @@ -0,0 +1,83 @@ +#include <linux/perf_event.h> +#include <perf/evlist.h> +#include <perf/evsel.h> +#include <perf/cpumap.h> +#include <perf/threadmap.h> +#include <perf/mmap.h> +#include <perf/core.h> +#include <perf/event.h> +#include <stdio.h> +#include <unistd.h> + +static int libperf_print(enum libperf_print_level level, + const char *fmt, va_list ap) +{ + return vfprintf(stderr, fmt, ap); +} + +int main(int argc, char **argv) +{ + int count = 100000, err = 0; + struct perf_evlist *evlist; + struct perf_evsel *evsel; + struct perf_thread_map *threads; + struct perf_counts_values counts; + + struct perf_event_attr attr1 = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING, + .disabled = 1, + }; + struct perf_event_attr attr2 = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_TASK_CLOCK, + .read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING, + .disabled = 1, + }; + + libperf_init(libperf_print); + threads = perf_thread_map__new_dummy(); + if (!threads) { + fprintf(stderr, "failed to create threads\n"); + return -1; + } + perf_thread_map__set_pid(threads, 0, 0); + evlist = perf_evlist__new(); + if (!evlist) { + fprintf(stderr, "failed to create evlist\n"); + goto out_threads; + } + evsel = perf_evsel__new(&attr1); + if (!evsel) { + fprintf(stderr, "failed to create evsel1\n"); + goto out_evlist; + } + perf_evlist__add(evlist, evsel); + evsel = perf_evsel__new(&attr2); + if (!evsel) { + fprintf(stderr, "failed to create evsel2\n"); + goto out_evlist; + } + perf_evlist__add(evlist, evsel); + perf_evlist__set_maps(evlist, NULL, threads); + err = perf_evlist__open(evlist); + if (err) { + fprintf(stderr, "failed to open evsel\n"); + goto out_evlist; + } + perf_evlist__enable(evlist); + while (count--); + perf_evlist__disable(evlist); + perf_evlist__for_each_evsel(evlist, evsel) { + perf_evsel__read(evsel, 0, 0, &counts); + fprintf(stdout, "count %llu, enabled %llu, run %llu\n", + counts.val, counts.ena, counts.run); + } + perf_evlist__close(evlist); +out_evlist: + perf_evlist__delete(evlist); +out_threads: + perf_thread_map__put(threads); + return err; +} diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index beaa8b8c08ff..e1bd2a93c6db 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -5541,7 +5541,7 @@ static void print_event_time(struct tep_handle *tep, struct trace_seq *s, if (p10 > 1 && p10 < time) trace_seq_printf(s, "%5llu.%0*llu", time / p10, prec, time % p10); else - trace_seq_printf(s, "%12llu\n", time); + trace_seq_printf(s, "%12llu", time); } struct print_event_type { diff --git a/tools/objtool/Build b/tools/objtool/Build index 8dc4f0848362..66f44f5cd2a6 100644 --- a/tools/objtool/Build +++ b/tools/objtool/Build @@ -11,6 +11,7 @@ objtool-y += objtool.o objtool-y += libstring.o objtool-y += libctype.o objtool-y += str_error_r.o +objtool-y += librbtree.o CFLAGS += -I$(srctree)/tools/lib @@ -25,3 +26,7 @@ $(OUTPUT)libctype.o: ../lib/ctype.c FORCE $(OUTPUT)str_error_r.o: ../lib/str_error_r.c FORCE $(call rule_mkdir) $(call if_changed_dep,cc_o_c) + +$(OUTPUT)librbtree.o: ../lib/rbtree.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index c807984a03c1..10fbe75ab43d 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -17,7 +17,7 @@ #include "builtin.h" #include "check.h" -bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess; +bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats; static const char * const check_usage[] = { "objtool check [<options>] file.o", @@ -31,6 +31,7 @@ const struct option check_options[] = { OPT_BOOLEAN('m', "module", &module, "Indicates the object will be part of a kernel module"), OPT_BOOLEAN('b', "backtrace", &backtrace, "unwind on error"), OPT_BOOLEAN('a', "uaccess", &uaccess, "enable uaccess checking"), + OPT_BOOLEAN('s', "stats", &stats, "print statistics"), OPT_END(), }; diff --git a/tools/objtool/builtin.h b/tools/objtool/builtin.h index a32736f8d2a4..0b907902ee79 100644 --- a/tools/objtool/builtin.h +++ b/tools/objtool/builtin.h @@ -8,7 +8,7 @@ #include <subcmd/parse-options.h> extern const struct option check_options[]; -extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess; +extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats; extern int cmd_check(int argc, const char **argv); extern int cmd_orc(int argc, const char **argv); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4768d91c6d68..8dd01f986fbb 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -72,22 +72,22 @@ static struct instruction *next_insn_same_func(struct objtool_file *file, return find_insn(file, func->cfunc->sec, func->cfunc->offset); } -#define func_for_each_insn_all(file, func, insn) \ +#define func_for_each_insn(file, func, insn) \ for (insn = find_insn(file, func->sec, func->offset); \ insn; \ insn = next_insn_same_func(file, insn)) -#define func_for_each_insn(file, func, insn) \ - for (insn = find_insn(file, func->sec, func->offset); \ +#define sym_for_each_insn(file, sym, insn) \ + for (insn = find_insn(file, sym->sec, sym->offset); \ insn && &insn->list != &file->insn_list && \ - insn->sec == func->sec && \ - insn->offset < func->offset + func->len; \ + insn->sec == sym->sec && \ + insn->offset < sym->offset + sym->len; \ insn = list_next_entry(insn, list)) -#define func_for_each_insn_continue_reverse(file, func, insn) \ +#define sym_for_each_insn_continue_reverse(file, sym, insn) \ for (insn = list_prev_entry(insn, list); \ &insn->list != &file->insn_list && \ - insn->sec == func->sec && insn->offset >= func->offset; \ + insn->sec == sym->sec && insn->offset >= sym->offset; \ insn = list_prev_entry(insn, list)) #define sec_for_each_insn_from(file, insn) \ @@ -97,14 +97,19 @@ static struct instruction *next_insn_same_func(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) +static bool is_static_jump(struct instruction *insn) +{ + return insn->type == INSN_JUMP_CONDITIONAL || + insn->type == INSN_JUMP_UNCONDITIONAL; +} + static bool is_sibling_call(struct instruction *insn) { /* An indirect jump is either a sibling call or a jump to a table. */ if (insn->type == INSN_JUMP_DYNAMIC) return list_empty(&insn->alts); - if (insn->type != INSN_JUMP_CONDITIONAL && - insn->type != INSN_JUMP_UNCONDITIONAL) + if (!is_static_jump(insn)) return false; /* add_jump_destinations() sets insn->call_dest for sibling calls. */ @@ -165,7 +170,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, if (!insn->func) return false; - func_for_each_insn_all(file, func, insn) { + func_for_each_insn(file, func, insn) { empty = false; if (insn->type == INSN_RETURN) @@ -180,7 +185,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, * case, the function's dead-end status depends on whether the target * of the sibling call returns. */ - func_for_each_insn_all(file, func, insn) { + func_for_each_insn(file, func, insn) { if (is_sibling_call(insn)) { struct instruction *dest = insn->jump_dest; @@ -234,6 +239,7 @@ static int decode_instructions(struct objtool_file *file) struct symbol *func; unsigned long offset; struct instruction *insn; + unsigned long nr_insns = 0; int ret; for_each_sec(file, sec) { @@ -269,6 +275,7 @@ static int decode_instructions(struct objtool_file *file) hash_add(file->insn_hash, &insn->hash, insn->offset); list_add_tail(&insn->list, &file->insn_list); + nr_insns++; } list_for_each_entry(func, &sec->symbol_list, list) { @@ -281,11 +288,14 @@ static int decode_instructions(struct objtool_file *file) return -1; } - func_for_each_insn(file, func, insn) + sym_for_each_insn(file, func, insn) insn->func = func; } } + if (stats) + printf("nr_insns: %lu\n", nr_insns); + return 0; err: @@ -415,8 +425,8 @@ static void add_ignores(struct objtool_file *file) break; case STT_SECTION: - func = find_symbol_by_offset(rela->sym->sec, rela->addend); - if (!func || func->type != STT_FUNC) + func = find_func_by_offset(rela->sym->sec, rela->addend); + if (!func) continue; break; @@ -425,7 +435,7 @@ static void add_ignores(struct objtool_file *file) continue; } - func_for_each_insn_all(file, func, insn) + func_for_each_insn(file, func, insn) insn->ignore = true; } } @@ -478,6 +488,7 @@ static const char *uaccess_safe_builtin[] = { "__sanitizer_cov_trace_cmp2", "__sanitizer_cov_trace_cmp4", "__sanitizer_cov_trace_cmp8", + "__sanitizer_cov_trace_switch", /* UBSAN */ "ubsan_type_mismatch_common", "__ubsan_handle_type_mismatch", @@ -553,15 +564,14 @@ static int add_jump_destinations(struct objtool_file *file) unsigned long dest_off; for_each_insn(file, insn) { - if (insn->type != INSN_JUMP_CONDITIONAL && - insn->type != INSN_JUMP_UNCONDITIONAL) + if (!is_static_jump(insn)) continue; if (insn->ignore || insn->offset == FAKE_JUMP_OFFSET) continue; - rela = find_rela_by_dest_range(insn->sec, insn->offset, - insn->len); + rela = find_rela_by_dest_range(file->elf, insn->sec, + insn->offset, insn->len); if (!rela) { dest_sec = insn->sec; dest_off = insn->offset + insn->len + insn->immediate; @@ -657,14 +667,18 @@ static int add_call_destinations(struct objtool_file *file) if (insn->type != INSN_CALL) continue; - rela = find_rela_by_dest_range(insn->sec, insn->offset, - insn->len); + rela = find_rela_by_dest_range(file->elf, insn->sec, + insn->offset, insn->len); if (!rela) { dest_off = insn->offset + insn->len + insn->immediate; - insn->call_dest = find_symbol_by_offset(insn->sec, - dest_off); + insn->call_dest = find_func_by_offset(insn->sec, dest_off); + if (!insn->call_dest) + insn->call_dest = find_symbol_by_offset(insn->sec, dest_off); - if (!insn->call_dest && !insn->ignore) { + if (insn->ignore) + continue; + + if (!insn->call_dest) { WARN_FUNC("unsupported intra-function call", insn->sec, insn->offset); if (retpoline) @@ -672,11 +686,16 @@ static int add_call_destinations(struct objtool_file *file) return -1; } + if (insn->func && insn->call_dest->type != STT_FUNC) { + WARN_FUNC("unsupported call to non-function", + insn->sec, insn->offset); + return -1; + } + } else if (rela->sym->type == STT_SECTION) { - insn->call_dest = find_symbol_by_offset(rela->sym->sec, - rela->addend+4); - if (!insn->call_dest || - insn->call_dest->type != STT_FUNC) { + insn->call_dest = find_func_by_offset(rela->sym->sec, + rela->addend+4); + if (!insn->call_dest) { WARN_FUNC("can't find call dest symbol at %s+0x%x", insn->sec, insn->offset, rela->sym->sec->name, @@ -764,8 +783,28 @@ static int handle_group_alt(struct objtool_file *file, insn->ignore = orig_insn->ignore_alts; insn->func = orig_insn->func; - if (insn->type != INSN_JUMP_CONDITIONAL && - insn->type != INSN_JUMP_UNCONDITIONAL) + /* + * Since alternative replacement code is copy/pasted by the + * kernel after applying relocations, generally such code can't + * have relative-address relocation references to outside the + * .altinstr_replacement section, unless the arch's + * alternatives code can adjust the relative offsets + * accordingly. + * + * The x86 alternatives code adjusts the offsets only when it + * encounters a branch instruction at the very beginning of the + * replacement group. + */ + if ((insn->offset != special_alt->new_off || + (insn->type != INSN_CALL && !is_static_jump(insn))) && + find_rela_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) { + + WARN_FUNC("unsupported relocation in alternatives section", + insn->sec, insn->offset); + return -1; + } + + if (!is_static_jump(insn)) continue; if (!insn->immediate) @@ -1001,7 +1040,7 @@ static struct rela *find_jump_table(struct objtool_file *file, struct instruction *insn) { struct rela *text_rela, *table_rela; - struct instruction *orig_insn = insn; + struct instruction *dest_insn, *orig_insn = insn; struct section *table_sec; unsigned long table_offset; @@ -1028,8 +1067,8 @@ static struct rela *find_jump_table(struct objtool_file *file, break; /* look for a relocation which references .rodata */ - text_rela = find_rela_by_dest_range(insn->sec, insn->offset, - insn->len); + text_rela = find_rela_by_dest_range(file->elf, insn->sec, + insn->offset, insn->len); if (!text_rela || text_rela->sym->type != STT_SECTION || !text_rela->sym->sec->rodata) continue; @@ -1053,10 +1092,17 @@ static struct rela *find_jump_table(struct objtool_file *file, strcmp(table_sec->name, C_JUMP_TABLE_SECTION)) continue; - /* Each table entry has a rela associated with it. */ - table_rela = find_rela_by_dest(table_sec, table_offset); + /* + * Each table entry has a rela associated with it. The rela + * should reference text in the same function as the original + * instruction. + */ + table_rela = find_rela_by_dest(file->elf, table_sec, table_offset); if (!table_rela) continue; + dest_insn = find_insn(file, table_rela->sym->sec, table_rela->addend); + if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func) + continue; /* * Use of RIP-relative switch jumps is quite rare, and @@ -1082,7 +1128,7 @@ static void mark_func_jump_tables(struct objtool_file *file, struct instruction *insn, *last = NULL; struct rela *rela; - func_for_each_insn_all(file, func, insn) { + func_for_each_insn(file, func, insn) { if (!last) last = insn; @@ -1117,7 +1163,7 @@ static int add_func_jump_tables(struct objtool_file *file, struct instruction *insn; int ret; - func_for_each_insn_all(file, func, insn) { + func_for_each_insn(file, func, insn) { if (!insn->jump_table) continue; @@ -1187,7 +1233,7 @@ static int read_unwind_hints(struct objtool_file *file) for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) { hint = (struct unwind_hint *)sec->data->d_buf + i; - rela = find_rela_by_dest(sec, i * sizeof(*hint)); + rela = find_rela_by_dest(file->elf, sec, i * sizeof(*hint)); if (!rela) { WARN("can't find rela for unwind_hints[%d]", i); return -1; @@ -1935,6 +1981,41 @@ static int validate_sibling_call(struct instruction *insn, struct insn_state *st return validate_call(insn, state); } +static int validate_return(struct symbol *func, struct instruction *insn, struct insn_state *state) +{ + if (state->uaccess && !func_uaccess_safe(func)) { + WARN_FUNC("return with UACCESS enabled", + insn->sec, insn->offset); + return 1; + } + + if (!state->uaccess && func_uaccess_safe(func)) { + WARN_FUNC("return with UACCESS disabled from a UACCESS-safe function", + insn->sec, insn->offset); + return 1; + } + + if (state->df) { + WARN_FUNC("return with DF set", + insn->sec, insn->offset); + return 1; + } + + if (func && has_modified_stack_frame(state)) { + WARN_FUNC("return with modified stack frame", + insn->sec, insn->offset); + return 1; + } + + if (state->bp_scratch) { + WARN("%s uses BP as a scratch register", + func->name); + return 1; + } + + return 0; +} + /* * Follow the branch starting at the given instruction, and recursively follow * any other branches (jumps). Meanwhile, track the frame pointer state at @@ -1989,7 +2070,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, i = insn; save_insn = NULL; - func_for_each_insn_continue_reverse(file, func, i) { + sym_for_each_insn_continue_reverse(file, func, i) { if (i->save) { save_insn = i; break; @@ -2050,34 +2131,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, switch (insn->type) { case INSN_RETURN: - if (state.uaccess && !func_uaccess_safe(func)) { - WARN_FUNC("return with UACCESS enabled", sec, insn->offset); - return 1; - } - - if (!state.uaccess && func_uaccess_safe(func)) { - WARN_FUNC("return with UACCESS disabled from a UACCESS-safe function", sec, insn->offset); - return 1; - } - - if (state.df) { - WARN_FUNC("return with DF set", sec, insn->offset); - return 1; - } - - if (func && has_modified_stack_frame(&state)) { - WARN_FUNC("return with modified stack frame", - sec, insn->offset); - return 1; - } - - if (state.bp_scratch) { - WARN("%s uses BP as a scratch register", - func->name); - return 1; - } - - return 0; + return validate_return(func, insn, &state); case INSN_CALL: case INSN_CALL_DYNAMIC: @@ -2342,9 +2396,8 @@ static bool ignore_unreachable_insn(struct instruction *insn) return false; } -static int validate_functions(struct objtool_file *file) +static int validate_section(struct objtool_file *file, struct section *sec) { - struct section *sec; struct symbol *func; struct instruction *insn; struct insn_state state; @@ -2357,36 +2410,45 @@ static int validate_functions(struct objtool_file *file) CFI_NUM_REGS * sizeof(struct cfi_reg)); state.stack_size = initial_func_cfi.cfa.offset; - for_each_sec(file, sec) { - list_for_each_entry(func, &sec->symbol_list, list) { - if (func->type != STT_FUNC) - continue; + list_for_each_entry(func, &sec->symbol_list, list) { + if (func->type != STT_FUNC) + continue; - if (!func->len) { - WARN("%s() is missing an ELF size annotation", - func->name); - warnings++; - } + if (!func->len) { + WARN("%s() is missing an ELF size annotation", + func->name); + warnings++; + } - if (func->pfunc != func || func->alias != func) - continue; + if (func->pfunc != func || func->alias != func) + continue; - insn = find_insn(file, sec, func->offset); - if (!insn || insn->ignore || insn->visited) - continue; + insn = find_insn(file, sec, func->offset); + if (!insn || insn->ignore || insn->visited) + continue; - state.uaccess = func->uaccess_safe; + state.uaccess = func->uaccess_safe; - ret = validate_branch(file, func, insn, state); - if (ret && backtrace) - BT_FUNC("<=== (func)", insn); - warnings += ret; - } + ret = validate_branch(file, func, insn, state); + if (ret && backtrace) + BT_FUNC("<=== (func)", insn); + warnings += ret; } return warnings; } +static int validate_functions(struct objtool_file *file) +{ + struct section *sec; + int warnings = 0; + + for_each_sec(file, sec) + warnings += validate_section(file, sec); + + return warnings; +} + static int validate_reachable_instructions(struct objtool_file *file) { struct instruction *insn; @@ -2405,23 +2467,6 @@ static int validate_reachable_instructions(struct objtool_file *file) return 0; } -static void cleanup(struct objtool_file *file) -{ - struct instruction *insn, *tmpinsn; - struct alternative *alt, *tmpalt; - - list_for_each_entry_safe(insn, tmpinsn, &file->insn_list, list) { - list_for_each_entry_safe(alt, tmpalt, &insn->alts, list) { - list_del(&alt->list); - free(alt); - } - list_del(&insn->list); - hash_del(&insn->hash); - free(insn); - } - elf_close(file->elf); -} - static struct objtool_file file; int check(const char *_objname, bool orc) @@ -2489,10 +2534,14 @@ int check(const char *_objname, bool orc) } out: - cleanup(&file); + if (ret < 0) { + /* + * Fatal error. The binary is corrupt or otherwise broken in + * some way, or objtool itself is broken. Fail the kernel + * build. + */ + return ret; + } - /* ignore warnings for now until we get all the code cleaned up */ - if (ret || warnings) - return 0; return 0; } diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 6d875ca6fce0..f0ce8ffe7135 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -50,7 +50,7 @@ struct instruction { struct objtool_file { struct elf *elf; struct list_head insn_list; - DECLARE_HASHTABLE(insn_hash, 16); + DECLARE_HASHTABLE(insn_hash, 20); bool ignore_unreachables, c_file, hints, rodata; }; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index edba4745f25a..09ddc8f1def3 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -15,17 +15,107 @@ #include <string.h> #include <unistd.h> #include <errno.h> +#include "builtin.h" #include "elf.h" #include "warn.h" #define MAX_NAME_LEN 128 +static inline u32 str_hash(const char *str) +{ + return jhash(str, strlen(str), 0); +} + +static void rb_add(struct rb_root *tree, struct rb_node *node, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + + while (*link) { + parent = *link; + if (cmp(node, parent) < 0) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + rb_link_node(node, parent, link); + rb_insert_color(node, tree); +} + +static struct rb_node *rb_find_first(struct rb_root *tree, const void *key, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + struct rb_node *match = NULL; + + while (node) { + int c = cmp(key, node); + if (c <= 0) { + if (!c) + match = node; + node = node->rb_left; + } else if (c > 0) { + node = node->rb_right; + } + } + + return match; +} + +static struct rb_node *rb_next_match(struct rb_node *node, const void *key, + int (*cmp)(const void *key, const struct rb_node *)) +{ + node = rb_next(node); + if (node && cmp(key, node)) + node = NULL; + return node; +} + +#define rb_for_each(tree, node, key, cmp) \ + for ((node) = rb_find_first((tree), (key), (cmp)); \ + (node); (node) = rb_next_match((node), (key), (cmp))) + +static int symbol_to_offset(struct rb_node *a, const struct rb_node *b) +{ + struct symbol *sa = rb_entry(a, struct symbol, node); + struct symbol *sb = rb_entry(b, struct symbol, node); + + if (sa->offset < sb->offset) + return -1; + if (sa->offset > sb->offset) + return 1; + + if (sa->len < sb->len) + return -1; + if (sa->len > sb->len) + return 1; + + sa->alias = sb; + + return 0; +} + +static int symbol_by_offset(const void *key, const struct rb_node *node) +{ + const struct symbol *s = rb_entry(node, struct symbol, node); + const unsigned long *o = key; + + if (*o < s->offset) + return -1; + if (*o > s->offset + s->len) + return 1; + + return 0; +} + struct section *find_section_by_name(struct elf *elf, const char *name) { struct section *sec; - list_for_each_entry(sec, &elf->sections, list) + hash_for_each_possible(elf->section_name_hash, sec, name_hash, str_hash(name)) if (!strcmp(sec->name, name)) return sec; @@ -37,7 +127,7 @@ static struct section *find_section_by_index(struct elf *elf, { struct section *sec; - list_for_each_entry(sec, &elf->sections, list) + hash_for_each_possible(elf->section_hash, sec, hash, idx) if (sec->idx == idx) return sec; @@ -46,88 +136,116 @@ static struct section *find_section_by_index(struct elf *elf, static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx) { - struct section *sec; struct symbol *sym; - list_for_each_entry(sec, &elf->sections, list) - hash_for_each_possible(sec->symbol_hash, sym, hash, idx) - if (sym->idx == idx) - return sym; + hash_for_each_possible(elf->symbol_hash, sym, hash, idx) + if (sym->idx == idx) + return sym; return NULL; } struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) { - struct symbol *sym; + struct rb_node *node; - list_for_each_entry(sym, &sec->symbol_list, list) - if (sym->type != STT_SECTION && - sym->offset == offset) - return sym; + rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + struct symbol *s = rb_entry(node, struct symbol, node); + + if (s->offset == offset && s->type != STT_SECTION) + return s; + } return NULL; } -struct symbol *find_symbol_by_name(struct elf *elf, const char *name) +struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) { - struct section *sec; - struct symbol *sym; + struct rb_node *node; - list_for_each_entry(sec, &elf->sections, list) - list_for_each_entry(sym, &sec->symbol_list, list) - if (!strcmp(sym->name, name)) - return sym; + rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + struct symbol *s = rb_entry(node, struct symbol, node); + + if (s->offset == offset && s->type == STT_FUNC) + return s; + } return NULL; } struct symbol *find_symbol_containing(struct section *sec, unsigned long offset) { - struct symbol *sym; + struct rb_node *node; - list_for_each_entry(sym, &sec->symbol_list, list) - if (sym->type != STT_SECTION && - offset >= sym->offset && offset < sym->offset + sym->len) - return sym; + rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + struct symbol *s = rb_entry(node, struct symbol, node); + + if (s->type != STT_SECTION) + return s; + } return NULL; } -struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset, - unsigned int len) +struct symbol *find_func_containing(struct section *sec, unsigned long offset) { - struct rela *rela; - unsigned long o; + struct rb_node *node; - if (!sec->rela) - return NULL; + rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) { + struct symbol *s = rb_entry(node, struct symbol, node); - for (o = offset; o < offset + len; o++) - hash_for_each_possible(sec->rela->rela_hash, rela, hash, o) - if (rela->offset == o) - return rela; + if (s->type == STT_FUNC) + return s; + } return NULL; } -struct rela *find_rela_by_dest(struct section *sec, unsigned long offset) +struct symbol *find_symbol_by_name(struct elf *elf, const char *name) { - return find_rela_by_dest_range(sec, offset, 1); + struct symbol *sym; + + hash_for_each_possible(elf->symbol_name_hash, sym, name_hash, str_hash(name)) + if (!strcmp(sym->name, name)) + return sym; + + return NULL; } -struct symbol *find_containing_func(struct section *sec, unsigned long offset) +struct rela *find_rela_by_dest_range(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int len) { - struct symbol *func; + struct rela *rela, *r = NULL; + unsigned long o; + + if (!sec->rela) + return NULL; - list_for_each_entry(func, &sec->symbol_list, list) - if (func->type == STT_FUNC && offset >= func->offset && - offset < func->offset + func->len) - return func; + sec = sec->rela; + + for_offset_range(o, offset, offset + len) { + hash_for_each_possible(elf->rela_hash, rela, hash, + sec_offset_hash(sec, o)) { + if (rela->sec != sec) + continue; + + if (rela->offset >= offset && rela->offset < offset + len) { + if (!r || rela->offset < r->offset) + r = rela; + } + } + if (r) + return r; + } return NULL; } +struct rela *find_rela_by_dest(struct elf *elf, struct section *sec, unsigned long offset) +{ + return find_rela_by_dest_range(elf, sec, offset, 1); +} + static int read_sections(struct elf *elf) { Elf_Scn *s = NULL; @@ -155,10 +273,6 @@ static int read_sections(struct elf *elf) INIT_LIST_HEAD(&sec->symbol_list); INIT_LIST_HEAD(&sec->rela_list); - hash_init(sec->rela_hash); - hash_init(sec->symbol_hash); - - list_add_tail(&sec->list, &elf->sections); s = elf_getscn(elf->elf, i); if (!s) { @@ -193,8 +307,15 @@ static int read_sections(struct elf *elf) } } sec->len = sec->sh.sh_size; + + list_add_tail(&sec->list, &elf->sections); + hash_add(elf->section_hash, &sec->hash, sec->idx); + hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name)); } + if (stats) + printf("nr_sections: %lu\n", (unsigned long)sections_nr); + /* sanity check, one more call to elf_nextscn() should return NULL */ if (elf_nextscn(elf->elf, s)) { WARN("section entry mismatch"); @@ -207,8 +328,9 @@ static int read_sections(struct elf *elf) static int read_symbols(struct elf *elf) { struct section *symtab, *sec; - struct symbol *sym, *pfunc, *alias; - struct list_head *entry, *tmp; + struct symbol *sym, *pfunc; + struct list_head *entry; + struct rb_node *pnode; int symbols_nr, i; char *coldstr; @@ -227,7 +349,7 @@ static int read_symbols(struct elf *elf) return -1; } memset(sym, 0, sizeof(*sym)); - alias = sym; + sym->alias = sym; sym->idx = i; @@ -265,33 +387,20 @@ static int read_symbols(struct elf *elf) sym->offset = sym->sym.st_value; sym->len = sym->sym.st_size; - /* sorted insert into a per-section list */ - entry = &sym->sec->symbol_list; - list_for_each_prev(tmp, &sym->sec->symbol_list) { - struct symbol *s; - - s = list_entry(tmp, struct symbol, list); - - if (sym->offset > s->offset) { - entry = tmp; - break; - } - - if (sym->offset == s->offset) { - if (sym->len && sym->len == s->len && alias == sym) - alias = s; - - if (sym->len >= s->len) { - entry = tmp; - break; - } - } - } - sym->alias = alias; + rb_add(&sym->sec->symbol_tree, &sym->node, symbol_to_offset); + pnode = rb_prev(&sym->node); + if (pnode) + entry = &rb_entry(pnode, struct symbol, node)->list; + else + entry = &sym->sec->symbol_list; list_add(&sym->list, entry); - hash_add(sym->sec->symbol_hash, &sym->hash, sym->idx); + hash_add(elf->symbol_hash, &sym->hash, sym->idx); + hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); } + if (stats) + printf("nr_symbols: %lu\n", (unsigned long)symbols_nr); + /* Create parent/child links for any cold subfunctions */ list_for_each_entry(sec, &elf->sections, list) { list_for_each_entry(sym, &sec->symbol_list, list) { @@ -353,6 +462,7 @@ static int read_relas(struct elf *elf) struct rela *rela; int i; unsigned int symndx; + unsigned long nr_rela, max_rela = 0, tot_rela = 0; list_for_each_entry(sec, &elf->sections, list) { if (sec->sh.sh_type != SHT_RELA) @@ -367,6 +477,7 @@ static int read_relas(struct elf *elf) sec->base->rela = sec; + nr_rela = 0; for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) { rela = malloc(sizeof(*rela)); if (!rela) { @@ -393,9 +504,16 @@ static int read_relas(struct elf *elf) } list_add_tail(&rela->list, &sec->rela_list); - hash_add(sec->rela_hash, &rela->hash, rela->offset); - + hash_add(elf->rela_hash, &rela->hash, rela_hash(rela)); + nr_rela++; } + max_rela = max(max_rela, nr_rela); + tot_rela += nr_rela; + } + + if (stats) { + printf("max_rela: %lu\n", max_rela); + printf("tot_rela: %lu\n", tot_rela); } return 0; @@ -415,6 +533,11 @@ struct elf *elf_read(const char *name, int flags) } memset(elf, 0, sizeof(*elf)); + hash_init(elf->symbol_hash); + hash_init(elf->symbol_name_hash); + hash_init(elf->section_hash); + hash_init(elf->section_name_hash); + hash_init(elf->rela_hash); INIT_LIST_HEAD(&elf->sections); elf->fd = open(name, flags); @@ -475,10 +598,6 @@ struct section *elf_create_section(struct elf *elf, const char *name, INIT_LIST_HEAD(&sec->symbol_list); INIT_LIST_HEAD(&sec->rela_list); - hash_init(sec->rela_hash); - hash_init(sec->symbol_hash); - - list_add_tail(&sec->list, &elf->sections); s = elf_newscn(elf->elf); if (!s) { @@ -556,6 +675,10 @@ struct section *elf_create_section(struct elf *elf, const char *name, shstrtab->len += strlen(name) + 1; shstrtab->changed = true; + list_add_tail(&sec->list, &elf->sections); + hash_add(elf->section_hash, &sec->hash, sec->idx); + hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name)); + return sec; } diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index 44150204db4d..ebbb10c61e24 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -10,6 +10,8 @@ #include <gelf.h> #include <linux/list.h> #include <linux/hashtable.h> +#include <linux/rbtree.h> +#include <linux/jhash.h> #ifdef LIBELF_USE_DEPRECATED # define elf_getshdrnum elf_getshnum @@ -25,11 +27,12 @@ struct section { struct list_head list; + struct hlist_node hash; + struct hlist_node name_hash; GElf_Shdr sh; + struct rb_root symbol_tree; struct list_head symbol_list; - DECLARE_HASHTABLE(symbol_hash, 8); struct list_head rela_list; - DECLARE_HASHTABLE(rela_hash, 16); struct section *base, *rela; struct symbol *sym; Elf_Data *data; @@ -41,7 +44,9 @@ struct section { struct symbol { struct list_head list; + struct rb_node node; struct hlist_node hash; + struct hlist_node name_hash; GElf_Sym sym; struct section *sec; char *name; @@ -71,19 +76,51 @@ struct elf { int fd; char *name; struct list_head sections; - DECLARE_HASHTABLE(rela_hash, 16); + DECLARE_HASHTABLE(symbol_hash, 20); + DECLARE_HASHTABLE(symbol_name_hash, 20); + DECLARE_HASHTABLE(section_hash, 16); + DECLARE_HASHTABLE(section_name_hash, 16); + DECLARE_HASHTABLE(rela_hash, 20); }; +#define OFFSET_STRIDE_BITS 4 +#define OFFSET_STRIDE (1UL << OFFSET_STRIDE_BITS) +#define OFFSET_STRIDE_MASK (~(OFFSET_STRIDE - 1)) + +#define for_offset_range(_offset, _start, _end) \ + for (_offset = ((_start) & OFFSET_STRIDE_MASK); \ + _offset <= ((_end) & OFFSET_STRIDE_MASK); \ + _offset += OFFSET_STRIDE) + +static inline u32 sec_offset_hash(struct section *sec, unsigned long offset) +{ + u32 ol, oh, idx = sec->idx; + + offset &= OFFSET_STRIDE_MASK; + + ol = offset; + oh = offset >> 32; + + __jhash_mix(ol, oh, idx); + + return ol; +} + +static inline u32 rela_hash(struct rela *rela) +{ + return sec_offset_hash(rela->sec, rela->offset); +} struct elf *elf_read(const char *name, int flags); struct section *find_section_by_name(struct elf *elf, const char *name); +struct symbol *find_func_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_name(struct elf *elf, const char *name); struct symbol *find_symbol_containing(struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest(struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset, - unsigned int len); -struct symbol *find_containing_func(struct section *sec, unsigned long offset); +struct rela *find_rela_by_dest(struct elf *elf, struct section *sec, unsigned long offset); +struct rela *find_rela_by_dest_range(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int len); +struct symbol *find_func_containing(struct section *sec, unsigned long offset); struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr); struct section *elf_create_rela_section(struct elf *elf, struct section *base); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 27a4112848c2..41e4a2754da4 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -81,7 +81,7 @@ int create_orc(struct objtool_file *file) return 0; } -static int create_orc_entry(struct section *u_sec, struct section *ip_relasec, +static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relasec, unsigned int idx, struct section *insn_sec, unsigned long insn_off, struct orc_entry *o) { @@ -109,9 +109,10 @@ static int create_orc_entry(struct section *u_sec, struct section *ip_relasec, rela->addend = insn_off; rela->type = R_X86_64_PC32; rela->offset = idx * sizeof(int); + rela->sec = ip_relasec; list_add_tail(&rela->list, &ip_relasec->rela_list); - hash_add(ip_relasec->rela_hash, &rela->hash, rela->offset); + hash_add(elf->rela_hash, &rela->hash, rela_hash(rela)); return 0; } @@ -182,7 +183,7 @@ int create_orc_sections(struct objtool_file *file) if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc, sizeof(struct orc_entry))) { - if (create_orc_entry(u_sec, ip_relasec, idx, + if (create_orc_entry(file->elf, u_sec, ip_relasec, idx, insn->sec, insn->offset, &insn->orc)) return -1; @@ -194,7 +195,7 @@ int create_orc_sections(struct objtool_file *file) /* section terminator */ if (prev_insn) { - if (create_orc_entry(u_sec, ip_relasec, idx, + if (create_orc_entry(file->elf, u_sec, ip_relasec, idx, prev_insn->sec, prev_insn->offset + prev_insn->len, &empty)) diff --git a/tools/objtool/special.c b/tools/objtool/special.c index fdbaa611146d..e74e0189de22 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -118,7 +118,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, } } - orig_rela = find_rela_by_dest(sec, offset + entry->orig); + orig_rela = find_rela_by_dest(elf, sec, offset + entry->orig); if (!orig_rela) { WARN_FUNC("can't find orig rela", sec, offset + entry->orig); return -1; @@ -133,7 +133,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, alt->orig_off = orig_rela->addend; if (!entry->group || alt->new_len) { - new_rela = find_rela_by_dest(sec, offset + entry->new); + new_rela = find_rela_by_dest(elf, sec, offset + entry->new); if (!new_rela) { WARN_FUNC("can't find new rela", sec, offset + entry->new); diff --git a/tools/objtool/warn.h b/tools/objtool/warn.h index cbb0a02b7480..7799f60de80a 100644 --- a/tools/objtool/warn.h +++ b/tools/objtool/warn.h @@ -21,7 +21,7 @@ static inline char *offstr(struct section *sec, unsigned long offset) char *name, *str; unsigned long name_off; - func = find_containing_func(sec, offset); + func = find_func_containing(sec, offset); if (func) { name = func->name; name_off = offset - func->offset; diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index adc5a7e44b98..31824d5269cc 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -295,7 +295,10 @@ $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml $(OUTPUT)%.xml : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b docbook -d manpage \ - $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ + $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ + -aperf_date=$(shell git log -1 --pretty="format:%cd" \ + --date=short $<) \ + -o $@+ $< && \ mv $@+ $@ XSLT = docbook.xsl diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index 2cf2d9e9d0da..fd9241a1b987 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -1,991 +1 @@ -Intel Processor Trace -===================== - -Overview -======== - -Intel Processor Trace (Intel PT) is an extension of Intel Architecture that -collects information about software execution such as control flow, execution -modes and timings and formats it into highly compressed binary packets. -Technical details are documented in the Intel 64 and IA-32 Architectures -Software Developer Manuals, Chapter 36 Intel Processor Trace. - -Intel PT is first supported in Intel Core M and 5th generation Intel Core -processors that are based on the Intel micro-architecture code name Broadwell. - -Trace data is collected by 'perf record' and stored within the perf.data file. -See below for options to 'perf record'. - -Trace data must be 'decoded' which involves walking the object code and matching -the trace data packets. For example a TNT packet only tells whether a -conditional branch was taken or not taken, so to make use of that packet the -decoder must know precisely which instruction was being executed. - -Decoding is done on-the-fly. The decoder outputs samples in the same format as -samples output by perf hardware events, for example as though the "instructions" -or "branches" events had been recorded. Presently 3 tools support this: -'perf script', 'perf report' and 'perf inject'. See below for more information -on using those tools. - -The main distinguishing feature of Intel PT is that the decoder can determine -the exact flow of software execution. Intel PT can be used to understand why -and how did software get to a certain point, or behave a certain way. The -software does not have to be recompiled, so Intel PT works with debug or release -builds, however the executed images are needed - which makes use in JIT-compiled -environments, or with self-modified code, a challenge. Also symbols need to be -provided to make sense of addresses. - -A limitation of Intel PT is that it produces huge amounts of trace data -(hundreds of megabytes per second per core) which takes a long time to decode, -for example two or three orders of magnitude longer than it took to collect. -Another limitation is the performance impact of tracing, something that will -vary depending on the use-case and architecture. - - -Quickstart -========== - -It is important to start small. That is because it is easy to capture vastly -more data than can possibly be processed. - -The simplest thing to do with Intel PT is userspace profiling of small programs. -Data is captured with 'perf record' e.g. to trace 'ls' userspace-only: - - perf record -e intel_pt//u ls - -And profiled with 'perf report' e.g. - - perf report - -To also trace kernel space presents a problem, namely kernel self-modifying -code. A fairly good kernel image is available in /proc/kcore but to get an -accurate image a copy of /proc/kcore needs to be made under the same conditions -as the data capture. A script perf-with-kcore can do that, but beware that the -script makes use of 'sudo' to copy /proc/kcore. If you have perf installed -locally from the source tree you can do: - - ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls - -which will create a directory named 'pt_ls' and put the perf.data file and -copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use -'perf report' becomes: - - ~/libexec/perf-core/perf-with-kcore report pt_ls - -Because samples are synthesized after-the-fact, the sampling period can be -selected for reporting. e.g. sample every microsecond - - ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge - -See the sections below for more information about the --itrace option. - -Beware the smaller the period, the more samples that are produced, and the -longer it takes to process them. - -Also note that the coarseness of Intel PT timing information will start to -distort the statistical value of the sampling as the sampling period becomes -smaller. - -To represent software control flow, "branches" samples are produced. By default -a branch sample is synthesized for every single branch. To get an idea what -data is available you can use the 'perf script' tool with all itrace sampling -options, which will list all the samples. - - perf record -e intel_pt//u ls - perf script --itrace=ibxwpe - -An interesting field that is not printed by default is 'flags' which can be -displayed as follows: - - perf script --itrace=ibxwpe -F+flags - -The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, -system, asynchronous, interrupt, transaction abort, trace begin, trace end, and -in transaction, respectively. - -Another interesting field that is not printed by default is 'ipc' which can be -displayed as follows: - - perf script --itrace=be -F+ipc - -There are two ways that instructions-per-cycle (IPC) can be calculated depending -on the recording. - -If the 'cyc' config term (see config terms section below) was used, then IPC is -calculated using the cycle count from CYC packets, otherwise MTC packets are -used - refer to the 'mtc' config term. When MTC is used, however, the values -are less accurate because the timing is less accurate. - -Because Intel PT does not update the cycle count on every branch or instruction, -the values will often be zero. When there are values, they will be the number -of instructions and number of cycles since the last update, and thus represent -the average IPC since the last IPC for that event type. Note IPC for "branches" -events is calculated separately from IPC for "instructions" events. - -Also note that the IPC instruction count may or may not include the current -instruction. If the cycle count is associated with an asynchronous branch -(e.g. page fault or interrupt), then the instruction count does not include the -current instruction, otherwise it does. That is consistent with whether or not -that instruction has retired when the cycle count is updated. - -Another note, in the case of "branches" events, non-taken branches are not -presently sampled, so IPC values for them do not appear e.g. a CYC packet with a -TNT packet that starts with a non-taken branch. To see every possible IPC -value, "instructions" events can be used e.g. --itrace=i0ns - -While it is possible to create scripts to analyze the data, an alternative -approach is available to export the data to a sqlite or postgresql database. -Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, -and to script exported-sql-viewer.py for an example of using the database. - -There is also script intel-pt-events.py which provides an example of how to -unpack the raw data for power events and PTWRITE. - -As mentioned above, it is easy to capture too much data. One way to limit the -data captured is to use 'snapshot' mode which is explained further below. -Refer to 'new snapshot option' and 'Intel PT modes of operation' further below. - -Another problem that will be experienced is decoder errors. They can be caused -by inability to access the executed image, self-modified or JIT-ed code, or the -inability to match side-band information (such as context switches and mmaps) -which results in the decoder not knowing what code was executed. - -There is also the problem of perf not being able to copy the data fast enough, -resulting in data lost because the buffer was full. See 'Buffer handling' below -for more details. - - -perf record -=========== - -new event ---------- - -The Intel PT kernel driver creates a new PMU for Intel PT. PMU events are -selected by providing the PMU name followed by the "config" separated by slashes. -An enhancement has been made to allow default "config" e.g. the option - - -e intel_pt// - -will use a default config value. Currently that is the same as - - -e intel_pt/tsc,noretcomp=0/ - -which is the same as - - -e intel_pt/tsc=1,noretcomp=0/ - -Note there are now new config terms - see section 'config terms' further below. - -The config terms are listed in /sys/devices/intel_pt/format. They are bit -fields within the config member of the struct perf_event_attr which is -passed to the kernel by the perf_event_open system call. They correspond to bit -fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: - - $ grep -H . /sys/bus/event_source/devices/intel_pt/format/* - /sys/bus/event_source/devices/intel_pt/format/cyc:config:1 - /sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22 - /sys/bus/event_source/devices/intel_pt/format/mtc:config:9 - /sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17 - /sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11 - /sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27 - /sys/bus/event_source/devices/intel_pt/format/tsc:config:10 - -Note that the default config must be overridden for each term i.e. - - -e intel_pt/noretcomp=0/ - -is the same as: - - -e intel_pt/tsc=1,noretcomp=0/ - -So, to disable TSC packets use: - - -e intel_pt/tsc=0/ - -It is also possible to specify the config value explicitly: - - -e intel_pt/config=0x400/ - -Note that, as with all events, the event is suffixed with event modifiers: - - u userspace - k kernel - h hypervisor - G guest - H host - p precise ip - -'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. -'p' is also not relevant to Intel PT. So only options 'u' and 'k' are -meaningful for Intel PT. - -perf_event_attr is displayed if the -vv option is used e.g. - - ------------------------------------------------------------ - perf_event_attr: - type 6 - size 112 - config 0x400 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - enable_on_exec 1 - sample_id_all 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - - -config terms ------------- - -The June 2015 version of Intel 64 and IA-32 Architectures Software Developer -Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. -Some of the features are reflect in new config terms. All the config terms are -described below. - -tsc Always supported. Produces TSC timestamp packets to provide - timing information. In some cases it is possible to decode - without timing information, for example a per-thread context - that does not overlap executable memory maps. - - The default config selects tsc (i.e. tsc=1). - -noretcomp Always supported. Disables "return compression" so a TIP packet - is produced when a function returns. Causes more packets to be - produced but might make decoding more reliable. - - The default config does not select noretcomp (i.e. noretcomp=0). - -psb_period Allows the frequency of PSB packets to be specified. - - The PSB packet is a synchronization packet that provides a - starting point for decoding or recovery from errors. - - Support for psb_period is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and "0" - otherwise. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The psb_period value is converted to the approximate number of - trace bytes between PSB packets as: - - 2 ^ (value + 11) - - e.g. value 3 means 16KiB bytes between PSBs - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/psb_period=15/u uname - Invalid psb_period for intel_pt. Valid values are: 0-5 - - If MTC packets are selected, the default config selects a value - of 3 (i.e. psb_period=3) or the nearest lower value that is - supported (0 is always supported). Otherwise the default is 0. - - If decoding is expected to be reliable and the buffer is large - then a large PSB period can be used. - - Because a TSC packet is produced with PSB, the PSB period can - also affect the granularity to timing information in the absence - of MTC or CYC. - -mtc Produces MTC timing packets. - - MTC packets provide finer grain timestamp information than TSC - packets. MTC packets record time using the hardware crystal - clock (CTC) which is related to TSC packets using a TMA packet. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc - - which contains "1" if the feature is supported and - "0" otherwise. - - The frequency of MTC packets can also be specified - see - mtc_period below. - -mtc_period Specifies how frequently MTC packets are produced - see mtc - above for how to determine if MTC packets are supported. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The mtc_period value is converted to the MTC frequency as: - - CTC-frequency / (2 ^ value) - - e.g. value 3 means one eighth of CTC-frequency - - Where CTC is the hardware crystal clock, the frequency of which - can be related to TSC via values provided in cpuid leaf 0x15. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/mtc_period=15/u uname - Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 - - The default value is 3 or the nearest lower value - that is supported (0 is always supported). - -cyc Produces CYC timing packets. - - CYC packets provide even finer grain timestamp information than - MTC and TSC packets. A CYC packet contains the number of CPU - cycles since the last CYC packet. Unlike MTC and TSC packets, - CYC packets are only sent when another packet is also sent. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and - "0" otherwise. - - The number of CYC packets produced can be reduced by specifying - a threshold - see cyc_thresh below. - -cyc_thresh Specifies how frequently CYC packets are produced - see cyc - above for how to determine if CYC packets are supported. - - Valid cyc_thresh values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The cyc_thresh value represents the minimum number of CPU cycles - that must have passed before a CYC packet can be sent. The - number of CPU cycles is: - - 2 ^ (value - 1) - - e.g. value 4 means 8 CPU cycles must pass before a CYC packet - can be sent. Note a CYC packet is still only sent when another - packet is sent, not at, e.g. every 8 CPU cycles. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname - Invalid cyc_thresh for intel_pt. Valid values are: 0-12 - - CYC packets are not requested by default. - -pt Specifies pass-through which enables the 'branch' config term. - - The default config selects 'pt' if it is available, so a user will - never need to specify this term. - -branch Enable branch tracing. Branch tracing is enabled by default so to - disable branch tracing use 'branch=0'. - - The default config selects 'branch' if it is available. - -ptw Enable PTWRITE packets which are produced when a ptwrite instruction - is executed. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/ptwrite - - which contains "1" if the feature is supported and - "0" otherwise. - -fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet - provides the address of the ptwrite instruction. In the absence of - fup_on_ptw, the decoder will use the address of the previous branch - if branch tracing is enabled, otherwise the address will be zero. - Note that fup_on_ptw will work even when branch tracing is disabled. - -pwr_evt Enable power events. The power events provide information about - changes to the CPU C-state. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/power_event_trace - - which contains "1" if the feature is supported and - "0" otherwise. - - -AUX area sampling option ------------------------- - -To select Intel PT "sampling" the AUX area sampling option can be used: - - --aux-sample - -Optionally it can be followed by the sample size in bytes e.g. - - --aux-sample=8192 - -In addition, the Intel PT event to sample must be defined e.g. - - -e intel_pt//u - -Samples on other events will be created containing Intel PT data e.g. the -following will create Intel PT samples on the branch-misses event, note the -events must be grouped using {}: - - perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' - -An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to -events. In this case, the grouping is implied e.g. - - perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u - -is the same as: - - perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}' - -but allows for also using an address filter e.g.: - - perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls - -It is important to select a sample size that is big enough to contain at least -one PSB packet. If not a warning will be displayed: - - Intel PT sample size (%zu) may be too small for PSB period (%zu) - -The calculation used for that is: if sample_size <= psb_period + 256 display the -warning. When sampling is used, psb_period defaults to 0 (2KiB). - -The default sample size is 4KiB. - -The sample size is passed in aux_sample_size in struct perf_event_attr. The -sample size is limited by the maximum event size which is 64KiB. It is -difficult to know how big the event might be without the trace sample attached, -but the tool validates that the sample size is not greater than 60KiB. - - -new snapshot option -------------------- - -The difference between full trace and snapshot from the kernel's perspective is -that in full trace we don't overwrite trace data that the user hasn't collected -yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let -the trace run and overwrite older data in the buffer so that whenever something -interesting happens, we can stop it and grab a snapshot of what was going on -around that interesting moment. - -To select snapshot mode a new option has been added: - - -S - -Optionally it can be followed by the snapshot size e.g. - - -S0x100000 - -The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap size -nor snapshot size is specified, then the default is 4MiB for privileged users -(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. -If an unprivileged user does not specify mmap pages, the mmap pages will be -reduced as described in the 'new auxtrace mmap size option' section below. - -The snapshot size is displayed if the option -vv is used e.g. - - Intel PT snapshot size: %zu - - -new auxtrace mmap size option ---------------------------- - -Intel PT buffer size is specified by an addition to the -m option e.g. - - -m,16 - -selects a buffer size of 16 pages i.e. 64KiB. - -Note that the existing functionality of -m is unchanged. The auxtrace mmap size -is specified by the optional addition of a comma and the value. - -The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users -(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. -If an unprivileged user does not specify mmap pages, the mmap pages will be -reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the -user is likely to get an error as they exceed their mlock limit (Max locked -memory as shown in /proc/self/limits). Note that perf does not count the first -512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu -against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus -their mlock limit (which defaults to 64KiB but is not multiplied by the number -of cpus). - -In full-trace mode, powers of two are allowed for buffer size, with a minimum -size of 2 pages. In snapshot mode or sampling mode, it is the same but the -minimum size is 1 page. - -The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g. - - mmap length 528384 - auxtrace mmap length 4198400 - - -Intel PT modes of operation ---------------------------- - -Intel PT can be used in 2 modes: - full-trace mode - sample mode - snapshot mode - -Full-trace mode traces continuously e.g. - - perf record -e intel_pt//u uname - -Sample mode attaches a Intel PT sample to other events e.g. - - perf record --aux-sample -e intel_pt//u -e branch-misses:u - -Snapshot mode captures the available data when a signal is sent e.g. - - perf record -v -e intel_pt//u -S ./loopy 1000000000 & - [1] 11435 - kill -USR2 11435 - Recording AUX area tracing snapshot - -Note that the signal sent is SIGUSR2. -Note that "Recording AUX area tracing snapshot" is displayed because the -v -option is used. - -The 2 modes cannot be used together. - - -Buffer handling ---------------- - -There may be buffer limitations (i.e. single ToPa entry) which means that actual -buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to -provide other sizes, and in particular an arbitrarily large size, multiple -buffers are logically concatenated. However an interrupt must be used to switch -between buffers. That has two potential problems: - a) the interrupt may not be handled in time so that the current buffer - becomes full and some trace data is lost. - b) the interrupts may slow the system and affect the performance - results. - -If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event -which the tools report as an error. - -In full-trace mode, the driver waits for data to be copied out before allowing -the (logical) buffer to wrap-around. If data is not copied out quickly enough, -again 'truncated' is set in the PERF_RECORD_AUX event. If the driver has to -wait, the intel_pt event gets disabled. Because it is difficult to know when -that happens, perf tools always re-enable the intel_pt event after copying out -data. - - -Intel PT and build ids ----------------------- - -By default "perf record" post-processes the event stream to find all build ids -for executables for all addresses sampled. Deliberately, Intel PT is not -decoded for that purpose (it would take too long). Instead the build ids for -all executables encountered (due to mmap, comm or task events) are included -in the perf.data file. - -To see buildids included in the perf.data file use the command: - - perf buildid-list - -If the perf.data file contains Intel PT data, that is the same as: - - perf buildid-list --with-hits - - -Snapshot mode and event disabling ---------------------------------- - -In order to make a snapshot, the intel_pt event is disabled using an IOCTL, -namely PERF_EVENT_IOC_DISABLE. However doing that can also disable the -collection of side-band information. In order to prevent that, a dummy -software event has been introduced that permits tracking events (like mmaps) to -continue to be recorded while intel_pt is disabled. That is important to ensure -there is complete side-band information to allow the decoding of subsequent -snapshots. - -A test has been created for that. To find the test: - - perf test list - ... - 23: Test using a dummy software event to keep tracking - -To run the test: - - perf test 23 - 23: Test using a dummy software event to keep tracking : Ok - - -perf record modes (nothing new here) ------------------------------------- - -perf record essentially operates in one of three modes: - per thread - per cpu - workload only - -"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a -workload). -"per cpu" is selected by -C or -a. -"workload only" mode is selected by not using the other options but providing a -command to run (i.e. the workload). - -In per-thread mode an exact list of threads is traced. There is no inheritance. -Each thread has its own event buffer. - -In per-cpu mode all processes (or processes from the selected cgroup i.e. -G -option, or processes selected with -p or -u) are traced. Each cpu has its own -buffer. Inheritance is allowed. - -In workload-only mode, the workload is traced but with per-cpu buffers. -Inheritance is allowed. Note that you can now trace a workload in per-thread -mode by using the --per-thread option. - - -Privileged vs non-privileged users ----------------------------------- - -Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users -have memory limits imposed upon them. That affects what buffer sizes they can -have as outlined above. - -The v4.2 kernel introduced support for a context switch metadata event, -PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes -are scheduled out and in, just not by whom, which is left for the -PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context, -which in turn requires CAP_SYS_ADMIN. - -Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context -switches") commit, that introduces these metadata events for further info. - -When working with kernels < v4.2, the following considerations must be taken, -as the sched:sched_switch tracepoints will be used to receive such information: - -Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are -not permitted to use tracepoints which means there is insufficient side-band -information to decode Intel PT in per-cpu mode, and potentially workload-only -mode too if the workload creates new processes. - -Note also, that to use tracepoints, read-access to debugfs is required. So if -debugfs is not mounted or the user does not have read-access, it will again not -be possible to decode Intel PT in per-cpu mode. - - -sched_switch tracepoint ------------------------ - -The sched_switch tracepoint is used to provide side-band data for Intel PT -decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't -available. - -The sched_switch events are automatically added. e.g. the second event shown -below: - - $ perf record -vv -e intel_pt//u uname - ------------------------------------------------------------ - perf_event_attr: - type 6 - size 112 - config 0x400 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - enable_on_exec 1 - sample_id_all 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - perf_event_attr: - type 2 - size 112 - config 0x108 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER - read_format ID - inherit 1 - sample_id_all 1 - exclude_guest 1 - ------------------------------------------------------------ - sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - perf_event_attr: - type 1 - size 112 - config 0x9 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - mmap 1 - comm 1 - enable_on_exec 1 - task 1 - sample_id_all 1 - mmap2 1 - comm_exec 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - mmap size 528384B - AUX area mmap length 4194304 - perf event ring buffer mmapped per cpu - Synthesizing auxtrace information - Linux - [ perf record: Woken up 1 times to write data ] - [ perf record: Captured and wrote 0.042 MB perf.data ] - -Note, the sched_switch event is only added if the user is permitted to use it -and only in per-cpu mode. - -Note also, the sched_switch event is only added if TSC packets are requested. -That is because, in the absence of timing information, the sched_switch events -cannot be matched against the Intel PT trace. - - -perf script -=========== - -By default, perf script will decode trace data found in the perf.data file. -This can be further controlled by new option --itrace. - - -New --itrace option -------------------- - -Having no option is the same as - - --itrace - -which, in turn, is the same as - - --itrace=cepwx - -The letters are: - - i synthesize "instructions" events - b synthesize "branches" events - x synthesize "transactions" events - w synthesize "ptwrite" events - p synthesize "power" events - c synthesize branches events (calls only) - r synthesize branches events (returns only) - e synthesize tracing error events - d create a debug log - g synthesize a call chain (use with i or x) - l synthesize last branch entries (use with i or x) - s skip initial number of events - -"Instructions" events look like they were recorded by "perf record -e -instructions". - -"Branches" events look like they were recorded by "perf record -e branches". "c" -and "r" can be combined to get calls and returns. - -"Transactions" events correspond to the start or end of transactions. The -'flags' field can be used in perf script to determine whether the event is a -tranasaction start, commit or abort. - -Note that "instructions", "branches" and "transactions" events depend on code -flow packets which can be disabled by using the config term "branch=0". Refer -to the config terms section above. - -"ptwrite" events record the payload of the ptwrite instruction and whether -"fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are -recorded only if the "ptw" config term was used. Refer to the config terms -section above. perf script "synth" field displays "ptwrite" information like -this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was -used. - -"Power" events correspond to power event packets and CBR (core-to-bus ratio) -packets. While CBR packets are always recorded when tracing is enabled, power -event packets are recorded only if the "pwr_evt" config term was used. Refer to -the config terms section above. The power events record information about -C-state changes, whereas CBR is indicative of CPU frequency. perf script -"event,synth" fields display information like this: - cbr: cbr: 22 freq: 2189 MHz (200%) - mwait: hints: 0x60 extensions: 0x1 - pwre: hw: 0 cstate: 2 sub-cstate: 0 - exstop: ip: 1 - pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 -Where: - "cbr" includes the frequency and the percentage of maximum non-turbo - "mwait" shows mwait hints and extensions - "pwre" shows C-state transitions (to a C-state deeper than C0) and - whether initiated by hardware - "exstop" indicates execution stopped and whether the IP was recorded - exactly, - "pwrx" indicates return to C0 -For more details refer to the Intel 64 and IA-32 Architectures Software -Developer Manuals. - -Error events show where the decoder lost the trace. Error events -are quite important. Users must know if what they are seeing is a complete -picture or not. - -The "d" option will cause the creation of a file "intel_pt.log" containing all -decoded packets and instructions. Note that this option slows down the decoder -and that the resulting file may be very large. - -In addition, the period of the "instructions" event can be specified. e.g. - - --itrace=i10us - -sets the period to 10us i.e. one instruction sample is synthesized for each 10 -microseconds of trace. Alternatives to "us" are "ms" (milliseconds), -"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions). - -"ms", "us" and "ns" are converted to TSC ticks. - -The timing information included with Intel PT does not give the time of every -instruction. Consequently, for the purpose of sampling, the decoder estimates -the time since the last timing packet based on 1 tick per instruction. The time -on the sample is *not* adjusted and reflects the last known value of TSC. - -For Intel PT, the default period is 100us. - -Setting it to a zero period means "as often as possible". - -In the case of Intel PT that is the same as a period of 1 and a unit of -'instructions' (i.e. --itrace=i1i). - -Also the call chain size (default 16, max. 1024) for instructions or -transactions events can be specified. e.g. - - --itrace=ig32 - --itrace=xg32 - -Also the number of last branch entries (default 64, max. 1024) for instructions or -transactions events can be specified. e.g. - - --itrace=il10 - --itrace=xl10 - -Note that last branch entries are cleared for each sample, so there is no overlap -from one sample to the next. - -To disable trace decoding entirely, use the option --no-itrace. - -It is also possible to skip events generated (instructions, branches, transactions) -at the beginning. This is useful to ignore initialization code. - - --itrace=i0nss1000000 - -skips the first million instructions. - -dump option ------------ - -perf script has an option (-D) to "dump" the events i.e. display the binary -data. - -When -D is used, Intel PT packets are displayed. The packet decoder does not -pay attention to PSB packets, but just decodes the bytes - so the packets seen -by the actual decoder may not be identical in places where the data is corrupt. -One example of that would be when the buffer-switching interrupt has been too -slow, and the buffer has been filled completely. In that case, the last packet -in the buffer might be truncated and immediately followed by a PSB as the trace -continues in the next buffer. - -To disable the display of Intel PT packets, combine the -D option with ---no-itrace. - - -perf report -=========== - -By default, perf report will decode trace data found in the perf.data file. -This can be further controlled by new option --itrace exactly the same as -perf script, with the exception that the default is --itrace=igxe. - - -perf inject -=========== - -perf inject also accepts the --itrace option in which case tracing data is -removed and replaced with the synthesized events. e.g. - - perf inject --itrace -i perf.data -o perf.data.new - -Below is an example of using Intel PT with autofdo. It requires autofdo -(https://github.com/google/autofdo) and gcc version 5. The bubble -sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial) -amended to take the number of elements as a parameter. - - $ gcc-5 -O3 sort.c -o sort_optimized - $ ./sort_optimized 30000 - Bubble sorting array of 30000 elements - 2254 ms - - $ cat ~/.perfconfig - [intel-pt] - mispred-all = on - - $ perf record -e intel_pt//u ./sort 3000 - Bubble sorting array of 3000 elements - 58 ms - [ perf record: Woken up 2 times to write data ] - [ perf record: Captured and wrote 3.939 MB perf.data ] - $ perf inject -i perf.data -o inj --itrace=i100usle --strip - $ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1 - $ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo - $ ./sort_autofdo 30000 - Bubble sorting array of 30000 elements - 2155 ms - -Note there is currently no advantage to using Intel PT instead of LBR, but -that may change in the future if greater use is made of the data. - - -PEBS via Intel PT -================= - -Some hardware has the feature to redirect PEBS records to the Intel PT trace. -Recording is selected by using the aux-output config term e.g. - - perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname - -Note that currently, software only supports redirecting at most one PEBS event. - -To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g. - - perf script --itrace=oe +Documentation for support for Intel Processor Trace within perf tools' has moved to file perf-intel-pt.txt diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index a64d6588470e..70969ea73e01 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -66,4 +66,5 @@ include::itrace.txt[] SEE ALSO -------- -linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1] +linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1], +linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt new file mode 100644 index 000000000000..456fdcbf26ac --- /dev/null +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -0,0 +1,1007 @@ +perf-intel-pt(1) +================ + +NAME +---- +perf-intel-pt - Support for Intel Processor Trace within perf tools + +SYNOPSIS +-------- +[verse] +'perf record' -e intel_pt// + +DESCRIPTION +----------- + +Intel Processor Trace (Intel PT) is an extension of Intel Architecture that +collects information about software execution such as control flow, execution +modes and timings and formats it into highly compressed binary packets. +Technical details are documented in the Intel 64 and IA-32 Architectures +Software Developer Manuals, Chapter 36 Intel Processor Trace. + +Intel PT is first supported in Intel Core M and 5th generation Intel Core +processors that are based on the Intel micro-architecture code name Broadwell. + +Trace data is collected by 'perf record' and stored within the perf.data file. +See below for options to 'perf record'. + +Trace data must be 'decoded' which involves walking the object code and matching +the trace data packets. For example a TNT packet only tells whether a +conditional branch was taken or not taken, so to make use of that packet the +decoder must know precisely which instruction was being executed. + +Decoding is done on-the-fly. The decoder outputs samples in the same format as +samples output by perf hardware events, for example as though the "instructions" +or "branches" events had been recorded. Presently 3 tools support this: +'perf script', 'perf report' and 'perf inject'. See below for more information +on using those tools. + +The main distinguishing feature of Intel PT is that the decoder can determine +the exact flow of software execution. Intel PT can be used to understand why +and how did software get to a certain point, or behave a certain way. The +software does not have to be recompiled, so Intel PT works with debug or release +builds, however the executed images are needed - which makes use in JIT-compiled +environments, or with self-modified code, a challenge. Also symbols need to be +provided to make sense of addresses. + +A limitation of Intel PT is that it produces huge amounts of trace data +(hundreds of megabytes per second per core) which takes a long time to decode, +for example two or three orders of magnitude longer than it took to collect. +Another limitation is the performance impact of tracing, something that will +vary depending on the use-case and architecture. + + +Quickstart +---------- + +It is important to start small. That is because it is easy to capture vastly +more data than can possibly be processed. + +The simplest thing to do with Intel PT is userspace profiling of small programs. +Data is captured with 'perf record' e.g. to trace 'ls' userspace-only: + + perf record -e intel_pt//u ls + +And profiled with 'perf report' e.g. + + perf report + +To also trace kernel space presents a problem, namely kernel self-modifying +code. A fairly good kernel image is available in /proc/kcore but to get an +accurate image a copy of /proc/kcore needs to be made under the same conditions +as the data capture. A script perf-with-kcore can do that, but beware that the +script makes use of 'sudo' to copy /proc/kcore. If you have perf installed +locally from the source tree you can do: + + ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls + +which will create a directory named 'pt_ls' and put the perf.data file and +copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use +'perf report' becomes: + + ~/libexec/perf-core/perf-with-kcore report pt_ls + +Because samples are synthesized after-the-fact, the sampling period can be +selected for reporting. e.g. sample every microsecond + + ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge + +See the sections below for more information about the --itrace option. + +Beware the smaller the period, the more samples that are produced, and the +longer it takes to process them. + +Also note that the coarseness of Intel PT timing information will start to +distort the statistical value of the sampling as the sampling period becomes +smaller. + +To represent software control flow, "branches" samples are produced. By default +a branch sample is synthesized for every single branch. To get an idea what +data is available you can use the 'perf script' tool with all itrace sampling +options, which will list all the samples. + + perf record -e intel_pt//u ls + perf script --itrace=ibxwpe + +An interesting field that is not printed by default is 'flags' which can be +displayed as follows: + + perf script --itrace=ibxwpe -F+flags + +The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, +system, asynchronous, interrupt, transaction abort, trace begin, trace end, and +in transaction, respectively. + +Another interesting field that is not printed by default is 'ipc' which can be +displayed as follows: + + perf script --itrace=be -F+ipc + +There are two ways that instructions-per-cycle (IPC) can be calculated depending +on the recording. + +If the 'cyc' config term (see config terms section below) was used, then IPC is +calculated using the cycle count from CYC packets, otherwise MTC packets are +used - refer to the 'mtc' config term. When MTC is used, however, the values +are less accurate because the timing is less accurate. + +Because Intel PT does not update the cycle count on every branch or instruction, +the values will often be zero. When there are values, they will be the number +of instructions and number of cycles since the last update, and thus represent +the average IPC since the last IPC for that event type. Note IPC for "branches" +events is calculated separately from IPC for "instructions" events. + +Also note that the IPC instruction count may or may not include the current +instruction. If the cycle count is associated with an asynchronous branch +(e.g. page fault or interrupt), then the instruction count does not include the +current instruction, otherwise it does. That is consistent with whether or not +that instruction has retired when the cycle count is updated. + +Another note, in the case of "branches" events, non-taken branches are not +presently sampled, so IPC values for them do not appear e.g. a CYC packet with a +TNT packet that starts with a non-taken branch. To see every possible IPC +value, "instructions" events can be used e.g. --itrace=i0ns + +While it is possible to create scripts to analyze the data, an alternative +approach is available to export the data to a sqlite or postgresql database. +Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, +and to script exported-sql-viewer.py for an example of using the database. + +There is also script intel-pt-events.py which provides an example of how to +unpack the raw data for power events and PTWRITE. + +As mentioned above, it is easy to capture too much data. One way to limit the +data captured is to use 'snapshot' mode which is explained further below. +Refer to 'new snapshot option' and 'Intel PT modes of operation' further below. + +Another problem that will be experienced is decoder errors. They can be caused +by inability to access the executed image, self-modified or JIT-ed code, or the +inability to match side-band information (such as context switches and mmaps) +which results in the decoder not knowing what code was executed. + +There is also the problem of perf not being able to copy the data fast enough, +resulting in data lost because the buffer was full. See 'Buffer handling' below +for more details. + + +perf record +----------- + +new event +~~~~~~~~~ + +The Intel PT kernel driver creates a new PMU for Intel PT. PMU events are +selected by providing the PMU name followed by the "config" separated by slashes. +An enhancement has been made to allow default "config" e.g. the option + + -e intel_pt// + +will use a default config value. Currently that is the same as + + -e intel_pt/tsc,noretcomp=0/ + +which is the same as + + -e intel_pt/tsc=1,noretcomp=0/ + +Note there are now new config terms - see section 'config terms' further below. + +The config terms are listed in /sys/devices/intel_pt/format. They are bit +fields within the config member of the struct perf_event_attr which is +passed to the kernel by the perf_event_open system call. They correspond to bit +fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: + + $ grep -H . /sys/bus/event_source/devices/intel_pt/format/* + /sys/bus/event_source/devices/intel_pt/format/cyc:config:1 + /sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22 + /sys/bus/event_source/devices/intel_pt/format/mtc:config:9 + /sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17 + /sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11 + /sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27 + /sys/bus/event_source/devices/intel_pt/format/tsc:config:10 + +Note that the default config must be overridden for each term i.e. + + -e intel_pt/noretcomp=0/ + +is the same as: + + -e intel_pt/tsc=1,noretcomp=0/ + +So, to disable TSC packets use: + + -e intel_pt/tsc=0/ + +It is also possible to specify the config value explicitly: + + -e intel_pt/config=0x400/ + +Note that, as with all events, the event is suffixed with event modifiers: + + u userspace + k kernel + h hypervisor + G guest + H host + p precise ip + +'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. +'p' is also not relevant to Intel PT. So only options 'u' and 'k' are +meaningful for Intel PT. + +perf_event_attr is displayed if the -vv option is used e.g. + + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + + +config terms +~~~~~~~~~~~~ + +The June 2015 version of Intel 64 and IA-32 Architectures Software Developer +Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. +Some of the features are reflect in new config terms. All the config terms are +described below. + +tsc Always supported. Produces TSC timestamp packets to provide + timing information. In some cases it is possible to decode + without timing information, for example a per-thread context + that does not overlap executable memory maps. + + The default config selects tsc (i.e. tsc=1). + +noretcomp Always supported. Disables "return compression" so a TIP packet + is produced when a function returns. Causes more packets to be + produced but might make decoding more reliable. + + The default config does not select noretcomp (i.e. noretcomp=0). + +psb_period Allows the frequency of PSB packets to be specified. + + The PSB packet is a synchronization packet that provides a + starting point for decoding or recovery from errors. + + Support for psb_period is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and "0" + otherwise. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The psb_period value is converted to the approximate number of + trace bytes between PSB packets as: + + 2 ^ (value + 11) + + e.g. value 3 means 16KiB bytes between PSBs + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/psb_period=15/u uname + Invalid psb_period for intel_pt. Valid values are: 0-5 + + If MTC packets are selected, the default config selects a value + of 3 (i.e. psb_period=3) or the nearest lower value that is + supported (0 is always supported). Otherwise the default is 0. + + If decoding is expected to be reliable and the buffer is large + then a large PSB period can be used. + + Because a TSC packet is produced with PSB, the PSB period can + also affect the granularity to timing information in the absence + of MTC or CYC. + +mtc Produces MTC timing packets. + + MTC packets provide finer grain timestamp information than TSC + packets. MTC packets record time using the hardware crystal + clock (CTC) which is related to TSC packets using a TMA packet. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc + + which contains "1" if the feature is supported and + "0" otherwise. + + The frequency of MTC packets can also be specified - see + mtc_period below. + +mtc_period Specifies how frequently MTC packets are produced - see mtc + above for how to determine if MTC packets are supported. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The mtc_period value is converted to the MTC frequency as: + + CTC-frequency / (2 ^ value) + + e.g. value 3 means one eighth of CTC-frequency + + Where CTC is the hardware crystal clock, the frequency of which + can be related to TSC via values provided in cpuid leaf 0x15. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/mtc_period=15/u uname + Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 + + The default value is 3 or the nearest lower value + that is supported (0 is always supported). + +cyc Produces CYC timing packets. + + CYC packets provide even finer grain timestamp information than + MTC and TSC packets. A CYC packet contains the number of CPU + cycles since the last CYC packet. Unlike MTC and TSC packets, + CYC packets are only sent when another packet is also sent. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and + "0" otherwise. + + The number of CYC packets produced can be reduced by specifying + a threshold - see cyc_thresh below. + +cyc_thresh Specifies how frequently CYC packets are produced - see cyc + above for how to determine if CYC packets are supported. + + Valid cyc_thresh values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The cyc_thresh value represents the minimum number of CPU cycles + that must have passed before a CYC packet can be sent. The + number of CPU cycles is: + + 2 ^ (value - 1) + + e.g. value 4 means 8 CPU cycles must pass before a CYC packet + can be sent. Note a CYC packet is still only sent when another + packet is sent, not at, e.g. every 8 CPU cycles. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname + Invalid cyc_thresh for intel_pt. Valid values are: 0-12 + + CYC packets are not requested by default. + +pt Specifies pass-through which enables the 'branch' config term. + + The default config selects 'pt' if it is available, so a user will + never need to specify this term. + +branch Enable branch tracing. Branch tracing is enabled by default so to + disable branch tracing use 'branch=0'. + + The default config selects 'branch' if it is available. + +ptw Enable PTWRITE packets which are produced when a ptwrite instruction + is executed. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/ptwrite + + which contains "1" if the feature is supported and + "0" otherwise. + +fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet + provides the address of the ptwrite instruction. In the absence of + fup_on_ptw, the decoder will use the address of the previous branch + if branch tracing is enabled, otherwise the address will be zero. + Note that fup_on_ptw will work even when branch tracing is disabled. + +pwr_evt Enable power events. The power events provide information about + changes to the CPU C-state. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/power_event_trace + + which contains "1" if the feature is supported and + "0" otherwise. + + +AUX area sampling option +~~~~~~~~~~~~~~~~~~~~~~~~ + +To select Intel PT "sampling" the AUX area sampling option can be used: + + --aux-sample + +Optionally it can be followed by the sample size in bytes e.g. + + --aux-sample=8192 + +In addition, the Intel PT event to sample must be defined e.g. + + -e intel_pt//u + +Samples on other events will be created containing Intel PT data e.g. the +following will create Intel PT samples on the branch-misses event, note the +events must be grouped using {}: + + perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' + +An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to +events. In this case, the grouping is implied e.g. + + perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u + +is the same as: + + perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}' + +but allows for also using an address filter e.g.: + + perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls + +It is important to select a sample size that is big enough to contain at least +one PSB packet. If not a warning will be displayed: + + Intel PT sample size (%zu) may be too small for PSB period (%zu) + +The calculation used for that is: if sample_size <= psb_period + 256 display the +warning. When sampling is used, psb_period defaults to 0 (2KiB). + +The default sample size is 4KiB. + +The sample size is passed in aux_sample_size in struct perf_event_attr. The +sample size is limited by the maximum event size which is 64KiB. It is +difficult to know how big the event might be without the trace sample attached, +but the tool validates that the sample size is not greater than 60KiB. + + +new snapshot option +~~~~~~~~~~~~~~~~~~~ + +The difference between full trace and snapshot from the kernel's perspective is +that in full trace we don't overwrite trace data that the user hasn't collected +yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let +the trace run and overwrite older data in the buffer so that whenever something +interesting happens, we can stop it and grab a snapshot of what was going on +around that interesting moment. + +To select snapshot mode a new option has been added: + + -S + +Optionally it can be followed by the snapshot size e.g. + + -S0x100000 + +The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap size +nor snapshot size is specified, then the default is 4MiB for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced as described in the 'new auxtrace mmap size option' section below. + +The snapshot size is displayed if the option -vv is used e.g. + + Intel PT snapshot size: %zu + + +new auxtrace mmap size option +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Intel PT buffer size is specified by an addition to the -m option e.g. + + -m,16 + +selects a buffer size of 16 pages i.e. 64KiB. + +Note that the existing functionality of -m is unchanged. The auxtrace mmap size +is specified by the optional addition of a comma and the value. + +The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the +user is likely to get an error as they exceed their mlock limit (Max locked +memory as shown in /proc/self/limits). Note that perf does not count the first +512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu +against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus +their mlock limit (which defaults to 64KiB but is not multiplied by the number +of cpus). + +In full-trace mode, powers of two are allowed for buffer size, with a minimum +size of 2 pages. In snapshot mode or sampling mode, it is the same but the +minimum size is 1 page. + +The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g. + + mmap length 528384 + auxtrace mmap length 4198400 + + +Intel PT modes of operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Intel PT can be used in 2 modes: + full-trace mode + sample mode + snapshot mode + +Full-trace mode traces continuously e.g. + + perf record -e intel_pt//u uname + +Sample mode attaches a Intel PT sample to other events e.g. + + perf record --aux-sample -e intel_pt//u -e branch-misses:u + +Snapshot mode captures the available data when a signal is sent e.g. + + perf record -v -e intel_pt//u -S ./loopy 1000000000 & + [1] 11435 + kill -USR2 11435 + Recording AUX area tracing snapshot + +Note that the signal sent is SIGUSR2. +Note that "Recording AUX area tracing snapshot" is displayed because the -v +option is used. + +The 2 modes cannot be used together. + + +Buffer handling +~~~~~~~~~~~~~~~ + +There may be buffer limitations (i.e. single ToPa entry) which means that actual +buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to +provide other sizes, and in particular an arbitrarily large size, multiple +buffers are logically concatenated. However an interrupt must be used to switch +between buffers. That has two potential problems: + a) the interrupt may not be handled in time so that the current buffer + becomes full and some trace data is lost. + b) the interrupts may slow the system and affect the performance + results. + +If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event +which the tools report as an error. + +In full-trace mode, the driver waits for data to be copied out before allowing +the (logical) buffer to wrap-around. If data is not copied out quickly enough, +again 'truncated' is set in the PERF_RECORD_AUX event. If the driver has to +wait, the intel_pt event gets disabled. Because it is difficult to know when +that happens, perf tools always re-enable the intel_pt event after copying out +data. + + +Intel PT and build ids +~~~~~~~~~~~~~~~~~~~~~~ + +By default "perf record" post-processes the event stream to find all build ids +for executables for all addresses sampled. Deliberately, Intel PT is not +decoded for that purpose (it would take too long). Instead the build ids for +all executables encountered (due to mmap, comm or task events) are included +in the perf.data file. + +To see buildids included in the perf.data file use the command: + + perf buildid-list + +If the perf.data file contains Intel PT data, that is the same as: + + perf buildid-list --with-hits + + +Snapshot mode and event disabling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to make a snapshot, the intel_pt event is disabled using an IOCTL, +namely PERF_EVENT_IOC_DISABLE. However doing that can also disable the +collection of side-band information. In order to prevent that, a dummy +software event has been introduced that permits tracking events (like mmaps) to +continue to be recorded while intel_pt is disabled. That is important to ensure +there is complete side-band information to allow the decoding of subsequent +snapshots. + +A test has been created for that. To find the test: + + perf test list + ... + 23: Test using a dummy software event to keep tracking + +To run the test: + + perf test 23 + 23: Test using a dummy software event to keep tracking : Ok + + +perf record modes (nothing new here) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +perf record essentially operates in one of three modes: + per thread + per cpu + workload only + +"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a +workload). +"per cpu" is selected by -C or -a. +"workload only" mode is selected by not using the other options but providing a +command to run (i.e. the workload). + +In per-thread mode an exact list of threads is traced. There is no inheritance. +Each thread has its own event buffer. + +In per-cpu mode all processes (or processes from the selected cgroup i.e. -G +option, or processes selected with -p or -u) are traced. Each cpu has its own +buffer. Inheritance is allowed. + +In workload-only mode, the workload is traced but with per-cpu buffers. +Inheritance is allowed. Note that you can now trace a workload in per-thread +mode by using the --per-thread option. + + +Privileged vs non-privileged users +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users +have memory limits imposed upon them. That affects what buffer sizes they can +have as outlined above. + +The v4.2 kernel introduced support for a context switch metadata event, +PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes +are scheduled out and in, just not by whom, which is left for the +PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context, +which in turn requires CAP_SYS_ADMIN. + +Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context +switches") commit, that introduces these metadata events for further info. + +When working with kernels < v4.2, the following considerations must be taken, +as the sched:sched_switch tracepoints will be used to receive such information: + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are +not permitted to use tracepoints which means there is insufficient side-band +information to decode Intel PT in per-cpu mode, and potentially workload-only +mode too if the workload creates new processes. + +Note also, that to use tracepoints, read-access to debugfs is required. So if +debugfs is not mounted or the user does not have read-access, it will again not +be possible to decode Intel PT in per-cpu mode. + + +sched_switch tracepoint +~~~~~~~~~~~~~~~~~~~~~~~ + +The sched_switch tracepoint is used to provide side-band data for Intel PT +decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't +available. + +The sched_switch events are automatically added. e.g. the second event shown +below: + + $ perf record -vv -e intel_pt//u uname + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 2 + size 112 + config 0x108 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER + read_format ID + inherit 1 + sample_id_all 1 + exclude_guest 1 + ------------------------------------------------------------ + sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 1 + size 112 + config 0x9 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + mmap 1 + comm 1 + enable_on_exec 1 + task 1 + sample_id_all 1 + mmap2 1 + comm_exec 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + mmap size 528384B + AUX area mmap length 4194304 + perf event ring buffer mmapped per cpu + Synthesizing auxtrace information + Linux + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.042 MB perf.data ] + +Note, the sched_switch event is only added if the user is permitted to use it +and only in per-cpu mode. + +Note also, the sched_switch event is only added if TSC packets are requested. +That is because, in the absence of timing information, the sched_switch events +cannot be matched against the Intel PT trace. + + +perf script +----------- + +By default, perf script will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace. + + +New --itrace option +~~~~~~~~~~~~~~~~~~~ + +Having no option is the same as + + --itrace + +which, in turn, is the same as + + --itrace=cepwx + +The letters are: + + i synthesize "instructions" events + b synthesize "branches" events + x synthesize "transactions" events + w synthesize "ptwrite" events + p synthesize "power" events + c synthesize branches events (calls only) + r synthesize branches events (returns only) + e synthesize tracing error events + d create a debug log + g synthesize a call chain (use with i or x) + l synthesize last branch entries (use with i or x) + s skip initial number of events + +"Instructions" events look like they were recorded by "perf record -e +instructions". + +"Branches" events look like they were recorded by "perf record -e branches". "c" +and "r" can be combined to get calls and returns. + +"Transactions" events correspond to the start or end of transactions. The +'flags' field can be used in perf script to determine whether the event is a +tranasaction start, commit or abort. + +Note that "instructions", "branches" and "transactions" events depend on code +flow packets which can be disabled by using the config term "branch=0". Refer +to the config terms section above. + +"ptwrite" events record the payload of the ptwrite instruction and whether +"fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are +recorded only if the "ptw" config term was used. Refer to the config terms +section above. perf script "synth" field displays "ptwrite" information like +this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was +used. + +"Power" events correspond to power event packets and CBR (core-to-bus ratio) +packets. While CBR packets are always recorded when tracing is enabled, power +event packets are recorded only if the "pwr_evt" config term was used. Refer to +the config terms section above. The power events record information about +C-state changes, whereas CBR is indicative of CPU frequency. perf script +"event,synth" fields display information like this: + cbr: cbr: 22 freq: 2189 MHz (200%) + mwait: hints: 0x60 extensions: 0x1 + pwre: hw: 0 cstate: 2 sub-cstate: 0 + exstop: ip: 1 + pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 +Where: + "cbr" includes the frequency and the percentage of maximum non-turbo + "mwait" shows mwait hints and extensions + "pwre" shows C-state transitions (to a C-state deeper than C0) and + whether initiated by hardware + "exstop" indicates execution stopped and whether the IP was recorded + exactly, + "pwrx" indicates return to C0 +For more details refer to the Intel 64 and IA-32 Architectures Software +Developer Manuals. + +Error events show where the decoder lost the trace. Error events +are quite important. Users must know if what they are seeing is a complete +picture or not. + +The "d" option will cause the creation of a file "intel_pt.log" containing all +decoded packets and instructions. Note that this option slows down the decoder +and that the resulting file may be very large. + +In addition, the period of the "instructions" event can be specified. e.g. + + --itrace=i10us + +sets the period to 10us i.e. one instruction sample is synthesized for each 10 +microseconds of trace. Alternatives to "us" are "ms" (milliseconds), +"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions). + +"ms", "us" and "ns" are converted to TSC ticks. + +The timing information included with Intel PT does not give the time of every +instruction. Consequently, for the purpose of sampling, the decoder estimates +the time since the last timing packet based on 1 tick per instruction. The time +on the sample is *not* adjusted and reflects the last known value of TSC. + +For Intel PT, the default period is 100us. + +Setting it to a zero period means "as often as possible". + +In the case of Intel PT that is the same as a period of 1 and a unit of +'instructions' (i.e. --itrace=i1i). + +Also the call chain size (default 16, max. 1024) for instructions or +transactions events can be specified. e.g. + + --itrace=ig32 + --itrace=xg32 + +Also the number of last branch entries (default 64, max. 1024) for instructions or +transactions events can be specified. e.g. + + --itrace=il10 + --itrace=xl10 + +Note that last branch entries are cleared for each sample, so there is no overlap +from one sample to the next. + +To disable trace decoding entirely, use the option --no-itrace. + +It is also possible to skip events generated (instructions, branches, transactions) +at the beginning. This is useful to ignore initialization code. + + --itrace=i0nss1000000 + +skips the first million instructions. + +dump option +~~~~~~~~~~~ + +perf script has an option (-D) to "dump" the events i.e. display the binary +data. + +When -D is used, Intel PT packets are displayed. The packet decoder does not +pay attention to PSB packets, but just decodes the bytes - so the packets seen +by the actual decoder may not be identical in places where the data is corrupt. +One example of that would be when the buffer-switching interrupt has been too +slow, and the buffer has been filled completely. In that case, the last packet +in the buffer might be truncated and immediately followed by a PSB as the trace +continues in the next buffer. + +To disable the display of Intel PT packets, combine the -D option with +--no-itrace. + + +perf report +----------- + +By default, perf report will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace exactly the same as +perf script, with the exception that the default is --itrace=igxe. + + +perf inject +----------- + +perf inject also accepts the --itrace option in which case tracing data is +removed and replaced with the synthesized events. e.g. + + perf inject --itrace -i perf.data -o perf.data.new + +Below is an example of using Intel PT with autofdo. It requires autofdo +(https://github.com/google/autofdo) and gcc version 5. The bubble +sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial) +amended to take the number of elements as a parameter. + + $ gcc-5 -O3 sort.c -o sort_optimized + $ ./sort_optimized 30000 + Bubble sorting array of 30000 elements + 2254 ms + + $ cat ~/.perfconfig + [intel-pt] + mispred-all = on + + $ perf record -e intel_pt//u ./sort 3000 + Bubble sorting array of 3000 elements + 58 ms + [ perf record: Woken up 2 times to write data ] + [ perf record: Captured and wrote 3.939 MB perf.data ] + $ perf inject -i perf.data -o inj --itrace=i100usle --strip + $ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1 + $ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo + $ ./sort_autofdo 30000 + Bubble sorting array of 30000 elements + 2155 ms + +Note there is currently no advantage to using Intel PT instead of LBR, but +that may change in the future if greater use is made of the data. + + +PEBS via Intel PT +----------------- + +Some hardware has the feature to redirect PEBS records to the Intel PT trace. +Recording is selected by using the aux-output config term e.g. + + perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname + +Note that currently, software only supports redirecting at most one PEBS event. + +To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g. + + perf script --itrace=oe + + +SEE ALSO +-------- + +linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1], +linkperf:perf-inject[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index b23a4012a606..7f4db7592467 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -589,4 +589,4 @@ appended unit character - B/K/M/G SEE ALSO -------- -linkperf:perf-stat[1], linkperf:perf-list[1] +linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index db61f16ffa56..bd0a029d4c08 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -546,4 +546,5 @@ include::callchain-overhead-calculation.txt[] SEE ALSO -------- -linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1] +linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1], +linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 2599b057e47b..db6a36aac47e 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -429,4 +429,4 @@ include::itrace.txt[] SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], -linkperf:perf-script-python[1] +linkperf:perf-script-python[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 9431b8066fb4..4d56586b2fb9 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -334,6 +334,15 @@ Configure all used events to run in kernel space. --all-user:: Configure all used events to run in user space. +--percore-show-thread:: +The event modifier "percore" has supported to sum up the event counts +for all hardware threads in a core and show the counts per core. + +This option with event modifier "percore" enabled also sums up the event +counts for all hardware threads in a core but show the sum counts per +hardware thread. This is essentially a replacement for the any bit and +convenient for post processing. + EXAMPLES -------- diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 7902a5681fc8..b8fc7d972be9 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -35,7 +35,7 @@ endif # Only pass canonical directory names as the output directory: # ifneq ($(O),) - FULL_O := $(shell readlink -f $(O) || echo $(O)) + FULL_O := $(shell cd $(PWD); readlink -f $(O) || echo $(O)) endif # diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 8d6821d9c3f6..27653be24447 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -11,17 +11,17 @@ #include <linux/zalloc.h> #include <time.h> -#include "../../util/cpumap.h" -#include "../../util/event.h" -#include "../../util/evsel.h" -#include "../../util/evlist.h" -#include "../../util/session.h" +#include "../../../util/cpumap.h" +#include "../../../util/event.h" +#include "../../../util/evsel.h" +#include "../../../util/evlist.h" +#include "../../../util/session.h" #include <internal/lib.h> // page_size -#include "../../util/pmu.h" -#include "../../util/debug.h" -#include "../../util/auxtrace.h" -#include "../../util/record.h" -#include "../../util/arm-spe.h" +#include "../../../util/pmu.h" +#include "../../../util/debug.h" +#include "../../../util/auxtrace.h" +#include "../../../util/record.h" +#include "../../../util/arm-spe.h" #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c index 2864e2e3776d..2833e101a7c6 100644 --- a/tools/perf/arch/arm64/util/perf_regs.c +++ b/tools/perf/arch/arm64/util/perf_regs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../util/perf_regs.h" +#include "../../../util/perf_regs.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG_END diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c index e9c436eeffc9..0a5242900248 100644 --- a/tools/perf/arch/powerpc/util/perf_regs.c +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -4,8 +4,8 @@ #include <regex.h> #include <linux/zalloc.h> -#include "../../util/perf_regs.h" -#include "../../util/debug.h" +#include "../../../util/perf_regs.h" +#include "../../../util/debug.h" #include <linux/kernel.h> diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-32.c b/tools/perf/arch/x86/tests/insn-x86-dat-32.c index e6461abc9e7b..9708ae892061 100644 --- a/tools/perf/arch/x86/tests/insn-x86-dat-32.c +++ b/tools/perf/arch/x86/tests/insn-x86-dat-32.c @@ -2085,6 +2085,118 @@ "67 f3 0f 38 f8 1c \tenqcmds (%si),%bx",}, {{0x67, 0xf3, 0x0f, 0x38, 0xf8, 0x8c, 0x34, 0x12, }, 8, 0, "", "", "67 f3 0f 38 f8 8c 34 12 \tenqcmds 0x1234(%si),%cx",}, +{{0xf3, 0x0f, 0xae, 0xe8, }, 4, 0, "", "", +"f3 0f ae e8 \tincsspd %eax",}, +{{0x0f, 0xae, 0x28, }, 3, 0, "", "", +"0f ae 28 \txrstor (%eax)",}, +{{0x0f, 0xae, 0x2d, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "", "", +"0f ae 2d 78 56 34 12 \txrstor 0x12345678",}, +{{0x0f, 0xae, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f ae ac c8 78 56 34 12 \txrstor 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xae, 0xe8, }, 3, 0, "", "", +"0f ae e8 \tlfence ",}, +{{0xf3, 0x0f, 0x1e, 0xc8, }, 4, 0, "", "", +"f3 0f 1e c8 \trdsspd %eax",}, +{{0xf3, 0x0f, 0x01, 0xea, }, 4, 0, "", "", +"f3 0f 01 ea \tsaveprevssp ",}, +{{0xf3, 0x0f, 0x01, 0x28, }, 4, 0, "", "", +"f3 0f 01 28 \trstorssp (%eax)",}, +{{0xf3, 0x0f, 0x01, 0x2d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"f3 0f 01 2d 78 56 34 12 \trstorssp 0x12345678",}, +{{0xf3, 0x0f, 0x01, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"f3 0f 01 ac c8 78 56 34 12 \trstorssp 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0x38, 0xf6, 0x08, }, 4, 0, "", "", +"0f 38 f6 08 \twrssd %ecx,(%eax)",}, +{{0x0f, 0x38, 0xf6, 0x15, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 f6 15 78 56 34 12 \twrssd %edx,0x12345678",}, +{{0x0f, 0x38, 0xf6, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 f6 94 c8 78 56 34 12 \twrssd %edx,0x12345678(%eax,%ecx,8)",}, +{{0x66, 0x0f, 0x38, 0xf5, 0x08, }, 5, 0, "", "", +"66 0f 38 f5 08 \twrussd %ecx,(%eax)",}, +{{0x66, 0x0f, 0x38, 0xf5, 0x15, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f 38 f5 15 78 56 34 12 \twrussd %edx,0x12345678",}, +{{0x66, 0x0f, 0x38, 0xf5, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"66 0f 38 f5 94 c8 78 56 34 12 \twrussd %edx,0x12345678(%eax,%ecx,8)",}, +{{0xf3, 0x0f, 0x01, 0xe8, }, 4, 0, "", "", +"f3 0f 01 e8 \tsetssbsy ",}, +{{0x0f, 0x01, 0xee, }, 3, 0, "", "", +"0f 01 ee \trdpkru ",}, +{{0x0f, 0x01, 0xef, }, 3, 0, "", "", +"0f 01 ef \twrpkru ",}, +{{0xf3, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"f3 0f ae 30 \tclrssbsy (%eax)",}, +{{0xf3, 0x0f, 0xae, 0x35, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"f3 0f ae 35 78 56 34 12 \tclrssbsy 0x12345678",}, +{{0xf3, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"f3 0f ae b4 c8 78 56 34 12 \tclrssbsy 0x12345678(%eax,%ecx,8)",}, +{{0xf3, 0x0f, 0x1e, 0xfb, }, 4, 0, "", "", +"f3 0f 1e fb \tendbr32 ",}, +{{0xf3, 0x0f, 0x1e, 0xfa, }, 4, 0, "", "", +"f3 0f 1e fa \tendbr64 ",}, +{{0xff, 0xd0, }, 2, 0, "call", "indirect", +"ff d0 \tcall *%eax",}, +{{0xff, 0x10, }, 2, 0, "call", "indirect", +"ff 10 \tcall *(%eax)",}, +{{0xff, 0x15, 0x78, 0x56, 0x34, 0x12, }, 6, 0, "call", "indirect", +"ff 15 78 56 34 12 \tcall *0x12345678",}, +{{0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "call", "indirect", +"ff 94 c8 78 56 34 12 \tcall *0x12345678(%eax,%ecx,8)",}, +{{0xf2, 0xff, 0xd0, }, 3, 0, "call", "indirect", +"f2 ff d0 \tbnd call *%eax",}, +{{0xf2, 0xff, 0x10, }, 3, 0, "call", "indirect", +"f2 ff 10 \tbnd call *(%eax)",}, +{{0xf2, 0xff, 0x15, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "call", "indirect", +"f2 ff 15 78 56 34 12 \tbnd call *0x12345678",}, +{{0xf2, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"f2 ff 94 c8 78 56 34 12 \tbnd call *0x12345678(%eax,%ecx,8)",}, +{{0x3e, 0xff, 0xd0, }, 3, 0, "call", "indirect", +"3e ff d0 \tnotrack call *%eax",}, +{{0x3e, 0xff, 0x10, }, 3, 0, "call", "indirect", +"3e ff 10 \tnotrack call *(%eax)",}, +{{0x3e, 0xff, 0x15, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "call", "indirect", +"3e ff 15 78 56 34 12 \tnotrack call *0x12345678",}, +{{0x3e, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"3e ff 94 c8 78 56 34 12 \tnotrack call *0x12345678(%eax,%ecx,8)",}, +{{0x3e, 0xf2, 0xff, 0xd0, }, 4, 0, "call", "indirect", +"3e f2 ff d0 \tnotrack bnd call *%eax",}, +{{0x3e, 0xf2, 0xff, 0x10, }, 4, 0, "call", "indirect", +"3e f2 ff 10 \tnotrack bnd call *(%eax)",}, +{{0x3e, 0xf2, 0xff, 0x15, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"3e f2 ff 15 78 56 34 12 \tnotrack bnd call *0x12345678",}, +{{0x3e, 0xf2, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "call", "indirect", +"3e f2 ff 94 c8 78 56 34 12 \tnotrack bnd call *0x12345678(%eax,%ecx,8)",}, +{{0xff, 0xe0, }, 2, 0, "jmp", "indirect", +"ff e0 \tjmp *%eax",}, +{{0xff, 0x20, }, 2, 0, "jmp", "indirect", +"ff 20 \tjmp *(%eax)",}, +{{0xff, 0x25, 0x78, 0x56, 0x34, 0x12, }, 6, 0, "jmp", "indirect", +"ff 25 78 56 34 12 \tjmp *0x12345678",}, +{{0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "jmp", "indirect", +"ff a4 c8 78 56 34 12 \tjmp *0x12345678(%eax,%ecx,8)",}, +{{0xf2, 0xff, 0xe0, }, 3, 0, "jmp", "indirect", +"f2 ff e0 \tbnd jmp *%eax",}, +{{0xf2, 0xff, 0x20, }, 3, 0, "jmp", "indirect", +"f2 ff 20 \tbnd jmp *(%eax)",}, +{{0xf2, 0xff, 0x25, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "jmp", "indirect", +"f2 ff 25 78 56 34 12 \tbnd jmp *0x12345678",}, +{{0xf2, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"f2 ff a4 c8 78 56 34 12 \tbnd jmp *0x12345678(%eax,%ecx,8)",}, +{{0x3e, 0xff, 0xe0, }, 3, 0, "jmp", "indirect", +"3e ff e0 \tnotrack jmp *%eax",}, +{{0x3e, 0xff, 0x20, }, 3, 0, "jmp", "indirect", +"3e ff 20 \tnotrack jmp *(%eax)",}, +{{0x3e, 0xff, 0x25, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "jmp", "indirect", +"3e ff 25 78 56 34 12 \tnotrack jmp *0x12345678",}, +{{0x3e, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"3e ff a4 c8 78 56 34 12 \tnotrack jmp *0x12345678(%eax,%ecx,8)",}, +{{0x3e, 0xf2, 0xff, 0xe0, }, 4, 0, "jmp", "indirect", +"3e f2 ff e0 \tnotrack bnd jmp *%eax",}, +{{0x3e, 0xf2, 0xff, 0x20, }, 4, 0, "jmp", "indirect", +"3e f2 ff 20 \tnotrack bnd jmp *(%eax)",}, +{{0x3e, 0xf2, 0xff, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"3e f2 ff 25 78 56 34 12 \tnotrack bnd jmp *0x12345678",}, +{{0x3e, 0xf2, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "jmp", "indirect", +"3e f2 ff a4 c8 78 56 34 12 \tnotrack bnd jmp *0x12345678(%eax,%ecx,8)",}, {{0x0f, 0x01, 0xcf, }, 3, 0, "", "", "0f 01 cf \tencls ",}, {{0x0f, 0x01, 0xd7, }, 3, 0, "", "", diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-64.c b/tools/perf/arch/x86/tests/insn-x86-dat-64.c index 567ecccfad7c..5da17d41d302 100644 --- a/tools/perf/arch/x86/tests/insn-x86-dat-64.c +++ b/tools/perf/arch/x86/tests/insn-x86-dat-64.c @@ -2263,6 +2263,202 @@ "67 f3 0f 38 f8 18 \tenqcmds (%eax),%ebx",}, {{0x67, 0xf3, 0x0f, 0x38, 0xf8, 0x88, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", "67 f3 0f 38 f8 88 78 56 34 12 \tenqcmds 0x12345678(%eax),%ecx",}, +{{0xf3, 0x0f, 0xae, 0xe8, }, 4, 0, "", "", +"f3 0f ae e8 \tincsspd %eax",}, +{{0xf3, 0x41, 0x0f, 0xae, 0xe8, }, 5, 0, "", "", +"f3 41 0f ae e8 \tincsspd %r8d",}, +{{0xf3, 0x48, 0x0f, 0xae, 0xe8, }, 5, 0, "", "", +"f3 48 0f ae e8 \tincsspq %rax",}, +{{0xf3, 0x49, 0x0f, 0xae, 0xe8, }, 5, 0, "", "", +"f3 49 0f ae e8 \tincsspq %r8",}, +{{0x0f, 0xae, 0x28, }, 3, 0, "", "", +"0f ae 28 \txrstor (%rax)",}, +{{0x41, 0x0f, 0xae, 0x28, }, 4, 0, "", "", +"41 0f ae 28 \txrstor (%r8)",}, +{{0x0f, 0xae, 0x2c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f ae 2c 25 78 56 34 12 \txrstor 0x12345678",}, +{{0x0f, 0xae, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f ae ac c8 78 56 34 12 \txrstor 0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0xae, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"41 0f ae ac c8 78 56 34 12 \txrstor 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xae, 0xe8, }, 3, 0, "", "", +"0f ae e8 \tlfence ",}, +{{0xf3, 0x0f, 0x1e, 0xc8, }, 4, 0, "", "", +"f3 0f 1e c8 \trdsspd %eax",}, +{{0xf3, 0x41, 0x0f, 0x1e, 0xc8, }, 5, 0, "", "", +"f3 41 0f 1e c8 \trdsspd %r8d",}, +{{0xf3, 0x48, 0x0f, 0x1e, 0xc8, }, 5, 0, "", "", +"f3 48 0f 1e c8 \trdsspq %rax",}, +{{0xf3, 0x49, 0x0f, 0x1e, 0xc8, }, 5, 0, "", "", +"f3 49 0f 1e c8 \trdsspq %r8",}, +{{0xf3, 0x0f, 0x01, 0xea, }, 4, 0, "", "", +"f3 0f 01 ea \tsaveprevssp ",}, +{{0xf3, 0x0f, 0x01, 0x28, }, 4, 0, "", "", +"f3 0f 01 28 \trstorssp (%rax)",}, +{{0xf3, 0x41, 0x0f, 0x01, 0x28, }, 5, 0, "", "", +"f3 41 0f 01 28 \trstorssp (%r8)",}, +{{0xf3, 0x0f, 0x01, 0x2c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"f3 0f 01 2c 25 78 56 34 12 \trstorssp 0x12345678",}, +{{0xf3, 0x0f, 0x01, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"f3 0f 01 ac c8 78 56 34 12 \trstorssp 0x12345678(%rax,%rcx,8)",}, +{{0xf3, 0x41, 0x0f, 0x01, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"f3 41 0f 01 ac c8 78 56 34 12 \trstorssp 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0x38, 0xf6, 0x08, }, 4, 0, "", "", +"0f 38 f6 08 \twrssd %ecx,(%rax)",}, +{{0x41, 0x0f, 0x38, 0xf6, 0x10, }, 5, 0, "", "", +"41 0f 38 f6 10 \twrssd %edx,(%r8)",}, +{{0x0f, 0x38, 0xf6, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 f6 14 25 78 56 34 12 \twrssd %edx,0x12345678",}, +{{0x0f, 0x38, 0xf6, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 f6 94 c8 78 56 34 12 \twrssd %edx,0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0x38, 0xf6, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"41 0f 38 f6 94 c8 78 56 34 12 \twrssd %edx,0x12345678(%r8,%rcx,8)",}, +{{0x48, 0x0f, 0x38, 0xf6, 0x08, }, 5, 0, "", "", +"48 0f 38 f6 08 \twrssq %rcx,(%rax)",}, +{{0x49, 0x0f, 0x38, 0xf6, 0x10, }, 5, 0, "", "", +"49 0f 38 f6 10 \twrssq %rdx,(%r8)",}, +{{0x48, 0x0f, 0x38, 0xf6, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"48 0f 38 f6 14 25 78 56 34 12 \twrssq %rdx,0x12345678",}, +{{0x48, 0x0f, 0x38, 0xf6, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"48 0f 38 f6 94 c8 78 56 34 12 \twrssq %rdx,0x12345678(%rax,%rcx,8)",}, +{{0x49, 0x0f, 0x38, 0xf6, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"49 0f 38 f6 94 c8 78 56 34 12 \twrssq %rdx,0x12345678(%r8,%rcx,8)",}, +{{0x66, 0x0f, 0x38, 0xf5, 0x08, }, 5, 0, "", "", +"66 0f 38 f5 08 \twrussd %ecx,(%rax)",}, +{{0x66, 0x41, 0x0f, 0x38, 0xf5, 0x10, }, 6, 0, "", "", +"66 41 0f 38 f5 10 \twrussd %edx,(%r8)",}, +{{0x66, 0x0f, 0x38, 0xf5, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"66 0f 38 f5 14 25 78 56 34 12 \twrussd %edx,0x12345678",}, +{{0x66, 0x0f, 0x38, 0xf5, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"66 0f 38 f5 94 c8 78 56 34 12 \twrussd %edx,0x12345678(%rax,%rcx,8)",}, +{{0x66, 0x41, 0x0f, 0x38, 0xf5, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 11, 0, "", "", +"66 41 0f 38 f5 94 c8 78 56 34 12 \twrussd %edx,0x12345678(%r8,%rcx,8)",}, +{{0x66, 0x48, 0x0f, 0x38, 0xf5, 0x08, }, 6, 0, "", "", +"66 48 0f 38 f5 08 \twrussq %rcx,(%rax)",}, +{{0x66, 0x49, 0x0f, 0x38, 0xf5, 0x10, }, 6, 0, "", "", +"66 49 0f 38 f5 10 \twrussq %rdx,(%r8)",}, +{{0x66, 0x48, 0x0f, 0x38, 0xf5, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 11, 0, "", "", +"66 48 0f 38 f5 14 25 78 56 34 12 \twrussq %rdx,0x12345678",}, +{{0x66, 0x48, 0x0f, 0x38, 0xf5, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 11, 0, "", "", +"66 48 0f 38 f5 94 c8 78 56 34 12 \twrussq %rdx,0x12345678(%rax,%rcx,8)",}, +{{0x66, 0x49, 0x0f, 0x38, 0xf5, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 11, 0, "", "", +"66 49 0f 38 f5 94 c8 78 56 34 12 \twrussq %rdx,0x12345678(%r8,%rcx,8)",}, +{{0xf3, 0x0f, 0x01, 0xe8, }, 4, 0, "", "", +"f3 0f 01 e8 \tsetssbsy ",}, +{{0x0f, 0x01, 0xee, }, 3, 0, "", "", +"0f 01 ee \trdpkru ",}, +{{0x0f, 0x01, 0xef, }, 3, 0, "", "", +"0f 01 ef \twrpkru ",}, +{{0xf3, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"f3 0f ae 30 \tclrssbsy (%rax)",}, +{{0xf3, 0x41, 0x0f, 0xae, 0x30, }, 5, 0, "", "", +"f3 41 0f ae 30 \tclrssbsy (%r8)",}, +{{0xf3, 0x0f, 0xae, 0x34, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"f3 0f ae 34 25 78 56 34 12 \tclrssbsy 0x12345678",}, +{{0xf3, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"f3 0f ae b4 c8 78 56 34 12 \tclrssbsy 0x12345678(%rax,%rcx,8)",}, +{{0xf3, 0x41, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"f3 41 0f ae b4 c8 78 56 34 12 \tclrssbsy 0x12345678(%r8,%rcx,8)",}, +{{0xf3, 0x0f, 0x1e, 0xfb, }, 4, 0, "", "", +"f3 0f 1e fb \tendbr32 ",}, +{{0xf3, 0x0f, 0x1e, 0xfa, }, 4, 0, "", "", +"f3 0f 1e fa \tendbr64 ",}, +{{0xff, 0xd0, }, 2, 0, "call", "indirect", +"ff d0 \tcallq *%rax",}, +{{0xff, 0x10, }, 2, 0, "call", "indirect", +"ff 10 \tcallq *(%rax)",}, +{{0x41, 0xff, 0x10, }, 3, 0, "call", "indirect", +"41 ff 10 \tcallq *(%r8)",}, +{{0xff, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "call", "indirect", +"ff 14 25 78 56 34 12 \tcallq *0x12345678",}, +{{0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "call", "indirect", +"ff 94 c8 78 56 34 12 \tcallq *0x12345678(%rax,%rcx,8)",}, +{{0x41, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"41 ff 94 c8 78 56 34 12 \tcallq *0x12345678(%r8,%rcx,8)",}, +{{0xf2, 0xff, 0xd0, }, 3, 0, "call", "indirect", +"f2 ff d0 \tbnd callq *%rax",}, +{{0xf2, 0xff, 0x10, }, 3, 0, "call", "indirect", +"f2 ff 10 \tbnd callq *(%rax)",}, +{{0xf2, 0x41, 0xff, 0x10, }, 4, 0, "call", "indirect", +"f2 41 ff 10 \tbnd callq *(%r8)",}, +{{0xf2, 0xff, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"f2 ff 14 25 78 56 34 12 \tbnd callq *0x12345678",}, +{{0xf2, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"f2 ff 94 c8 78 56 34 12 \tbnd callq *0x12345678(%rax,%rcx,8)",}, +{{0xf2, 0x41, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "call", "indirect", +"f2 41 ff 94 c8 78 56 34 12 \tbnd callq *0x12345678(%r8,%rcx,8)",}, +{{0x3e, 0xff, 0xd0, }, 3, 0, "call", "indirect", +"3e ff d0 \tnotrack callq *%rax",}, +{{0x3e, 0xff, 0x10, }, 3, 0, "call", "indirect", +"3e ff 10 \tnotrack callq *(%rax)",}, +{{0x3e, 0x41, 0xff, 0x10, }, 4, 0, "call", "indirect", +"3e 41 ff 10 \tnotrack callq *(%r8)",}, +{{0x3e, 0xff, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"3e ff 14 25 78 56 34 12 \tnotrack callq *0x12345678",}, +{{0x3e, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "call", "indirect", +"3e ff 94 c8 78 56 34 12 \tnotrack callq *0x12345678(%rax,%rcx,8)",}, +{{0x3e, 0x41, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "call", "indirect", +"3e 41 ff 94 c8 78 56 34 12 \tnotrack callq *0x12345678(%r8,%rcx,8)",}, +{{0x3e, 0xf2, 0xff, 0xd0, }, 4, 0, "call", "indirect", +"3e f2 ff d0 \tnotrack bnd callq *%rax",}, +{{0x3e, 0xf2, 0xff, 0x10, }, 4, 0, "call", "indirect", +"3e f2 ff 10 \tnotrack bnd callq *(%rax)",}, +{{0x3e, 0xf2, 0x41, 0xff, 0x10, }, 5, 0, "call", "indirect", +"3e f2 41 ff 10 \tnotrack bnd callq *(%r8)",}, +{{0x3e, 0xf2, 0xff, 0x14, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "call", "indirect", +"3e f2 ff 14 25 78 56 34 12 \tnotrack bnd callq *0x12345678",}, +{{0x3e, 0xf2, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "call", "indirect", +"3e f2 ff 94 c8 78 56 34 12 \tnotrack bnd callq *0x12345678(%rax,%rcx,8)",}, +{{0x3e, 0xf2, 0x41, 0xff, 0x94, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "call", "indirect", +"3e f2 41 ff 94 c8 78 56 34 12 \tnotrack bnd callq *0x12345678(%r8,%rcx,8)",}, +{{0xff, 0xe0, }, 2, 0, "jmp", "indirect", +"ff e0 \tjmpq *%rax",}, +{{0xff, 0x20, }, 2, 0, "jmp", "indirect", +"ff 20 \tjmpq *(%rax)",}, +{{0x41, 0xff, 0x20, }, 3, 0, "jmp", "indirect", +"41 ff 20 \tjmpq *(%r8)",}, +{{0xff, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "jmp", "indirect", +"ff 24 25 78 56 34 12 \tjmpq *0x12345678",}, +{{0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "jmp", "indirect", +"ff a4 c8 78 56 34 12 \tjmpq *0x12345678(%rax,%rcx,8)",}, +{{0x41, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"41 ff a4 c8 78 56 34 12 \tjmpq *0x12345678(%r8,%rcx,8)",}, +{{0xf2, 0xff, 0xe0, }, 3, 0, "jmp", "indirect", +"f2 ff e0 \tbnd jmpq *%rax",}, +{{0xf2, 0xff, 0x20, }, 3, 0, "jmp", "indirect", +"f2 ff 20 \tbnd jmpq *(%rax)",}, +{{0xf2, 0x41, 0xff, 0x20, }, 4, 0, "jmp", "indirect", +"f2 41 ff 20 \tbnd jmpq *(%r8)",}, +{{0xf2, 0xff, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"f2 ff 24 25 78 56 34 12 \tbnd jmpq *0x12345678",}, +{{0xf2, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"f2 ff a4 c8 78 56 34 12 \tbnd jmpq *0x12345678(%rax,%rcx,8)",}, +{{0xf2, 0x41, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "jmp", "indirect", +"f2 41 ff a4 c8 78 56 34 12 \tbnd jmpq *0x12345678(%r8,%rcx,8)",}, +{{0x3e, 0xff, 0xe0, }, 3, 0, "jmp", "indirect", +"3e ff e0 \tnotrack jmpq *%rax",}, +{{0x3e, 0xff, 0x20, }, 3, 0, "jmp", "indirect", +"3e ff 20 \tnotrack jmpq *(%rax)",}, +{{0x3e, 0x41, 0xff, 0x20, }, 4, 0, "jmp", "indirect", +"3e 41 ff 20 \tnotrack jmpq *(%r8)",}, +{{0x3e, 0xff, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"3e ff 24 25 78 56 34 12 \tnotrack jmpq *0x12345678",}, +{{0x3e, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "jmp", "indirect", +"3e ff a4 c8 78 56 34 12 \tnotrack jmpq *0x12345678(%rax,%rcx,8)",}, +{{0x3e, 0x41, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "jmp", "indirect", +"3e 41 ff a4 c8 78 56 34 12 \tnotrack jmpq *0x12345678(%r8,%rcx,8)",}, +{{0x3e, 0xf2, 0xff, 0xe0, }, 4, 0, "jmp", "indirect", +"3e f2 ff e0 \tnotrack bnd jmpq *%rax",}, +{{0x3e, 0xf2, 0xff, 0x20, }, 4, 0, "jmp", "indirect", +"3e f2 ff 20 \tnotrack bnd jmpq *(%rax)",}, +{{0x3e, 0xf2, 0x41, 0xff, 0x20, }, 5, 0, "jmp", "indirect", +"3e f2 41 ff 20 \tnotrack bnd jmpq *(%r8)",}, +{{0x3e, 0xf2, 0xff, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "jmp", "indirect", +"3e f2 ff 24 25 78 56 34 12 \tnotrack bnd jmpq *0x12345678",}, +{{0x3e, 0xf2, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "jmp", "indirect", +"3e f2 ff a4 c8 78 56 34 12 \tnotrack bnd jmpq *0x12345678(%rax,%rcx,8)",}, +{{0x3e, 0xf2, 0x41, 0xff, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "jmp", "indirect", +"3e f2 41 ff a4 c8 78 56 34 12 \tnotrack bnd jmpq *0x12345678(%r8,%rcx,8)",}, {{0x0f, 0x01, 0xcf, }, 3, 0, "", "", "0f 01 cf \tencls ",}, {{0x0f, 0x01, 0xd7, }, 3, 0, "", "", diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-src.c b/tools/perf/arch/x86/tests/insn-x86-dat-src.c index ddbf07c50bb8..c3808e94c46e 100644 --- a/tools/perf/arch/x86/tests/insn-x86-dat-src.c +++ b/tools/perf/arch/x86/tests/insn-x86-dat-src.c @@ -1771,6 +1771,145 @@ int main(void) asm volatile("enqcmds (%eax),%ebx"); asm volatile("enqcmds 0x12345678(%eax),%ecx"); + /* incsspd/q */ + + asm volatile("incsspd %eax"); + asm volatile("incsspd %r8d"); + asm volatile("incsspq %rax"); + asm volatile("incsspq %r8"); + /* Also check instructions in the same group encoding as incsspd/q */ + asm volatile("xrstor (%rax)"); + asm volatile("xrstor (%r8)"); + asm volatile("xrstor (0x12345678)"); + asm volatile("xrstor 0x12345678(%rax,%rcx,8)"); + asm volatile("xrstor 0x12345678(%r8,%rcx,8)"); + asm volatile("lfence"); + + /* rdsspd/q */ + + asm volatile("rdsspd %eax"); + asm volatile("rdsspd %r8d"); + asm volatile("rdsspq %rax"); + asm volatile("rdsspq %r8"); + + /* saveprevssp */ + + asm volatile("saveprevssp"); + + /* rstorssp */ + + asm volatile("rstorssp (%rax)"); + asm volatile("rstorssp (%r8)"); + asm volatile("rstorssp (0x12345678)"); + asm volatile("rstorssp 0x12345678(%rax,%rcx,8)"); + asm volatile("rstorssp 0x12345678(%r8,%rcx,8)"); + + /* wrssd/q */ + + asm volatile("wrssd %ecx,(%rax)"); + asm volatile("wrssd %edx,(%r8)"); + asm volatile("wrssd %edx,(0x12345678)"); + asm volatile("wrssd %edx,0x12345678(%rax,%rcx,8)"); + asm volatile("wrssd %edx,0x12345678(%r8,%rcx,8)"); + asm volatile("wrssq %rcx,(%rax)"); + asm volatile("wrssq %rdx,(%r8)"); + asm volatile("wrssq %rdx,(0x12345678)"); + asm volatile("wrssq %rdx,0x12345678(%rax,%rcx,8)"); + asm volatile("wrssq %rdx,0x12345678(%r8,%rcx,8)"); + + /* wrussd/q */ + + asm volatile("wrussd %ecx,(%rax)"); + asm volatile("wrussd %edx,(%r8)"); + asm volatile("wrussd %edx,(0x12345678)"); + asm volatile("wrussd %edx,0x12345678(%rax,%rcx,8)"); + asm volatile("wrussd %edx,0x12345678(%r8,%rcx,8)"); + asm volatile("wrussq %rcx,(%rax)"); + asm volatile("wrussq %rdx,(%r8)"); + asm volatile("wrussq %rdx,(0x12345678)"); + asm volatile("wrussq %rdx,0x12345678(%rax,%rcx,8)"); + asm volatile("wrussq %rdx,0x12345678(%r8,%rcx,8)"); + + /* setssbsy */ + + asm volatile("setssbsy"); + /* Also check instructions in the same group encoding as setssbsy */ + asm volatile("rdpkru"); + asm volatile("wrpkru"); + + /* clrssbsy */ + + asm volatile("clrssbsy (%rax)"); + asm volatile("clrssbsy (%r8)"); + asm volatile("clrssbsy (0x12345678)"); + asm volatile("clrssbsy 0x12345678(%rax,%rcx,8)"); + asm volatile("clrssbsy 0x12345678(%r8,%rcx,8)"); + + /* endbr32/64 */ + + asm volatile("endbr32"); + asm volatile("endbr64"); + + /* call with/without notrack prefix */ + + asm volatile("callq *%rax"); /* Expecting: call indirect 0 */ + asm volatile("callq *(%rax)"); /* Expecting: call indirect 0 */ + asm volatile("callq *(%r8)"); /* Expecting: call indirect 0 */ + asm volatile("callq *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("callq *0x12345678(%rax,%rcx,8)"); /* Expecting: call indirect 0 */ + asm volatile("callq *0x12345678(%r8,%rcx,8)"); /* Expecting: call indirect 0 */ + + asm volatile("bnd callq *%rax"); /* Expecting: call indirect 0 */ + asm volatile("bnd callq *(%rax)"); /* Expecting: call indirect 0 */ + asm volatile("bnd callq *(%r8)"); /* Expecting: call indirect 0 */ + asm volatile("bnd callq *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("bnd callq *0x12345678(%rax,%rcx,8)"); /* Expecting: call indirect 0 */ + asm volatile("bnd callq *0x12345678(%r8,%rcx,8)"); /* Expecting: call indirect 0 */ + + asm volatile("notrack callq *%rax"); /* Expecting: call indirect 0 */ + asm volatile("notrack callq *(%rax)"); /* Expecting: call indirect 0 */ + asm volatile("notrack callq *(%r8)"); /* Expecting: call indirect 0 */ + asm volatile("notrack callq *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("notrack callq *0x12345678(%rax,%rcx,8)"); /* Expecting: call indirect 0 */ + asm volatile("notrack callq *0x12345678(%r8,%rcx,8)"); /* Expecting: call indirect 0 */ + + asm volatile("notrack bnd callq *%rax"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd callq *(%rax)"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd callq *(%r8)"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd callq *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd callq *0x12345678(%rax,%rcx,8)"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd callq *0x12345678(%r8,%rcx,8)"); /* Expecting: call indirect 0 */ + + /* jmp with/without notrack prefix */ + + asm volatile("jmpq *%rax"); /* Expecting: jmp indirect 0 */ + asm volatile("jmpq *(%rax)"); /* Expecting: jmp indirect 0 */ + asm volatile("jmpq *(%r8)"); /* Expecting: jmp indirect 0 */ + asm volatile("jmpq *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("jmpq *0x12345678(%rax,%rcx,8)"); /* Expecting: jmp indirect 0 */ + asm volatile("jmpq *0x12345678(%r8,%rcx,8)"); /* Expecting: jmp indirect 0 */ + + asm volatile("bnd jmpq *%rax"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmpq *(%rax)"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmpq *(%r8)"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmpq *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmpq *0x12345678(%rax,%rcx,8)"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmpq *0x12345678(%r8,%rcx,8)"); /* Expecting: jmp indirect 0 */ + + asm volatile("notrack jmpq *%rax"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmpq *(%rax)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmpq *(%r8)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmpq *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmpq *0x12345678(%rax,%rcx,8)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmpq *0x12345678(%r8,%rcx,8)"); /* Expecting: jmp indirect 0 */ + + asm volatile("notrack bnd jmpq *%rax"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmpq *(%rax)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmpq *(%r8)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmpq *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmpq *0x12345678(%rax,%rcx,8)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmpq *0x12345678(%r8,%rcx,8)"); /* Expecting: jmp indirect 0 */ + #else /* #ifdef __x86_64__ */ /* bound r32, mem (same op code as EVEX prefix) */ @@ -3434,6 +3573,103 @@ int main(void) asm volatile("enqcmds (%si),%bx"); asm volatile("enqcmds 0x1234(%si),%cx"); + /* incsspd */ + + asm volatile("incsspd %eax"); + /* Also check instructions in the same group encoding as incsspd */ + asm volatile("xrstor (%eax)"); + asm volatile("xrstor (0x12345678)"); + asm volatile("xrstor 0x12345678(%eax,%ecx,8)"); + asm volatile("lfence"); + + /* rdsspd */ + + asm volatile("rdsspd %eax"); + + /* saveprevssp */ + + asm volatile("saveprevssp"); + + /* rstorssp */ + + asm volatile("rstorssp (%eax)"); + asm volatile("rstorssp (0x12345678)"); + asm volatile("rstorssp 0x12345678(%eax,%ecx,8)"); + + /* wrssd */ + + asm volatile("wrssd %ecx,(%eax)"); + asm volatile("wrssd %edx,(0x12345678)"); + asm volatile("wrssd %edx,0x12345678(%eax,%ecx,8)"); + + /* wrussd */ + + asm volatile("wrussd %ecx,(%eax)"); + asm volatile("wrussd %edx,(0x12345678)"); + asm volatile("wrussd %edx,0x12345678(%eax,%ecx,8)"); + + /* setssbsy */ + + asm volatile("setssbsy"); + /* Also check instructions in the same group encoding as setssbsy */ + asm volatile("rdpkru"); + asm volatile("wrpkru"); + + /* clrssbsy */ + + asm volatile("clrssbsy (%eax)"); + asm volatile("clrssbsy (0x12345678)"); + asm volatile("clrssbsy 0x12345678(%eax,%ecx,8)"); + + /* endbr32/64 */ + + asm volatile("endbr32"); + asm volatile("endbr64"); + + /* call with/without notrack prefix */ + + asm volatile("call *%eax"); /* Expecting: call indirect 0 */ + asm volatile("call *(%eax)"); /* Expecting: call indirect 0 */ + asm volatile("call *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("call *0x12345678(%eax,%ecx,8)"); /* Expecting: call indirect 0 */ + + asm volatile("bnd call *%eax"); /* Expecting: call indirect 0 */ + asm volatile("bnd call *(%eax)"); /* Expecting: call indirect 0 */ + asm volatile("bnd call *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("bnd call *0x12345678(%eax,%ecx,8)"); /* Expecting: call indirect 0 */ + + asm volatile("notrack call *%eax"); /* Expecting: call indirect 0 */ + asm volatile("notrack call *(%eax)"); /* Expecting: call indirect 0 */ + asm volatile("notrack call *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("notrack call *0x12345678(%eax,%ecx,8)"); /* Expecting: call indirect 0 */ + + asm volatile("notrack bnd call *%eax"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd call *(%eax)"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd call *(0x12345678)"); /* Expecting: call indirect 0 */ + asm volatile("notrack bnd call *0x12345678(%eax,%ecx,8)"); /* Expecting: call indirect 0 */ + + /* jmp with/without notrack prefix */ + + asm volatile("jmp *%eax"); /* Expecting: jmp indirect 0 */ + asm volatile("jmp *(%eax)"); /* Expecting: jmp indirect 0 */ + asm volatile("jmp *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("jmp *0x12345678(%eax,%ecx,8)"); /* Expecting: jmp indirect 0 */ + + asm volatile("bnd jmp *%eax"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmp *(%eax)"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmp *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("bnd jmp *0x12345678(%eax,%ecx,8)"); /* Expecting: jmp indirect 0 */ + + asm volatile("notrack jmp *%eax"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmp *(%eax)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmp *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack jmp *0x12345678(%eax,%ecx,8)"); /* Expecting: jmp indirect 0 */ + + asm volatile("notrack bnd jmp *%eax"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmp *(%eax)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmp *(0x12345678)"); /* Expecting: jmp indirect 0 */ + asm volatile("notrack bnd jmp *0x12345678(%eax,%ecx,8)"); /* Expecting: jmp indirect 0 */ + #endif /* #ifndef __x86_64__ */ /* SGX */ diff --git a/tools/perf/arch/x86/util/auxtrace.c b/tools/perf/arch/x86/util/auxtrace.c index 7abc9fd4cbec..3da506e13f49 100644 --- a/tools/perf/arch/x86/util/auxtrace.c +++ b/tools/perf/arch/x86/util/auxtrace.c @@ -7,13 +7,13 @@ #include <errno.h> #include <stdbool.h> -#include "../../util/header.h" -#include "../../util/debug.h" -#include "../../util/pmu.h" -#include "../../util/auxtrace.h" -#include "../../util/intel-pt.h" -#include "../../util/intel-bts.h" -#include "../../util/evlist.h" +#include "../../../util/header.h" +#include "../../../util/debug.h" +#include "../../../util/pmu.h" +#include "../../../util/auxtrace.h" +#include "../../../util/intel-pt.h" +#include "../../../util/intel-bts.h" +#include "../../../util/evlist.h" static struct auxtrace_record *auxtrace_record__init_intel(struct evlist *evlist, diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c index ac45015cc6ba..047dc00eafa6 100644 --- a/tools/perf/arch/x86/util/event.c +++ b/tools/perf/arch/x86/util/event.c @@ -3,12 +3,12 @@ #include <linux/string.h> #include <linux/zalloc.h> -#include "../../util/event.h" -#include "../../util/synthetic-events.h" -#include "../../util/machine.h" -#include "../../util/tool.h" -#include "../../util/map.h" -#include "../../util/debug.h" +#include "../../../util/event.h" +#include "../../../util/synthetic-events.h" +#include "../../../util/machine.h" +#include "../../../util/tool.h" +#include "../../../util/map.h" +#include "../../../util/debug.h" #if defined(__x86_64__) diff --git a/tools/perf/arch/x86/util/header.c b/tools/perf/arch/x86/util/header.c index aa6deb463bf3..578c8c568ffd 100644 --- a/tools/perf/arch/x86/util/header.c +++ b/tools/perf/arch/x86/util/header.c @@ -7,8 +7,8 @@ #include <string.h> #include <regex.h> -#include "../../util/debug.h" -#include "../../util/header.h" +#include "../../../util/debug.h" +#include "../../../util/header.h" static inline void cpuid(unsigned int op, unsigned int *a, unsigned int *b, unsigned int *c, diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 26cee1052179..09f93800bffd 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -11,18 +11,18 @@ #include <linux/log2.h> #include <linux/zalloc.h> -#include "../../util/cpumap.h" -#include "../../util/event.h" -#include "../../util/evsel.h" -#include "../../util/evlist.h" -#include "../../util/mmap.h" -#include "../../util/session.h" -#include "../../util/pmu.h" -#include "../../util/debug.h" -#include "../../util/record.h" -#include "../../util/tsc.h" -#include "../../util/auxtrace.h" -#include "../../util/intel-bts.h" +#include "../../../util/cpumap.h" +#include "../../../util/event.h" +#include "../../../util/evsel.h" +#include "../../../util/evlist.h" +#include "../../../util/mmap.h" +#include "../../../util/session.h" +#include "../../../util/pmu.h" +#include "../../../util/debug.h" +#include "../../../util/record.h" +#include "../../../util/tsc.h" +#include "../../../util/auxtrace.h" +#include "../../../util/intel-bts.h" #include <internal/lib.h> // page_size #define KiB(x) ((x) * 1024) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 7eea4fd7ce58..1643aed8c4c8 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -13,23 +13,23 @@ #include <linux/zalloc.h> #include <cpuid.h> -#include "../../util/session.h" -#include "../../util/event.h" -#include "../../util/evlist.h" -#include "../../util/evsel.h" -#include "../../util/evsel_config.h" -#include "../../util/cpumap.h" -#include "../../util/mmap.h" +#include "../../../util/session.h" +#include "../../../util/event.h" +#include "../../../util/evlist.h" +#include "../../../util/evsel.h" +#include "../../../util/evsel_config.h" +#include "../../../util/cpumap.h" +#include "../../../util/mmap.h" #include <subcmd/parse-options.h> -#include "../../util/parse-events.h" -#include "../../util/pmu.h" -#include "../../util/debug.h" -#include "../../util/auxtrace.h" -#include "../../util/record.h" -#include "../../util/target.h" -#include "../../util/tsc.h" +#include "../../../util/parse-events.h" +#include "../../../util/pmu.h" +#include "../../../util/debug.h" +#include "../../../util/auxtrace.h" +#include "../../../util/record.h" +#include "../../../util/target.h" +#include "../../../util/tsc.h" #include <internal/lib.h> // page_size -#include "../../util/intel-pt.h" +#include "../../../util/intel-pt.h" #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) diff --git a/tools/perf/arch/x86/util/machine.c b/tools/perf/arch/x86/util/machine.c index e17e080e76f4..31679c35d493 100644 --- a/tools/perf/arch/x86/util/machine.c +++ b/tools/perf/arch/x86/util/machine.c @@ -5,9 +5,9 @@ #include <stdlib.h> #include <internal/lib.h> // page_size -#include "../../util/machine.h" -#include "../../util/map.h" -#include "../../util/symbol.h" +#include "../../../util/machine.h" +#include "../../../util/map.h" +#include "../../../util/symbol.h" #include <linux/ctype.h> #include <symbol/kallsyms.h> diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index c218b83e063b..fca81b39b09f 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -5,10 +5,10 @@ #include <linux/kernel.h> #include <linux/zalloc.h> -#include "../../perf-sys.h" -#include "../../util/perf_regs.h" -#include "../../util/debug.h" -#include "../../util/event.h" +#include "../../../perf-sys.h" +#include "../../../util/perf_regs.h" +#include "../../../util/debug.h" +#include "../../../util/event.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG(AX, PERF_REG_X86_AX), diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c index e33ef5bc31c5..d48d608517fd 100644 --- a/tools/perf/arch/x86/util/pmu.c +++ b/tools/perf/arch/x86/util/pmu.c @@ -4,9 +4,9 @@ #include <linux/stddef.h> #include <linux/perf_event.h> -#include "../../util/intel-pt.h" -#include "../../util/intel-bts.h" -#include "../../util/pmu.h" +#include "../../../util/intel-pt.h" +#include "../../../util/intel-bts.h" +#include "../../../util/pmu.h" struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused) { diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index fddb3ced9db6..4aa6de1aa67d 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -2,6 +2,10 @@ #ifndef BENCH_H #define BENCH_H +#include <sys/time.h> + +extern struct timeval bench__start, bench__end, bench__runtime; + /* * The madvise transparent hugepage constants were added in glibc * 2.13. For compatibility with older versions of glibc, define these diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index bb617e568841..cadc18d42aa4 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -35,7 +35,6 @@ static unsigned int nthreads = 0; static unsigned int nsecs = 8; -struct timeval start, end, runtime; static bool done, __verbose, randomize; /* @@ -94,8 +93,8 @@ static void toggle_done(int sig __maybe_unused, { /* inform all threads that we're done for the day */ done = true; - gettimeofday(&end, NULL); - timersub(&end, &start, &runtime); + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); } static void nest_epollfd(void) @@ -313,6 +312,7 @@ int bench_epoll_ctl(int argc, const char **argv) exit(EXIT_FAILURE); } + memset(&act, 0, sizeof(act)); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); @@ -361,7 +361,7 @@ int bench_epoll_ctl(int argc, const char **argv) threads_starting = nthreads; - gettimeofday(&start, NULL); + gettimeofday(&bench__start, NULL); do_threads(worker, cpu); diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index 7af694437f4e..f938c585d512 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -90,7 +90,6 @@ static unsigned int nthreads = 0; static unsigned int nsecs = 8; -struct timeval start, end, runtime; static bool wdone, done, __verbose, randomize, nonblocking; /* @@ -276,8 +275,8 @@ static void toggle_done(int sig __maybe_unused, { /* inform all threads that we're done for the day */ done = true; - gettimeofday(&end, NULL); - timersub(&end, &start, &runtime); + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); } static void print_summary(void) @@ -287,7 +286,7 @@ static void print_summary(void) printf("\nAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", avg, rel_stddev_stats(stddev, avg), - (int) runtime.tv_sec); + (int)bench__runtime.tv_sec); } static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) @@ -427,6 +426,7 @@ int bench_epoll_wait(int argc, const char **argv) exit(EXIT_FAILURE); } + memset(&act, 0, sizeof(act)); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); @@ -479,7 +479,7 @@ int bench_epoll_wait(int argc, const char **argv) threads_starting = nthreads; - gettimeofday(&start, NULL); + gettimeofday(&bench__start, NULL); do_threads(worker, cpu); @@ -519,7 +519,7 @@ int bench_epoll_wait(int argc, const char **argv) qsort(worker, nthreads, sizeof(struct worker), cmpworker); for (i = 0; i < nthreads; i++) { - unsigned long t = worker[i].ops/runtime.tv_sec; + unsigned long t = worker[i].ops / bench__runtime.tv_sec; update_stats(&throughput_stats, t); diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 8ba0c3330a9a..65eebe06c04d 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -37,7 +37,7 @@ static unsigned int nfutexes = 1024; static bool fshared = false, done = false, silent = false; static int futex_flag = 0; -struct timeval start, end, runtime; +struct timeval bench__start, bench__end, bench__runtime; static pthread_mutex_t thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; @@ -103,8 +103,8 @@ static void toggle_done(int sig __maybe_unused, { /* inform all threads that we're done for the day */ done = true; - gettimeofday(&end, NULL); - timersub(&end, &start, &runtime); + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); } static void print_summary(void) @@ -114,7 +114,7 @@ static void print_summary(void) printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), - (int) runtime.tv_sec); + (int)bench__runtime.tv_sec); } int bench_futex_hash(int argc, const char **argv) @@ -137,6 +137,7 @@ int bench_futex_hash(int argc, const char **argv) if (!cpu) goto errmem; + memset(&act, 0, sizeof(act)); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); @@ -161,7 +162,7 @@ int bench_futex_hash(int argc, const char **argv) threads_starting = nthreads; pthread_attr_init(&thread_attr); - gettimeofday(&start, NULL); + gettimeofday(&bench__start, NULL); for (i = 0; i < nthreads; i++) { worker[i].tid = i; worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex)); @@ -204,7 +205,7 @@ int bench_futex_hash(int argc, const char **argv) pthread_mutex_destroy(&thread_lock); for (i = 0; i < nthreads; i++) { - unsigned long t = worker[i].ops/runtime.tv_sec; + unsigned long t = worker[i].ops / bench__runtime.tv_sec; update_stats(&throughput_stats, t); if (!silent) { if (nfutexes == 1) diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index d0cae8125423..89fd8f325f38 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -37,7 +37,6 @@ static bool silent = false, multi = false; static bool done = false, fshared = false; static unsigned int nthreads = 0; static int futex_flag = 0; -struct timeval start, end, runtime; static pthread_mutex_t thread_lock; static unsigned int threads_starting; static struct stats throughput_stats; @@ -64,7 +63,7 @@ static void print_summary(void) printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), - (int) runtime.tv_sec); + (int)bench__runtime.tv_sec); } static void toggle_done(int sig __maybe_unused, @@ -73,8 +72,8 @@ static void toggle_done(int sig __maybe_unused, { /* inform all threads that we're done for the day */ done = true; - gettimeofday(&end, NULL); - timersub(&end, &start, &runtime); + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); } static void *workerfn(void *arg) @@ -161,6 +160,7 @@ int bench_futex_lock_pi(int argc, const char **argv) if (!cpu) err(EXIT_FAILURE, "calloc"); + memset(&act, 0, sizeof(act)); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); @@ -185,7 +185,7 @@ int bench_futex_lock_pi(int argc, const char **argv) threads_starting = nthreads; pthread_attr_init(&thread_attr); - gettimeofday(&start, NULL); + gettimeofday(&bench__start, NULL); create_threads(worker, thread_attr, cpu); pthread_attr_destroy(&thread_attr); @@ -211,7 +211,7 @@ int bench_futex_lock_pi(int argc, const char **argv) pthread_mutex_destroy(&thread_lock); for (i = 0; i < nthreads; i++) { - unsigned long t = worker[i].ops/runtime.tv_sec; + unsigned long t = worker[i].ops / bench__runtime.tv_sec; update_stats(&throughput_stats, t); if (!silent) diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index a00a6891447a..7a15c2e61022 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -128,6 +128,7 @@ int bench_futex_requeue(int argc, const char **argv) if (!cpu) err(EXIT_FAILURE, "cpu_map__new"); + memset(&act, 0, sizeof(act)); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index a053cf2b7039..cd2b81a845ac 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -234,6 +234,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) exit(EXIT_FAILURE); } + memset(&act, 0, sizeof(act)); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index df810096abfe..2dfcef3e371e 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -43,7 +43,7 @@ static bool done = false, silent = false, fshared = false; static pthread_mutex_t thread_lock; static pthread_cond_t thread_parent, thread_worker; static struct stats waketime_stats, wakeup_stats; -static unsigned int ncpus, threads_starting, nthreads = 0; +static unsigned int threads_starting, nthreads = 0; static int futex_flag = 0; static const struct option options[] = { @@ -136,12 +136,13 @@ int bench_futex_wake(int argc, const char **argv) if (!cpu) err(EXIT_FAILURE, "calloc"); + memset(&act, 0, sizeof(act)); sigfillset(&act.sa_mask); act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); if (!nthreads) - nthreads = ncpus; + nthreads = cpu->nr; worker = calloc(nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index f8b6ae557d8b..5e697cd2224a 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -572,29 +572,12 @@ static void init_block_hist(struct block_hist *bh) bh->valid = true; } -static int block_pair_cmp(struct hist_entry *a, struct hist_entry *b) -{ - struct block_info *bi_a = a->block_info; - struct block_info *bi_b = b->block_info; - int cmp; - - if (!bi_a->sym || !bi_b->sym) - return -1; - - cmp = strcmp(bi_a->sym->name, bi_b->sym->name); - - if ((!cmp) && (bi_a->start == bi_b->start) && (bi_a->end == bi_b->end)) - return 0; - - return -1; -} - static struct hist_entry *get_block_pair(struct hist_entry *he, struct hists *hists_pair) { struct rb_root_cached *root = hists_pair->entries_in; struct rb_node *next = rb_first_cached(root); - int cmp; + int64_t cmp; while (next != NULL) { struct hist_entry *he_pair = rb_entry(next, struct hist_entry, @@ -602,7 +585,7 @@ static struct hist_entry *get_block_pair(struct hist_entry *he, next = rb_next(&he_pair->rb_node_in); - cmp = block_pair_cmp(he_pair, he); + cmp = __block_info__cmp(he_pair, he); if (!cmp) return he_pair; } @@ -1312,7 +1295,8 @@ static int cycles_printf(struct hist_entry *he, struct hist_entry *pair, end_line = map__srcline(he->ms.map, bi->sym->start + bi->end, he->ms.sym); - if ((start_line != SRCLINE_UNKNOWN) && (end_line != SRCLINE_UNKNOWN)) { + if ((strncmp(start_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0) && + (strncmp(end_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0)) { scnprintf(buf, sizeof(buf), "[%s -> %s] %4ld", start_line, end_line, block_he->diff.cycles); } else { diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 72a12b69f120..5f4045df76f4 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -104,6 +104,7 @@ struct report { bool symbol_ipc; bool total_cycles_mode; struct block_report *block_reports; + int nr_block_reports; }; static int report__config(const char *var, const char *value, void *cb) @@ -185,24 +186,23 @@ static int hist_iter__branch_callback(struct hist_entry_iter *iter, { struct hist_entry *he = iter->he; struct report *rep = arg; - struct branch_info *bi; + struct branch_info *bi = he->branch_info; struct perf_sample *sample = iter->sample; struct evsel *evsel = iter->evsel; int err; + branch_type_count(&rep->brtype_stat, &bi->flags, + bi->from.addr, bi->to.addr); + if (!ui__has_annotation() && !rep->symbol_ipc) return 0; - bi = he->branch_info; err = addr_map_symbol__inc_samples(&bi->from, sample, evsel); if (err) goto out; err = addr_map_symbol__inc_samples(&bi->to, sample, evsel); - branch_type_count(&rep->brtype_stat, &bi->flags, - bi->from.addr, bi->to.addr); - out: return err; } @@ -966,8 +966,19 @@ static int __cmd_report(struct report *rep) report__output_resort(rep); if (rep->total_cycles_mode) { + int block_hpps[6] = { + PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT, + PERF_HPP_REPORT__BLOCK_LBR_CYCLES, + PERF_HPP_REPORT__BLOCK_CYCLES_PCT, + PERF_HPP_REPORT__BLOCK_AVG_CYCLES, + PERF_HPP_REPORT__BLOCK_RANGE, + PERF_HPP_REPORT__BLOCK_DSO, + }; + rep->block_reports = block_info__create_report(session->evlist, - rep->total_cycles); + rep->total_cycles, + block_hpps, 6, + &rep->nr_block_reports); if (!rep->block_reports) return -1; } @@ -1551,8 +1562,11 @@ error: zfree(&report.ptime_range); } - if (report.block_reports) - zfree(&report.block_reports); + if (report.block_reports) { + block_info__free_report(report.block_reports, + report.nr_block_reports); + report.block_reports = NULL; + } zstd_fini(&(session->zstd_data)); perf_session__delete(session); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index e2406b291c1c..656b347f6dd8 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -735,6 +735,7 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -743,8 +744,8 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, return 0; for (i = 0; i < br->nr; i++) { - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; if (PRINT_FIELD(DSO)) { memset(&alf, 0, sizeof(alf)); @@ -768,10 +769,10 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str( br->entries + i), - br->entries[i].flags.in_tx? 'X' : '-', - br->entries[i].flags.abort? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -782,6 +783,7 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -793,8 +795,8 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, memset(&alf, 0, sizeof(alf)); memset(&alt, 0, sizeof(alt)); - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; thread__find_symbol_fb(thread, sample->cpumode, from, &alf); thread__find_symbol_fb(thread, sample->cpumode, to, &alt); @@ -813,10 +815,10 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, printed += fprintf(fp, ")"); } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str( br->entries + i), - br->entries[i].flags.in_tx? 'X' : '-', - br->entries[i].flags.abort? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -827,6 +829,7 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -838,8 +841,8 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, memset(&alf, 0, sizeof(alf)); memset(&alt, 0, sizeof(alt)); - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; if (thread__find_map_fb(thread, sample->cpumode, from, &alf) && !alf.map->dso->adjust_symbols) @@ -862,10 +865,10 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, printed += fprintf(fp, ")"); } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str(br->entries + i), - br->entries[i].flags.in_tx ? 'X' : '-', - br->entries[i].flags.abort ? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -1053,6 +1056,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, struct machine *machine, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); u64 start, end; int i, insn, len, nr, ilen, printed = 0; struct perf_insn x; @@ -1073,31 +1077,31 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += fprintf(fp, "%c", '\n'); /* Handle first from jump, of which we don't know the entry. */ - len = grab_bb(buffer, br->entries[nr-1].from, - br->entries[nr-1].from, + len = grab_bb(buffer, entries[nr-1].from, + entries[nr-1].from, machine, thread, &x.is64bit, &x.cpumode, false); if (len > 0) { - printed += ip__fprintf_sym(br->entries[nr - 1].from, thread, + printed += ip__fprintf_sym(entries[nr - 1].from, thread, x.cpumode, x.cpu, &lastsym, attr, fp); - printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1], + printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1], &x, buffer, len, 0, fp, &total_cycles); if (PRINT_FIELD(SRCCODE)) - printed += print_srccode(thread, x.cpumode, br->entries[nr - 1].from); + printed += print_srccode(thread, x.cpumode, entries[nr - 1].from); } /* Print all blocks */ for (i = nr - 2; i >= 0; i--) { - if (br->entries[i].from || br->entries[i].to) + if (entries[i].from || entries[i].to) pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i, - br->entries[i].from, - br->entries[i].to); - start = br->entries[i + 1].to; - end = br->entries[i].from; + entries[i].from, + entries[i].to); + start = entries[i + 1].to; + end = entries[i].from; len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false); /* Patch up missing kernel transfers due to ring filters */ if (len == -ENXIO && i > 0) { - end = br->entries[--i].from; + end = entries[--i].from; pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end); len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false); } @@ -1110,7 +1114,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (ip == end) { - printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, ++insn, fp, + printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp, &total_cycles); if (PRINT_FIELD(SRCCODE)) printed += print_srccode(thread, x.cpumode, ip); @@ -1134,9 +1138,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, * Hit the branch? In this case we are already done, and the target * has not been executed yet. */ - if (br->entries[0].from == sample->ip) + if (entries[0].from == sample->ip) goto out; - if (br->entries[0].flags.abort) + if (entries[0].flags.abort) goto out; /* @@ -1147,7 +1151,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, * between final branch and sample. When this happens just * continue walking after the last TO until we hit a branch. */ - start = br->entries[0].to; + start = entries[0].to; end = sample->ip; if (end < start) { /* Missing jump. Scan 128 bytes for the next branch */ diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a098c2ebf4ea..ec053dc1e35c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -929,6 +929,10 @@ static struct option stat_options[] = { OPT_BOOLEAN_FLAG(0, "all-user", &stat_config.all_user, "Configure all used events to run in user space.", PARSE_OPT_EXCLUSIVE), + OPT_BOOLEAN(0, "percore-show-thread", &stat_config.percore_show_thread, + "Use with 'percore' event qualifier to show the event " + "counts of one hardware thread by sum up total hardware " + "threads of same physical core"), OPT_END() }; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index f6dd1a63f159..d2539b793f9d 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -684,7 +684,9 @@ repeat: delay_msecs = top->delay_secs * MSEC_PER_SEC; set_term_quiet_input(&save); /* trash return*/ - getc(stdin); + clearerr(stdin); + if (poll(&stdin_poll, 1, 0) > 0) + getc(stdin); while (!done) { perf_top__print_sym_table(top); diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json index 5e36bc2468d0..c998e4f1d1d2 100644 --- a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json +++ b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json @@ -4,27 +4,27 @@ "EventCode": "80", "EventName": "ECC_FUNCTION_COUNT", "BriefDescription": "ECC Function Count", - "PublicDescription": "Long ECC function Count" + "PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions issued by the CPU." }, { "Unit": "CPU-M-CF", "EventCode": "81", "EventName": "ECC_CYCLES_COUNT", "BriefDescription": "ECC Cycles Count", - "PublicDescription": "Long ECC Function cycles count" + "PublicDescription": "This counter counts the total number of CPU cycles when the ECC coprocessor is busy performing the elliptic-curve cryptography (ECC) functions issued by the CPU." }, { "Unit": "CPU-M-CF", "EventCode": "82", "EventName": "ECC_BLOCKED_FUNCTION_COUNT", "BriefDescription": "Ecc Blocked Function Count", - "PublicDescription": "Long ECC blocked function count" + "PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions that are issued by the CPU and are blocked because the ECC coprocessor is busy performing a function issued by another CPU." }, { "Unit": "CPU-M-CF", "EventCode": "83", "EventName": "ECC_BLOCKED_CYCLES_COUNT", "BriefDescription": "ECC Blocked Cycles Count", - "PublicDescription": "Long ECC blocked cycles count" + "PublicDescription": "This counter counts the total number of CPU cycles blocked for the elliptic-curve cryptography (ECC) functions issued by the CPU because the ECC coprocessor is busy performing a function issued by another CPU." }, ] diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json index 89e070727e1b..2df2e231e9ee 100644 --- a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json +++ b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json @@ -25,7 +25,7 @@ "EventCode": "131", "EventName": "DTLB2_HPAGE_WRITES", "BriefDescription": "DTLB2 One-Megabyte Page Writes", - "PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page or a Last Host Translation was done" + "PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page" }, { "Unit": "CPU-M-CF", @@ -358,6 +358,34 @@ }, { "Unit": "CPU-M-CF", + "EventCode": "247", + "EventName": "DFLT_ACCESS", + "BriefDescription": "Cycles CPU spent obtaining access to Deflate unit", + "PublicDescription": "Cycles CPU spent obtaining access to Deflate unit" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "252", + "EventName": "DFLT_CYCLES", + "BriefDescription": "Cycles CPU is using Deflate unit", + "PublicDescription": "Cycles CPU is using Deflate unit" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "264", + "EventName": "DFLT_CC", + "BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed", + "PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "265", + "EventName": "DFLT_CCERROR", + "BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2", + "PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2" + }, + { + "Unit": "CPU-M-CF", "EventCode": "448", "EventName": "MT_DIAG_CYCLES_ONE_THR_ACTIVE", "BriefDescription": "Cycle count with one thread active", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index f94653229dd4..a728c6e5119b 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index e7feb60f9fa9..f97e8316ad2f 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 21d7a0c2c2e8..35f5db1786f7 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 079c77b6a2fd..3c4236a5bad8 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -323,7 +323,7 @@ static int print_events_table_entry(void *data, char *name, char *event, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated) + char *deprecated, char *metric_constraint) { struct perf_entry_data *pd = data; FILE *outfp = pd->outfp; @@ -357,6 +357,8 @@ static int print_events_table_entry(void *data, char *name, char *event, fprintf(outfp, "\t.metric_group = \"%s\",\n", metric_group); if (deprecated) fprintf(outfp, "\t.deprecated = \"%s\",\n", deprecated); + if (metric_constraint) + fprintf(outfp, "\t.metric_constraint = \"%s\",\n", metric_constraint); fprintf(outfp, "},\n"); return 0; @@ -375,6 +377,7 @@ struct event_struct { char *metric_name; char *metric_group; char *deprecated; + char *metric_constraint; }; #define ADD_EVENT_FIELD(field) do { if (field) { \ @@ -422,7 +425,7 @@ static int save_arch_std_events(void *data, char *name, char *event, char *desc, char *long_desc, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated) + char *deprecated, char *metric_constraint) { struct event_struct *es; @@ -486,7 +489,7 @@ try_fixup(const char *fn, char *arch_std, char **event, char **desc, char **name, char **long_desc, char **pmu, char **filter, char **perpkg, char **unit, char **metric_expr, char **metric_name, char **metric_group, unsigned long long eventcode, - char **deprecated) + char **deprecated, char **metric_constraint) { /* try to find matching event from arch standard values */ struct event_struct *es; @@ -515,7 +518,7 @@ int json_events(const char *fn, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated), + char *deprecated, char *metric_constraint), void *data) { int err; @@ -545,6 +548,7 @@ int json_events(const char *fn, char *metric_name = NULL; char *metric_group = NULL; char *deprecated = NULL; + char *metric_constraint = NULL; char *arch_std = NULL; unsigned long long eventcode = 0; struct msrmap *msr = NULL; @@ -629,6 +633,8 @@ int json_events(const char *fn, addfield(map, &metric_name, "", "", val); } else if (json_streq(map, field, "MetricGroup")) { addfield(map, &metric_group, "", "", val); + } else if (json_streq(map, field, "MetricConstraint")) { + addfield(map, &metric_constraint, "", "", val); } else if (json_streq(map, field, "MetricExpr")) { addfield(map, &metric_expr, "", "", val); for (s = metric_expr; *s; s++) @@ -670,13 +676,13 @@ int json_events(const char *fn, &long_desc, &pmu, &filter, &perpkg, &unit, &metric_expr, &metric_name, &metric_group, eventcode, - &deprecated); + &deprecated, &metric_constraint); if (err) goto free_strings; } err = func(data, name, real_event(name, event), desc, long_desc, pmu, unit, perpkg, metric_expr, metric_name, - metric_group, deprecated); + metric_group, deprecated, metric_constraint); free_strings: free(event); free(desc); @@ -691,6 +697,7 @@ free_strings: free(metric_expr); free(metric_name); free(metric_group); + free(metric_constraint); free(arch_std); if (err) @@ -1082,10 +1089,9 @@ static int process_one_file(const char *fpath, const struct stat *sb, */ int main(int argc, char *argv[]) { - int rc; + int rc, ret = 0; int maxfds; char ldirname[PATH_MAX]; - const char *arch; const char *output_file; const char *start_dirname; @@ -1156,7 +1162,8 @@ int main(int argc, char *argv[]) /* Make build fail */ fclose(eventsfp); free_arch_std_events(); - return 1; + ret = 1; + goto out_free_mapfile; } else if (rc) { goto empty_map; } @@ -1174,14 +1181,17 @@ int main(int argc, char *argv[]) /* Make build fail */ fclose(eventsfp); free_arch_std_events(); - return 1; + ret = 1; } - return 0; + + goto out_free_mapfile; empty_map: fclose(eventsfp); create_empty_mapping(output_file); free_arch_std_events(); - return 0; +out_free_mapfile: + free(mapfile); + return ret; } diff --git a/tools/perf/pmu-events/jevents.h b/tools/perf/pmu-events/jevents.h index 5cda49a42143..2afc8304529e 100644 --- a/tools/perf/pmu-events/jevents.h +++ b/tools/perf/pmu-events/jevents.h @@ -8,7 +8,7 @@ int json_events(const char *fn, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated), + char *deprecated, char *metric_constraint), void *data); char *get_cpu_str(void); diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index caeb577d36c9..53e76d5d5b37 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -18,6 +18,7 @@ struct pmu_event { const char *metric_name; const char *metric_group; const char *deprecated; + const char *metric_constraint; }; /* diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl index 4e7076c20616..d307ce8fd6ed 100644 --- a/tools/perf/scripts/perl/check-perf-trace.pl +++ b/tools/perf/scripts/perl/check-perf-trace.pl @@ -28,7 +28,7 @@ sub trace_end sub irq::softirq_entry { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $vec) = @_; print_header($event_name, $common_cpu, $common_secs, $common_nsecs, @@ -43,7 +43,7 @@ sub irq::softirq_entry sub kmem::kmalloc { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $call_site, $ptr, $bytes_req, $bytes_alloc, $gfp_flags) = @_; @@ -92,7 +92,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/failed-syscalls.pl b/tools/perf/scripts/perl/failed-syscalls.pl index 55e7ae4c5c88..05954a8f363a 100644 --- a/tools/perf/scripts/perl/failed-syscalls.pl +++ b/tools/perf/scripts/perl/failed-syscalls.pl @@ -18,7 +18,7 @@ my %failed_syscalls; sub raw_syscalls::sys_exit { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $id, $ret) = @_; if ($ret < 0) { diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl index 168fa5e94b44..92a750b8552b 100644 --- a/tools/perf/scripts/perl/rw-by-file.pl +++ b/tools/perf/scripts/perl/rw-by-file.pl @@ -28,7 +28,7 @@ my %writes; sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_; + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; if ($common_comm eq $for_comm) { $reads{$fd}{bytes_requested} += $count; @@ -39,7 +39,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_; + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; if ($common_comm eq $for_comm) { $writes{$fd}{bytes_written} += $count; @@ -98,7 +98,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/rw-by-pid.pl b/tools/perf/scripts/perl/rw-by-pid.pl index 495698250b2f..d789fe39caab 100644 --- a/tools/perf/scripts/perl/rw-by-pid.pl +++ b/tools/perf/scripts/perl/rw-by-pid.pl @@ -24,7 +24,7 @@ my %writes; sub syscalls::sys_exit_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; if ($ret > 0) { @@ -40,7 +40,7 @@ sub syscalls::sys_exit_read sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; $reads{$common_pid}{bytes_requested} += $count; @@ -51,7 +51,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_exit_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; if ($ret <= 0) { @@ -62,7 +62,7 @@ sub syscalls::sys_exit_write sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; $writes{$common_pid}{bytes_written} += $count; @@ -178,7 +178,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl index 6473442568a2..eba4df67af6b 100644 --- a/tools/perf/scripts/perl/rwtop.pl +++ b/tools/perf/scripts/perl/rwtop.pl @@ -35,7 +35,7 @@ if (!$interval) { sub syscalls::sys_exit_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; print_check(); @@ -53,7 +53,7 @@ sub syscalls::sys_exit_read sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; print_check(); @@ -66,7 +66,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_exit_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; print_check(); @@ -79,7 +79,7 @@ sub syscalls::sys_exit_write sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; print_check(); @@ -197,7 +197,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/wakeup-latency.pl b/tools/perf/scripts/perl/wakeup-latency.pl index efcfec5e347a..53444ff4ec7f 100644 --- a/tools/perf/scripts/perl/wakeup-latency.pl +++ b/tools/perf/scripts/perl/wakeup-latency.pl @@ -28,7 +28,7 @@ my $total_wakeups = 0; sub sched::sched_switch { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $prev_comm, $prev_pid, $prev_prio, $prev_state, $next_comm, $next_pid, $next_prio) = @_; @@ -51,7 +51,7 @@ sub sched::sched_switch sub sched::sched_wakeup { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $comm, $pid, $prio, $success, $target_cpu) = @_; $last_wakeup{$target_cpu}{ts} = nsecs($common_secs, $common_nsecs); @@ -101,7 +101,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/tests/bp_account.c b/tools/perf/tests/bp_account.c index d0b935356274..489b50604cf2 100644 --- a/tools/perf/tests/bp_account.c +++ b/tools/perf/tests/bp_account.c @@ -19,7 +19,7 @@ #include "../perf-sys.h" #include "cloexec.h" -volatile long the_var; +static volatile long the_var; static noinline int test_function(void) { diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 5f05db75cdd8..54d9516c9839 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -543,8 +543,11 @@ static int run_shell_tests(int argc, const char *argv[], int i, int width) return -1; dir = opendir(st.dir); - if (!dir) + if (!dir) { + pr_err("failed to open shell test directory: %s\n", + st.dir); return -1; + } for_each_shell_test(dir, st.dir, ent) { int curr = i++; diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 87843af4c118..28313e59d6f6 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -10,7 +10,7 @@ static int test(struct parse_ctx *ctx, const char *e, double val2) { double val; - if (expr__parse(&val, ctx, &e)) + if (expr__parse(&val, ctx, e)) TEST_ASSERT_VAL("parse test failed", 0); TEST_ASSERT_VAL("unexpected value", val == val2); return 0; @@ -44,12 +44,12 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) return ret; p = "FOO/0"; - ret = expr__parse(&val, &ctx, &p); - TEST_ASSERT_VAL("division by zero", ret == 1); + ret = expr__parse(&val, &ctx, p); + TEST_ASSERT_VAL("division by zero", ret == -1); p = "BAR/"; - ret = expr__parse(&val, &ctx, &p); - TEST_ASSERT_VAL("missing operand", ret == 1); + ret = expr__parse(&val, &ctx, p); + TEST_ASSERT_VAL("missing operand", ret == -1); TEST_ASSERT_VAL("find other", expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", &other, &num_other) == 0); diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 2762e1155238..14239e472187 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -99,6 +99,7 @@ static bool samples_same(const struct perf_sample *s1, if (type & PERF_SAMPLE_BRANCH_STACK) { COMP(branch_stack->nr); + COMP(branch_stack->hw_idx); for (i = 0; i < s1->branch_stack->nr; i++) MCOMP(branch_stack->entries[i]); } @@ -186,7 +187,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) u64 data[64]; } branch_stack = { /* 1 branch_entry */ - .data = {1, 211, 212, 213}, + .data = {1, -1ULL, 211, 212, 213}, }; u64 regs[64]; const u64 raw_data[] = {0x123456780a0b0c0dULL, 0x1102030405060708ULL}; @@ -208,6 +209,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .transaction = 112, .raw_data = (void *)raw_data, .callchain = &callchain.callchain, + .no_hw_idx = false, .branch_stack = &branch_stack.branch_stack, .user_regs = { .abi = PERF_SAMPLE_REGS_ABI_64, @@ -244,6 +246,9 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) if (sample_type & PERF_SAMPLE_REGS_INTR) evsel.core.attr.sample_regs_intr = sample_regs; + if (sample_type & PERF_SAMPLE_BRANCH_STACK) + evsel.core.attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; + for (i = 0; i < sizeof(regs); i++) *(i + (u8 *)regs) = i & 0xfe; diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 07da6c790b63..c0cf8dff694e 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -121,7 +121,9 @@ perf-y += mem-events.o perf-y += vsprintf.o perf-y += units.o perf-y += time-utils.o +perf-y += expr-flex.o perf-y += expr-bison.o +perf-y += expr.o perf-y += branch.o perf-y += mem2node.o @@ -189,9 +191,13 @@ $(OUTPUT)util/parse-events-bison.c: util/parse-events.y $(call rule_mkdir) $(Q)$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_ +$(OUTPUT)util/expr-flex.c: util/expr.l $(OUTPUT)util/expr-bison.c + $(call rule_mkdir) + $(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/expr-flex.h $(PARSER_DEBUG_FLEX) util/expr.l + $(OUTPUT)util/expr-bison.c: util/expr.y $(call rule_mkdir) - $(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr__ + $(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr_ $(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c $(call rule_mkdir) @@ -203,12 +209,14 @@ $(OUTPUT)util/pmu-bison.c: util/pmu.y CFLAGS_parse-events-flex.o += -w CFLAGS_pmu-flex.o += -w +CFLAGS_expr-flex.o += -w CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w CFLAGS_pmu-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w CFLAGS_expr-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c +$(OUTPUT)util/expr.o: $(OUTPUT)util/expr-flex.c $(OUTPUT)util/expr-bison.c CFLAGS_bitmap.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_find_bit.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" @@ -216,6 +224,7 @@ CFLAGS_rbtree.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ET CFLAGS_libstring.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_hweight.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_parse-events.o += -Wno-redundant-decls +CFLAGS_expr.o += -Wno-redundant-decls CFLAGS_header.o += -include $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0ea95be84b3b..f1ea0d61eb5b 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2611,8 +2611,6 @@ void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) if (++al->jump_sources > notes->max_jump_sources) notes->max_jump_sources = al->jump_sources; - - ++notes->nr_jumps; } } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 001258601a37..07c775938d46 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -279,7 +279,6 @@ struct annotation { struct annotation_options *options; struct annotation_line **offsets; int nr_events; - int nr_jumps; int max_jump_sources; int nr_entries; int nr_asm_entries; diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index c4b030bf6ec2..423ec69bda6c 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -65,8 +65,7 @@ struct block_info *block_info__new(void) return bi; } -int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, - struct hist_entry *left, struct hist_entry *right) +int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right) { struct block_info *bi_l = left->block_info; struct block_info *bi_r = right->block_info; @@ -74,30 +73,27 @@ int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, if (!bi_l->sym || !bi_r->sym) { if (!bi_l->sym && !bi_r->sym) - return 0; + return -1; else if (!bi_l->sym) return -1; else return 1; } - if (bi_l->sym == bi_r->sym) { - if (bi_l->start == bi_r->start) { - if (bi_l->end == bi_r->end) - return 0; - else - return (int64_t)(bi_r->end - bi_l->end); - } else - return (int64_t)(bi_r->start - bi_l->start); - } else { - cmp = strcmp(bi_l->sym->name, bi_r->sym->name); + cmp = strcmp(bi_l->sym->name, bi_r->sym->name); + if (cmp) return cmp; - } - if (bi_l->sym->start != bi_r->sym->start) - return (int64_t)(bi_r->sym->start - bi_l->sym->start); + if (bi_l->start != bi_r->start) + return (int64_t)(bi_r->start - bi_l->start); - return (int64_t)(bi_r->sym->end - bi_l->sym->end); + return (int64_t)(bi_r->end - bi_l->end); +} + +int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, + struct hist_entry *left, struct hist_entry *right) +{ + return __block_info__cmp(left, right); } static void init_block_info(struct block_info *bi, struct symbol *sym, @@ -185,6 +181,17 @@ static int block_column_width(struct perf_hpp_fmt *fmt, return block_fmt->width; } +static int color_pct(struct perf_hpp *hpp, int width, double pct) +{ +#ifdef HAVE_SLANG_SUPPORT + if (use_browser) { + return __hpp__slsmg_color_printf(hpp, "%*.2f%%", + width - 1, pct); + } +#endif + return hpp_color_scnprintf(hpp, "%*.2f%%", width - 1, pct); +} + static int block_total_cycles_pct_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he) @@ -192,14 +199,11 @@ static int block_total_cycles_pct_entry(struct perf_hpp_fmt *fmt, struct block_fmt *block_fmt = container_of(fmt, struct block_fmt, fmt); struct block_info *bi = he->block_info; double ratio = 0.0; - char buf[16]; if (block_fmt->total_cycles) ratio = (double)bi->cycles / (double)block_fmt->total_cycles; - sprintf(buf, "%.2f%%", 100.0 * ratio); - - return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf); + return color_pct(hpp, block_fmt->width, 100.0 * ratio); } static int64_t block_total_cycles_pct_sort(struct perf_hpp_fmt *fmt, @@ -252,16 +256,13 @@ static int block_cycles_pct_entry(struct perf_hpp_fmt *fmt, struct block_info *bi = he->block_info; double ratio = 0.0; u64 avg; - char buf[16]; if (block_fmt->block_cycles && bi->num_aggr) { avg = bi->cycles_aggr / bi->num_aggr; ratio = (double)avg / (double)block_fmt->block_cycles; } - sprintf(buf, "%.2f%%", 100.0 * ratio); - - return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf); + return color_pct(hpp, block_fmt->width, 100.0 * ratio); } static int block_avg_cycles_entry(struct perf_hpp_fmt *fmt, @@ -295,7 +296,8 @@ static int block_range_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, end_line = map__srcline(he->ms.map, bi->sym->start + bi->end, he->ms.sym); - if ((start_line != SRCLINE_UNKNOWN) && (end_line != SRCLINE_UNKNOWN)) { + if ((strncmp(start_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0) && + (strncmp(end_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0)) { scnprintf(buf, sizeof(buf), "[%s -> %s]", start_line, end_line); } else { @@ -348,7 +350,7 @@ static void hpp_register(struct block_fmt *block_fmt, int idx, switch (idx) { case PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT: - fmt->entry = block_total_cycles_pct_entry; + fmt->color = block_total_cycles_pct_entry; fmt->cmp = block_info__cmp; fmt->sort = block_total_cycles_pct_sort; break; @@ -356,7 +358,7 @@ static void hpp_register(struct block_fmt *block_fmt, int idx, fmt->entry = block_cycles_lbr_entry; break; case PERF_HPP_REPORT__BLOCK_CYCLES_PCT: - fmt->entry = block_cycles_pct_entry; + fmt->color = block_cycles_pct_entry; break; case PERF_HPP_REPORT__BLOCK_AVG_CYCLES: fmt->entry = block_avg_cycles_entry; @@ -376,33 +378,41 @@ static void hpp_register(struct block_fmt *block_fmt, int idx, } static void register_block_columns(struct perf_hpp_list *hpp_list, - struct block_fmt *block_fmts) + struct block_fmt *block_fmts, + int *block_hpps, int nr_hpps) { - for (int i = 0; i < PERF_HPP_REPORT__BLOCK_MAX_INDEX; i++) - hpp_register(&block_fmts[i], i, hpp_list); + for (int i = 0; i < nr_hpps; i++) + hpp_register(&block_fmts[i], block_hpps[i], hpp_list); } -static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts) +static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts, + int *block_hpps, int nr_hpps) { __hists__init(&bh->block_hists, &bh->block_list); perf_hpp_list__init(&bh->block_list); bh->block_list.nr_header_lines = 1; - register_block_columns(&bh->block_list, block_fmts); + register_block_columns(&bh->block_list, block_fmts, + block_hpps, nr_hpps); - perf_hpp_list__register_sort_field(&bh->block_list, - &block_fmts[PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT].fmt); + /* Sort by the first fmt */ + perf_hpp_list__register_sort_field(&bh->block_list, &block_fmts[0].fmt); } -static void process_block_report(struct hists *hists, - struct block_report *block_report, - u64 total_cycles) +static int process_block_report(struct hists *hists, + struct block_report *block_report, + u64 total_cycles, int *block_hpps, + int nr_hpps) { struct rb_node *next = rb_first_cached(&hists->entries); struct block_hist *bh = &block_report->hist; struct hist_entry *he; - init_block_hist(bh, block_report->fmts); + if (nr_hpps > PERF_HPP_REPORT__BLOCK_MAX_INDEX) + return -1; + + block_report->nr_fmts = nr_hpps; + init_block_hist(bh, block_report->fmts, block_hpps, nr_hpps); while (next) { he = rb_entry(next, struct hist_entry, rb_node); @@ -411,16 +421,19 @@ static void process_block_report(struct hists *hists, next = rb_next(&he->rb_node); } - for (int i = 0; i < PERF_HPP_REPORT__BLOCK_MAX_INDEX; i++) { + for (int i = 0; i < nr_hpps; i++) { block_report->fmts[i].total_cycles = total_cycles; block_report->fmts[i].block_cycles = block_report->cycles; } hists__output_resort(&bh->block_hists, NULL); + return 0; } struct block_report *block_info__create_report(struct evlist *evlist, - u64 total_cycles) + u64 total_cycles, + int *block_hpps, int nr_hpps, + int *nr_reps) { struct block_report *block_reports; int nr_hists = evlist->core.nr_entries, i = 0; @@ -433,13 +446,23 @@ struct block_report *block_info__create_report(struct evlist *evlist, evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); - process_block_report(hists, &block_reports[i], total_cycles); + process_block_report(hists, &block_reports[i], total_cycles, + block_hpps, nr_hpps); i++; } + *nr_reps = nr_hists; return block_reports; } +void block_info__free_report(struct block_report *reps, int nr_reps) +{ + for (int i = 0; i < nr_reps; i++) + hists__delete_entries(&reps[i].hist.block_hists); + + free(reps); +} + int report__browse_block_hists(struct block_hist *bh, float min_percent, struct evsel *evsel, struct perf_env *env, struct annotation_options *annotation_opts) @@ -451,13 +474,11 @@ int report__browse_block_hists(struct block_hist *bh, float min_percent, symbol_conf.report_individual_block = true; hists__fprintf(&bh->block_hists, true, 0, 0, min_percent, stdout, true); - hists__delete_entries(&bh->block_hists); return 0; case 1: symbol_conf.report_individual_block = true; ret = block_hists_tui_browse(bh, evsel, min_percent, env, annotation_opts); - hists__delete_entries(&bh->block_hists); return ret; default: return -1; diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h index bef0d75e9819..42e9dcc4cf0a 100644 --- a/tools/perf/util/block-info.h +++ b/tools/perf/util/block-info.h @@ -45,6 +45,7 @@ struct block_report { struct block_hist hist; u64 cycles; struct block_fmt fmts[PERF_HPP_REPORT__BLOCK_MAX_INDEX]; + int nr_fmts; }; struct block_hist; @@ -61,6 +62,8 @@ static inline void __block_info__zput(struct block_info **bi) #define block_info__zput(bi) __block_info__zput(&bi) +int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right); + int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, struct hist_entry *left, struct hist_entry *right); @@ -68,7 +71,11 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, u64 *block_cycles_aggr, u64 total_cycles); struct block_report *block_info__create_report(struct evlist *evlist, - u64 total_cycles); + u64 total_cycles, + int *block_hpps, int nr_hpps, + int *nr_reps); + +void block_info__free_report(struct block_report *reps, int nr_reps); int report__browse_block_hists(struct block_hist *bh, float min_percent, struct evsel *evsel, struct perf_env *env, diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 88e00d268f6f..154a05cd03af 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -12,6 +12,7 @@ #include <linux/stddef.h> #include <linux/perf_event.h> #include <linux/types.h> +#include "event.h" struct branch_flags { u64 mispred:1; @@ -39,9 +40,30 @@ struct branch_entry { struct branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries[0]; }; +/* + * The hw_idx is only available when PERF_SAMPLE_BRANCH_HW_INDEX is applied. + * Otherwise, the output format of a sample with branch stack is + * struct branch_stack { + * u64 nr; + * struct branch_entry entries[0]; + * } + * Check whether the hw_idx is available, + * and return the corresponding pointer of entries[0]. + */ +static inline struct branch_entry *perf_sample__branch_entries(struct perf_sample *sample) +{ + u64 *entry = (u64 *)sample->branch_stack; + + entry++; + if (sample->no_hw_idx) + return (struct branch_entry *)entry; + return (struct branch_entry *)(++entry); +} + struct branch_type_stat { bool branch_to; u64 counts[PERF_BR_MAX]; diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index 4881d4af3381..5bc9d3b01bd9 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -3,75 +3,16 @@ #include "evsel.h" #include "cgroup.h" #include "evlist.h" -#include <linux/stringify.h> #include <linux/zalloc.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <stdlib.h> #include <string.h> +#include <api/fs/fs.h> int nr_cgroups; -static int -cgroupfs_find_mountpoint(char *buf, size_t maxlen) -{ - FILE *fp; - char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; - char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path; - char *token, *saved_ptr = NULL; - - fp = fopen("/proc/mounts", "r"); - if (!fp) - return -1; - - /* - * in order to handle split hierarchy, we need to scan /proc/mounts - * and inspect every cgroupfs mount point to find one that has - * perf_event subsystem - */ - path_v1[0] = '\0'; - path_v2[0] = '\0'; - - while (fscanf(fp, "%*s %"__stringify(PATH_MAX)"s %"__stringify(PATH_MAX)"s %" - __stringify(PATH_MAX)"s %*d %*d\n", - mountpoint, type, tokens) == 3) { - - if (!path_v1[0] && !strcmp(type, "cgroup")) { - - token = strtok_r(tokens, ",", &saved_ptr); - - while (token != NULL) { - if (!strcmp(token, "perf_event")) { - strcpy(path_v1, mountpoint); - break; - } - token = strtok_r(NULL, ",", &saved_ptr); - } - } - - if (!path_v2[0] && !strcmp(type, "cgroup2")) - strcpy(path_v2, mountpoint); - - if (path_v1[0] && path_v2[0]) - break; - } - fclose(fp); - - if (path_v1[0]) - path = path_v1; - else if (path_v2[0]) - path = path_v2; - else - return -1; - - if (strlen(path) < maxlen) { - strcpy(buf, path); - return 0; - } - return -1; -} - static int open_cgroup(const char *name) { char path[PATH_MAX + 1]; @@ -79,7 +20,7 @@ static int open_cgroup(const char *name) int fd; - if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1)) + if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1, "perf_event")) return -1; scnprintf(path, PATH_MAX, "%s/%s", mnt, name); diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 5471045ebf5c..62d2f9b9ce1b 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -363,6 +363,23 @@ struct cs_etm_packet_queue return NULL; } +static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm, + struct cs_etm_traceid_queue *tidq) +{ + struct cs_etm_packet *tmp; + + if (etm->sample_branches || etm->synth_opts.last_branch || + etm->sample_instructions) { + /* + * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for + * the next incoming packet. + */ + tmp = tidq->packet; + tidq->packet = tidq->prev_packet; + tidq->prev_packet = tmp; + } +} + static void cs_etm__packet_dump(const char *pkt_string) { const char *color = PERF_COLOR_BLUE; @@ -945,7 +962,7 @@ static inline u64 cs_etm__instr_addr(struct cs_etm_queue *etmq, if (packet->isa == CS_ETM_ISA_T32) { u64 addr = packet->start_addr; - while (offset > 0) { + while (offset) { addr += cs_etm__t32_instr_size(etmq, trace_chan_id, addr); offset--; @@ -1134,10 +1151,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, cs_etm__copy_insn(etmq, tidq->trace_chan_id, tidq->packet, &sample); - if (etm->synth_opts.last_branch) { - cs_etm__copy_last_branch_rb(etmq, tidq); + if (etm->synth_opts.last_branch) sample.branch_stack = tidq->last_branch; - } if (etm->synth_opts.inject) { ret = cs_etm__inject_event(event, &sample, @@ -1153,9 +1168,6 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, "CS ETM Trace: failed to deliver instruction event, error %d\n", ret); - if (etm->synth_opts.last_branch) - cs_etm__reset_last_branch_rb(tidq); - return ret; } @@ -1172,6 +1184,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, union perf_event *event = tidq->event_buf; struct dummy_branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries; } dummy_bs; u64 ip; @@ -1202,6 +1215,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, if (etm->synth_opts.last_branch) { dummy_bs = (struct dummy_branch_stack){ .nr = 1, + .hw_idx = -1ULL, .entries = { .from = sample.ip, .to = sample.addr, @@ -1340,12 +1354,14 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, struct cs_etm_traceid_queue *tidq) { struct cs_etm_auxtrace *etm = etmq->etm; - struct cs_etm_packet *tmp; int ret; u8 trace_chan_id = tidq->trace_chan_id; - u64 instrs_executed = tidq->packet->instr_count; + u64 instrs_prev; + + /* Get instructions remainder from previous packet */ + instrs_prev = tidq->period_instructions; - tidq->period_instructions += instrs_executed; + tidq->period_instructions += tidq->packet->instr_count; /* * Record a branch when the last instruction in @@ -1363,26 +1379,80 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, * TODO: allow period to be defined in cycles and clock time */ - /* Get number of instructions executed after the sample point */ - u64 instrs_over = tidq->period_instructions - - etm->instructions_sample_period; + /* + * Below diagram demonstrates the instruction samples + * generation flows: + * + * Instrs Instrs Instrs Instrs + * Sample(n) Sample(n+1) Sample(n+2) Sample(n+3) + * | | | | + * V V V V + * -------------------------------------------------- + * ^ ^ + * | | + * Period Period + * instructions(Pi) instructions(Pi') + * + * | | + * \---------------- -----------------/ + * V + * tidq->packet->instr_count + * + * Instrs Sample(n...) are the synthesised samples occurring + * every etm->instructions_sample_period instructions - as + * defined on the perf command line. Sample(n) is being the + * last sample before the current etm packet, n+1 to n+3 + * samples are generated from the current etm packet. + * + * tidq->packet->instr_count represents the number of + * instructions in the current etm packet. + * + * Period instructions (Pi) contains the the number of + * instructions executed after the sample point(n) from the + * previous etm packet. This will always be less than + * etm->instructions_sample_period. + * + * When generate new samples, it combines with two parts + * instructions, one is the tail of the old packet and another + * is the head of the new coming packet, to generate + * sample(n+1); sample(n+2) and sample(n+3) consume the + * instructions with sample period. After sample(n+3), the rest + * instructions will be used by later packet and it is assigned + * to tidq->period_instructions for next round calculation. + */ /* - * Calculate the address of the sampled instruction (-1 as - * sample is reported as though instruction has just been - * executed, but PC has not advanced to next instruction) + * Get the initial offset into the current packet instructions; + * entry conditions ensure that instrs_prev is less than + * etm->instructions_sample_period. */ - u64 offset = (instrs_executed - instrs_over - 1); - u64 addr = cs_etm__instr_addr(etmq, trace_chan_id, - tidq->packet, offset); + u64 offset = etm->instructions_sample_period - instrs_prev; + u64 addr; - ret = cs_etm__synth_instruction_sample( - etmq, tidq, addr, etm->instructions_sample_period); - if (ret) - return ret; + /* Prepare last branches for instruction sample */ + if (etm->synth_opts.last_branch) + cs_etm__copy_last_branch_rb(etmq, tidq); - /* Carry remaining instructions into next sample period */ - tidq->period_instructions = instrs_over; + while (tidq->period_instructions >= + etm->instructions_sample_period) { + /* + * Calculate the address of the sampled instruction (-1 + * as sample is reported as though instruction has just + * been executed, but PC has not advanced to next + * instruction) + */ + addr = cs_etm__instr_addr(etmq, trace_chan_id, + tidq->packet, offset - 1); + ret = cs_etm__synth_instruction_sample( + etmq, tidq, addr, + etm->instructions_sample_period); + if (ret) + return ret; + + offset += etm->instructions_sample_period; + tidq->period_instructions -= + etm->instructions_sample_period; + } } if (etm->sample_branches) { @@ -1404,15 +1474,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, } } - if (etm->sample_branches || etm->synth_opts.last_branch) { - /* - * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for - * the next incoming packet. - */ - tmp = tidq->packet; - tidq->packet = tidq->prev_packet; - tidq->prev_packet = tmp; - } + cs_etm__packet_swap(etm, tidq); return 0; } @@ -1441,7 +1503,6 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, { int err = 0; struct cs_etm_auxtrace *etm = etmq->etm; - struct cs_etm_packet *tmp; /* Handle start tracing packet */ if (tidq->prev_packet->sample_type == CS_ETM_EMPTY) @@ -1449,6 +1510,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, if (etmq->etm->synth_opts.last_branch && tidq->prev_packet->sample_type == CS_ETM_RANGE) { + u64 addr; + + /* Prepare last branches for instruction sample */ + cs_etm__copy_last_branch_rb(etmq, tidq); + /* * Generate a last branch event for the branches left in the * circular buffer at the end of the trace. @@ -1456,7 +1522,7 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, * Use the address of the end of the last reported execution * range */ - u64 addr = cs_etm__last_executed_instr(tidq->prev_packet); + addr = cs_etm__last_executed_instr(tidq->prev_packet); err = cs_etm__synth_instruction_sample( etmq, tidq, addr, @@ -1476,15 +1542,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, } swap_packet: - if (etm->sample_branches || etm->synth_opts.last_branch) { - /* - * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for - * the next incoming packet. - */ - tmp = tidq->packet; - tidq->packet = tidq->prev_packet; - tidq->prev_packet = tmp; - } + cs_etm__packet_swap(etm, tidq); + + /* Reset last branches after flush the trace */ + if (etm->synth_opts.last_branch) + cs_etm__reset_last_branch_rb(tidq); return err; } @@ -1505,11 +1567,16 @@ static int cs_etm__end_block(struct cs_etm_queue *etmq, */ if (etmq->etm->synth_opts.last_branch && tidq->prev_packet->sample_type == CS_ETM_RANGE) { + u64 addr; + + /* Prepare last branches for instruction sample */ + cs_etm__copy_last_branch_rb(etmq, tidq); + /* * Use the address of the end of the last reported execution * range. */ - u64 addr = cs_etm__last_executed_instr(tidq->prev_packet); + addr = cs_etm__last_executed_instr(tidq->prev_packet); err = cs_etm__synth_instruction_sample( etmq, tidq, addr, diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 6242a9215df7..4154f944f474 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -343,11 +343,11 @@ static const char *normalize_arch(char *arch) const char *perf_env__arch(struct perf_env *env) { - struct utsname uts; char *arch_name; if (!env || !env->arch) { /* Assume local operation */ - if (uname(&uts) < 0) + static struct utsname uts = { .machine[0] = '\0', }; + if (uts.machine[0] == '\0' && uname(&uts) < 0) return NULL; arch_name = uts.machine; } else diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 85223159737c..3cda40a2fafc 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -139,6 +139,7 @@ struct perf_sample { u16 insn_len; u8 cpumode; u16 misc; + bool no_hw_idx; /* No hw_idx collected in branch_stack */ char insn[MAX_INSN]; void *raw_data; struct ip_callchain *callchain; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index c8dc4450884c..816d930d774e 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -712,7 +712,8 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_CALL_STACK | PERF_SAMPLE_BRANCH_NO_CYCLES | - PERF_SAMPLE_BRANCH_NO_FLAGS; + PERF_SAMPLE_BRANCH_NO_FLAGS | + PERF_SAMPLE_BRANCH_HW_INDEX; } } else pr_warning("Cannot use LBR callstack with branch stack. " @@ -763,7 +764,8 @@ perf_evsel__reset_callgraph(struct evsel *evsel, if (param->record_mode == CALLCHAIN_LBR) { perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER | - PERF_SAMPLE_BRANCH_CALL_STACK); + PERF_SAMPLE_BRANCH_CALL_STACK | + PERF_SAMPLE_BRANCH_HW_INDEX); } if (param->record_mode == CALLCHAIN_DWARF) { perf_evsel__reset_sample_bit(evsel, REGS_USER); @@ -1673,6 +1675,8 @@ fallback_missing_features: evsel->core.attr.ksymbol = 0; if (perf_missing_features.bpf) evsel->core.attr.bpf_event = 0; + if (perf_missing_features.branch_hw_idx) + evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_HW_INDEX; retry_sample_id: if (perf_missing_features.sample_id_all) evsel->core.attr.sample_id_all = 0; @@ -1784,7 +1788,12 @@ try_fallback: * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) { + if (!perf_missing_features.branch_hw_idx && + (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX)) { + perf_missing_features.branch_hw_idx = true; + pr_debug2("switching off branch HW index support\n"); + goto fallback_missing_features; + } else if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) { perf_missing_features.aux_output = true; pr_debug2_peo("Kernel has no attr.aux_output support, bailing out\n"); goto out_close; @@ -2169,7 +2178,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, if (data->branch_stack->nr > max_branch_nr) return -EFAULT; + sz = data->branch_stack->nr * sizeof(struct branch_entry); + if (perf_evsel__has_branch_hw_idx(evsel)) + sz += sizeof(u64); + else + data->no_hw_idx = true; OVERFLOW_CHECK(array, sz, max_size); array = (void *)array + sz; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index dc14f4a823cd..33804740e2ca 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -119,6 +119,7 @@ struct perf_missing_features { bool ksymbol; bool bpf; bool aux_output; + bool branch_hw_idx; }; extern struct perf_missing_features perf_missing_features; @@ -389,6 +390,11 @@ static inline bool perf_evsel__has_branch_callstack(const struct evsel *evsel) return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } +static inline bool perf_evsel__has_branch_hw_idx(const struct evsel *evsel) +{ + return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + static inline bool evsel__has_callchain(const struct evsel *evsel) { return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0; diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c new file mode 100644 index 000000000000..fd192ddf93c1 --- /dev/null +++ b/tools/perf/util/expr.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdbool.h> +#include <assert.h> +#include "expr.h" +#include "expr-bison.h" +#define YY_EXTRA_TYPE int +#include "expr-flex.h" + +#ifdef PARSER_DEBUG +extern int expr_debug; +#endif + +/* Caller must make sure id is allocated */ +void expr__add_id(struct parse_ctx *ctx, const char *name, double val) +{ + int idx; + + assert(ctx->num_ids < MAX_PARSE_ID); + idx = ctx->num_ids++; + ctx->ids[idx].name = name; + ctx->ids[idx].val = val; +} + +void expr__ctx_init(struct parse_ctx *ctx) +{ + ctx->num_ids = 0; +} + +static int +__expr__parse(double *val, struct parse_ctx *ctx, const char *expr, + int start) +{ + YY_BUFFER_STATE buffer; + void *scanner; + int ret; + + ret = expr_lex_init_extra(start, &scanner); + if (ret) + return ret; + + buffer = expr__scan_string(expr, scanner); + +#ifdef PARSER_DEBUG + expr_debug = 1; +#endif + + ret = expr_parse(val, ctx, scanner); + + expr__flush_buffer(buffer, scanner); + expr__delete_buffer(buffer, scanner); + expr_lex_destroy(scanner); + return ret; +} + +int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr) +{ + return __expr__parse(final_val, ctx, expr, EXPR_PARSE) ? -1 : 0; +} + +static bool +already_seen(const char *val, const char *one, const char **other, + int num_other) +{ + int i; + + if (one && !strcasecmp(one, val)) + return true; + for (i = 0; i < num_other; i++) + if (!strcasecmp(other[i], val)) + return true; + return false; +} + +int expr__find_other(const char *expr, const char *one, const char ***other, + int *num_other) +{ + int err, i = 0, j = 0; + struct parse_ctx ctx; + + expr__ctx_init(&ctx); + err = __expr__parse(NULL, &ctx, expr, EXPR_OTHER); + if (err) + return -1; + + *other = malloc((ctx.num_ids + 1) * sizeof(char *)); + if (!*other) + return -ENOMEM; + + for (i = 0, j = 0; i < ctx.num_ids; i++) { + const char *str = ctx.ids[i].name; + + if (already_seen(str, one, *other, j)) + continue; + + str = strdup(str); + if (!str) + goto out; + (*other)[j++] = str; + } + (*other)[j] = NULL; + +out: + if (i != ctx.num_ids) { + while (--j) + free((char *) (*other)[i]); + free(*other); + err = -1; + } + + *num_other = j; + return err; +} diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index 046160831f90..9377538f4097 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -2,7 +2,7 @@ #ifndef PARSE_CTX_H #define PARSE_CTX_H 1 -#define EXPR_MAX_OTHER 15 +#define EXPR_MAX_OTHER 20 #define MAX_PARSE_ID EXPR_MAX_OTHER struct parse_id { @@ -17,10 +17,8 @@ struct parse_ctx { void expr__ctx_init(struct parse_ctx *ctx); void expr__add_id(struct parse_ctx *ctx, const char *id, double val); -#ifndef IN_EXPR_Y -int expr__parse(double *final_val, struct parse_ctx *ctx, const char **pp); -#endif -int expr__find_other(const char *p, const char *one, const char ***other, +int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr); +int expr__find_other(const char *expr, const char *one, const char ***other, int *num_other); #endif diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l new file mode 100644 index 000000000000..eaad29243c23 --- /dev/null +++ b/tools/perf/util/expr.l @@ -0,0 +1,114 @@ +%option prefix="expr_" +%option reentrant +%option bison-bridge + +%{ +#include <linux/compiler.h> +#include "expr.h" +#include "expr-bison.h" + +char *expr_get_text(yyscan_t yyscanner); +YYSTYPE *expr_get_lval(yyscan_t yyscanner); + +static int __value(YYSTYPE *yylval, char *str, int base, int token) +{ + u64 num; + + errno = 0; + num = strtoull(str, NULL, base); + if (errno) + return EXPR_ERROR; + + yylval->num = num; + return token; +} + +static int value(yyscan_t scanner, int base) +{ + YYSTYPE *yylval = expr_get_lval(scanner); + char *text = expr_get_text(scanner); + + return __value(yylval, text, base, NUMBER); +} + +/* + * Allow @ instead of / to be able to specify pmu/event/ without + * conflicts with normal division. + */ +static char *normalize(char *str) +{ + char *ret = str; + char *dst = str; + + while (*str) { + if (*str == '@') + *dst++ = '/'; + else if (*str == '\\') + *dst++ = *++str; + else + *dst++ = *str; + str++; + } + + *dst = 0x0; + return ret; +} + +static int str(yyscan_t scanner, int token) +{ + YYSTYPE *yylval = expr_get_lval(scanner); + char *text = expr_get_text(scanner); + + yylval->str = normalize(strdup(text)); + if (!yylval->str) + return EXPR_ERROR; + + yylval->str = normalize(yylval->str); + return token; +} +%} + +number [0-9]+ + +sch [-,=] +spec \\{sch} +sym [0-9a-zA-Z_\.:@]+ +symbol {spec}*{sym}*{spec}*{sym}* + +%% + { + int start_token; + + start_token = expr_get_extra(yyscanner); + + if (start_token) { + expr_set_extra(NULL, yyscanner); + return start_token; + } + } + +max { return MAX; } +min { return MIN; } +if { return IF; } +else { return ELSE; } +#smt_on { return SMT_ON; } +{number} { return value(yyscanner, 10); } +{symbol} { return str(yyscanner, ID); } +"|" { return '|'; } +"^" { return '^'; } +"&" { return '&'; } +"-" { return '-'; } +"+" { return '+'; } +"*" { return '*'; } +"/" { return '/'; } +"%" { return '%'; } +"(" { return '('; } +")" { return ')'; } +"," { return ','; } +. { } +%% + +int expr_wrap(void *scanner __maybe_unused) +{ + return 1; +} diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y index 7d226241f1d7..4720cbe79357 100644 --- a/tools/perf/util/expr.y +++ b/tools/perf/util/expr.y @@ -1,31 +1,32 @@ /* Simple expression parser */ %{ +#define YYDEBUG 1 +#include <stdio.h> #include "util.h" #include "util/debug.h" #include <stdlib.h> // strtod() #define IN_EXPR_Y 1 #include "expr.h" #include "smt.h" -#include <assert.h> #include <string.h> -#define MAXIDLEN 256 %} %define api.pure full %parse-param { double *final_val } %parse-param { struct parse_ctx *ctx } -%parse-param { const char **pp } -%lex-param { const char **pp } +%parse-param {void *scanner} +%lex-param {void* scanner} %union { - double num; - char id[MAXIDLEN+1]; + double num; + char *str; } +%token EXPR_PARSE EXPR_OTHER EXPR_ERROR %token <num> NUMBER -%token <id> ID +%token <str> ID %token MIN MAX IF ELSE SMT_ON %left MIN MAX IF %left '|' @@ -37,11 +38,9 @@ %type <num> expr if_expr %{ -static int expr__lex(YYSTYPE *res, const char **pp); - -static void expr__error(double *final_val __maybe_unused, +static void expr_error(double *final_val __maybe_unused, struct parse_ctx *ctx __maybe_unused, - const char **pp __maybe_unused, + void *scanner, const char *s) { pr_debug("%s\n", s); @@ -63,6 +62,27 @@ static int lookup_id(struct parse_ctx *ctx, char *id, double *val) %} %% +start: +EXPR_PARSE all_expr +| +EXPR_OTHER all_other + +all_other: all_other other +| + +other: ID +{ + if (ctx->num_ids + 1 >= EXPR_MAX_OTHER) { + pr_err("failed: way too many variables"); + YYABORT; + } + + ctx->ids[ctx->num_ids++].name = $1; +} +| +MIN | MAX | IF | ELSE | SMT_ON | NUMBER | '|' | '^' | '&' | '-' | '+' | '*' | '/' | '%' | '(' | ')' + + all_expr: if_expr { *final_val = $1; } ; @@ -93,146 +113,3 @@ expr: NUMBER ; %% - -static int expr__symbol(YYSTYPE *res, const char *p, const char **pp) -{ - char *dst = res->id; - const char *s = p; - - if (*p == '#') - *dst++ = *p++; - - while (isalnum(*p) || *p == '_' || *p == '.' || *p == ':' || *p == '@' || *p == '\\') { - if (p - s >= MAXIDLEN) - return -1; - /* - * Allow @ instead of / to be able to specify pmu/event/ without - * conflicts with normal division. - */ - if (*p == '@') - *dst++ = '/'; - else if (*p == '\\') - *dst++ = *++p; - else - *dst++ = *p; - p++; - } - *dst = 0; - *pp = p; - dst = res->id; - switch (dst[0]) { - case 'm': - if (!strcmp(dst, "min")) - return MIN; - if (!strcmp(dst, "max")) - return MAX; - break; - case 'i': - if (!strcmp(dst, "if")) - return IF; - break; - case 'e': - if (!strcmp(dst, "else")) - return ELSE; - break; - case '#': - if (!strcasecmp(dst, "#smt_on")) - return SMT_ON; - break; - } - return ID; -} - -static int expr__lex(YYSTYPE *res, const char **pp) -{ - int tok; - const char *s; - const char *p = *pp; - - while (isspace(*p)) - p++; - s = p; - switch (*p++) { - case '#': - case 'a' ... 'z': - case 'A' ... 'Z': - return expr__symbol(res, p - 1, pp); - case '0' ... '9': case '.': - res->num = strtod(s, (char **)&p); - tok = NUMBER; - break; - default: - tok = *s; - break; - } - *pp = p; - return tok; -} - -/* Caller must make sure id is allocated */ -void expr__add_id(struct parse_ctx *ctx, const char *name, double val) -{ - int idx; - assert(ctx->num_ids < MAX_PARSE_ID); - idx = ctx->num_ids++; - ctx->ids[idx].name = name; - ctx->ids[idx].val = val; -} - -void expr__ctx_init(struct parse_ctx *ctx) -{ - ctx->num_ids = 0; -} - -static bool already_seen(const char *val, const char *one, const char **other, - int num_other) -{ - int i; - - if (one && !strcasecmp(one, val)) - return true; - for (i = 0; i < num_other; i++) - if (!strcasecmp(other[i], val)) - return true; - return false; -} - -int expr__find_other(const char *p, const char *one, const char ***other, - int *num_otherp) -{ - const char *orig = p; - int err = -1; - int num_other; - - *other = malloc((EXPR_MAX_OTHER + 1) * sizeof(char *)); - if (!*other) - return -1; - - num_other = 0; - for (;;) { - YYSTYPE val; - int tok = expr__lex(&val, &p); - if (tok == 0) { - err = 0; - break; - } - if (tok == ID && !already_seen(val.id, one, *other, num_other)) { - if (num_other >= EXPR_MAX_OTHER - 1) { - pr_debug("Too many extra events in %s\n", orig); - break; - } - (*other)[num_other] = strdup(val.id); - if (!(*other)[num_other]) - return -1; - num_other++; - } - } - (*other)[num_other] = NULL; - *num_otherp = num_other; - if (err) { - *num_otherp = 0; - free(*other); - *other = NULL; - } - return err; -} diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4246e7447e54..acbd046bf95c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1590,6 +1590,40 @@ static void free_event_desc(struct evsel *events) free(events); } +static bool perf_attr_check(struct perf_event_attr *attr) +{ + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) { + pr_warning("Reserved bits are set unexpectedly. " + "Please update perf tool.\n"); + return false; + } + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) { + pr_warning("Unknown sample type (0x%llx) is detected. " + "Please update perf tool.\n", + attr->sample_type); + return false; + } + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) { + pr_warning("Unknown read format (0x%llx) is detected. " + "Please update perf tool.\n", + attr->read_format); + return false; + } + + if ((attr->sample_type & PERF_SAMPLE_BRANCH_STACK) && + (attr->branch_sample_type & ~(PERF_SAMPLE_BRANCH_MAX-1))) { + pr_warning("Unknown branch sample type (0x%llx) is detected. " + "Please update perf tool.\n", + attr->branch_sample_type); + + return false; + } + + return true; +} + static struct evsel *read_event_desc(struct feat_fd *ff) { struct evsel *evsel, *events = NULL; @@ -1634,6 +1668,9 @@ static struct evsel *read_event_desc(struct feat_fd *ff) memcpy(&evsel->core.attr, buf, msz); + if (!perf_attr_check(&evsel->core.attr)) + goto error; + if (do_read_u32(ff, &nr)) goto error; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index ca5a8f4d007e..e74a5acf66d9 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2584,9 +2584,10 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, u64 *total_cycles) { struct branch_info *bi; + struct branch_entry *entries = perf_sample__branch_entries(sample); /* If we have branch cycles always annotate them. */ - if (bs && bs->nr && bs->entries[0].flags.cycles) { + if (bs && bs->nr && entries[0].flags.cycles) { int i; bi = sample__resolve_bstack(sample, al); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 33cf8928cf05..23c8289c2472 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1295,6 +1295,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) struct perf_sample sample = { .ip = 0, }; struct dummy_branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries; } dummy_bs; @@ -1316,6 +1317,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) if (pt->synth_opts.last_branch && sort__mode == SORT_MODE__BRANCH) { dummy_bs = (struct dummy_branch_stack){ .nr = 1, + .hw_idx = -1ULL, .entries = { .from = sample.ip, .to = sample.addr, diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index b5af680fc667..dbdffb6673fe 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -265,6 +265,8 @@ static int detect_kbuild_dir(char **kbuild_dir) return -ENOMEM; return 0; } + pr_debug("%s: Couldn't find \"%s\", missing kernel-devel package?.\n", + __func__, autoconf_path); free(autoconf_path); return -ENOENT; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index fb5c2cd44d30..fd14f1489802 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2081,15 +2081,16 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, { unsigned int i; const struct branch_stack *bs = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct branch_info *bi = calloc(bs->nr, sizeof(struct branch_info)); if (!bi) return NULL; for (i = 0; i < bs->nr; i++) { - ip__resolve_ams(al->thread, &bi[i].to, bs->entries[i].to); - ip__resolve_ams(al->thread, &bi[i].from, bs->entries[i].from); - bi[i].flags = bs->entries[i].flags; + ip__resolve_ams(al->thread, &bi[i].to, entries[i].to); + ip__resolve_ams(al->thread, &bi[i].from, entries[i].from); + bi[i].flags = entries[i].flags; } return bi; } @@ -2185,6 +2186,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, /* LBR only affects the user callchain */ if (i != chain_nr) { struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); int lbr_nr = lbr_stack->nr, j, k; bool branch; struct branch_flags *flags; @@ -2210,31 +2212,29 @@ static int resolve_lbr_callchain_sample(struct thread *thread, ip = chain->ips[j]; else if (j > i + 1) { k = j - i - 2; - ip = lbr_stack->entries[k].from; + ip = entries[k].from; branch = true; - flags = &lbr_stack->entries[k].flags; + flags = &entries[k].flags; } else { - ip = lbr_stack->entries[0].to; + ip = entries[0].to; branch = true; - flags = &lbr_stack->entries[0].flags; - branch_from = - lbr_stack->entries[0].from; + flags = &entries[0].flags; + branch_from = entries[0].from; } } else { if (j < lbr_nr) { k = lbr_nr - j - 1; - ip = lbr_stack->entries[k].from; + ip = entries[k].from; branch = true; - flags = &lbr_stack->entries[k].flags; + flags = &entries[k].flags; } else if (j > lbr_nr) ip = chain->ips[i + 1 - (j - lbr_nr)]; else { - ip = lbr_stack->entries[0].to; + ip = entries[0].to; branch = true; - flags = &lbr_stack->entries[0].flags; - branch_from = - lbr_stack->entries[0].from; + flags = &entries[0].flags; + branch_from = entries[0].from; } } @@ -2281,6 +2281,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, int max_stack) { struct branch_stack *branch = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct ip_callchain *chain = sample->callchain; int chain_nr = 0; u8 cpumode = PERF_RECORD_MISC_USER; @@ -2328,7 +2329,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, for (i = 0; i < nr; i++) { if (callchain_param.order == ORDER_CALLEE) { - be[i] = branch->entries[i]; + be[i] = entries[i]; if (chain == NULL) continue; @@ -2347,7 +2348,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, be[i].from >= chain->ips[first_call] - 8) first_call++; } else - be[i] = branch->entries[branch->nr - i - 1]; + be[i] = entries[branch->nr - i - 1]; } memset(iter, 0, sizeof(struct iterations) * nr); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index a08ca276098e..53d96611e6a6 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -44,8 +44,8 @@ static inline int is_no_dso_memory(const char *filename) static inline int is_android_lib(const char *filename) { - return !strncmp(filename, "/data/app-lib", 13) || - !strncmp(filename, "/system/lib", 11); + return strstarts(filename, "/data/app-lib/") || + strstarts(filename, "/system/lib/"); } static inline bool replace_android_lib(const char *filename, char *newfilename) @@ -65,7 +65,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) app_abi_length = strlen(app_abi); - if (!strncmp(filename, "/data/app-lib", 13)) { + if (strstarts(filename, "/data/app-lib/")) { char *apk_path; if (!app_abi_length) @@ -89,7 +89,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) return true; } - if (!strncmp(filename, "/system/lib/", 11)) { + if (strstarts(filename, "/system/lib/")) { char *ndk, *app; const char *arch; size_t ndk_length; @@ -431,7 +431,7 @@ int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix, if (map && map->dso) { char *srcline = map__srcline(map, addr, NULL); - if (srcline != SRCLINE_UNKNOWN) + if (strncmp(srcline, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0) ret = fprintf(fp, "%s%s", prefix, srcline); free_srcline(srcline); } diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 02aee946b6c1..c3a8c701609a 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -22,6 +22,8 @@ #include <linux/string.h> #include <linux/zalloc.h> #include <subcmd/parse-options.h> +#include <api/fs/fs.h> +#include "util.h" struct metric_event *metricgroup__lookup(struct rblist *metric_events, struct evsel *evsel, @@ -399,13 +401,85 @@ void metricgroup__print(bool metrics, bool metricgroups, char *filter, strlist__delete(metriclist); } +static void metricgroup__add_metric_weak_group(struct strbuf *events, + const char **ids, + int idnum) +{ + bool no_group = false; + int i; + + for (i = 0; i < idnum; i++) { + pr_debug("found event %s\n", ids[i]); + /* + * Duration time maps to a software event and can make + * groups not count. Always use it outside a + * group. + */ + if (!strcmp(ids[i], "duration_time")) { + if (i > 0) + strbuf_addf(events, "}:W,"); + strbuf_addf(events, "duration_time"); + no_group = true; + continue; + } + strbuf_addf(events, "%s%s", + i == 0 || no_group ? "{" : ",", + ids[i]); + no_group = false; + } + if (!no_group) + strbuf_addf(events, "}:W"); +} + +static void metricgroup__add_metric_non_group(struct strbuf *events, + const char **ids, + int idnum) +{ + int i; + + for (i = 0; i < idnum; i++) + strbuf_addf(events, ",%s", ids[i]); +} + +static void metricgroup___watchdog_constraint_hint(const char *name, bool foot) +{ + static bool violate_nmi_constraint; + + if (!foot) { + pr_warning("Splitting metric group %s into standalone metrics.\n", name); + violate_nmi_constraint = true; + return; + } + + if (!violate_nmi_constraint) + return; + + pr_warning("Try disabling the NMI watchdog to comply NO_NMI_WATCHDOG metric constraint:\n" + " echo 0 > /proc/sys/kernel/nmi_watchdog\n" + " perf stat ...\n" + " echo 1 > /proc/sys/kernel/nmi_watchdog\n"); +} + +static bool metricgroup__has_constraint(struct pmu_event *pe) +{ + if (!pe->metric_constraint) + return false; + + if (!strcmp(pe->metric_constraint, "NO_NMI_WATCHDOG") && + sysctl__nmi_watchdog_enabled()) { + metricgroup___watchdog_constraint_hint(pe->metric_name, false); + return true; + } + + return false; +} + static int metricgroup__add_metric(const char *metric, struct strbuf *events, struct list_head *group_list) { struct pmu_events_map *map = perf_pmu__find_map(NULL); struct pmu_event *pe; - int ret = -EINVAL; - int i, j; + int i, ret = -EINVAL; if (!map) return 0; @@ -422,7 +496,6 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, const char **ids; int idnum; struct egroup *eg; - bool no_group = false; pr_debug("metric expr %s for %s\n", pe->metric_expr, pe->metric_name); @@ -431,27 +504,11 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, continue; if (events->len > 0) strbuf_addf(events, ","); - for (j = 0; j < idnum; j++) { - pr_debug("found event %s\n", ids[j]); - /* - * Duration time maps to a software event and can make - * groups not count. Always use it outside a - * group. - */ - if (!strcmp(ids[j], "duration_time")) { - if (j > 0) - strbuf_addf(events, "}:W,"); - strbuf_addf(events, "duration_time"); - no_group = true; - continue; - } - strbuf_addf(events, "%s%s", - j == 0 || no_group ? "{" : ",", - ids[j]); - no_group = false; - } - if (!no_group) - strbuf_addf(events, "}:W"); + + if (metricgroup__has_constraint(pe)) + metricgroup__add_metric_non_group(events, ids, idnum); + else + metricgroup__add_metric_weak_group(events, ids, idnum); eg = malloc(sizeof(struct egroup)); if (!eg) { @@ -493,6 +550,10 @@ static int metricgroup__add_metric_list(const char *list, struct strbuf *events, } } free(nlist); + + if (!ret) + metricgroup___watchdog_constraint_hint(NULL, true); + return ret; } diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 3b664fa673a6..ab7108d22428 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -98,20 +98,29 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity) { void *data; size_t mmap_len; - unsigned long node_mask; + unsigned long *node_mask; + unsigned long node_index; + int err = 0; if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) { data = map->aio.data[idx]; mmap_len = mmap__mmap_len(map); - node_mask = 1UL << cpu__get_node(cpu); - if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) { - pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n", - data, data + mmap_len, cpu__get_node(cpu)); + node_index = cpu__get_node(cpu); + node_mask = bitmap_alloc(node_index + 1); + if (!node_mask) { + pr_err("Failed to allocate node mask for mbind: error %m\n"); return -1; } + set_bit(node_index, node_mask); + if (mbind(data, mmap_len, MPOL_BIND, node_mask, node_index + 1 + 1, 0)) { + pr_err("Failed to bind [%p-%p] AIO buffer to node %lu: error %m\n", + data, data + mmap_len, node_index); + err = -1; + } + bitmap_free(node_mask); } - return 0; + return err; } #else /* !HAVE_LIBNUMA_SUPPORT */ static int perf_mmap__aio_alloc(struct mmap *map, int idx) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index c01ba6f8fdad..a7dc0b096974 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -257,21 +257,15 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) path = zalloc(sizeof(*path)); if (!path) return NULL; - path->system = malloc(MAX_EVENT_LENGTH); - if (!path->system) { + if (asprintf(&path->system, "%.*s", MAX_EVENT_LENGTH, sys_dirent->d_name) < 0) { free(path); return NULL; } - path->name = malloc(MAX_EVENT_LENGTH); - if (!path->name) { + if (asprintf(&path->name, "%.*s", MAX_EVENT_LENGTH, evt_dirent->d_name) < 0) { zfree(&path->system); free(path); return NULL; } - strncpy(path->system, sys_dirent->d_name, - MAX_EVENT_LENGTH); - strncpy(path->name, evt_dirent->d_name, - MAX_EVENT_LENGTH); return path; } } @@ -1219,7 +1213,7 @@ static int config_attr(struct perf_event_attr *attr, static int get_config_terms(struct list_head *head_config, struct list_head *head_terms __maybe_unused) { -#define ADD_CONFIG_TERM(__type) \ +#define ADD_CONFIG_TERM(__type, __weak) \ struct perf_evsel_config_term *__t; \ \ __t = zalloc(sizeof(*__t)); \ @@ -1228,18 +1222,18 @@ static int get_config_terms(struct list_head *head_config, \ INIT_LIST_HEAD(&__t->list); \ __t->type = PERF_EVSEL__CONFIG_TERM_ ## __type; \ - __t->weak = term->weak; \ + __t->weak = __weak; \ list_add_tail(&__t->list, head_terms) -#define ADD_CONFIG_TERM_VAL(__type, __name, __val) \ +#define ADD_CONFIG_TERM_VAL(__type, __name, __val, __weak) \ do { \ - ADD_CONFIG_TERM(__type); \ + ADD_CONFIG_TERM(__type, __weak); \ __t->val.__name = __val; \ } while (0) -#define ADD_CONFIG_TERM_STR(__type, __val) \ +#define ADD_CONFIG_TERM_STR(__type, __val, __weak) \ do { \ - ADD_CONFIG_TERM(__type); \ + ADD_CONFIG_TERM(__type, __weak); \ __t->val.str = strdup(__val); \ if (!__t->val.str) { \ zfree(&__t); \ @@ -1253,62 +1247,62 @@ do { \ list_for_each_entry(term, head_config, list) { switch (term->type_term) { case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: - ADD_CONFIG_TERM_VAL(PERIOD, period, term->val.num); + ADD_CONFIG_TERM_VAL(PERIOD, period, term->val.num, term->weak); break; case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ: - ADD_CONFIG_TERM_VAL(FREQ, freq, term->val.num); + ADD_CONFIG_TERM_VAL(FREQ, freq, term->val.num, term->weak); break; case PARSE_EVENTS__TERM_TYPE_TIME: - ADD_CONFIG_TERM_VAL(TIME, time, term->val.num); + ADD_CONFIG_TERM_VAL(TIME, time, term->val.num, term->weak); break; case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: - ADD_CONFIG_TERM_STR(CALLGRAPH, term->val.str); + ADD_CONFIG_TERM_STR(CALLGRAPH, term->val.str, term->weak); break; case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE: - ADD_CONFIG_TERM_STR(BRANCH, term->val.str); + ADD_CONFIG_TERM_STR(BRANCH, term->val.str, term->weak); break; case PARSE_EVENTS__TERM_TYPE_STACKSIZE: ADD_CONFIG_TERM_VAL(STACK_USER, stack_user, - term->val.num); + term->val.num, term->weak); break; case PARSE_EVENTS__TERM_TYPE_INHERIT: ADD_CONFIG_TERM_VAL(INHERIT, inherit, - term->val.num ? 1 : 0); + term->val.num ? 1 : 0, term->weak); break; case PARSE_EVENTS__TERM_TYPE_NOINHERIT: ADD_CONFIG_TERM_VAL(INHERIT, inherit, - term->val.num ? 0 : 1); + term->val.num ? 0 : 1, term->weak); break; case PARSE_EVENTS__TERM_TYPE_MAX_STACK: ADD_CONFIG_TERM_VAL(MAX_STACK, max_stack, - term->val.num); + term->val.num, term->weak); break; case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: ADD_CONFIG_TERM_VAL(MAX_EVENTS, max_events, - term->val.num); + term->val.num, term->weak); break; case PARSE_EVENTS__TERM_TYPE_OVERWRITE: ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, - term->val.num ? 1 : 0); + term->val.num ? 1 : 0, term->weak); break; case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, - term->val.num ? 0 : 1); + term->val.num ? 0 : 1, term->weak); break; case PARSE_EVENTS__TERM_TYPE_DRV_CFG: - ADD_CONFIG_TERM_STR(DRV_CFG, term->val.str); + ADD_CONFIG_TERM_STR(DRV_CFG, term->val.str, term->weak); break; case PARSE_EVENTS__TERM_TYPE_PERCORE: ADD_CONFIG_TERM_VAL(PERCORE, percore, - term->val.num ? true : false); + term->val.num ? true : false, term->weak); break; case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT: ADD_CONFIG_TERM_VAL(AUX_OUTPUT, aux_output, - term->val.num ? 1 : 0); + term->val.num ? 1 : 0, term->weak); break; case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE: ADD_CONFIG_TERM_VAL(AUX_SAMPLE_SIZE, aux_sample_size, - term->val.num); + term->val.num, term->weak); break; default: break; @@ -1345,7 +1339,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct list_head *head_config, } if (bits) - ADD_CONFIG_TERM_VAL(CFG_CHG, cfg_chg, bits); + ADD_CONFIG_TERM_VAL(CFG_CHG, cfg_chg, bits, false); #undef ADD_CONFIG_TERM return 0; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 651203126c71..355d3458d4e6 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -50,6 +50,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value) bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), + bit_name(HW_INDEX), { .name = NULL, } }; #undef bit_name diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index 0f5fda11675f..8c852948513e 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -206,6 +206,9 @@ static struct strlist *__probe_file__get_namelist(int fd, bool include_group) } else ret = strlist__add(sl, tev.event); clear_probe_trace_event(&tev); + /* Skip if there is same name multi-probe event in the list */ + if (ret == -EEXIST) + ret = 0; if (ret < 0) break; } diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 1c817add6ca4..e4cff49384f4 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -637,14 +637,19 @@ static int convert_to_trace_point(Dwarf_Die *sp_die, Dwfl_Module *mod, return -EINVAL; } - /* Try to get actual symbol name from symtab */ - symbol = dwfl_module_addrsym(mod, paddr, &sym, NULL); + if (dwarf_entrypc(sp_die, &eaddr) == 0) { + /* If the DIE has entrypc, use it. */ + symbol = dwarf_diename(sp_die); + } else { + /* Try to get actual symbol name and address from symtab */ + symbol = dwfl_module_addrsym(mod, paddr, &sym, NULL); + eaddr = sym.st_value; + } if (!symbol) { pr_warning("Failed to find symbol at 0x%lx\n", (unsigned long)paddr); return -ENOENT; } - eaddr = sym.st_value; tp->offset = (unsigned long)(paddr - eaddr); tp->address = (unsigned long)paddr; diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 80ca5d0ab7fe..8c1b27cd8b99 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -464,6 +464,7 @@ static PyObject *python_process_brstack(struct perf_sample *sample, struct thread *thread) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); PyObject *pylist; u64 i; @@ -484,28 +485,28 @@ static PyObject *python_process_brstack(struct perf_sample *sample, Py_FatalError("couldn't create Python dictionary"); pydict_set_item_string_decref(pyelem, "from", - PyLong_FromUnsignedLongLong(br->entries[i].from)); + PyLong_FromUnsignedLongLong(entries[i].from)); pydict_set_item_string_decref(pyelem, "to", - PyLong_FromUnsignedLongLong(br->entries[i].to)); + PyLong_FromUnsignedLongLong(entries[i].to)); pydict_set_item_string_decref(pyelem, "mispred", - PyBool_FromLong(br->entries[i].flags.mispred)); + PyBool_FromLong(entries[i].flags.mispred)); pydict_set_item_string_decref(pyelem, "predicted", - PyBool_FromLong(br->entries[i].flags.predicted)); + PyBool_FromLong(entries[i].flags.predicted)); pydict_set_item_string_decref(pyelem, "in_tx", - PyBool_FromLong(br->entries[i].flags.in_tx)); + PyBool_FromLong(entries[i].flags.in_tx)); pydict_set_item_string_decref(pyelem, "abort", - PyBool_FromLong(br->entries[i].flags.abort)); + PyBool_FromLong(entries[i].flags.abort)); pydict_set_item_string_decref(pyelem, "cycles", - PyLong_FromUnsignedLongLong(br->entries[i].flags.cycles)); + PyLong_FromUnsignedLongLong(entries[i].flags.cycles)); thread__find_map_fb(thread, sample->cpumode, - br->entries[i].from, &al); + entries[i].from, &al); dsoname = get_dsoname(al.map); pydict_set_item_string_decref(pyelem, "from_dsoname", _PyUnicode_FromString(dsoname)); thread__find_map_fb(thread, sample->cpumode, - br->entries[i].to, &al); + entries[i].to, &al); dsoname = get_dsoname(al.map); pydict_set_item_string_decref(pyelem, "to_dsoname", _PyUnicode_FromString(dsoname)); @@ -561,6 +562,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, struct thread *thread) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); PyObject *pylist; u64 i; char bf[512]; @@ -581,22 +583,22 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, Py_FatalError("couldn't create Python dictionary"); thread__find_symbol_fb(thread, sample->cpumode, - br->entries[i].from, &al); + entries[i].from, &al); get_symoff(al.sym, &al, true, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "from", _PyUnicode_FromString(bf)); thread__find_symbol_fb(thread, sample->cpumode, - br->entries[i].to, &al); + entries[i].to, &al); get_symoff(al.sym, &al, true, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "to", _PyUnicode_FromString(bf)); - get_br_mspred(&br->entries[i].flags, bf, sizeof(bf)); + get_br_mspred(&entries[i].flags, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "pred", _PyUnicode_FromString(bf)); - if (br->entries[i].flags.in_tx) { + if (entries[i].flags.in_tx) { pydict_set_item_string_decref(pyelem, "in_tx", _PyUnicode_FromString("X")); } else { @@ -604,7 +606,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, _PyUnicode_FromString("-")); } - if (br->entries[i].flags.abort) { + if (entries[i].flags.abort) { pydict_set_item_string_decref(pyelem, "abort", _PyUnicode_FromString("A")); } else { diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index d0d7d25b23e3..055b00abd56d 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1007,6 +1007,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) { struct ip_callchain *callchain = sample->callchain; struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); u64 kernel_callchain_nr = callchain->nr; unsigned int i; @@ -1043,10 +1044,10 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) i, callchain->ips[i]); printf("..... %2d: %016" PRIx64 "\n", - (int)(kernel_callchain_nr), lbr_stack->entries[0].to); + (int)(kernel_callchain_nr), entries[0].to); for (i = 0; i < lbr_stack->nr; i++) printf("..... %2d: %016" PRIx64 "\n", - (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from); + (int)(i + kernel_callchain_nr + 1), entries[i].from); } } @@ -1068,6 +1069,7 @@ static void callchain__printf(struct evsel *evsel, static void branch_stack__printf(struct perf_sample *sample, bool callstack) { + struct branch_entry *entries = perf_sample__branch_entries(sample); uint64_t i; printf("%s: nr:%" PRIu64 "\n", @@ -1075,7 +1077,7 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack) sample->branch_stack->nr); for (i = 0; i < sample->branch_stack->nr; i++) { - struct branch_entry *e = &sample->branch_stack->entries[i]; + struct branch_entry *e = &entries[i]; if (!callstack) { printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n", diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index aa344a163eaf..8a065a6f9713 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -2,11 +2,13 @@ from os import getenv from subprocess import Popen, PIPE from re import sub +cc = getenv("CC") +cc_is_clang = b"clang version" in Popen([cc, "-v"], stderr=PIPE).stderr.readline() + def clang_has_option(option): - return [o for o in Popen(['clang', option], stderr=PIPE).stderr.readlines() if b"unknown argument" in o] == [ ] + return [o for o in Popen([cc, option], stderr=PIPE).stderr.readlines() if b"unknown argument" in o] == [ ] -cc = getenv("CC") -if cc == "clang": +if cc_is_clang: from distutils.sysconfig import get_config_vars vars = get_config_vars() for var in ('CFLAGS', 'OPT'): @@ -40,7 +42,7 @@ class install_lib(_install_lib): cflags = getenv('CFLAGS', '').split() # switch off several checks (need to be at the end of cflags list) cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter', '-Wno-redundant-decls' ] -if cc != "clang": +if not cc_is_clang: cflags += ['-Wno-cast-function-type' ] src_perf = getenv('srctree') + '/tools/perf' diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index bc31fccc0057..76c6052b12e2 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -16,6 +16,7 @@ #include <linux/ctype.h> #include "cgroup.h" #include <api/fs/fs.h> +#include "util.h" #define CNTR_NOT_SUPPORTED "<not supported>" #define CNTR_NOT_COUNTED "<not counted>" @@ -110,7 +111,7 @@ static void aggr_printout(struct perf_stat_config *config, config->csv_sep); break; case AGGR_NONE: - if (evsel->percore) { + if (evsel->percore && !config->percore_show_thread) { fprintf(config->output, "S%d-D%d-C%*d%s", cpu_map__id_to_socket(id), cpu_map__id_to_die(id), @@ -628,7 +629,7 @@ static void aggr_cb(struct perf_stat_config *config, static void print_counter_aggrdata(struct perf_stat_config *config, struct evsel *counter, int s, char *prefix, bool metric_only, - bool *first) + bool *first, int cpu) { struct aggr_data ad; FILE *output = config->output; @@ -654,7 +655,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config, fprintf(output, "%s", prefix); uval = val * counter->scale; - printout(config, id, nr, counter, uval, prefix, + printout(config, cpu != -1 ? cpu : id, nr, counter, uval, prefix, run, ena, 1.0, &rt_stat); if (!metric_only) fputc('\n', output); @@ -687,7 +688,7 @@ static void print_aggr(struct perf_stat_config *config, evlist__for_each_entry(evlist, counter) { print_counter_aggrdata(config, counter, s, prefix, metric_only, - &first); + &first, -1); } if (metric_only) fputc('\n', output); @@ -1097,7 +1098,6 @@ static void print_footer(struct perf_stat_config *config) { double avg = avg_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC; FILE *output = config->output; - int n; if (!config->null_run) fprintf(output, "\n"); @@ -1131,9 +1131,7 @@ static void print_footer(struct perf_stat_config *config) } fprintf(output, "\n\n"); - if (config->print_free_counters_hint && - sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 && - n > 0) + if (config->print_free_counters_hint && sysctl__nmi_watchdog_enabled()) fprintf(output, "Some events weren't counted. Try disabling the NMI watchdog:\n" " echo 0 > /proc/sys/kernel/nmi_watchdog\n" @@ -1146,6 +1144,26 @@ static void print_footer(struct perf_stat_config *config) "the same PMU. Try reorganizing the group.\n"); } +static void print_percore_thread(struct perf_stat_config *config, + struct evsel *counter, char *prefix) +{ + int s, s2, id; + bool first = true; + + for (int i = 0; i < perf_evsel__nr_cpus(counter); i++) { + s2 = config->aggr_get_id(config, evsel__cpus(counter), i); + for (s = 0; s < config->aggr_map->nr; s++) { + id = config->aggr_map->map[s]; + if (s2 == id) + break; + } + + print_counter_aggrdata(config, counter, s, + prefix, false, + &first, i); + } +} + static void print_percore(struct perf_stat_config *config, struct evsel *counter, char *prefix) { @@ -1157,13 +1175,16 @@ static void print_percore(struct perf_stat_config *config, if (!(config->aggr_map || config->aggr_get_id)) return; + if (config->percore_show_thread) + return print_percore_thread(config, counter, prefix); + for (s = 0; s < config->aggr_map->nr; s++) { if (prefix && metric_only) fprintf(output, "%s", prefix); print_counter_aggrdata(config, counter, s, prefix, metric_only, - &first); + &first, -1); } if (metric_only) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 90d23cc3c8d4..0fd713d3674f 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -777,9 +777,7 @@ static void generic_metric(struct perf_stat_config *config, } if (!metric_events[i]) { - const char *p = metric_expr; - - if (expr__parse(&ratio, &pctx, &p) == 0) { + if (expr__parse(&ratio, &pctx, metric_expr) == 0) { char *unit; char metric_bf[64]; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index fb990efa54a8..b4fdfaa7f2c0 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -109,6 +109,7 @@ struct perf_stat_config { bool walltime_run_table; bool all_kernel; bool all_user; + bool percore_show_thread; FILE *output; unsigned int interval; unsigned int timeout; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 1077013d8ce2..26bc6a0096ce 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1622,7 +1622,12 @@ int dso__load(struct dso *dso, struct map *map) goto out; } - if (dso->kernel) { + kmod = dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || + dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || + dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE || + dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; + + if (dso->kernel && !kmod) { if (dso->kernel == DSO_TYPE_KERNEL) ret = dso__load_kernel_sym(dso, map); else if (dso->kernel == DSO_TYPE_GUEST_KERNEL) @@ -1650,12 +1655,6 @@ int dso__load(struct dso *dso, struct map *map) if (!name) goto out; - kmod = dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; - - /* * Read the build id if possible. This is required for * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index c423298fe62d..3f28af39f9c6 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -345,6 +345,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, continue; event->mmap2.ino = (u64)ino; + event->mmap2.ino_generation = 0; /* * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c @@ -1183,7 +1184,8 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, if (type & PERF_SAMPLE_BRANCH_STACK) { sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); + /* nr, hw_idx */ + sz += 2 * sizeof(u64); result += sz; } @@ -1344,7 +1346,8 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo if (type & PERF_SAMPLE_BRANCH_STACK) { sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); + /* nr, hw_idx */ + sz += 2 * sizeof(u64); memcpy(array, sample->branch_stack, sz); array = (void *)array + sz; } diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 969ae560dad9..d707c9624dd9 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -55,6 +55,24 @@ int sysctl__max_stack(void) return sysctl_perf_event_max_stack; } +bool sysctl__nmi_watchdog_enabled(void) +{ + static bool cached; + static bool nmi_watchdog; + int value; + + if (cached) + return nmi_watchdog; + + if (sysctl__read_int("kernel/nmi_watchdog", &value) < 0) + return false; + + nmi_watchdog = (value > 0) ? true : false; + cached = true; + + return nmi_watchdog; +} + bool test_attr__enabled; bool perf_host = true; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 9969b8b46f7c..f486fdd3a538 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -29,6 +29,8 @@ size_t hex_width(u64 v); int sysctl__max_stack(void); +bool sysctl__nmi_watchdog_enabled(void); + int fetch_kernel_version(unsigned int *puint, char *str, size_t str_sz); #define KVER_VERSION(x) (((x) >> 16) & 0xff) diff --git a/tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c b/tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c index 33dc34db4f3c..20f46348271b 100644 --- a/tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c +++ b/tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c @@ -82,7 +82,7 @@ static struct pci_access *pci_acc; static struct pci_dev *amd_fam14h_pci_dev; static int nbp1_entered; -struct timespec start_time; +static struct timespec start_time; static unsigned long long timediff; #ifdef DEBUG diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c index 3c4cee160b0e..a65f7d011513 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c +++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c @@ -19,7 +19,7 @@ struct cpuidle_monitor cpuidle_sysfs_monitor; static unsigned long long **previous_count; static unsigned long long **current_count; -struct timespec start_time; +static struct timespec start_time; static unsigned long long timediff; static int cpuidle_get_count_percent(unsigned int id, double *percent, diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c index 6d44fec55ad5..7c77045fef52 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -27,6 +27,8 @@ struct cpuidle_monitor *all_monitors[] = { 0 }; +int cpu_count; + static struct cpuidle_monitor *monitors[MONITORS_MAX]; static unsigned int avail_monitors; diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h index 5b5eb1da0cce..c559d3115330 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h @@ -25,7 +25,7 @@ #endif #define CSTATE_DESC_LEN 60 -int cpu_count; +extern int cpu_count; /* Hard to define the right names ...: */ enum power_range_e { diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 2b2b8167c65b..b73763489410 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -15,7 +15,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.2"; +static const char *version_str = "v1.3"; static const int supported_api_ver = 1; static struct isst_if_platform_info isst_platform_info; static char *progname; @@ -42,6 +42,7 @@ static int out_format_json; static int cmd_help; static int force_online_offline; static int auto_mode; +static int fact_enable_fail; /* clos related */ static int current_clos = -1; @@ -61,6 +62,18 @@ struct _cpu_map { }; struct _cpu_map *cpu_map; +struct cpu_topology { + short cpu; + short core_id; + short pkg_id; + short die_id; +}; + +FILE *get_output_file(void) +{ + return outf; +} + void debug_printf(const char *format, ...) { va_list args; @@ -82,6 +95,14 @@ int is_clx_n_platform(void) return 0; } +int is_skx_based_platform(void) +{ + if (cpu_model == 0x55) + return 1; + + return 0; +} + static int update_cpu_model(void) { unsigned int ebx, ecx, edx; @@ -175,25 +196,137 @@ int out_format_is_json(void) return out_format_json; } +static int get_stored_topology_info(int cpu, int *core_id, int *pkg_id, int *die_id) +{ + const char *pathname = "/tmp/isst_cpu_topology.dat"; + struct cpu_topology cpu_top; + FILE *fp; + int ret; + + fp = fopen(pathname, "rb"); + if (!fp) + return -1; + + ret = fseek(fp, cpu * sizeof(cpu_top), SEEK_SET); + if (ret) + goto err_ret; + + ret = fread(&cpu_top, sizeof(cpu_top), 1, fp); + if (ret != 1) { + ret = -1; + goto err_ret; + } + + *pkg_id = cpu_top.pkg_id; + *core_id = cpu_top.core_id; + *die_id = cpu_top.die_id; + ret = 0; + +err_ret: + fclose(fp); + + return ret; +} + +static void store_cpu_topology(void) +{ + const char *pathname = "/tmp/isst_cpu_topology.dat"; + FILE *fp; + int i; + + fp = fopen(pathname, "rb"); + if (fp) { + /* Mapping already exists */ + fclose(fp); + return; + } + + fp = fopen(pathname, "wb"); + if (!fp) { + fprintf(stderr, "Can't create file:%s\n", pathname); + return; + } + + for (i = 0; i < topo_max_cpus; ++i) { + struct cpu_topology cpu_top; + + cpu_top.core_id = parse_int_file(0, + "/sys/devices/system/cpu/cpu%d/topology/core_id", i); + if (cpu_top.core_id < 0) + cpu_top.core_id = -1; + + cpu_top.pkg_id = parse_int_file(0, + "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", i); + if (cpu_top.pkg_id < 0) + cpu_top.pkg_id = -1; + + cpu_top.die_id = parse_int_file(0, + "/sys/devices/system/cpu/cpu%d/topology/die_id", i); + if (cpu_top.die_id < 0) + cpu_top.die_id = -1; + + cpu_top.cpu = i; + + if (fwrite(&cpu_top, sizeof(cpu_top), 1, fp) != 1) { + fprintf(stderr, "Can't write to:%s\n", pathname); + break; + } + } + + fclose(fp); +} + int get_physical_package_id(int cpu) { - return parse_int_file( - 0, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", - cpu); + int ret; + + ret = parse_int_file(0, + "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", + cpu); + if (ret < 0) { + int core_id, pkg_id, die_id; + + ret = get_stored_topology_info(cpu, &core_id, &pkg_id, &die_id); + if (!ret) + return pkg_id; + } + + return ret; } int get_physical_core_id(int cpu) { - return parse_int_file( - 0, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); + int ret; + + ret = parse_int_file(0, + "/sys/devices/system/cpu/cpu%d/topology/core_id", + cpu); + if (ret < 0) { + int core_id, pkg_id, die_id; + + ret = get_stored_topology_info(cpu, &core_id, &pkg_id, &die_id); + if (!ret) + return core_id; + } + + return ret; } int get_physical_die_id(int cpu) { int ret; - ret = parse_int_file(0, "/sys/devices/system/cpu/cpu%d/topology/die_id", - cpu); + ret = parse_int_file(0, + "/sys/devices/system/cpu/cpu%d/topology/die_id", + cpu); + if (ret < 0) { + int core_id, pkg_id, die_id; + + ret = get_stored_topology_info(cpu, &core_id, &pkg_id, &die_id); + if (!ret) + return die_id; + } + if (ret < 0) ret = 0; @@ -219,8 +352,14 @@ static void set_cpu_online_offline(int cpu, int state) "/sys/devices/system/cpu/cpu%d/online", cpu); fd = open(buffer, O_WRONLY); - if (fd < 0) + if (fd < 0) { + if (!cpu && state) { + fprintf(stderr, "This system is not configured for CPU 0 online/offline\n"); + fprintf(stderr, "Ignoring online request for CPU 0 as this is already online\n"); + return; + } err(-1, "%s open failed", buffer); + } if (state) ret = write(fd, "1\n", 2); @@ -259,7 +398,12 @@ static void for_each_online_package_in_set(void (*callback)(int, void *, void *, die_id = get_physical_die_id(i); if (die_id < 0) die_id = 0; - pkg_id = get_physical_package_id(i); + + pkg_id = parse_int_file(0, + "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", i); + if (pkg_id < 0) + continue; + /* Create an unique id for package, die combination to store */ pkg_id = (MAX_PACKAGE_COUNT * pkg_id + die_id); @@ -281,7 +425,7 @@ static void for_each_online_target_cpu_in_set( void (*callback)(int, void *, void *, void *, void *), void *arg1, void *arg2, void *arg3, void *arg4) { - int i; + int i, found = 0; for (i = 0; i < topo_max_cpus; ++i) { int online; @@ -295,9 +439,14 @@ static void for_each_online_target_cpu_in_set( online = 1; /* online entry for CPU 0 needs some special configs */ - if (online && callback) + if (online && callback) { callback(i, arg1, arg2, arg3, arg4); + found = 1; + } } + + if (!found) + fprintf(stderr, "No valid CPU in the list\n"); } #define BITMASK_SIZE 32 @@ -305,14 +454,27 @@ static void set_max_cpu_num(void) { FILE *filep; unsigned long dummy; + int i; topo_max_cpus = 0; - filep = fopen_or_exit( - "/sys/devices/system/cpu/cpu0/topology/thread_siblings", "r"); + for (i = 0; i < 256; ++i) { + char path[256]; + + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", i); + filep = fopen(path, "r"); + if (filep) + break; + } + + if (!filep) { + fprintf(stderr, "Can't get max cpu number\n"); + exit(0); + } + while (fscanf(filep, "%lx,", &dummy) == 1) topo_max_cpus += BITMASK_SIZE; fclose(filep); - topo_max_cpus--; /* 0 based */ debug_printf("max cpus %d\n", topo_max_cpus); } @@ -362,6 +524,10 @@ static void set_cpu_present_cpu_mask(void) die_id = 0; pkg_id = get_physical_package_id(i); + if (pkg_id < 0) { + fprintf(stderr, "Failed to get package id, CPU %d may be offline\n", i); + continue; + } if (pkg_id < MAX_PACKAGE_COUNT && die_id < MAX_DIE_PER_PACKAGE) { int core_id = get_physical_core_id(i); @@ -542,7 +708,11 @@ static int isst_send_mmio_command(unsigned int cpu, unsigned int reg, int write, } if (ioctl(fd, cmd, &io_regs) == -1) { - perror("ISST_IF_IO_CMD"); + if (errno == ENOTTY) { + perror("ISST_IF_IO_COMMAND\n"); + fprintf(stderr, "Check presence of kernel modules: isst_if_mmio\n"); + exit(0); + } fprintf(outf, "Error: mmio_cmd cpu:%d reg:%x read_write:%x\n", cpu, reg, write); } else { @@ -571,7 +741,8 @@ int isst_send_mbox_command(unsigned int cpu, unsigned char command, "mbox_send: cpu:%d command:%x sub_command:%x parameter:%x req_data:%x\n", cpu, command, sub_command, parameter, req_data); - if (isst_platform_info.mmio_supported && command == CONFIG_CLOS) { + if (!is_skx_based_platform() && command == CONFIG_CLOS && + sub_command != CLOS_PM_QOS_CONFIG) { unsigned int value; int write = 0; int clos_id, core_id, ret = 0; @@ -620,10 +791,14 @@ int isst_send_mbox_command(unsigned int cpu, unsigned char command, err(-1, "%s open failed", pathname); if (ioctl(fd, ISST_IF_MBOX_COMMAND, &mbox_cmds) == -1) { - perror("ISST_IF_MBOX_COMMAND"); - fprintf(outf, - "Error: mbox_cmd cpu:%d command:%x sub_command:%x parameter:%x req_data:%x\n", - cpu, command, sub_command, parameter, req_data); + if (errno == ENOTTY) { + perror("ISST_IF_MBOX_COMMAND\n"); + fprintf(stderr, "Check presence of kernel modules: isst_if_mbox_pci or isst_if_mbox_msr\n"); + exit(0); + } + debug_printf( + "Error: mbox_cmd cpu:%d command:%x sub_command:%x parameter:%x req_data:%x errorno:%d\n", + cpu, command, sub_command, parameter, req_data, errno); return -1; } else { *resp = mbox_cmds.mbox_cmd[0].resp_data; @@ -656,7 +831,7 @@ int isst_send_msr_command(unsigned int cpu, unsigned int msr, int write, msr_cmds.msr_cmd[0].data = *req_resp; if (ioctl(fd, ISST_IF_MSR_COMMAND, &msr_cmds) == -1) { - perror("ISST_IF_MSR_COMMAD"); + perror("ISST_IF_MSR_COMMAND"); fprintf(outf, "Error: msr_cmd cpu:%d msr:%x read_write:%d\n", cpu, msr, write); } else { @@ -697,12 +872,85 @@ static int isst_fill_platform_info(void) return 0; } +static void isst_print_extended_platform_info(void) +{ + int cp_state, cp_cap, fact_support = 0, pbf_support = 0; + struct isst_pkg_ctdp_level_info ctdp_level; + struct isst_pkg_ctdp pkg_dev; + int ret, i, j; + FILE *filep; + + for (i = 0; i < 256; ++i) { + char path[256]; + + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", i); + filep = fopen(path, "r"); + if (filep) + break; + } + + if (!filep) + return; + + fclose(filep); + + ret = isst_get_ctdp_levels(i, &pkg_dev); + if (ret) + return; + + if (pkg_dev.enabled) { + fprintf(outf, "Intel(R) SST-PP (feature perf-profile) is supported\n"); + } else { + fprintf(outf, "Intel(R) SST-PP (feature perf-profile) is not supported\n"); + fprintf(outf, "Only performance level 0 (base level) is present\n"); + } + + if (pkg_dev.locked) + fprintf(outf, "TDP level change control is locked\n"); + else + fprintf(outf, "TDP level change control is unlocked, max level: %d \n", pkg_dev.levels); + + for (j = 0; j <= pkg_dev.levels; ++j) { + ret = isst_get_ctdp_control(i, j, &ctdp_level); + if (ret) + continue; + + if (!fact_support && ctdp_level.fact_support) + fact_support = 1; + + if (!pbf_support && ctdp_level.pbf_support) + pbf_support = 1; + } + + if (fact_support) + fprintf(outf, "Intel(R) SST-TF (feature turbo-freq) is supported\n"); + else + fprintf(outf, "Intel(R) SST-TF (feature turbo-freq) is not supported\n"); + + if (pbf_support) + fprintf(outf, "Intel(R) SST-BF (feature base-freq) is supported\n"); + else + fprintf(outf, "Intel(R) SST-BF (feature base-freq) is not supported\n"); + + ret = isst_read_pm_config(i, &cp_state, &cp_cap); + if (cp_cap) + fprintf(outf, "Intel(R) SST-CP (feature core-power) is supported\n"); + else + fprintf(outf, "Intel(R) SST-CP (feature core-power) is not supported\n"); +} + static void isst_print_platform_information(void) { struct isst_if_platform_info platform_info; const char *pathname = "/dev/isst_interface"; int fd; + if (is_clx_n_platform()) { + fprintf(stderr, "\nThis option in not supported on this platform\n"); + exit(0); + } + fd = open(pathname, O_RDWR); if (fd < 0) err(-1, "%s open failed", pathname); @@ -718,6 +966,7 @@ static void isst_print_platform_information(void) platform_info.mbox_supported); fprintf(outf, "Platform: mmio supported : %d\n", platform_info.mmio_supported); + isst_print_extended_platform_info(); } close(fd); @@ -725,6 +974,7 @@ static void isst_print_platform_information(void) exit(0); } +static char *local_str0, *local_str1; static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, void *arg4) { @@ -734,13 +984,14 @@ static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, fn_ptr = arg1; ret = fn_ptr(cpu, arg2); if (ret) - perror("get_tdp_*"); + isst_display_error_info_message(1, "get_tdp_* failed", 0, 0); else isst_ctdp_display_core_info(cpu, outf, arg3, - *(unsigned int *)arg4); + *(unsigned int *)arg4, + local_str0, local_str1); } -#define _get_tdp_level(desc, suffix, object, help) \ +#define _get_tdp_level(desc, suffix, object, help, str0, str1) \ static void get_tdp_##object(int arg) \ { \ struct isst_pkg_ctdp ctdp; \ @@ -751,6 +1002,8 @@ static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, help); \ exit(0); \ } \ + local_str0 = str0; \ + local_str1 = str1; \ isst_ctdp_display_information_start(outf); \ if (max_target_cpus) \ for_each_online_target_cpu_in_set( \ @@ -764,12 +1017,12 @@ static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, isst_ctdp_display_information_end(outf); \ } -_get_tdp_level("get-config-levels", levels, levels, "TDP levels"); -_get_tdp_level("get-config-version", levels, version, "TDP version"); -_get_tdp_level("get-config-enabled", levels, enabled, "TDP enable status"); +_get_tdp_level("get-config-levels", levels, levels, "Max TDP level", NULL, NULL); +_get_tdp_level("get-config-version", levels, version, "TDP version", NULL, NULL); +_get_tdp_level("get-config-enabled", levels, enabled, "perf-profile enable status", "disabled", "enabled"); _get_tdp_level("get-config-current_level", levels, current_level, - "Current TDP Level"); -_get_tdp_level("get-lock-status", levels, locked, "TDP lock status"); + "Current TDP Level", NULL, NULL); +_get_tdp_level("get-lock-status", levels, locked, "TDP lock status", "unlocked", "locked"); struct isst_pkg_ctdp clx_n_pkg_dev; @@ -902,9 +1155,14 @@ static void dump_clx_n_config_for_cpu(int cpu, void *arg1, void *arg2, { int ret; + if (tdp_level != 0xff && tdp_level != 0) { + isst_display_error_info_message(1, "Invalid level", 1, tdp_level); + exit(0); + } + ret = clx_n_config(cpu); if (ret) { - perror("isst_get_process_ctdp"); + debug_printf("clx_n_config failed"); } else { struct isst_pkg_ctdp_level_info *ctdp_level; struct isst_pbf_info *pbf_info; @@ -926,7 +1184,9 @@ static void dump_isst_config_for_cpu(int cpu, void *arg1, void *arg2, memset(&pkg_dev, 0, sizeof(pkg_dev)); ret = isst_get_process_ctdp(cpu, tdp_level, &pkg_dev); if (ret) { - perror("isst_get_process_ctdp"); + isst_display_error_info_message(1, "Failed to get perf-profile info on cpu", 1, cpu); + isst_ctdp_display_information_end(outf); + exit(1); } else { isst_ctdp_display_information(cpu, outf, tdp_level, &pkg_dev); isst_get_process_ctdp_complete(cpu, &pkg_dev); @@ -969,9 +1229,11 @@ static void set_tdp_level_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, int ret; ret = isst_set_tdp_level(cpu, tdp_level); - if (ret) - perror("set_tdp_level_for_cpu"); - else { + if (ret) { + isst_display_error_info_message(1, "Set TDP level failed", 0, 0); + isst_ctdp_display_information_end(outf); + exit(1); + } else { isst_display_result(cpu, outf, "perf-profile", "set_tdp_level", ret); if (force_online_offline) { @@ -1009,11 +1271,13 @@ static void set_tdp_level(int arg) "\t Arguments: -l|--level : Specify tdp level\n"); fprintf(stderr, "\t Optional Arguments: -o | online : online/offline for the tdp level\n"); + fprintf(stderr, + "\t online/offline operation has limitations, refer to Linux hotplug documentation\n"); exit(0); } if (tdp_level == 0xff) { - fprintf(outf, "Invalid command: specify tdp_level\n"); + isst_display_error_info_message(1, "Invalid command: specify tdp_level", 0, 0); exit(1); } isst_ctdp_display_information_start(outf); @@ -1033,7 +1297,7 @@ static void clx_n_dump_pbf_config_for_cpu(int cpu, void *arg1, void *arg2, ret = clx_n_config(cpu); if (ret) { - perror("isst_get_process_ctdp"); + isst_display_error_info_message(1, "clx_n_config failed", 0, 0); } else { struct isst_pkg_ctdp_level_info *ctdp_level; struct isst_pbf_info *pbf_info; @@ -1054,7 +1318,9 @@ static void dump_pbf_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_get_pbf_info(cpu, tdp_level, &pbf_info); if (ret) { - perror("isst_get_pbf_info"); + isst_display_error_info_message(1, "Failed to get base-freq info at this level", 1, tdp_level); + isst_ctdp_display_information_end(outf); + exit(1); } else { isst_pbf_display_information(cpu, outf, tdp_level, &pbf_info); isst_get_pbf_info_complete(&pbf_info); @@ -1074,7 +1340,7 @@ static void dump_pbf_config(int arg) } if (tdp_level == 0xff) { - fprintf(outf, "Invalid command: specify tdp_level\n"); + isst_display_error_info_message(1, "Invalid command: specify tdp_level", 0, 0); exit(1); } @@ -1100,7 +1366,7 @@ static int set_clos_param(int cpu, int clos, int epp, int wt, int min, int max) ret = isst_pm_get_clos(cpu, clos, &clos_config); if (ret) { - perror("isst_pm_get_clos"); + isst_display_error_info_message(1, "isst_pm_get_clos failed", 0, 0); return ret; } clos_config.clos_min = min; @@ -1109,7 +1375,7 @@ static int set_clos_param(int cpu, int clos, int epp, int wt, int min, int max) clos_config.clos_prop_prio = wt; ret = isst_set_clos(cpu, clos, &clos_config); if (ret) { - perror("isst_pm_set_clos"); + isst_display_error_info_message(1, "isst_set_clos failed", 0, 0); return ret; } @@ -1153,7 +1419,7 @@ static int set_clx_pbf_cpufreq_scaling_min_max(int cpu) ret = clx_n_config(cpu); if (ret) { - perror("set_clx_pbf_cpufreq_scaling_min_max"); + debug_printf("cpufreq_scaling_min_max failed for CLX"); return ret; } @@ -1316,7 +1582,7 @@ static int set_core_priority_and_min(int cpu, int mask_size, debug_printf("Associate cpu: %d clos: %d\n", i, clos); ret = isst_clos_associate(i, clos); if (ret) { - perror("isst_clos_associate"); + isst_display_error_info_message(1, "isst_clos_associate failed", 0, 0); return ret; } } @@ -1332,14 +1598,14 @@ static int set_pbf_core_power(int cpu) ret = isst_get_ctdp_levels(cpu, &pkg_dev); if (ret) { - perror("isst_get_ctdp_levels"); + debug_printf("isst_get_ctdp_levels failed"); return ret; } debug_printf("Current_level: %d\n", pkg_dev.current_level); ret = isst_get_pbf_info(cpu, pkg_dev.current_level, &pbf_info); if (ret) { - perror("isst_get_pbf_info"); + debug_printf("isst_get_pbf_info failed"); return ret; } debug_printf("p1_high: %d p1_low: %d\n", pbf_info.p1_high, @@ -1349,13 +1615,13 @@ static int set_pbf_core_power(int cpu) pbf_info.core_cpumask, pbf_info.p1_high, pbf_info.p1_low); if (ret) { - perror("set_core_priority_and_min"); + debug_printf("set_core_priority_and_min failed"); return ret; } ret = isst_pm_qos_config(cpu, 1, 1); if (ret) { - perror("isst_pm_qos_config"); + debug_printf("isst_pm_qos_config failed"); return ret; } @@ -1369,17 +1635,13 @@ static void set_pbf_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, int status = *(int *)arg4; if (is_clx_n_platform()) { + ret = 0; if (status) { - ret = 0; - if (auto_mode) - set_clx_pbf_cpufreq_scaling_min_max(cpu); + set_clx_pbf_cpufreq_scaling_min_max(cpu); } else { - ret = -1; - if (auto_mode) { - set_scaling_max_to_cpuinfo_max(cpu); - set_scaling_min_to_cpuinfo_min(cpu); - } + set_scaling_max_to_cpuinfo_max(cpu); + set_scaling_min_to_cpuinfo_min(cpu); } goto disp_result; } @@ -1392,7 +1654,7 @@ static void set_pbf_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_set_pbf_fact_status(cpu, 1, status); if (ret) { - perror("isst_set_pbf"); + debug_printf("isst_set_pbf_fact_status failed"); if (auto_mode) isst_pm_qos_config(cpu, 0, 0); } else { @@ -1405,7 +1667,7 @@ static void set_pbf_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, } if (auto_mode && !status) - isst_pm_qos_config(cpu, 0, 0); + isst_pm_qos_config(cpu, 0, 1); disp_result: if (status) @@ -1424,10 +1686,25 @@ static void set_pbf_enable(int arg) if (enable) { fprintf(stderr, "Enable Intel Speed Select Technology base frequency feature\n"); + if (is_clx_n_platform()) { + fprintf(stderr, + "\tOn this platform this command doesn't enable feature in the hardware.\n"); + fprintf(stderr, + "\tIt updates the cpufreq scaling_min_freq to match cpufreq base_frequency.\n"); + exit(0); + + } fprintf(stderr, "\tOptional Arguments: -a|--auto : Use priority of cores to set core-power associations\n"); } else { + if (is_clx_n_platform()) { + fprintf(stderr, + "\tOn this platform this command doesn't disable feature in the hardware.\n"); + fprintf(stderr, + "\tIt updates the cpufreq scaling_min_freq to match cpuinfo_min_freq\n"); + exit(0); + } fprintf(stderr, "Disable Intel Speed Select Technology base frequency feature\n"); fprintf(stderr, @@ -1452,12 +1729,15 @@ static void dump_fact_config_for_cpu(int cpu, void *arg1, void *arg2, struct isst_fact_info fact_info; int ret; - ret = isst_get_fact_info(cpu, tdp_level, &fact_info); - if (ret) - perror("isst_get_fact_bucket_info"); - else + ret = isst_get_fact_info(cpu, tdp_level, fact_bucket, &fact_info); + if (ret) { + isst_display_error_info_message(1, "Failed to get turbo-freq info at this level", 1, tdp_level); + isst_ctdp_display_information_end(outf); + exit(1); + } else { isst_fact_display_information(cpu, outf, tdp_level, fact_bucket, fact_avx, &fact_info); + } } static void dump_fact_config(int arg) @@ -1475,7 +1755,7 @@ static void dump_fact_config(int arg) } if (tdp_level == 0xff) { - fprintf(outf, "Invalid command: specify tdp_level\n"); + isst_display_error_info_message(1, "Invalid command: specify tdp_level\n", 0, 0); exit(1); } @@ -1503,7 +1783,7 @@ static void set_fact_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_set_pbf_fact_status(cpu, 0, status); if (ret) { - perror("isst_set_fact"); + debug_printf("isst_set_pbf_fact_status failed"); if (auto_mode) isst_pm_qos_config(cpu, 0, 0); @@ -1527,6 +1807,8 @@ static void set_fact_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, disp_results: if (status) { isst_display_result(cpu, outf, "turbo-freq", "enable", ret); + if (ret) + fact_enable_fail = ret; } else { /* Since we modified TRL during Fact enable, restore it */ isst_set_trl_from_current_tdp(cpu, fact_trl); @@ -1568,7 +1850,7 @@ static void set_fact_enable(int arg) NULL, &enable); isst_ctdp_display_information_end(outf); - if (enable && auto_mode) { + if (!fact_enable_fail && enable && auto_mode) { /* * When we adjust CLOS param, we have to set for siblings also. * So for the each user specified CPU, also add the sibling @@ -1652,9 +1934,12 @@ static void enable_clos_qos_config(int cpu, void *arg1, void *arg2, void *arg3, int ret; int status = *(int *)arg4; + if (is_skx_based_platform()) + clos_priority_type = 1; + ret = isst_pm_qos_config(cpu, status, clos_priority_type); if (ret) - perror("isst_pm_qos_config"); + isst_display_error_info_message(1, "isst_pm_qos_config failed", 0, 0); if (status) isst_display_result(cpu, outf, "core-power", "enable", @@ -1672,9 +1957,11 @@ static void set_clos_enable(int arg) if (enable) { fprintf(stderr, "Enable core-power for a package/die\n"); - fprintf(stderr, - "\tClos Enable: Specify priority type with [--priority|-p]\n"); - fprintf(stderr, "\t\t 0: Proportional, 1: Ordered\n"); + if (!is_skx_based_platform()) { + fprintf(stderr, + "\tClos Enable: Specify priority type with [--priority|-p]\n"); + fprintf(stderr, "\t\t 0: Proportional, 1: Ordered\n"); + } } else { fprintf(stderr, "Disable core-power: [No command arguments are required]\n"); @@ -1705,7 +1992,7 @@ static void dump_clos_config_for_cpu(int cpu, void *arg1, void *arg2, ret = isst_pm_get_clos(cpu, current_clos, &clos_config); if (ret) - perror("isst_pm_get_clos"); + isst_display_error_info_message(1, "isst_pm_get_clos failed", 0, 0); else isst_clos_display_information(cpu, outf, current_clos, &clos_config); @@ -1721,7 +2008,8 @@ static void dump_clos_config(int arg) exit(0); } if (current_clos < 0 || current_clos > 3) { - fprintf(stderr, "Invalid clos id\n"); + isst_display_error_info_message(1, "Invalid clos id\n", 0, 0); + isst_ctdp_display_information_end(outf); exit(0); } @@ -1742,9 +2030,14 @@ static void get_clos_info_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_clos_get_clos_information(cpu, &enable, &prio_type); if (ret) - perror("isst_clos_get_info"); - else - isst_clos_display_clos_information(cpu, outf, enable, prio_type); + isst_display_error_info_message(1, "isst_clos_get_info failed", 0, 0); + else { + int cp_state, cp_cap; + + isst_read_pm_config(cpu, &cp_state, &cp_cap); + isst_clos_display_clos_information(cpu, outf, enable, prio_type, + cp_state, cp_cap); + } } static void dump_clos_info(int arg) @@ -1752,19 +2045,17 @@ static void dump_clos_info(int arg) if (cmd_help) { fprintf(stderr, "Print Intel Speed Select Technology core power information\n"); - fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); - exit(0); - } - - if (!max_target_cpus) { - fprintf(stderr, - "Invalid target cpu. Specify with [-c|--cpu]\n"); + fprintf(stderr, "\t Optionally specify targeted cpu id with [--cpu|-c]\n"); exit(0); } isst_ctdp_display_information_start(outf); - for_each_online_target_cpu_in_set(get_clos_info_for_cpu, NULL, - NULL, NULL, NULL); + if (max_target_cpus) + for_each_online_target_cpu_in_set(get_clos_info_for_cpu, NULL, + NULL, NULL, NULL); + else + for_each_online_package_in_set(get_clos_info_for_cpu, NULL, + NULL, NULL, NULL); isst_ctdp_display_information_end(outf); } @@ -1785,7 +2076,7 @@ static void set_clos_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, clos_config.clos_desired = clos_desired; ret = isst_set_clos(cpu, current_clos, &clos_config); if (ret) - perror("isst_set_clos"); + isst_display_error_info_message(1, "isst_set_clos failed", 0, 0); else isst_display_result(cpu, outf, "core-power", "config", ret); } @@ -1797,26 +2088,27 @@ static void set_clos_config(int arg) "Set core-power configuration for one of the four clos ids\n"); fprintf(stderr, "\tSpecify targeted clos id with [--clos|-c]\n"); - fprintf(stderr, "\tSpecify clos EPP with [--epp|-e]\n"); - fprintf(stderr, - "\tSpecify clos Proportional Priority [--weight|-w]\n"); + if (!is_skx_based_platform()) { + fprintf(stderr, "\tSpecify clos EPP with [--epp|-e]\n"); + fprintf(stderr, + "\tSpecify clos Proportional Priority [--weight|-w]\n"); + } fprintf(stderr, "\tSpecify clos min in MHz with [--min|-n]\n"); fprintf(stderr, "\tSpecify clos max in MHz with [--max|-m]\n"); - fprintf(stderr, "\tSpecify clos desired in MHz with [--desired|-d]\n"); exit(0); } if (current_clos < 0 || current_clos > 3) { - fprintf(stderr, "Invalid clos id\n"); + isst_display_error_info_message(1, "Invalid clos id\n", 0, 0); exit(0); } - if (clos_epp < 0 || clos_epp > 0x0F) { - fprintf(stderr, "clos epp is not specified, default: 0\n"); + if (!is_skx_based_platform() && (clos_epp < 0 || clos_epp > 0x0F)) { + fprintf(stderr, "clos epp is not specified or invalid, default: 0\n"); clos_epp = 0; } - if (clos_prop_prio < 0 || clos_prop_prio > 0x0F) { + if (!is_skx_based_platform() && (clos_prop_prio < 0 || clos_prop_prio > 0x0F)) { fprintf(stderr, - "clos frequency weight is not specified, default: 0\n"); + "clos frequency weight is not specified or invalid, default: 0\n"); clos_prop_prio = 0; } if (clos_min < 0) { @@ -1824,11 +2116,11 @@ static void set_clos_config(int arg) clos_min = 0; } if (clos_max < 0) { - fprintf(stderr, "clos max is not specified, default: 25500 MHz\n"); + fprintf(stderr, "clos max is not specified, default: Max frequency (ratio 0xff)\n"); clos_max = 0xff; } - if (clos_desired < 0) { - fprintf(stderr, "clos desired is not specified, default: 0\n"); + if (clos_desired) { + fprintf(stderr, "clos desired is not supported on this platform\n"); clos_desired = 0x00; } @@ -1849,7 +2141,7 @@ static void set_clos_assoc_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_clos_associate(cpu, current_clos); if (ret) - perror("isst_clos_associate"); + debug_printf("isst_clos_associate failed"); else isst_display_result(cpu, outf, "core-power", "assoc", ret); } @@ -1860,19 +2152,22 @@ static void set_clos_assoc(int arg) fprintf(stderr, "Associate a clos id to a CPU\n"); fprintf(stderr, "\tSpecify targeted clos id with [--clos|-c]\n"); + fprintf(stderr, + "\tFor example to associate clos 1 to CPU 0: issue\n"); + fprintf(stderr, + "\tintel-speed-select --cpu 0 core-power assoc --clos 1\n"); exit(0); } if (current_clos < 0 || current_clos > 3) { - fprintf(stderr, "Invalid clos id\n"); + isst_display_error_info_message(1, "Invalid clos id\n", 0, 0); exit(0); } if (max_target_cpus) for_each_online_target_cpu_in_set(set_clos_assoc_for_cpu, NULL, NULL, NULL, NULL); else { - fprintf(stderr, - "Invalid target cpu. Specify with [-c|--cpu]\n"); + isst_display_error_info_message(1, "Invalid target cpu. Specify with [-c|--cpu]", 0, 0); } } @@ -1883,7 +2178,7 @@ static void get_clos_assoc_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_clos_get_assoc_status(cpu, &clos); if (ret) - perror("isst_clos_get_assoc_status"); + isst_display_error_info_message(1, "isst_clos_get_assoc_status failed", 0, 0); else isst_clos_display_assoc_information(cpu, outf, clos); } @@ -1897,8 +2192,7 @@ static void get_clos_assoc(int arg) } if (!max_target_cpus) { - fprintf(stderr, - "Invalid target cpu. Specify with [-c|--cpu]\n"); + isst_display_error_info_message(1, "Invalid target cpu. Specify with [-c|--cpu]", 0, 0); exit(0); } @@ -2035,7 +2329,7 @@ static void parse_cmd_args(int argc, int start, char **argv) option_index = start; optind = start + 1; - while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:hoa", + while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:r:hoa", long_options, &option_index)) != -1) { switch (opt) { case 'a': @@ -2061,7 +2355,7 @@ static void parse_cmd_args(int argc, int start, char **argv) fact_avx = 0x01; } else if (!strncmp(optarg, "avx2", 4)) { fact_avx = 0x02; - } else if (!strncmp(optarg, "avx512", 4)) { + } else if (!strncmp(optarg, "avx512", 6)) { fact_avx = 0x04; } else { fprintf(outf, "Invalid sse,avx options\n"); @@ -2078,6 +2372,10 @@ static void parse_cmd_args(int argc, int start, char **argv) break; case 'e': clos_epp = atoi(optarg); + if (is_skx_based_platform()) { + isst_display_error_info_message(1, "epp can't be specified on this platform", 0, 0); + exit(0); + } break; case 'n': clos_min = atoi(optarg); @@ -2089,14 +2387,25 @@ static void parse_cmd_args(int argc, int start, char **argv) break; case 'p': clos_priority_type = atoi(optarg); + if (is_skx_based_platform() && !clos_priority_type) { + isst_display_error_info_message(1, "Invalid clos priority type: proportional for this platform", 0, 0); + exit(0); + } break; case 'w': clos_prop_prio = atoi(optarg); + if (is_skx_based_platform()) { + isst_display_error_info_message(1, "weight can't be specified on this platform", 0, 0); + exit(0); + } break; default: - printf("no match\n"); + printf("Unknown option: ignore\n"); } } + + if (argv[optind]) + printf("Garbage at the end of command: ignore\n"); } static void isst_help(void) @@ -2214,11 +2523,18 @@ void process_command(int argc, char **argv, static void usage(void) { - printf("Intel(R) Speed Select Technology\n"); + if (is_clx_n_platform()) { + fprintf(stderr, "\nThere is limited support of Intel Speed Select features on this platform.\n"); + fprintf(stderr, "Everything is pre-configured using BIOS options, this tool can't enable any feature in the hardware.\n\n"); + } + printf("\nUsage:\n"); printf("intel-speed-select [OPTIONS] FEATURE COMMAND COMMAND_ARGUMENTS\n"); - printf("\nUse this tool to enumerate and control the Intel Speed Select Technology features,\n"); - printf("\nFEATURE : [perf-profile|base-freq|turbo-freq|core-power]\n"); + printf("\nUse this tool to enumerate and control the Intel Speed Select Technology features:\n"); + if (is_clx_n_platform()) + printf("\nFEATURE : [perf-profile|base-freq]\n"); + else + printf("\nFEATURE : [perf-profile|base-freq|turbo-freq|core-power]\n"); printf("\nFor help on each feature, use -h|--help\n"); printf("\tFor example: intel-speed-select perf-profile -h\n"); @@ -2231,17 +2547,29 @@ static void usage(void) printf("\t\tDefault: Die scoped for all dies in the system with multiple dies/package\n"); printf("\t\t\t Or Package scoped for all Packages when each package contains one die\n"); printf("\t[-d|--debug] : Debug mode\n"); + printf("\t[-f|--format] : output format [json|text]. Default: text\n"); printf("\t[-h|--help] : Print help\n"); printf("\t[-i|--info] : Print platform information\n"); printf("\t[-o|--out] : Output file\n"); printf("\t\t\tDefault : stderr\n"); - printf("\t[-f|--format] : output format [json|text]. Default: text\n"); printf("\t[-v|--version] : Print version\n"); printf("\nResult format\n"); printf("\tResult display uses a common format for each command:\n"); printf("\tResults are formatted in text/JSON with\n"); printf("\t\tPackage, Die, CPU, and command specific results.\n"); + + printf("\nExamples\n"); + printf("\tTo get platform information:\n"); + printf("\t\tintel-speed-select --info\n"); + printf("\tTo get full perf-profile information dump:\n"); + printf("\t\tintel-speed-select perf-profile info\n"); + printf("\tTo get full base-freq information dump:\n"); + printf("\t\tintel-speed-select base-freq info -l 0\n"); + if (!is_clx_n_platform()) { + printf("\tTo get full turbo-freq information dump:\n"); + printf("\t\tintel-speed-select turbo-freq info -l 0\n"); + } exit(1); } @@ -2254,6 +2582,8 @@ static void print_version(void) static void cmdline(int argc, char **argv) { + const char *pathname = "/dev/isst_interface"; + FILE *fp; int opt; int option_index = 0; int ret; @@ -2269,6 +2599,28 @@ static void cmdline(int argc, char **argv) { 0, 0, 0, 0 } }; + if (geteuid() != 0) { + fprintf(stderr, "Must run as root\n"); + exit(0); + } + + ret = update_cpu_model(); + if (ret) + err(-1, "Invalid CPU model (%d)\n", cpu_model); + printf("Intel(R) Speed Select Technology\n"); + printf("Executing on CPU model:%d[0x%x]\n", cpu_model, cpu_model); + + if (!is_clx_n_platform()) { + fp = fopen(pathname, "rb"); + if (!fp) { + fprintf(stderr, "Intel speed select drivers are not loaded on this system.\n"); + fprintf(stderr, "Verify that kernel config includes CONFIG_INTEL_SPEED_SELECT_INTERFACE.\n"); + fprintf(stderr, "If the config is included then this is not a supported platform.\n"); + exit(0); + } + fclose(fp); + } + progname = argv[0]; while ((opt = getopt_long_only(argc, argv, "+c:df:hio:v", long_options, &option_index)) != -1) { @@ -2303,21 +2655,12 @@ static void cmdline(int argc, char **argv) } } - if (geteuid() != 0) { - fprintf(stderr, "Must run as root\n"); - exit(0); - } - if (optind > (argc - 2)) { - fprintf(stderr, "Feature name and|or command not specified\n"); + usage(); exit(0); } - ret = update_cpu_model(); - if (ret) - err(-1, "Invalid CPU model (%d)\n", cpu_model); - printf("Intel(R) Speed Select Technology\n"); - printf("Executing on CPU model:%d[0x%x]\n", cpu_model, cpu_model); set_max_cpu_num(); + store_cpu_topology(); set_cpu_present_cpu_mask(); set_cpu_target_cpu_mask(); diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index 81a119f688a3..67c9b1139631 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -114,8 +114,10 @@ int isst_get_tdp_info(int cpu, int config_index, ret = isst_send_mbox_command(cpu, CONFIG_TDP, CONFIG_TDP_GET_TDP_INFO, 0, config_index, &resp); - if (ret) + if (ret) { + isst_display_error_info_message(1, "Invalid level, Can't get TDP information at level", 1, config_index); return ret; + } ctdp_level->pkg_tdp = resp & GENMASK(14, 0); ctdp_level->tdp_ratio = (resp & GENMASK(23, 16)) >> 16; @@ -352,7 +354,7 @@ int isst_set_tdp_level_msr(int cpu, int tdp_level) debug_printf("cpu: tdp_level via MSR %d\n", cpu, tdp_level); if (isst_get_config_tdp_lock_status(cpu)) { - debug_printf("cpu: tdp_locked %d\n", cpu); + isst_display_error_info_message(1, "tdp_locked", 0, 0); return -1; } @@ -373,19 +375,50 @@ int isst_set_tdp_level(int cpu, int tdp_level) unsigned int resp; int ret; + + if (isst_get_config_tdp_lock_status(cpu)) { + isst_display_error_info_message(1, "TDP is locked", 0, 0); + return -1; + + } + ret = isst_send_mbox_command(cpu, CONFIG_TDP, CONFIG_TDP_SET_LEVEL, 0, tdp_level, &resp); - if (ret) - return isst_set_tdp_level_msr(cpu, tdp_level); + if (ret) { + isst_display_error_info_message(1, "Set TDP level failed for level", 1, tdp_level); + return ret; + } return 0; } int isst_get_pbf_info(int cpu, int level, struct isst_pbf_info *pbf_info) { + struct isst_pkg_ctdp_level_info ctdp_level; + struct isst_pkg_ctdp pkg_dev; int i, ret, core_cnt, max; unsigned int req, resp; + ret = isst_get_ctdp_levels(cpu, &pkg_dev); + if (ret) { + isst_display_error_info_message(1, "Failed to get number of levels", 0, 0); + return ret; + } + + if (level > pkg_dev.levels) { + isst_display_error_info_message(1, "Invalid level", 1, level); + return -1; + } + + ret = isst_get_ctdp_control(cpu, level, &ctdp_level); + if (ret) + return ret; + + if (!ctdp_level.pbf_support) { + isst_display_error_info_message(1, "base-freq feature is not present at this level", 1, level); + return -1; + } + pbf_info->core_cpumask_size = alloc_cpu_set(&pbf_info->core_cpumask); core_cnt = get_core_count(get_physical_package_id(cpu), get_physical_die_id(cpu)); @@ -481,6 +514,10 @@ int isst_set_pbf_fact_status(int cpu, int pbf, int enable) else req &= ~BIT(17); } else { + + if (enable && !ctdp_level.sst_cp_enabled) + isst_display_error_info_message(0, "Make sure to execute before: core-power enable", 0, 0); + if (ctdp_level.pbf_enabled) req = BIT(17); @@ -566,10 +603,32 @@ int isst_get_fact_bucket_info(int cpu, int level, return 0; } -int isst_get_fact_info(int cpu, int level, struct isst_fact_info *fact_info) +int isst_get_fact_info(int cpu, int level, int fact_bucket, struct isst_fact_info *fact_info) { + struct isst_pkg_ctdp_level_info ctdp_level; + struct isst_pkg_ctdp pkg_dev; unsigned int resp; - int ret; + int j, ret, print; + + ret = isst_get_ctdp_levels(cpu, &pkg_dev); + if (ret) { + isst_display_error_info_message(1, "Failed to get number of levels", 0, 0); + return ret; + } + + if (level > pkg_dev.levels) { + isst_display_error_info_message(1, "Invalid level", 1, level); + return -1; + } + + ret = isst_get_ctdp_control(cpu, level, &ctdp_level); + if (ret) + return ret; + + if (!ctdp_level.fact_support) { + isst_display_error_info_message(1, "turbo-freq feature is not present at this level", 1, level); + return -1; + } ret = isst_send_mbox_command(cpu, CONFIG_TDP, CONFIG_TDP_GET_FACT_LP_CLIPPING_RATIO, 0, @@ -585,8 +644,25 @@ int isst_get_fact_info(int cpu, int level, struct isst_fact_info *fact_info) fact_info->lp_clipping_ratio_license_avx512 = (resp >> 16) & 0xff; ret = isst_get_fact_bucket_info(cpu, level, fact_info->bucket_info); + if (ret) + return ret; - return ret; + print = 0; + for (j = 0; j < ISST_FACT_MAX_BUCKETS; ++j) { + if (fact_bucket != 0xff && fact_bucket != j) + continue; + + if (!fact_info->bucket_info[j].high_priority_cores_count) + break; + + print = 1; + } + if (!print) { + isst_display_error_info_message(1, "Invalid bucket", 0, 0); + return -1; + } + + return 0; } int isst_set_trl(int cpu, unsigned long long trl) @@ -671,7 +747,7 @@ void isst_get_process_ctdp_complete(int cpu, struct isst_pkg_ctdp *pkg_dev) int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { - int i, ret; + int i, ret, valid = 0; if (pkg_dev->processed) return 0; @@ -684,6 +760,14 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) cpu, pkg_dev->enabled, pkg_dev->current_level, pkg_dev->levels); + if (tdp_level != 0xff && tdp_level > pkg_dev->levels) { + isst_display_error_info_message(1, "Invalid level", 0, 0); + return -1; + } + + if (!pkg_dev->enabled) + isst_display_error_info_message(0, "perf-profile feature is not supported, just base-config level 0 is valid", 0, 0); + for (i = 0; i <= pkg_dev->levels; ++i) { struct isst_pkg_ctdp_level_info *ctdp_level; @@ -703,6 +787,7 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) if (ret) continue; + valid = 1; pkg_dev->processed = 1; ctdp_level->processed = 1; @@ -713,7 +798,7 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) } if (ctdp_level->fact_support) { - ret = isst_get_fact_info(cpu, i, + ret = isst_get_fact_info(cpu, i, 0xff, &ctdp_level->fact_info); if (ret) return ret; @@ -775,6 +860,9 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) isst_get_uncore_mem_freq(cpu, i, ctdp_level); } + if (!valid) + isst_display_error_info_message(0, "Invalid level, Can't get TDP control information at specified levels on cpu", 1, cpu); + return 0; } @@ -829,17 +917,19 @@ int isst_pm_qos_config(int cpu, int enable_clos, int priority_type) } ret = isst_write_pm_config(cpu, 0); if (ret) - perror("isst_write_pm_config\n"); + isst_display_error_info_message(0, "WRITE_PM_CONFIG command failed, ignoring error\n", 0, 0); } else { ret = isst_write_pm_config(cpu, 1); if (ret) - perror("isst_write_pm_config\n"); + isst_display_error_info_message(0, "WRITE_PM_CONFIG command failed, ignoring error\n", 0, 0); } ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_QOS_CONFIG, 0, 0, &resp); - if (ret) + if (ret) { + isst_display_error_info_message(1, "CLOS_PM_QOS_CONFIG command failed", 0, 0); return ret; + } debug_printf("cpu:%d CLOS_PM_QOS_CONFIG resp:%x\n", cpu, resp); @@ -850,6 +940,9 @@ int isst_pm_qos_config(int cpu, int enable_clos, int priority_type) else req = req & ~BIT(1); + if (priority_type > 1) + isst_display_error_info_message(1, "Invalid priority type: Changing type to ordered", 0, 0); + if (priority_type) req = req | BIT(2); else diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 4fb0c1d49d64..51dbaa5f02ec 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -158,10 +158,17 @@ static void format_and_print(FILE *outf, int level, char *header, char *value) last_level = level; } -static void print_package_info(int cpu, FILE *outf) +static int print_package_info(int cpu, FILE *outf) { char header[256]; + if (out_format_is_json()) { + snprintf(header, sizeof(header), "package-%d:die-%d:cpu-%d", + get_physical_package_id(cpu), get_physical_die_id(cpu), + cpu); + format_and_print(outf, 1, header, NULL); + return 1; + } snprintf(header, sizeof(header), "package-%d", get_physical_package_id(cpu)); format_and_print(outf, 1, header, NULL); @@ -169,6 +176,8 @@ static void print_package_info(int cpu, FILE *outf) format_and_print(outf, 2, header, NULL); snprintf(header, sizeof(header), "cpu-%d", cpu); format_and_print(outf, 3, header, NULL); + + return 3; } static void _isst_pbf_display_information(int cpu, FILE *outf, int level, @@ -178,7 +187,7 @@ static void _isst_pbf_display_information(int cpu, FILE *outf, int level, char header[256]; char value[256]; - snprintf(header, sizeof(header), "speed-select-base-freq"); + snprintf(header, sizeof(header), "speed-select-base-freq-properties"); format_and_print(outf, disp_level, header, NULL); snprintf(header, sizeof(header), "high-priority-base-frequency(MHz)"); @@ -222,9 +231,23 @@ static void _isst_fact_display_information(int cpu, FILE *outf, int level, struct isst_fact_bucket_info *bucket_info = fact_info->bucket_info; char header[256]; char value[256]; - int j; + int print = 0, j; + + for (j = 0; j < ISST_FACT_MAX_BUCKETS; ++j) { + if (fact_bucket != 0xff && fact_bucket != j) + continue; + + if (!bucket_info[j].high_priority_cores_count) + break; - snprintf(header, sizeof(header), "speed-select-turbo-freq"); + print = 1; + } + if (!print) { + fprintf(stderr, "Invalid bucket\n"); + return; + } + + snprintf(header, sizeof(header), "speed-select-turbo-freq-properties"); format_and_print(outf, base_level, header, NULL); for (j = 0; j < ISST_FACT_MAX_BUCKETS; ++j) { if (fact_bucket != 0xff && fact_bucket != j) @@ -289,7 +312,7 @@ static void _isst_fact_display_information(int cpu, FILE *outf, int level, } void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, - unsigned int val) + unsigned int val, char *str0, char *str1) { char header[256]; char value[256]; @@ -301,8 +324,12 @@ void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, format_and_print(outf, 2, header, NULL); snprintf(header, sizeof(header), "cpu-%d", cpu); format_and_print(outf, 3, header, NULL); - - snprintf(value, sizeof(value), "%u", val); + if (str0 && !val) + snprintf(value, sizeof(value), "%s", str0); + else if (str1 && val) + snprintf(value, sizeof(value), "%s", str1); + else + snprintf(value, sizeof(value), "%u", val); format_and_print(outf, 4, prefix, value); format_and_print(outf, 1, NULL, NULL); @@ -313,10 +340,11 @@ void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, { char header[256]; char value[256]; - int i, base_level = 1; + static int level; + int i; if (pkg_dev->processed) - print_package_info(cpu, outf); + level = print_package_info(cpu, outf); for (i = 0; i <= pkg_dev->levels; ++i) { struct isst_pkg_ctdp_level_info *ctdp_level; @@ -328,72 +356,80 @@ void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, snprintf(header, sizeof(header), "perf-profile-level-%d", ctdp_level->level); - format_and_print(outf, base_level + 3, header, NULL); + format_and_print(outf, level + 1, header, NULL); snprintf(header, sizeof(header), "cpu-count"); j = get_cpu_count(get_physical_die_id(cpu), get_physical_die_id(cpu)); snprintf(value, sizeof(value), "%d", j); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); + + j = CPU_COUNT_S(ctdp_level->core_cpumask_size, + ctdp_level->core_cpumask); + if (j) { + snprintf(header, sizeof(header), "enable-cpu-count"); + snprintf(value, sizeof(value), "%d", j); + format_and_print(outf, level + 2, header, value); + } if (ctdp_level->core_cpumask_size) { snprintf(header, sizeof(header), "enable-cpu-mask"); printcpumask(sizeof(value), value, ctdp_level->core_cpumask_size, ctdp_level->core_cpumask); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "enable-cpu-list"); printcpulist(sizeof(value), value, ctdp_level->core_cpumask_size, ctdp_level->core_cpumask); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } snprintf(header, sizeof(header), "thermal-design-power-ratio"); snprintf(value, sizeof(value), "%d", ctdp_level->tdp_ratio); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "base-frequency(MHz)"); if (!ctdp_level->sse_p1) ctdp_level->sse_p1 = ctdp_level->tdp_ratio; snprintf(value, sizeof(value), "%d", ctdp_level->sse_p1 * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); if (ctdp_level->avx2_p1) { snprintf(header, sizeof(header), "base-frequency-avx2(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->avx2_p1 * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } if (ctdp_level->avx512_p1) { snprintf(header, sizeof(header), "base-frequency-avx512(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->avx512_p1 * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } if (ctdp_level->uncore_p1) { snprintf(header, sizeof(header), "uncore-frequency-min(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->uncore_p1 * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } if (ctdp_level->uncore_p0) { snprintf(header, sizeof(header), "uncore-frequency-max(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->uncore_p0 * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } if (ctdp_level->mem_freq) { snprintf(header, sizeof(header), "mem-frequency(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->mem_freq * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } snprintf(header, sizeof(header), @@ -405,7 +441,7 @@ void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, snprintf(value, sizeof(value), "disabled"); } else snprintf(value, sizeof(value), "unsupported"); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "speed-select-base-freq"); @@ -416,7 +452,7 @@ void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, snprintf(value, sizeof(value), "disabled"); } else snprintf(value, sizeof(value), "unsupported"); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "speed-select-core-power"); @@ -427,110 +463,115 @@ void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, snprintf(value, sizeof(value), "disabled"); } else snprintf(value, sizeof(value), "unsupported"); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); if (is_clx_n_platform()) { if (ctdp_level->pbf_support) _isst_pbf_display_information(cpu, outf, tdp_level, &ctdp_level->pbf_info, - base_level + 4); + level + 1); continue; } if (ctdp_level->pkg_tdp) { snprintf(header, sizeof(header), "thermal-design-power(W)"); snprintf(value, sizeof(value), "%d", ctdp_level->pkg_tdp); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } if (ctdp_level->t_proc_hot) { snprintf(header, sizeof(header), "tjunction-max(C)"); snprintf(value, sizeof(value), "%d", ctdp_level->t_proc_hot); - format_and_print(outf, base_level + 4, header, value); + format_and_print(outf, level + 2, header, value); } snprintf(header, sizeof(header), "turbo-ratio-limits-sse"); - format_and_print(outf, base_level + 4, header, NULL); + format_and_print(outf, level + 2, header, NULL); for (j = 0; j < 8; ++j) { snprintf(header, sizeof(header), "bucket-%d", j); - format_and_print(outf, base_level + 5, header, NULL); + format_and_print(outf, level + 3, header, NULL); snprintf(header, sizeof(header), "core-count"); snprintf(value, sizeof(value), "%llu", (ctdp_level->buckets_info >> (j * 8)) & 0xff); - format_and_print(outf, base_level + 6, header, value); + format_and_print(outf, level + 4, header, value); snprintf(header, sizeof(header), "max-turbo-frequency(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->trl_sse_active_cores[j] * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 6, header, value); + format_and_print(outf, level + 4, header, value); } if (ctdp_level->trl_avx_active_cores[0]) { snprintf(header, sizeof(header), "turbo-ratio-limits-avx2"); - format_and_print(outf, base_level + 4, header, NULL); + format_and_print(outf, level + 2, header, NULL); for (j = 0; j < 8; ++j) { snprintf(header, sizeof(header), "bucket-%d", j); - format_and_print(outf, base_level + 5, header, NULL); + format_and_print(outf, level + 3, header, NULL); snprintf(header, sizeof(header), "core-count"); snprintf(value, sizeof(value), "%llu", (ctdp_level->buckets_info >> (j * 8)) & 0xff); - format_and_print(outf, base_level + 6, header, value); + format_and_print(outf, level + 4, header, value); snprintf(header, sizeof(header), "max-turbo-frequency(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->trl_avx_active_cores[j] * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 6, header, value); + format_and_print(outf, level + 4, header, value); } } if (ctdp_level->trl_avx_512_active_cores[0]) { snprintf(header, sizeof(header), "turbo-ratio-limits-avx512"); - format_and_print(outf, base_level + 4, header, NULL); + format_and_print(outf, level + 2, header, NULL); for (j = 0; j < 8; ++j) { snprintf(header, sizeof(header), "bucket-%d", j); - format_and_print(outf, base_level + 5, header, NULL); + format_and_print(outf, level + 3, header, NULL); snprintf(header, sizeof(header), "core-count"); snprintf(value, sizeof(value), "%llu", (ctdp_level->buckets_info >> (j * 8)) & 0xff); - format_and_print(outf, base_level + 6, header, value); + format_and_print(outf, level + 4, header, value); snprintf(header, sizeof(header), "max-turbo-frequency(MHz)"); snprintf(value, sizeof(value), "%d", ctdp_level->trl_avx_512_active_cores[j] * DISP_FREQ_MULTIPLIER); - format_and_print(outf, base_level + 6, header, value); + format_and_print(outf, level + 4, header, value); } } if (ctdp_level->pbf_support) _isst_pbf_display_information(cpu, outf, i, &ctdp_level->pbf_info, - base_level + 4); + level + 2); if (ctdp_level->fact_support) _isst_fact_display_information(cpu, outf, i, 0xff, 0xff, &ctdp_level->fact_info, - base_level + 4); + level + 2); } format_and_print(outf, 1, NULL, NULL); } +static int start; void isst_ctdp_display_information_start(FILE *outf) { last_level = 0; format_and_print(outf, 0, "start", NULL); + start = 1; } void isst_ctdp_display_information_end(FILE *outf) { format_and_print(outf, 0, NULL, NULL); + start = 0; } void isst_pbf_display_information(int cpu, FILE *outf, int level, struct isst_pbf_info *pbf_info) { - print_package_info(cpu, outf); - _isst_pbf_display_information(cpu, outf, level, pbf_info, 4); + int _level; + + _level = print_package_info(cpu, outf); + _isst_pbf_display_information(cpu, outf, level, pbf_info, _level + 1); format_and_print(outf, 1, NULL, NULL); } @@ -538,9 +579,11 @@ void isst_fact_display_information(int cpu, FILE *outf, int level, int fact_bucket, int fact_avx, struct isst_fact_info *fact_info) { - print_package_info(cpu, outf); + int _level; + + _level = print_package_info(cpu, outf); _isst_fact_display_information(cpu, outf, level, fact_bucket, fact_avx, - fact_info, 4); + fact_info, _level + 1); format_and_print(outf, 1, NULL, NULL); } @@ -549,94 +592,103 @@ void isst_clos_display_information(int cpu, FILE *outf, int clos, { char header[256]; char value[256]; + int level; - snprintf(header, sizeof(header), "package-%d", - get_physical_package_id(cpu)); - format_and_print(outf, 1, header, NULL); - snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); - format_and_print(outf, 2, header, NULL); - snprintf(header, sizeof(header), "cpu-%d", cpu); - format_and_print(outf, 3, header, NULL); + level = print_package_info(cpu, outf); snprintf(header, sizeof(header), "core-power"); - format_and_print(outf, 4, header, NULL); + format_and_print(outf, level + 1, header, NULL); snprintf(header, sizeof(header), "clos"); snprintf(value, sizeof(value), "%d", clos); - format_and_print(outf, 5, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "epp"); snprintf(value, sizeof(value), "%d", clos_config->epp); - format_and_print(outf, 5, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "clos-proportional-priority"); snprintf(value, sizeof(value), "%d", clos_config->clos_prop_prio); - format_and_print(outf, 5, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "clos-min"); snprintf(value, sizeof(value), "%d MHz", clos_config->clos_min * DISP_FREQ_MULTIPLIER); - format_and_print(outf, 5, header, value); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "clos-max"); - snprintf(value, sizeof(value), "%d MHz", clos_config->clos_max * DISP_FREQ_MULTIPLIER); - format_and_print(outf, 5, header, value); + if (clos_config->clos_max == 0xff) + snprintf(value, sizeof(value), "Max Turbo frequency"); + else + snprintf(value, sizeof(value), "%d MHz", clos_config->clos_max * DISP_FREQ_MULTIPLIER); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "clos-desired"); snprintf(value, sizeof(value), "%d MHz", clos_config->clos_desired * DISP_FREQ_MULTIPLIER); - format_and_print(outf, 5, header, value); + format_and_print(outf, level + 2, header, value); - format_and_print(outf, 1, NULL, NULL); + format_and_print(outf, level, NULL, NULL); } void isst_clos_display_clos_information(int cpu, FILE *outf, - int clos_enable, int type) + int clos_enable, int type, + int state, int cap) { char header[256]; char value[256]; + int level; - snprintf(header, sizeof(header), "package-%d", - get_physical_package_id(cpu)); - format_and_print(outf, 1, header, NULL); - snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); - format_and_print(outf, 2, header, NULL); - snprintf(header, sizeof(header), "cpu-%d", cpu); - format_and_print(outf, 3, header, NULL); + level = print_package_info(cpu, outf); snprintf(header, sizeof(header), "core-power"); - format_and_print(outf, 4, header, NULL); + format_and_print(outf, level + 1, header, NULL); + + snprintf(header, sizeof(header), "support-status"); + if (cap) + snprintf(value, sizeof(value), "supported"); + else + snprintf(value, sizeof(value), "unsupported"); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "enable-status"); - snprintf(value, sizeof(value), "%d", clos_enable); - format_and_print(outf, 5, header, value); + if (state) + snprintf(value, sizeof(value), "enabled"); + else + snprintf(value, sizeof(value), "disabled"); + format_and_print(outf, level + 2, header, value); + + snprintf(header, sizeof(header), "clos-enable-status"); + if (clos_enable) + snprintf(value, sizeof(value), "enabled"); + else + snprintf(value, sizeof(value), "disabled"); + format_and_print(outf, level + 2, header, value); snprintf(header, sizeof(header), "priority-type"); - snprintf(value, sizeof(value), "%d", type); - format_and_print(outf, 5, header, value); + if (type) + snprintf(value, sizeof(value), "ordered"); + else + snprintf(value, sizeof(value), "proportional"); + format_and_print(outf, level + 2, header, value); - format_and_print(outf, 1, NULL, NULL); + format_and_print(outf, level, NULL, NULL); } void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos) { char header[256]; char value[256]; + int level; - snprintf(header, sizeof(header), "package-%d", - get_physical_package_id(cpu)); - format_and_print(outf, 1, header, NULL); - snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); - format_and_print(outf, 2, header, NULL); - snprintf(header, sizeof(header), "cpu-%d", cpu); - format_and_print(outf, 3, header, NULL); + level = print_package_info(cpu, outf); snprintf(header, sizeof(header), "get-assoc"); - format_and_print(outf, 4, header, NULL); + format_and_print(outf, level + 1, header, NULL); snprintf(header, sizeof(header), "clos"); snprintf(value, sizeof(value), "%d", clos); - format_and_print(outf, 5, header, value); + format_and_print(outf, level + 2, header, value); - format_and_print(outf, 1, NULL, NULL); + format_and_print(outf, level, NULL, NULL); } void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, @@ -644,24 +696,60 @@ void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, { char header[256]; char value[256]; + int level = 3; + + if (cpu >= 0) + level = print_package_info(cpu, outf); - if (cpu >= 0) { - snprintf(header, sizeof(header), "package-%d", - get_physical_package_id(cpu)); - format_and_print(outf, 1, header, NULL); - snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); - format_and_print(outf, 2, header, NULL); - snprintf(header, sizeof(header), "cpu-%d", cpu); - format_and_print(outf, 3, header, NULL); - } snprintf(header, sizeof(header), "%s", feature); - format_and_print(outf, 4, header, NULL); + format_and_print(outf, level + 1, header, NULL); snprintf(header, sizeof(header), "%s", cmd); if (!result) snprintf(value, sizeof(value), "success"); else snprintf(value, sizeof(value), "failed(error %d)", result); - format_and_print(outf, 5, header, value); + format_and_print(outf, level + 2, header, value); + + format_and_print(outf, level, NULL, NULL); +} + +void isst_display_error_info_message(int error, char *msg, int arg_valid, int arg) +{ + FILE *outf = get_output_file(); + static int error_index; + char header[256]; + char value[256]; + + if (!out_format_is_json()) { + if (arg_valid) + snprintf(value, sizeof(value), "%s %d", msg, arg); + else + snprintf(value, sizeof(value), "%s", msg); + + if (error) + fprintf(outf, "Error: %s\n", value); + else + fprintf(outf, "Information: %s\n", value); + return; + } + + if (!start) + format_and_print(outf, 0, "start", NULL); + + if (error) + snprintf(header, sizeof(header), "Error%d", error_index++); + else + snprintf(header, sizeof(header), "Information:%d", error_index++); + format_and_print(outf, 1, header, NULL); + + snprintf(header, sizeof(header), "message"); + if (arg_valid) + snprintf(value, sizeof(value), "%s %d", msg, arg); + else + snprintf(value, sizeof(value), "%s", msg); + format_and_print(outf, 2, header, value); format_and_print(outf, 1, NULL, NULL); + if (!start) + format_and_print(outf, 0, NULL, NULL); } diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index ad5aa6341d0f..2e1afd856a78 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -172,6 +172,7 @@ extern int get_cpu_count(int pkg_id, int die_id); extern int get_core_count(int pkg_id, int die_id); /* Common interfaces */ +FILE *get_output_file(void); extern void debug_printf(const char *format, ...); extern int out_format_is_json(void); extern int get_physical_package_id(int cpu); @@ -196,6 +197,8 @@ extern int isst_send_msr_command(unsigned int cpu, unsigned int command, int write, unsigned long long *req_resp); extern int isst_get_ctdp_levels(int cpu, struct isst_pkg_ctdp *pkg_dev); +extern int isst_get_ctdp_control(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level); extern int isst_get_coremask_info(int cpu, int config_index, struct isst_pkg_ctdp_level_info *ctdp_level); extern int isst_get_process_ctdp(int cpu, int tdp_level, @@ -205,7 +208,7 @@ extern void isst_get_process_ctdp_complete(int cpu, extern void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev); extern void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, - unsigned int val); + unsigned int val, char *str0, char *str1); extern void isst_ctdp_display_information_start(FILE *outf); extern void isst_ctdp_display_information_end(FILE *outf); extern void isst_pbf_display_information(int cpu, FILE *outf, int level, @@ -216,7 +219,7 @@ extern int isst_set_pbf_fact_status(int cpu, int pbf, int enable); extern int isst_get_pbf_info(int cpu, int level, struct isst_pbf_info *pbf_info); extern void isst_get_pbf_info_complete(struct isst_pbf_info *pbf_info); -extern int isst_get_fact_info(int cpu, int level, +extern int isst_get_fact_info(int cpu, int level, int fact_bucket, struct isst_fact_info *fact_info); extern int isst_get_fact_bucket_info(int cpu, int level, struct isst_fact_bucket_info *bucket_info); @@ -245,7 +248,10 @@ extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, extern int isst_clos_get_clos_information(int cpu, int *enable, int *type); extern void isst_clos_display_clos_information(int cpu, FILE *outf, - int clos_enable, int type); + int clos_enable, int type, + int state, int cap); extern int is_clx_n_platform(void); extern int get_cpufreq_base_freq(int cpu); +extern int isst_read_pm_config(int cpu, int *cp_state, int *cp_cap); +extern void isst_display_error_info_message(int error, char *msg, int arg_valid, int arg); #endif diff --git a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py index 256199c7a182..3c47865bb247 100755 --- a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py +++ b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py @@ -235,7 +235,6 @@ def plot_duration_cpu(): output_png = 'all_cpu_durations.png' g_plot = common_all_gnuplot_settings(output_png) # autoscale this one, no set y range - g_plot('set ytics 0, 500') g_plot('set ylabel "Timer Duration (MilliSeconds)"') g_plot('set title "{} : cpu durations : {:%F %H:%M}"'.format(testname, datetime.now())) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 13f1e8b9ac52..2b6551269e43 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -16,7 +16,7 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 %: %.c @mkdir -p $(BUILD_OUTPUT) - $(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) + $(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap .PHONY : clean clean : diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 31c1ca0bb3ee..33b370865d16 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -30,7 +30,7 @@ #include <sched.h> #include <time.h> #include <cpuid.h> -#include <linux/capability.h> +#include <sys/capability.h> #include <errno.h> #include <math.h> @@ -304,6 +304,10 @@ int *irqs_per_cpu; /* indexed by cpu_num */ void setup_all_buffers(void); +char *sys_lpi_file; +char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us"; +char *sys_lpi_file_debugfs = "/sys/kernel/debug/pmc_core/slp_s0_residency_usec"; + int cpu_is_not_present(int cpu) { return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set); @@ -2916,8 +2920,6 @@ int snapshot_gfx_mhz(void) * * record snapshot of * /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us - * - * return 1 if config change requires a restart, else return 0 */ int snapshot_cpu_lpi_us(void) { @@ -2941,17 +2943,14 @@ int snapshot_cpu_lpi_us(void) /* * snapshot_sys_lpi() * - * record snapshot of - * /sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us - * - * return 1 if config change requires a restart, else return 0 + * record snapshot of sys_lpi_file */ int snapshot_sys_lpi_us(void) { FILE *fp; int retval; - fp = fopen_or_die("/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us", "r"); + fp = fopen_or_die(sys_lpi_file, "r"); retval = fscanf(fp, "%lld", &cpuidle_cur_sys_lpi_us); if (retval != 1) { @@ -3151,28 +3150,42 @@ void check_dev_msr() err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); } -void check_permissions() +/* + * check for CAP_SYS_RAWIO + * return 0 on success + * return 1 on fail + */ +int check_for_cap_sys_rawio(void) { - struct __user_cap_header_struct cap_header_data; - cap_user_header_t cap_header = &cap_header_data; - struct __user_cap_data_struct cap_data_data; - cap_user_data_t cap_data = &cap_data_data; - extern int capget(cap_user_header_t hdrp, cap_user_data_t datap); - int do_exit = 0; - char pathname[32]; + cap_t caps; + cap_flag_value_t cap_flag_value; - /* check for CAP_SYS_RAWIO */ - cap_header->pid = getpid(); - cap_header->version = _LINUX_CAPABILITY_VERSION; - if (capget(cap_header, cap_data) < 0) - err(-6, "capget(2) failed"); + caps = cap_get_proc(); + if (caps == NULL) + err(-6, "cap_get_proc\n"); - if ((cap_data->effective & (1 << CAP_SYS_RAWIO)) == 0) { - do_exit++; + if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) + err(-6, "cap_get\n"); + + if (cap_flag_value != CAP_SET) { warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); + return 1; } + if (cap_free(caps) == -1) + err(-6, "cap_free\n"); + + return 0; +} +void check_permissions(void) +{ + int do_exit = 0; + char pathname[32]; + + /* check for CAP_SYS_RAWIO */ + do_exit += check_for_cap_sys_rawio(); + /* test file permissions */ sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { @@ -3265,6 +3278,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ + case INTEL_FAM6_ATOM_TREMONT: /* EHL */ pkg_cstate_limits = glm_pkg_cstate_limits; break; default: @@ -3336,6 +3350,17 @@ int is_skx(unsigned int family, unsigned int model) } return 0; } +int is_ehl(unsigned int family, unsigned int model) +{ + if (!genuine_intel) + return 0; + + switch (model) { + case INTEL_FAM6_ATOM_TREMONT: + return 1; + } + return 0; +} int has_turbo_ratio_limit(unsigned int family, unsigned int model) { @@ -3478,6 +3503,23 @@ dump_cstate_pstate_config_info(unsigned int family, unsigned int model) dump_nhm_cst_cfg(); } +static void dump_sysfs_file(char *path) +{ + FILE *input; + char cpuidle_buf[64]; + + input = fopen(path, "r"); + if (input == NULL) { + if (debug) + fprintf(outf, "NSFOD %s\n", path); + return; + } + if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input)) + err(1, "%s: failed to read file", path); + fclose(input); + + fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); +} static void dump_sysfs_cstate_config(void) { @@ -3491,6 +3533,15 @@ dump_sysfs_cstate_config(void) if (!DO_BIC(BIC_sysfs)) return; + if (access("/sys/devices/system/cpu/cpuidle", R_OK)) { + fprintf(outf, "cpuidle not loaded\n"); + return; + } + + dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_driver"); + dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor"); + dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor_ro"); + for (state = 0; state < 10; ++state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", @@ -3894,6 +3945,20 @@ void rapl_probe_intel(unsigned int family, unsigned int model) else BIC_PRESENT(BIC_PkgWatt); break; + case INTEL_FAM6_ATOM_TREMONT: /* EHL */ + do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; + if (rapl_joules) { + BIC_PRESENT(BIC_Pkg_J); + BIC_PRESENT(BIC_Cor_J); + BIC_PRESENT(BIC_RAM_J); + BIC_PRESENT(BIC_GFX_J); + } else { + BIC_PRESENT(BIC_PkgWatt); + BIC_PRESENT(BIC_CorWatt); + BIC_PRESENT(BIC_RAMWatt); + BIC_PRESENT(BIC_GFXWatt); + } + break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; @@ -4295,6 +4360,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ + case INTEL_FAM6_ATOM_TREMONT: /* EHL */ return 1; } return 0; @@ -4324,6 +4390,7 @@ int has_c8910_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_CANNONLAKE_L: /* CNL */ case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + case INTEL_FAM6_ATOM_TREMONT: /* EHL */ return 1; } return 0; @@ -4610,14 +4677,24 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_COMETLAKE_L: + case INTEL_FAM6_COMETLAKE: return INTEL_FAM6_SKYLAKE_L; case INTEL_FAM6_ICELAKE_L: case INTEL_FAM6_ICELAKE_NNPI: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: return INTEL_FAM6_CANNONLAKE_L; case INTEL_FAM6_ATOM_TREMONT_D: return INTEL_FAM6_ATOM_GOLDMONT_D; + + case INTEL_FAM6_ATOM_TREMONT_L: + return INTEL_FAM6_ATOM_TREMONT; + + case INTEL_FAM6_ICELAKE_X: + return INTEL_FAM6_SKYLAKE_X; } return model; } @@ -4872,7 +4949,8 @@ void process_cpuid() do_slm_cstates = is_slm(family, model); do_knl_cstates = is_knl(family, model); - if (do_slm_cstates || do_knl_cstates || is_cnl(family, model)) + if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || + is_ehl(family, model)) BIC_NOT_PRESENT(BIC_CPU_c3); if (!quiet) @@ -4907,10 +4985,16 @@ void process_cpuid() else BIC_NOT_PRESENT(BIC_CPU_LPI); - if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us", R_OK)) + if (!access(sys_lpi_file_sysfs, R_OK)) { + sys_lpi_file = sys_lpi_file_sysfs; BIC_PRESENT(BIC_SYS_LPI); - else + } else if (!access(sys_lpi_file_debugfs, R_OK)) { + sys_lpi_file = sys_lpi_file_debugfs; + BIC_PRESENT(BIC_SYS_LPI); + } else { + sys_lpi_file_sysfs = NULL; BIC_NOT_PRESENT(BIC_SYS_LPI); + } if (!quiet) decode_misc_feature_control(); @@ -5306,7 +5390,7 @@ int get_and_dump_counters(void) } void print_version() { - fprintf(outf, "turbostat version 19.08.31" + fprintf(outf, "turbostat version 20.03.20" " - Len Brown <lenb@kernel.org>\n"); } @@ -5323,9 +5407,9 @@ int add_counter(unsigned int msr_num, char *path, char *name, } msrp->msr_num = msr_num; - strncpy(msrp->name, name, NAME_BYTES); + strncpy(msrp->name, name, NAME_BYTES - 1); if (path) - strncpy(msrp->path, path, PATH_BYTES); + strncpy(msrp->path, path, PATH_BYTES - 1); msrp->width = width; msrp->type = type; msrp->format = format; diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index ded7a950dc40..a7974638561c 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 ifneq ($(O),) ifeq ($(origin O), command line) - dummy := $(if $(shell test -d $(O) || echo $(O)),$(error O=$(O) does not exist),) - ABSOLUTE_O := $(shell cd $(O) ; pwd) + dummy := $(if $(shell cd $(PWD); test -d $(O) || echo $(O)),$(error O=$(O) does not exist),) + ABSOLUTE_O := $(shell cd $(PWD); cd $(O) ; pwd) OUTPUT := $(ABSOLUTE_O)/$(if $(subdir),$(subdir)/) COMMAND_O := O=$(ABSOLUTE_O) ifeq ($(objtree),) @@ -106,6 +106,7 @@ ifneq ($(silent),1) ifneq ($(V),1) QUIET_CC = @echo ' CC '$@; QUIET_CC_FPIC = @echo ' CC FPIC '$@; + QUIET_CLANG = @echo ' CLANG '$@; QUIET_AR = @echo ' AR '$@; QUIET_LINK = @echo ' LINK '$@; QUIET_MKDIR = @echo ' MKDIR '$@; diff --git a/tools/spi/Makefile b/tools/spi/Makefile index 5c342e655e55..2249a1546cc1 100644 --- a/tools/spi/Makefile +++ b/tools/spi/Makefile @@ -51,7 +51,7 @@ $(OUTPUT)spidev_fdx: $(SPIDEV_FDX_IN) clean: rm -f $(ALL_PROGRAMS) - rm -f $(OUTPUT)include/linux/spi/spidev.h + rm -rf $(OUTPUT)include/ find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete install: $(ALL_PROGRAMS) diff --git a/tools/spi/spidev_test.c b/tools/spi/spidev_test.c index 3559e7646256..27967dd90f8f 100644 --- a/tools/spi/spidev_test.c +++ b/tools/spi/spidev_test.c @@ -13,6 +13,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <errno.h> #include <getopt.h> #include <fcntl.h> #include <time.h> @@ -26,7 +27,11 @@ static void pabort(const char *s) { - perror(s); + if (errno != 0) + perror(s); + else + printf("%s\n", s); + abort(); } @@ -283,7 +288,6 @@ static void parse_opts(int argc, char *argv[]) break; default: print_usage(argv[0]); - break; } } if (mode & SPI_LOOP) { @@ -405,6 +409,9 @@ int main(int argc, char *argv[]) parse_opts(argc, argv); + if (input_tx && input_file) + pabort("only one of -p and --input may be selected"); + fd = open(device, O_RDWR); if (fd < 0) pabort("can't open device"); @@ -446,9 +453,6 @@ int main(int argc, char *argv[]) printf("bits per word: %d\n", bits); printf("max speed: %d Hz (%d KHz)\n", speed, speed/1000); - if (input_tx && input_file) - pabort("only one of -p and --input may be selected"); - if (input_tx) transfer_escaped_string(fd, input_tx); else if (input_file) diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 220d04f958a6..7570e36d636d 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -30,7 +30,7 @@ my %default = ( "EMAIL_WHEN_STARTED" => 0, "NUM_TESTS" => 1, "TEST_TYPE" => "build", - "BUILD_TYPE" => "randconfig", + "BUILD_TYPE" => "oldconfig", "MAKE_CMD" => "make", "CLOSE_CONSOLE_SIGNAL" => "INT", "TIMEOUT" => 120, @@ -1030,7 +1030,7 @@ sub __read_config { } if (!$skip && $rest !~ /^\s*$/) { - die "$name: $.: Gargbage found after $type\n$_"; + die "$name: $.: Garbage found after $type\n$_"; } if ($skip && $type eq "TEST_START") { @@ -1063,7 +1063,7 @@ sub __read_config { } if ($rest !~ /^\s*$/) { - die "$name: $.: Gargbage found after DEFAULTS\n$_"; + die "$name: $.: Garbage found after DEFAULTS\n$_"; } } elsif (/^\s*INCLUDE\s+(\S+)/) { @@ -1154,7 +1154,7 @@ sub __read_config { # on of these sections that have SKIP defined. # The save variable can be # defined multiple times and the new one simply overrides - # the prevous one. + # the previous one. set_variable($lvalue, $rvalue); } else { @@ -1234,7 +1234,7 @@ sub read_config { foreach my $option (keys %not_used) { print "$option\n"; } - print "Set IGRNORE_UNUSED = 1 to have ktest ignore unused variables\n"; + print "Set IGNORE_UNUSED = 1 to have ktest ignore unused variables\n"; if (!read_yn "Do you want to continue?") { exit -1; } @@ -1345,7 +1345,7 @@ sub eval_option { # Check for recursive evaluations. # 100 deep should be more than enough. if ($r++ > 100) { - die "Over 100 evaluations accurred with $option\n" . + die "Over 100 evaluations occurred with $option\n" . "Check for recursive variables\n"; } $prev = $option; @@ -1383,7 +1383,7 @@ sub reboot { } else { # Make sure everything has been written to disk - run_ssh("sync"); + run_ssh("sync", 10); if (defined($time)) { start_monitor; @@ -1461,7 +1461,7 @@ sub get_test_name() { sub dodie { - # avoid recusion + # avoid recursion return if ($in_die); $in_die = 1; diff --git a/tools/testing/ktest/sample.conf b/tools/testing/ktest/sample.conf index c3bc933d437b..27666b8007ed 100644 --- a/tools/testing/ktest/sample.conf +++ b/tools/testing/ktest/sample.conf @@ -10,7 +10,7 @@ # # Options set in the beginning of the file are considered to be -# default options. These options can be overriden by test specific +# default options. These options can be overridden by test specific # options, with the following exceptions: # # LOG_FILE @@ -204,7 +204,7 @@ # # This config file can also contain "config variables". # These are assigned with ":=" instead of the ktest option -# assigment "=". +# assignment "=". # # The difference between ktest options and config variables # is that config variables can be used multiple times, @@ -263,7 +263,7 @@ #### Using options in other options #### # # Options that are defined in the config file may also be used -# by other options. All options are evaulated at time of +# by other options. All options are evaluated at time of # use (except that config variables are evaluated at config # processing time). # @@ -505,7 +505,7 @@ #TEST = ssh user@machine /root/run_test # The build type is any make config type or special command -# (default randconfig) +# (default oldconfig) # nobuild - skip the clean and build step # useconfig:/path/to/config - use the given config and run # oldconfig on it. @@ -707,7 +707,7 @@ # Line to define a successful boot up in console output. # This is what the line contains, not the entire line. If you need -# the entire line to match, then use regural expression syntax like: +# the entire line to match, then use regular expression syntax like: # (do not add any quotes around it) # # SUCCESS_LINE = ^MyBox Login:$ @@ -839,7 +839,7 @@ # (ignored if POWEROFF_ON_SUCCESS is set) #REBOOT_ON_SUCCESS = 1 -# In case there are isses with rebooting, you can specify this +# In case there are issues with rebooting, you can specify this # to always powercycle after this amount of time after calling # reboot. # Note, POWERCYCLE_AFTER_REBOOT = 0 does NOT disable it. It just @@ -848,7 +848,7 @@ # (default undefined) #POWERCYCLE_AFTER_REBOOT = 5 -# In case there's isses with halting, you can specify this +# In case there's issues with halting, you can specify this # to always poweroff after this amount of time after calling # halt. # Note, POWEROFF_AFTER_HALT = 0 does NOT disable it. It just @@ -972,7 +972,7 @@ # # PATCHCHECK_START is required and is the first patch to # test (the SHA1 of the commit). You may also specify anything -# that git checkout allows (branch name, tage, HEAD~3). +# that git checkout allows (branch name, tag, HEAD~3). # # PATCHCHECK_END is the last patch to check (default HEAD) # @@ -994,7 +994,7 @@ # IGNORE_WARNINGS is set for the given commit's sha1 # # IGNORE_WARNINGS can be used to disable the failure of patchcheck -# on a particuler commit (SHA1). You can add more than one commit +# on a particular commit (SHA1). You can add more than one commit # by adding a list of SHA1s that are space delimited. # # If BUILD_NOCLEAN is set, then make mrproper will not be run on @@ -1093,7 +1093,7 @@ # whatever reason. (Can't reboot, want to inspect each iteration) # Doing a BISECT_MANUAL will have the test wait for you to # tell it if the test passed or failed after each iteration. -# This is basicall the same as running git bisect yourself +# This is basically the same as running git bisect yourself # but ktest will rebuild and install the kernel for you. # # BISECT_CHECK = 1 (optional, default 0) @@ -1239,7 +1239,7 @@ # # CONFIG_BISECT_EXEC (optional) # The config bisect is a separate program that comes with ktest.pl. -# By befault, it will look for: +# By default, it will look for: # `pwd`/config-bisect.pl # the location ktest.pl was executed from. # If it does not find it there, it will look for: # `dirname <ktest.pl>`/config-bisect.pl # The directory that holds ktest.pl diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore index 61df01cdf0b2..304fdf1a21dc 100644 --- a/tools/testing/selftests/.gitignore +++ b/tools/testing/selftests/.gitignore @@ -3,4 +3,7 @@ gpiogpio-hammer gpioinclude/ gpiolsgpio tpm2/SpaceTest.log -tpm2/*.pyc + +# Python bytecode and cache +__pycache__/ +*.py[cod] diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 6ec503912bea..0635684cbf07 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -33,9 +33,9 @@ TARGETS += memory-hotplug TARGETS += mount TARGETS += mqueue TARGETS += net +TARGETS += net/forwarding TARGETS += net/mptcp TARGETS += netfilter -TARGETS += networking/timestamping TARGETS += nsfs TARGETS += pidfd TARGETS += powerpc @@ -91,7 +91,7 @@ override LDFLAGS = override MAKEFLAGS = endif -# Append kselftest to KBUILD_OUTPUT to avoid cluttering +# Append kselftest to KBUILD_OUTPUT and O to avoid cluttering # KBUILD_OUTPUT with selftest objects and headers installed # by selftests Makefile or lib.mk. ifdef building_out_of_srctree @@ -99,7 +99,7 @@ override LDFLAGS = endif ifneq ($(O),) - BUILD := $(O) + BUILD := $(O)/kselftest else ifneq ($(KBUILD_OUTPUT),) BUILD := $(KBUILD_OUTPUT)/kselftest diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile index 7c462714b418..9258306cafe9 100644 --- a/tools/testing/selftests/android/Makefile +++ b/tools/testing/selftests/android/Makefile @@ -21,7 +21,7 @@ all: override define INSTALL_RULE mkdir -p $(INSTALL_PATH) - install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) +install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) @for SUBDIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$SUBDIR; \ diff --git a/tools/testing/selftests/android/ion/Makefile b/tools/testing/selftests/android/ion/Makefile index 0eb7ab626e1c..42b71f005332 100644 --- a/tools/testing/selftests/android/ion/Makefile +++ b/tools/testing/selftests/android/ion/Makefile @@ -17,4 +17,4 @@ include ../../lib.mk $(OUTPUT)/ionapp_export: ionapp_export.c ipcsocket.c ionutils.c $(OUTPUT)/ionapp_import: ionapp_import.c ipcsocket.c ionutils.c -$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c +$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c ipcsocket.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index ec464859c6b6..2198cd876675 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -31,6 +31,7 @@ test_tcp_check_syncookie_user test_sysctl test_hashmap test_btf_dump +test_current_pid_tgid_new_ns xdping test_cpp *.skel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 257a1aaaa37d..7729892e0b04 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -20,8 +20,9 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \ +CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) \ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ + -I$(APIDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread @@ -32,7 +33,8 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ - test_progs-no_alu32 + test_progs-no_alu32 \ + test_current_pid_tgid_new_ns # Also test bpf-gcc, if present ifneq ($(BPF_GCC),) @@ -62,7 +64,8 @@ TEST_PROGS := test_kmod.sh \ test_tc_tunnel.sh \ test_tc_edt.sh \ test_xdping.sh \ - test_bpftool_build.sh + test_bpftool_build.sh \ + test_bpftool.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ @@ -128,10 +131,13 @@ $(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) $(call msg,CC,,$@) $(CC) -c $(CFLAGS) -o $@ $< -VMLINUX_BTF_PATHS := $(abspath ../../../../vmlinux) \ - /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(shell uname -r) -VMLINUX_BTF:= $(firstword $(wildcard $(VMLINUX_BTF_PATHS))) +VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF := $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + $(OUTPUT)/runqslower: $(BPFOBJ) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \ @@ -171,6 +177,10 @@ $(BUILD_DIR)/libbpf $(BUILD_DIR)/bpftool $(INCLUDE_DIR): $(call msg,MKDIR,,$@) mkdir -p $@ +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) | $(BPFTOOL) $(INCLUDE_DIR) + $(call msg,GEN,,$@) + $(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ + # Get Clang's default includes on this system, as opposed to those seen by # '-target bpf'. This fixes "missing" files on some architectures/distros, # such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. @@ -189,8 +199,8 @@ MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ - -I$(INCLUDE_DIR) -I$(CURDIR) -I$(CURDIR)/include/uapi \ - -I$(APIDIR) -I$(abspath $(OUTPUT)/../usr/include) + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -209,7 +219,7 @@ define CLANG_BPF_BUILD_RULE $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -target bpf -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -mattr=dwarfris -march=bpf -mcpu=probe $4 -filetype=obj -o $2 + $(LLC) -mattr=dwarfris -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE @@ -223,7 +233,7 @@ define CLANG_NATIVE_BPF_BUILD_RULE $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -march=bpf -mcpu=probe $4 -filetype=obj -o $2 + $(LLC) -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE @@ -279,6 +289,7 @@ $(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs := y $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ + $$(INCLUDE_DIR)/vmlinux.h \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS), \ diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 8f21965ffc6c..5bf2fe9b1efa 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -6,7 +6,7 @@ #include <linux/types.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_core_read.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> #define BPF_STRUCT_OPS(name, args...) \ SEC("struct_ops/"#name) \ diff --git a/tools/testing/selftests/bpf/bpf_trace_helpers.h b/tools/testing/selftests/bpf/bpf_trace_helpers.h deleted file mode 100644 index c6f1354d93fb..000000000000 --- a/tools/testing/selftests/bpf/bpf_trace_helpers.h +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -#ifndef __BPF_TRACE_HELPERS_H -#define __BPF_TRACE_HELPERS_H - -#include <bpf/bpf_helpers.h> - -#define ___bpf_concat(a, b) a ## b -#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) -#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N -#define ___bpf_narg(...) \ - ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) -#define ___bpf_empty(...) \ - ___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) - -#define ___bpf_ctx_cast0() ctx -#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] -#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] -#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] -#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] -#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] -#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] -#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] -#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] -#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] -#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] -#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] -#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] -#define ___bpf_ctx_cast(args...) \ - ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) - -/* - * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and - * similar kinds of BPF programs, that accept input arguments as a single - * pointer to untyped u64 array, where each u64 can actually be a typed - * pointer or integer of different size. Instead of requring user to write - * manual casts and work with array elements by index, BPF_PROG macro - * allows user to declare a list of named and typed input arguments in the - * same syntax as for normal C function. All the casting is hidden and - * performed transparently, while user code can just assume working with - * function arguments of specified type and name. - * - * Original raw context argument is preserved as well as 'ctx' argument. - * This is useful when using BPF helpers that expect original context - * as one of the parameters (e.g., for bpf_perf_event_output()). - */ -#define BPF_PROG(name, args...) \ -name(unsigned long long *ctx); \ -static __always_inline typeof(name(0)) \ -____##name(unsigned long long *ctx, ##args); \ -typeof(name(0)) name(unsigned long long *ctx) \ -{ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_ctx_cast(args)); \ - _Pragma("GCC diagnostic pop") \ -} \ -static __always_inline typeof(name(0)) \ -____##name(unsigned long long *ctx, ##args) - -struct pt_regs; - -#define ___bpf_kprobe_args0() ctx -#define ___bpf_kprobe_args1(x) \ - ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) -#define ___bpf_kprobe_args2(x, args...) \ - ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) -#define ___bpf_kprobe_args3(x, args...) \ - ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) -#define ___bpf_kprobe_args4(x, args...) \ - ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) -#define ___bpf_kprobe_args5(x, args...) \ - ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) -#define ___bpf_kprobe_args(args...) \ - ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) - -/* - * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for - * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific - * low-level way of getting kprobe input arguments from struct pt_regs, and - * provides a familiar typed and named function arguments syntax and - * semantics of accessing kprobe input paremeters. - * - * Original struct pt_regs* context is preserved as 'ctx' argument. This might - * be necessary when using BPF helpers like bpf_perf_event_output(). - */ -#define BPF_KPROBE(name, args...) \ -name(struct pt_regs *ctx); \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ -typeof(name(0)) name(struct pt_regs *ctx) \ -{ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_kprobe_args(args)); \ - _Pragma("GCC diagnostic pop") \ -} \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) - -#define ___bpf_kretprobe_args0() ctx -#define ___bpf_kretprobe_argsN(x, args...) \ - ___bpf_kprobe_args(args), (void *)PT_REGS_RET(ctx) -#define ___bpf_kretprobe_args(args...) \ - ___bpf_apply(___bpf_kretprobe_args, ___bpf_empty(args))(args) - -/* - * BPF_KRETPROBE is similar to BPF_KPROBE, except, in addition to listing all - * input kprobe arguments, one last extra argument has to be specified, which - * captures kprobe return value. - */ -#define BPF_KRETPROBE(name, args...) \ -name(struct pt_regs *ctx); \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ -typeof(name(0)) name(struct pt_regs *ctx) \ -{ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_kretprobe_args(args)); \ - _Pragma("GCC diagnostic pop") \ -} \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) -#endif diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 5dc109f4c097..60e3ae5d4e48 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -35,3 +35,5 @@ CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y +CONFIG_BPF_LSM=y +CONFIG_SECURITY=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 8482bbc67eec..9a8f47fc0b91 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -11,6 +11,7 @@ static const unsigned int total_bytes = 10 * 1024 * 1024; static const struct timeval timeo_sec = { .tv_sec = 10 }; static const size_t timeo_optlen = sizeof(timeo_sec); +static int expected_stg = 0xeB9F; static int stop, duration; static int settimeo(int fd) @@ -88,7 +89,7 @@ done: return NULL; } -static void do_test(const char *tcp_ca) +static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { struct sockaddr_in6 sa6 = {}; ssize_t nr_recv = 0, bytes = 0; @@ -126,14 +127,34 @@ static void do_test(const char *tcp_ca) err = listen(lfd, 1); if (CHECK(err == -1, "listen", "errno:%d\n", errno)) goto done; - err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); - if (CHECK(err != 0, "pthread_create", "err:%d\n", err)) - goto done; + + if (sk_stg_map) { + err = bpf_map_update_elem(bpf_map__fd(sk_stg_map), &fd, + &expected_stg, BPF_NOEXIST); + if (CHECK(err, "bpf_map_update_elem(sk_stg_map)", + "err:%d errno:%d\n", err, errno)) + goto done; + } /* connect to server */ err = connect(fd, (struct sockaddr *)&sa6, addrlen); if (CHECK(err == -1, "connect", "errno:%d\n", errno)) - goto wait_thread; + goto done; + + if (sk_stg_map) { + int tmp_stg; + + err = bpf_map_lookup_elem(bpf_map__fd(sk_stg_map), &fd, + &tmp_stg); + if (CHECK(!err || errno != ENOENT, + "bpf_map_lookup_elem(sk_stg_map)", + "err:%d errno:%d\n", err, errno)) + goto done; + } + + err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); + if (CHECK(err != 0, "pthread_create", "err:%d errno:%d\n", err, errno)) + goto done; /* recv total_bytes */ while (bytes < total_bytes && !READ_ONCE(stop)) { @@ -149,7 +170,6 @@ static void do_test(const char *tcp_ca) CHECK(bytes != total_bytes, "recv", "%zd != %u nr_recv:%zd errno:%d\n", bytes, total_bytes, nr_recv, errno); -wait_thread: WRITE_ONCE(stop, 1); pthread_join(srv_thread, &thread_ret); CHECK(IS_ERR(thread_ret), "pthread_join", "thread_ret:%ld", @@ -175,7 +195,7 @@ static void test_cubic(void) return; } - do_test("bpf_cubic"); + do_test("bpf_cubic", NULL); bpf_link__destroy(link); bpf_cubic__destroy(cubic_skel); @@ -197,7 +217,10 @@ static void test_dctcp(void) return; } - do_test("bpf_dctcp"); + do_test("bpf_dctcp", dctcp_skel->maps.sk_stg_map); + CHECK(dctcp_skel->bss->stg_result != expected_stg, + "Unexpected stg_result", "stg_result (%x) != expected_stg (%x)\n", + dctcp_skel->bss->stg_result, expected_stg); bpf_link__destroy(link); bpf_dctcp__destroy(dctcp_skel); diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 7390d3061065..cb33a7ee4e04 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -125,6 +125,6 @@ void test_btf_dump() { if (!test__start_subtest(t->name)) continue; - test_btf_dump_case(i, &btf_dump_test_cases[i]); + test_btf_dump_case(i, &btf_dump_test_cases[i]); } } diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c index 5b13f2c6c402..70e94e783070 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c @@ -6,7 +6,7 @@ #define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" -char bpf_log_buf[BPF_LOG_BUF_SIZE]; +static char bpf_log_buf[BPF_LOG_BUF_SIZE]; static int prog_load(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index 2ff21dbce179..139f8e82c7c6 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -6,7 +6,7 @@ #define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" -char bpf_log_buf[BPF_LOG_BUF_SIZE]; +static char bpf_log_buf[BPF_LOG_BUF_SIZE]; static int map_fd = -1; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c index 9d8cb48b99de..9e96f8d87fea 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c @@ -8,7 +8,7 @@ #define BAR "/foo/bar/" #define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" -char bpf_log_buf[BPF_LOG_BUF_SIZE]; +static char bpf_log_buf[BPF_LOG_BUF_SIZE]; static int prog_load(int verdict) { diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c new file mode 100644 index 000000000000..6e04f8d1d15b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "cgroup_helpers.h" +#include "test_cgroup_link.skel.h" + +static __u32 duration = 0; +#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" + +static struct test_cgroup_link *skel = NULL; + +int ping_and_check(int exp_calls, int exp_alt_calls) +{ + skel->bss->calls = 0; + skel->bss->alt_calls = 0; + CHECK_FAIL(system(PING_CMD)); + if (CHECK(skel->bss->calls != exp_calls, "call_cnt", + "exp %d, got %d\n", exp_calls, skel->bss->calls)) + return -EINVAL; + if (CHECK(skel->bss->alt_calls != exp_alt_calls, "alt_call_cnt", + "exp %d, got %d\n", exp_alt_calls, skel->bss->alt_calls)) + return -EINVAL; + return 0; +} + +void test_cgroup_link(void) +{ + struct { + const char *path; + int fd; + } cgs[] = { + { "/cg1" }, + { "/cg1/cg2" }, + { "/cg1/cg2/cg3" }, + { "/cg1/cg2/cg3/cg4" }, + }; + int last_cg = ARRAY_SIZE(cgs) - 1, cg_nr = ARRAY_SIZE(cgs); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, link_upd_opts); + struct bpf_link *links[ARRAY_SIZE(cgs)] = {}, *tmp_link; + __u32 prog_ids[ARRAY_SIZE(cgs)], prog_cnt = 0, attach_flags; + int i = 0, err, prog_fd; + bool detach_legacy = false; + + skel = test_cgroup_link__open_and_load(); + if (CHECK(!skel, "skel_open_load", "failed to open/load skeleton\n")) + return; + prog_fd = bpf_program__fd(skel->progs.egress); + + err = setup_cgroup_environment(); + if (CHECK(err, "cg_init", "failed: %d\n", err)) + goto cleanup; + + for (i = 0; i < cg_nr; i++) { + cgs[i].fd = create_and_get_cgroup(cgs[i].path); + if (CHECK(cgs[i].fd < 0, "cg_create", "fail: %d\n", cgs[i].fd)) + goto cleanup; + } + + err = join_cgroup(cgs[last_cg].path); + if (CHECK(err, "cg_join", "fail: %d\n", err)) + goto cleanup; + + for (i = 0; i < cg_nr; i++) { + links[i] = bpf_program__attach_cgroup(skel->progs.egress, + cgs[i].fd); + if (CHECK(IS_ERR(links[i]), "cg_attach", "i: %d, err: %ld\n", + i, PTR_ERR(links[i]))) + goto cleanup; + } + + ping_and_check(cg_nr, 0); + + /* query the number of effective progs and attach flags in root cg */ + err = bpf_prog_query(cgs[0].fd, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, &attach_flags, NULL, + &prog_cnt); + CHECK_FAIL(err); + CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); + if (CHECK(prog_cnt != 1, "effect_cnt", "exp %d, got %d\n", 1, prog_cnt)) + goto cleanup; + + /* query the number of effective progs in last cg */ + err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, NULL, NULL, + &prog_cnt); + CHECK_FAIL(err); + CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); + if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n", + cg_nr, prog_cnt)) + goto cleanup; + + /* query the effective prog IDs in last cg */ + err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, &attach_flags, + prog_ids, &prog_cnt); + CHECK_FAIL(err); + CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); + if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n", + cg_nr, prog_cnt)) + goto cleanup; + for (i = 1; i < prog_cnt; i++) { + CHECK(prog_ids[i - 1] != prog_ids[i], "prog_id_check", + "idx %d, prev id %d, cur id %d\n", + i, prog_ids[i - 1], prog_ids[i]); + } + + /* detach bottom program and ping again */ + bpf_link__destroy(links[last_cg]); + links[last_cg] = NULL; + + ping_and_check(cg_nr - 1, 0); + + /* mix in with non link-based multi-attachments */ + err = bpf_prog_attach(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS, BPF_F_ALLOW_MULTI); + if (CHECK(err, "cg_attach_legacy", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = true; + + links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress, + cgs[last_cg].fd); + if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n", + PTR_ERR(links[last_cg]))) + goto cleanup; + + ping_and_check(cg_nr + 1, 0); + + /* detach link */ + bpf_link__destroy(links[last_cg]); + links[last_cg] = NULL; + + /* detach legacy */ + err = bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS); + if (CHECK(err, "cg_detach_legacy", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = false; + + /* attach legacy exclusive prog attachment */ + err = bpf_prog_attach(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS, 0); + if (CHECK(err, "cg_attach_exclusive", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = true; + + /* attempt to mix in with multi-attach bpf_link */ + tmp_link = bpf_program__attach_cgroup(skel->progs.egress, + cgs[last_cg].fd); + if (CHECK(!IS_ERR(tmp_link), "cg_attach_fail", "unexpected success!\n")) { + bpf_link__destroy(tmp_link); + goto cleanup; + } + + ping_and_check(cg_nr, 0); + + /* detach */ + err = bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS); + if (CHECK(err, "cg_detach_legacy", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = false; + + ping_and_check(cg_nr - 1, 0); + + /* attach back link-based one */ + links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress, + cgs[last_cg].fd); + if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n", + PTR_ERR(links[last_cg]))) + goto cleanup; + + ping_and_check(cg_nr, 0); + + /* check legacy exclusive prog can't be attached */ + err = bpf_prog_attach(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS, 0); + if (CHECK(!err, "cg_attach_exclusive", "unexpected success")) { + bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS); + goto cleanup; + } + + /* replace BPF programs inside their links for all but first link */ + for (i = 1; i < cg_nr; i++) { + err = bpf_link__update_program(links[i], skel->progs.egress_alt); + if (CHECK(err, "prog_upd", "link #%d\n", i)) + goto cleanup; + } + + ping_and_check(1, cg_nr - 1); + + /* Attempt program update with wrong expected BPF program */ + link_upd_opts.old_prog_fd = bpf_program__fd(skel->progs.egress_alt); + link_upd_opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(links[0]), + bpf_program__fd(skel->progs.egress_alt), + &link_upd_opts); + if (CHECK(err == 0 || errno != EPERM, "prog_cmpxchg1", + "unexpectedly succeeded, err %d, errno %d\n", err, -errno)) + goto cleanup; + + /* Compare-exchange single link program from egress to egress_alt */ + link_upd_opts.old_prog_fd = bpf_program__fd(skel->progs.egress); + link_upd_opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(links[0]), + bpf_program__fd(skel->progs.egress_alt), + &link_upd_opts); + if (CHECK(err, "prog_cmpxchg2", "errno %d\n", -errno)) + goto cleanup; + + /* ping */ + ping_and_check(0, cg_nr); + + /* close cgroup FDs before detaching links */ + for (i = 0; i < cg_nr; i++) { + if (cgs[i].fd > 0) { + close(cgs[i].fd); + cgs[i].fd = -1; + } + } + + /* BPF programs should still get called */ + ping_and_check(0, cg_nr); + + /* leave cgroup and remove them, don't detach programs */ + cleanup_cgroup_environment(); + + /* BPF programs should have been auto-detached */ + ping_and_check(0, 0); + +cleanup: + if (detach_legacy) + bpf_prog_detach2(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS); + + for (i = 0; i < cg_nr; i++) { + if (!IS_ERR(links[i])) + bpf_link__destroy(links[i]); + } + test_cgroup_link__destroy(skel); + + for (i = 0; i < cg_nr; i++) { + if (cgs[i].fd > 0) + close(cgs[i].fd); + } + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 235ac4f67f5b..83493bd5745c 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -1,22 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -#include "test_pkt_access.skel.h" #include "fentry_test.skel.h" #include "fexit_test.skel.h" void test_fentry_fexit(void) { - struct test_pkt_access *pkt_skel = NULL; struct fentry_test *fentry_skel = NULL; struct fexit_test *fexit_skel = NULL; __u64 *fentry_res, *fexit_res; __u32 duration = 0, retval; - int err, pkt_fd, i; + int err, prog_fd, i; - pkt_skel = test_pkt_access__open_and_load(); - if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) - return; fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto close_prog; @@ -31,8 +26,8 @@ void test_fentry_fexit(void) if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) goto close_prog; - pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fexit_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); CHECK(err || retval, "ipv6", "err %d errno %d retval %d duration %d\n", @@ -49,7 +44,6 @@ void test_fentry_fexit(void) } close_prog: - test_pkt_access__destroy(pkt_skel); fentry_test__destroy(fentry_skel); fexit_test__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 5cc06021f27d..04ebbf1cb390 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -1,20 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -#include "test_pkt_access.skel.h" #include "fentry_test.skel.h" void test_fentry_test(void) { - struct test_pkt_access *pkt_skel = NULL; struct fentry_test *fentry_skel = NULL; - int err, pkt_fd, i; + int err, prog_fd, i; __u32 duration = 0, retval; __u64 *result; - pkt_skel = test_pkt_access__open_and_load(); - if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) - return; fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto cleanup; @@ -23,10 +18,10 @@ void test_fentry_test(void) if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err)) goto cleanup; - pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fentry_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", + CHECK(err || retval, "test_run", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); @@ -39,5 +34,4 @@ void test_fentry_test(void) cleanup: fentry_test__destroy(fentry_skel); - test_pkt_access__destroy(pkt_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index d2c3655dd7a3..78d7a2765c27 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -1,64 +1,37 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> +#include "fexit_test.skel.h" void test_fexit_test(void) { - struct bpf_prog_load_attr attr = { - .file = "./fexit_test.o", - }; - - char prog_name[] = "fexit/bpf_fentry_testX"; - struct bpf_object *obj = NULL, *pkt_obj; - int err, pkt_fd, kfree_skb_fd, i; - struct bpf_link *link[6] = {}; - struct bpf_program *prog[6]; + struct fexit_test *fexit_skel = NULL; + int err, prog_fd, i; __u32 duration = 0, retval; - struct bpf_map *data_map; - const int zero = 0; - u64 result[6]; + __u64 *result; - err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, - &pkt_obj, &pkt_fd); - if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) - return; - err = bpf_prog_load_xattr(&attr, &obj, &kfree_skb_fd); - if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno)) - goto close_prog; + fexit_skel = fexit_test__open_and_load(); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) + goto cleanup; - for (i = 0; i < 6; i++) { - prog_name[sizeof(prog_name) - 2] = '1' + i; - prog[i] = bpf_object__find_program_by_title(obj, prog_name); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name)) - goto close_prog; - link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) - goto close_prog; - } - data_map = bpf_object__find_map_by_name(obj, "fexit_te.bss"); - if (CHECK(!data_map, "find_data_map", "data map not found\n")) - goto close_prog; + err = fexit_test__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) + goto cleanup; - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fexit_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", + CHECK(err || retval, "test_run", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); - err = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, &result); - if (CHECK(err, "get_result", - "failed to get output data: %d\n", err)) - goto close_prog; - - for (i = 0; i < 6; i++) - if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n", - i + 1, result[i])) - goto close_prog; + result = (__u64 *)fexit_skel->bss; + for (i = 0; i < 6; i++) { + if (CHECK(result[i] != 1, "result", + "fexit_test%d failed err %lld\n", i + 1, result[i])) + goto cleanup; + } -close_prog: - for (i = 0; i < 6; i++) - if (!IS_ERR_OR_NULL(link[i])) - bpf_link__destroy(link[i]); - bpf_object__close(obj); - bpf_object__close(pkt_obj); +cleanup: + fexit_test__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c index eba9a970703b..925722217edf 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c @@ -82,6 +82,7 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size) void test_get_stack_raw_tp(void) { const char *file = "./test_get_stack_rawtp.o"; + const char *file_err = "./test_get_stack_rawtp_err.o"; const char *prog_name = "raw_tracepoint/sys_enter"; int i, err, prog_fd, exp_cnt = MAX_CNT_RAWTP; struct perf_buffer_opts pb_opts = {}; @@ -93,6 +94,10 @@ void test_get_stack_raw_tp(void) struct bpf_map *map; cpu_set_t cpu_set; + err = bpf_prog_load(file_err, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); + if (CHECK(err >= 0, "prog_load raw tp", "err %d errno %d\n", err, errno)) + return; + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno)) return; diff --git a/tools/testing/selftests/bpf/prog_tests/global_data_init.c b/tools/testing/selftests/bpf/prog_tests/global_data_init.c new file mode 100644 index 000000000000..3bdaa5a40744 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/global_data_init.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> + +void test_global_data_init(void) +{ + const char *file = "./test_global_data.o"; + int err = -ENOMEM, map_fd, zero = 0; + __u8 *buff = NULL, *newval = NULL; + struct bpf_object *obj; + struct bpf_map *map; + __u32 duration = 0; + size_t sz; + + obj = bpf_object__open_file(file, NULL); + if (CHECK_FAIL(!obj)) + return; + + map = bpf_object__find_map_by_name(obj, "test_glo.rodata"); + if (CHECK_FAIL(!map || !bpf_map__is_internal(map))) + goto out; + + sz = bpf_map__def(map)->value_size; + newval = malloc(sz); + if (CHECK_FAIL(!newval)) + goto out; + + memset(newval, 0, sz); + /* wrong size, should fail */ + err = bpf_map__set_initial_value(map, newval, sz - 1); + if (CHECK(!err, "reject set initial value wrong size", "err %d\n", err)) + goto out; + + err = bpf_map__set_initial_value(map, newval, sz); + if (CHECK(err, "set initial value", "err %d\n", err)) + goto out; + + err = bpf_object__load(obj); + if (CHECK_FAIL(err)) + goto out; + + map_fd = bpf_map__fd(map); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + buff = malloc(sz); + if (buff) + err = bpf_map_lookup_elem(map_fd, &zero, buff); + if (CHECK(!buff || err || memcmp(buff, newval, sz), + "compare .rodata map data override", + "err %d errno %d\n", err, errno)) + goto out; + + memset(newval, 1, sz); + /* object loaded - should fail */ + err = bpf_map__set_initial_value(map, newval, sz); + CHECK(!err, "reject set initial value after load", "err %d\n", err); +out: + free(buff); + free(newval); + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/link_pinning.c b/tools/testing/selftests/bpf/prog_tests/link_pinning.c new file mode 100644 index 000000000000..a743288cf384 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/link_pinning.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> +#include <sys/stat.h> + +#include "test_link_pinning.skel.h" + +static int duration = 0; + +void test_link_pinning_subtest(struct bpf_program *prog, + struct test_link_pinning__bss *bss) +{ + const char *link_pin_path = "/sys/fs/bpf/pinned_link_test"; + struct stat statbuf = {}; + struct bpf_link *link; + int err, i; + + link = bpf_program__attach(prog); + if (CHECK(IS_ERR(link), "link_attach", "err: %ld\n", PTR_ERR(link))) + goto cleanup; + + bss->in = 1; + usleep(1); + CHECK(bss->out != 1, "res_check1", "exp %d, got %d\n", 1, bss->out); + + /* pin link */ + err = bpf_link__pin(link, link_pin_path); + if (CHECK(err, "link_pin", "err: %d\n", err)) + goto cleanup; + + CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path1", + "exp %s, got %s\n", link_pin_path, bpf_link__pin_path(link)); + + /* check that link was pinned */ + err = stat(link_pin_path, &statbuf); + if (CHECK(err, "stat_link", "err %d errno %d\n", err, errno)) + goto cleanup; + + bss->in = 2; + usleep(1); + CHECK(bss->out != 2, "res_check2", "exp %d, got %d\n", 2, bss->out); + + /* destroy link, pinned link should keep program attached */ + bpf_link__destroy(link); + link = NULL; + + bss->in = 3; + usleep(1); + CHECK(bss->out != 3, "res_check3", "exp %d, got %d\n", 3, bss->out); + + /* re-open link from BPFFS */ + link = bpf_link__open(link_pin_path); + if (CHECK(IS_ERR(link), "link_open", "err: %ld\n", PTR_ERR(link))) + goto cleanup; + + CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path2", + "exp %s, got %s\n", link_pin_path, bpf_link__pin_path(link)); + + /* unpin link from BPFFS, program still attached */ + err = bpf_link__unpin(link); + if (CHECK(err, "link_unpin", "err: %d\n", err)) + goto cleanup; + + /* still active, as we have FD open now */ + bss->in = 4; + usleep(1); + CHECK(bss->out != 4, "res_check4", "exp %d, got %d\n", 4, bss->out); + + bpf_link__destroy(link); + link = NULL; + + /* Validate it's finally detached. + * Actual detachment might get delayed a bit, so there is no reliable + * way to validate it immediately here, let's count up for long enough + * and see if eventually output stops being updated + */ + for (i = 5; i < 10000; i++) { + bss->in = i; + usleep(1); + if (bss->out == i - 1) + break; + } + CHECK(i == 10000, "link_attached", "got to iteration #%d\n", i); + +cleanup: + if (!IS_ERR(link)) + bpf_link__destroy(link); +} + +void test_link_pinning(void) +{ + struct test_link_pinning* skel; + + skel = test_link_pinning__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + if (test__start_subtest("pin_raw_tp")) + test_link_pinning_subtest(skel->progs.raw_tp_prog, skel->bss); + if (test__start_subtest("pin_tp_btf")) + test_link_pinning_subtest(skel->progs.tp_btf_prog, skel->bss); + + test_link_pinning__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/modify_return.c b/tools/testing/selftests/bpf/prog_tests/modify_return.c new file mode 100644 index 000000000000..97fec70c600b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/modify_return.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include <test_progs.h> +#include "modify_return.skel.h" + +#define LOWER(x) ((x) & 0xffff) +#define UPPER(x) ((x) >> 16) + + +static void run_test(__u32 input_retval, __u16 want_side_effect, __s16 want_ret) +{ + struct modify_return *skel = NULL; + int err, prog_fd; + __u32 duration = 0, retval; + __u16 side_effect; + __s16 ret; + + skel = modify_return__open_and_load(); + if (CHECK(!skel, "skel_load", "modify_return skeleton failed\n")) + goto cleanup; + + err = modify_return__attach(skel); + if (CHECK(err, "modify_return", "attach failed: %d\n", err)) + goto cleanup; + + skel->bss->input_retval = input_retval; + prog_fd = bpf_program__fd(skel->progs.fmod_ret_test); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, 0, + &retval, &duration); + + CHECK(err, "test_run", "err %d errno %d\n", err, errno); + + side_effect = UPPER(retval); + ret = LOWER(retval); + + CHECK(ret != want_ret, "test_run", + "unexpected ret: %d, expected: %d\n", ret, want_ret); + CHECK(side_effect != want_side_effect, "modify_return", + "unexpected side_effect: %d\n", side_effect); + + CHECK(skel->bss->fentry_result != 1, "modify_return", + "fentry failed\n"); + CHECK(skel->bss->fexit_result != 1, "modify_return", + "fexit failed\n"); + CHECK(skel->bss->fmod_ret_result != 1, "modify_return", + "fmod_ret failed\n"); + +cleanup: + modify_return__destroy(skel); +} + +void test_modify_return(void) +{ + run_test(0 /* input_retval */, + 1 /* want_side_effect */, + 4 /* want_ret */); + run_test(-EINVAL /* input_retval */, + 0 /* want_side_effect */, + -EINVAL /* want_ret */); +} + diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c new file mode 100644 index 000000000000..542240e16564 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Carlos Neira cneirabustos@gmail.com */ +#include <test_progs.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/syscall.h> + +struct bss { + __u64 dev; + __u64 ino; + __u64 pid_tgid; + __u64 user_pid_tgid; +}; + +void test_ns_current_pid_tgid(void) +{ + const char *probe_name = "raw_tracepoint/sys_enter"; + const char *file = "test_ns_current_pid_tgid.o"; + int err, key = 0, duration = 0; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_map *bss_map; + struct bpf_object *obj; + struct bss bss; + struct stat st; + __u64 id; + + obj = bpf_object__open_file(file, NULL); + if (CHECK(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj))) + return; + + err = bpf_object__load(obj); + if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) + goto cleanup; + + bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss"); + if (CHECK(!bss_map, "find_bss_map", "failed\n")) + goto cleanup; + + prog = bpf_object__find_program_by_title(obj, probe_name); + if (CHECK(!prog, "find_prog", "prog '%s' not found\n", + probe_name)) + goto cleanup; + + memset(&bss, 0, sizeof(bss)); + pid_t tid = syscall(SYS_gettid); + pid_t pid = getpid(); + + id = (__u64) tid << 32 | pid; + bss.user_pid_tgid = id; + + if (CHECK_FAIL(stat("/proc/self/ns/pid", &st))) { + perror("Failed to stat /proc/self/ns/pid"); + goto cleanup; + } + + bss.dev = st.st_dev; + bss.ino = st.st_ino; + + err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0); + if (CHECK(err, "setting_bss", "failed to set bss : %d\n", err)) + goto cleanup; + + link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); + if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", + PTR_ERR(link))) { + link = NULL; + goto cleanup; + } + + /* trigger some syscalls */ + usleep(1); + + err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss); + if (CHECK(err, "set_bss", "failed to get bss : %d\n", err)) + goto cleanup; + + if (CHECK(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid", + "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) + goto cleanup; +cleanup: + if (!link) { + bpf_link__destroy(link); + link = NULL; + } + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c new file mode 100644 index 000000000000..e35c444902a7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <pthread.h> +#include <sched.h> +#include <sys/socket.h> +#include <test_progs.h> +#include "bpf/libbpf_internal.h" +#include "test_perf_branches.skel.h" + +static void check_good_sample(struct test_perf_branches *skel) +{ + int written_global = skel->bss->written_global_out; + int required_size = skel->bss->required_size_out; + int written_stack = skel->bss->written_stack_out; + int pbe_size = sizeof(struct perf_branch_entry); + int duration = 0; + + if (CHECK(!skel->bss->valid, "output not valid", + "no valid sample from prog")) + return; + + /* + * It's hard to validate the contents of the branch entries b/c it + * would require some kind of disassembler and also encoding the + * valid jump instructions for supported architectures. So just check + * the easy stuff for now. + */ + CHECK(required_size <= 0, "read_branches_size", "err %d\n", required_size); + CHECK(written_stack < 0, "read_branches_stack", "err %d\n", written_stack); + CHECK(written_stack % pbe_size != 0, "read_branches_stack", + "stack bytes written=%d not multiple of struct size=%d\n", + written_stack, pbe_size); + CHECK(written_global < 0, "read_branches_global", "err %d\n", written_global); + CHECK(written_global % pbe_size != 0, "read_branches_global", + "global bytes written=%d not multiple of struct size=%d\n", + written_global, pbe_size); + CHECK(written_global < written_stack, "read_branches_size", + "written_global=%d < written_stack=%d\n", written_global, written_stack); +} + +static void check_bad_sample(struct test_perf_branches *skel) +{ + int written_global = skel->bss->written_global_out; + int required_size = skel->bss->required_size_out; + int written_stack = skel->bss->written_stack_out; + int duration = 0; + + if (CHECK(!skel->bss->valid, "output not valid", + "no valid sample from prog")) + return; + + CHECK((required_size != -EINVAL && required_size != -ENOENT), + "read_branches_size", "err %d\n", required_size); + CHECK((written_stack != -EINVAL && written_stack != -ENOENT), + "read_branches_stack", "written %d\n", written_stack); + CHECK((written_global != -EINVAL && written_global != -ENOENT), + "read_branches_global", "written %d\n", written_global); +} + +static void test_perf_branches_common(int perf_fd, + void (*cb)(struct test_perf_branches *)) +{ + struct test_perf_branches *skel; + int err, i, duration = 0; + bool detached = false; + struct bpf_link *link; + volatile int j = 0; + cpu_set_t cpu_set; + + skel = test_perf_branches__open_and_load(); + if (CHECK(!skel, "test_perf_branches_load", + "perf_branches skeleton failed\n")) + return; + + /* attach perf_event */ + link = bpf_program__attach_perf_event(skel->progs.perf_branches, perf_fd); + if (CHECK(IS_ERR(link), "attach_perf_event", "err %ld\n", PTR_ERR(link))) + goto out_destroy_skel; + + /* generate some branches on cpu 0 */ + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err)) + goto out_destroy; + /* spin the loop for a while (random high number) */ + for (i = 0; i < 1000000; ++i) + ++j; + + test_perf_branches__detach(skel); + detached = true; + + cb(skel); +out_destroy: + bpf_link__destroy(link); +out_destroy_skel: + if (!detached) + test_perf_branches__detach(skel); + test_perf_branches__destroy(skel); +} + +static void test_perf_branches_hw(void) +{ + struct perf_event_attr attr = {0}; + int duration = 0; + int pfd; + + /* create perf event */ + attr.size = sizeof(attr); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.freq = 1; + attr.sample_freq = 4000; + attr.sample_type = PERF_SAMPLE_BRANCH_STACK; + attr.branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + + /* + * Some setups don't support branch records (virtual machines, !x86), + * so skip test in this case. + */ + if (pfd == -1) { + if (errno == ENOENT || errno == EOPNOTSUPP) { + printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n", + __func__); + test__skip(); + return; + } + if (CHECK(pfd < 0, "perf_event_open", "err %d errno %d\n", + pfd, errno)) + return; + } + + test_perf_branches_common(pfd, check_good_sample); + + close(pfd); +} + +/* + * Tests negative case -- run bpf_read_branch_records() on improperly configured + * perf event. + */ +static void test_perf_branches_no_hw(void) +{ + struct perf_event_attr attr = {0}; + int duration = 0; + int pfd; + + /* create perf event */ + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 4000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (CHECK(pfd < 0, "perf_event_open", "err %d\n", pfd)) + return; + + test_perf_branches_common(pfd, check_bad_sample); + + close(pfd); +} + +void test_perf_branches(void) +{ + if (test__start_subtest("perf_branches_hw")) + test_perf_branches_hw(); + if (test__start_subtest("perf_branches_no_hw")) + test_perf_branches_no_hw(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 0800036ed654..821b4146b7b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -36,6 +36,7 @@ static int result_map, tmp_index_ovr_map, linum_map, data_check_map; static __u32 expected_results[NR_RESULTS]; static int sk_fds[REUSEPORT_ARRAY_SIZE]; static int reuseport_array = -1, outer_map = -1; +static enum bpf_map_type inner_map_type; static int select_by_skb_data_prog; static int saved_tcp_syncookie = -1; static struct bpf_object *obj; @@ -63,13 +64,15 @@ static union sa46 { } \ }) -static int create_maps(void) +static int create_maps(enum bpf_map_type inner_type) { struct bpf_create_map_attr attr = {}; + inner_map_type = inner_type; + /* Creating reuseport_array */ attr.name = "reuseport_array"; - attr.map_type = BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; + attr.map_type = inner_type; attr.key_size = sizeof(__u32); attr.value_size = sizeof(__u32); attr.max_entries = REUSEPORT_ARRAY_SIZE; @@ -506,11 +509,6 @@ static void test_syncookie(int type, sa_family_t family) .pass_on_failure = 0, }; - if (type != SOCK_STREAM) { - test__skip(); - return; - } - /* * +1 for TCP-SYN and * +1 for the TCP-ACK (ack the syncookie) @@ -728,12 +726,36 @@ static void cleanup_per_test(bool no_inner_map) static void cleanup(void) { - if (outer_map != -1) + if (outer_map != -1) { close(outer_map); - if (reuseport_array != -1) + outer_map = -1; + } + + if (reuseport_array != -1) { close(reuseport_array); - if (obj) + reuseport_array = -1; + } + + if (obj) { bpf_object__close(obj); + obj = NULL; + } + + memset(expected_results, 0, sizeof(expected_results)); +} + +static const char *maptype_str(enum bpf_map_type type) +{ + switch (type) { + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + return "reuseport_sockarray"; + case BPF_MAP_TYPE_SOCKMAP: + return "sockmap"; + case BPF_MAP_TYPE_SOCKHASH: + return "sockhash"; + default: + return "unknown"; + } } static const char *family_str(sa_family_t family) @@ -760,7 +782,7 @@ static const char *sotype_str(int sotype) } } -#define TEST_INIT(fn, ...) { fn, #fn, __VA_ARGS__ } +#define TEST_INIT(fn_, ...) { .fn = fn_, .name = #fn_, __VA_ARGS__ } static void test_config(int sotype, sa_family_t family, bool inany) { @@ -768,12 +790,15 @@ static void test_config(int sotype, sa_family_t family, bool inany) void (*fn)(int sotype, sa_family_t family); const char *name; bool no_inner_map; + int need_sotype; } tests[] = { - TEST_INIT(test_err_inner_map, true /* no_inner_map */), + TEST_INIT(test_err_inner_map, + .no_inner_map = true), TEST_INIT(test_err_skb_data), TEST_INIT(test_err_sk_select_port), TEST_INIT(test_pass), - TEST_INIT(test_syncookie), + TEST_INIT(test_syncookie, + .need_sotype = SOCK_STREAM), TEST_INIT(test_pass_on_err), TEST_INIT(test_detach_bpf), }; @@ -781,7 +806,11 @@ static void test_config(int sotype, sa_family_t family, bool inany) const struct test *t; for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { - snprintf(s, sizeof(s), "%s/%s %s %s", + if (t->need_sotype && t->need_sotype != sotype) + continue; /* test not compatible with socket type */ + + snprintf(s, sizeof(s), "%s %s/%s %s %s", + maptype_str(inner_map_type), family_str(family), sotype_str(sotype), inany ? "INANY" : "LOOPBACK", t->name); @@ -816,13 +845,20 @@ static void test_all(void) test_config(c->sotype, c->family, c->inany); } -void test_select_reuseport(void) +void test_map_type(enum bpf_map_type mt) { - if (create_maps()) + if (create_maps(mt)) goto out; if (prepare_bpf_obj()) goto out; + test_all(); +out: + cleanup(); +} + +void test_select_reuseport(void) +{ saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); if (saved_tcp_fo < 0) goto out; @@ -835,8 +871,9 @@ void test_select_reuseport(void) if (disable_syncookie()) goto out; - test_all(); + test_map_type(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); + test_map_type(BPF_MAP_TYPE_SOCKMAP); + test_map_type(BPF_MAP_TYPE_SOCKHASH); out: - cleanup(); restore_sysctls(); } diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c b/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c new file mode 100644 index 000000000000..189a34a7addb --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <pthread.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include "test_send_signal_kern.skel.h" + +static void sigusr1_handler(int signum) +{ +} + +#define THREAD_COUNT 100 + +static void *worker(void *p) +{ + int i; + + for ( i = 0; i < 1000; i++) + usleep(1); + + return NULL; +} + +void test_send_signal_sched_switch(void) +{ + struct test_send_signal_kern *skel; + pthread_t threads[THREAD_COUNT]; + u32 duration = 0; + int i, err; + + signal(SIGUSR1, sigusr1_handler); + + skel = test_send_signal_kern__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n")) + return; + + skel->bss->pid = getpid(); + skel->bss->sig = SIGUSR1; + + err = test_send_signal_kern__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed\n")) + goto destroy_skel; + + for (i = 0; i < THREAD_COUNT; i++) { + err = pthread_create(threads + i, NULL, worker, NULL); + if (CHECK(err, "pthread_create", "Error creating thread, %s\n", + strerror(errno))) + goto destroy_skel; + } + + for (i = 0; i < THREAD_COUNT; i++) + pthread_join(threads[i], NULL); + +destroy_skel: + test_send_signal_kern__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c new file mode 100644 index 000000000000..d572e1a2c297 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook +// Copyright (c) 2019 Cloudflare +// Copyright (c) 2020 Isovalent, Inc. +/* + * Test that the socket assign program is able to redirect traffic towards a + * socket, regardless of whether the port or address destination of the traffic + * matches the port. + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <signal.h> +#include <stdlib.h> +#include <unistd.h> + +#include "test_progs.h" + +#define BIND_PORT 1234 +#define CONNECT_PORT 4321 +#define TEST_DADDR (0xC0A80203) +#define NS_SELF "/proc/self/ns/net" + +static const struct timeval timeo_sec = { .tv_sec = 3 }; +static const size_t timeo_optlen = sizeof(timeo_sec); +static int stop, duration; + +static bool +configure_stack(void) +{ + char tc_cmd[BUFSIZ]; + + /* Move to a new networking namespace */ + if (CHECK_FAIL(unshare(CLONE_NEWNET))) + return false; + + /* Configure necessary links, routes */ + if (CHECK_FAIL(system("ip link set dev lo up"))) + return false; + if (CHECK_FAIL(system("ip route add local default dev lo"))) + return false; + if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) + return false; + + /* Load qdisc, BPF program */ + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) + return false; + sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf", + "direct-action object-file ./test_sk_assign.o", + "section classifier/sk_assign_test", + (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : ""); + if (CHECK(system(tc_cmd), "BPF load failed;", + "run with -vv for more info\n")) + return false; + + return true; +} + +static int +start_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd; + + fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + goto out; + if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, + timeo_optlen))) + goto close_out; + if (CHECK_FAIL(bind(fd, addr, len) == -1)) + goto close_out; + if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) + goto close_out; + + goto out; +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static int +connect_to_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd = -1; + + fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + goto out; + if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, + timeo_optlen))) + goto close_out; + if (CHECK_FAIL(connect(fd, addr, len))) + goto close_out; + + goto out; +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static in_port_t +get_port(int fd) +{ + struct sockaddr_storage ss; + socklen_t slen = sizeof(ss); + in_port_t port = 0; + + if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&ss, &slen))) + return port; + + switch (ss.ss_family) { + case AF_INET: + port = ((struct sockaddr_in *)&ss)->sin_port; + break; + case AF_INET6: + port = ((struct sockaddr_in6 *)&ss)->sin6_port; + break; + default: + CHECK(1, "Invalid address family", "%d\n", ss.ss_family); + } + return port; +} + +static ssize_t +rcv_msg(int srv_client, int type) +{ + struct sockaddr_storage ss; + char buf[BUFSIZ]; + socklen_t slen; + + if (type == SOCK_STREAM) + return read(srv_client, &buf, sizeof(buf)); + else + return recvfrom(srv_client, &buf, sizeof(buf), 0, + (struct sockaddr *)&ss, &slen); +} + +static int +run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) +{ + int client = -1, srv_client = -1; + char buf[] = "testing"; + in_port_t port; + int ret = 1; + + client = connect_to_server(addr, len, type); + if (client == -1) { + perror("Cannot connect to server"); + goto out; + } + + if (type == SOCK_STREAM) { + srv_client = accept(server_fd, NULL, NULL); + if (CHECK_FAIL(srv_client == -1)) { + perror("Can't accept connection"); + goto out; + } + } else { + srv_client = server_fd; + } + if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { + perror("Can't write on client"); + goto out; + } + if (CHECK_FAIL(rcv_msg(srv_client, type) != sizeof(buf))) { + perror("Can't read on server"); + goto out; + } + + port = get_port(srv_client); + if (CHECK_FAIL(!port)) + goto out; + /* SOCK_STREAM is connected via accept(), so the server's local address + * will be the CONNECT_PORT rather than the BIND port that corresponds + * to the listen socket. SOCK_DGRAM on the other hand is connectionless + * so we can't really do the same check there; the server doesn't ever + * create a socket with CONNECT_PORT. + */ + if (type == SOCK_STREAM && + CHECK(port != htons(CONNECT_PORT), "Expected", "port %u but got %u", + CONNECT_PORT, ntohs(port))) + goto out; + else if (type == SOCK_DGRAM && + CHECK(port != htons(BIND_PORT), "Expected", + "port %u but got %u", BIND_PORT, ntohs(port))) + goto out; + + ret = 0; +out: + close(client); + if (srv_client != server_fd) + close(srv_client); + if (ret) + WRITE_ONCE(stop, 1); + return ret; +} + +static void +prepare_addr(struct sockaddr *addr, int family, __u16 port, bool rewrite_addr) +{ + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = family; + addr4->sin_port = htons(port); + if (rewrite_addr) + addr4->sin_addr.s_addr = htonl(TEST_DADDR); + else + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = family; + addr6->sin6_port = htons(port); + addr6->sin6_addr = in6addr_loopback; + if (rewrite_addr) + addr6->sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); + break; + default: + fprintf(stderr, "Invalid family %d", family); + } +} + +struct test_sk_cfg { + const char *name; + int family; + struct sockaddr *addr; + socklen_t len; + int type; + bool rewrite_addr; +}; + +#define TEST(NAME, FAMILY, TYPE, REWRITE) \ +{ \ + .name = NAME, \ + .family = FAMILY, \ + .addr = (FAMILY == AF_INET) ? (struct sockaddr *)&addr4 \ + : (struct sockaddr *)&addr6, \ + .len = (FAMILY == AF_INET) ? sizeof(addr4) : sizeof(addr6), \ + .type = TYPE, \ + .rewrite_addr = REWRITE, \ +} + +void test_sk_assign(void) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + struct test_sk_cfg tests[] = { + TEST("ipv4 tcp port redir", AF_INET, SOCK_STREAM, false), + TEST("ipv4 tcp addr redir", AF_INET, SOCK_STREAM, true), + TEST("ipv6 tcp port redir", AF_INET6, SOCK_STREAM, false), + TEST("ipv6 tcp addr redir", AF_INET6, SOCK_STREAM, true), + TEST("ipv4 udp port redir", AF_INET, SOCK_DGRAM, false), + TEST("ipv4 udp addr redir", AF_INET, SOCK_DGRAM, true), + TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false), + TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), + }; + int server = -1; + int self_net; + + self_net = open(NS_SELF, O_RDONLY); + if (CHECK_FAIL(self_net < 0)) { + perror("Unable to open "NS_SELF); + return; + } + + if (!configure_stack()) { + perror("configure_stack"); + goto cleanup; + } + + for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { + struct test_sk_cfg *test = &tests[i]; + const struct sockaddr *addr; + + if (!test__start_subtest(test->name)) + continue; + prepare_addr(test->addr, test->family, BIND_PORT, false); + addr = (const struct sockaddr *)test->addr; + server = start_server(addr, test->len, test->type); + if (server == -1) + goto cleanup; + + /* connect to unbound ports */ + prepare_addr(test->addr, test->family, CONNECT_PORT, + test->rewrite_addr); + if (run_test(server, addr, test->len, test->type)) + goto close; + + close(server); + server = -1; + } + +close: + close(server); +cleanup: + if (CHECK_FAIL(setns(self_net, CLONE_NEWNET))) + perror("Failed to setns("NS_SELF")"); + close(self_net); +} diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index c6d6b685a946..4538bd08203f 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -14,6 +14,7 @@ void test_skb_ctx(void) .wire_len = 100, .gso_segs = 8, .mark = 9, + .gso_size = 10, }; struct bpf_prog_test_run_attr tattr = { .data_in = &pkt_v4, diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c new file mode 100644 index 000000000000..06b86addc181 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare +/* + * Tests for sockmap/sockhash holding kTLS sockets. + */ + +#include "test_progs.h" + +#define MAX_TEST_NAME 80 +#define TCP_ULP 31 + +static int tcp_server(int family) +{ + int err, s; + + s = socket(family, SOCK_STREAM, 0); + if (CHECK_FAIL(s == -1)) { + perror("socket"); + return -1; + } + + err = listen(s, SOMAXCONN); + if (CHECK_FAIL(err)) { + perror("listen"); + return -1; + } + + return s; +} + +static int disconnect(int fd) +{ + struct sockaddr unspec = { AF_UNSPEC }; + + return connect(fd, &unspec, sizeof(unspec)); +} + +/* Disconnect (unhash) a kTLS socket after removing it from sockmap. */ +static void test_sockmap_ktls_disconnect_after_delete(int family, int map) +{ + struct sockaddr_storage addr = {0}; + socklen_t len = sizeof(addr); + int err, cli, srv, zero = 0; + + srv = tcp_server(family); + if (srv == -1) + return; + + err = getsockname(srv, (struct sockaddr *)&addr, &len); + if (CHECK_FAIL(err)) { + perror("getsockopt"); + goto close_srv; + } + + cli = socket(family, SOCK_STREAM, 0); + if (CHECK_FAIL(cli == -1)) { + perror("socket"); + goto close_srv; + } + + err = connect(cli, (struct sockaddr *)&addr, len); + if (CHECK_FAIL(err)) { + perror("connect"); + goto close_cli; + } + + err = bpf_map_update_elem(map, &zero, &cli, 0); + if (CHECK_FAIL(err)) { + perror("bpf_map_update_elem"); + goto close_cli; + } + + err = setsockopt(cli, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls")); + if (CHECK_FAIL(err)) { + perror("setsockopt(TCP_ULP)"); + goto close_cli; + } + + err = bpf_map_delete_elem(map, &zero); + if (CHECK_FAIL(err)) { + perror("bpf_map_delete_elem"); + goto close_cli; + } + + err = disconnect(cli); + if (CHECK_FAIL(err)) + perror("disconnect"); + +close_cli: + close(cli); +close_srv: + close(srv); +} + +static void run_tests(int family, enum bpf_map_type map_type) +{ + char test_name[MAX_TEST_NAME]; + int map; + + map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); + if (CHECK_FAIL(map == -1)) { + perror("bpf_map_create"); + return; + } + + snprintf(test_name, MAX_TEST_NAME, + "sockmap_ktls disconnect_after_delete %s %s", + family == AF_INET ? "IPv4" : "IPv6", + map_type == BPF_MAP_TYPE_SOCKMAP ? "SOCKMAP" : "SOCKHASH"); + if (!test__start_subtest(test_name)) + return; + + test_sockmap_ktls_disconnect_after_delete(family, map); + + close(map); +} + +void test_sockmap_ktls(void) +{ + run_tests(AF_INET, BPF_MAP_TYPE_SOCKMAP); + run_tests(AF_INET, BPF_MAP_TYPE_SOCKHASH); + run_tests(AF_INET6, BPF_MAP_TYPE_SOCKMAP); + run_tests(AF_INET6, BPF_MAP_TYPE_SOCKHASH); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c new file mode 100644 index 000000000000..d7d65a700799 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -0,0 +1,1635 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare +/* + * Test suite for SOCKMAP/SOCKHASH holding listening sockets. + * Covers: + * 1. BPF map operations - bpf_map_{update,lookup delete}_elem + * 2. BPF redirect helpers - bpf_{sk,msg}_redirect_map + * 3. BPF reuseport helper - bpf_sk_select_reuseport + */ + +#include <linux/compiler.h> +#include <errno.h> +#include <error.h> +#include <limits.h> +#include <netinet/in.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> +#include <sys/select.h> +#include <unistd.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_util.h" +#include "test_progs.h" +#include "test_sockmap_listen.skel.h" + +#define IO_TIMEOUT_SEC 30 +#define MAX_STRERR_LEN 256 +#define MAX_TEST_NAME 80 + +#define _FAIL(errnum, fmt...) \ + ({ \ + error_at_line(0, (errnum), __func__, __LINE__, fmt); \ + CHECK_FAIL(true); \ + }) +#define FAIL(fmt...) _FAIL(0, fmt) +#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) +#define FAIL_LIBBPF(err, msg) \ + ({ \ + char __buf[MAX_STRERR_LEN]; \ + libbpf_strerror((err), __buf, sizeof(__buf)); \ + FAIL("%s: %s", (msg), __buf); \ + }) + +/* Wrappers that fail the test on error and report it. */ + +#define xaccept_nonblock(fd, addr, len) \ + ({ \ + int __ret = \ + accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("accept"); \ + __ret; \ + }) + +#define xbind(fd, addr, len) \ + ({ \ + int __ret = bind((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("bind"); \ + __ret; \ + }) + +#define xclose(fd) \ + ({ \ + int __ret = close((fd)); \ + if (__ret == -1) \ + FAIL_ERRNO("close"); \ + __ret; \ + }) + +#define xconnect(fd, addr, len) \ + ({ \ + int __ret = connect((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("connect"); \ + __ret; \ + }) + +#define xgetsockname(fd, addr, len) \ + ({ \ + int __ret = getsockname((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockname"); \ + __ret; \ + }) + +#define xgetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = getsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockopt(" #name ")"); \ + __ret; \ + }) + +#define xlisten(fd, backlog) \ + ({ \ + int __ret = listen((fd), (backlog)); \ + if (__ret == -1) \ + FAIL_ERRNO("listen"); \ + __ret; \ + }) + +#define xsetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = setsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("setsockopt(" #name ")"); \ + __ret; \ + }) + +#define xsend(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = send((fd), (buf), (len), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("send"); \ + __ret; \ + }) + +#define xrecv_nonblock(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ + IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("recv"); \ + __ret; \ + }) + +#define xsocket(family, sotype, flags) \ + ({ \ + int __ret = socket(family, sotype, flags); \ + if (__ret == -1) \ + FAIL_ERRNO("socket"); \ + __ret; \ + }) + +#define xbpf_map_delete_elem(fd, key) \ + ({ \ + int __ret = bpf_map_delete_elem((fd), (key)); \ + if (__ret == -1) \ + FAIL_ERRNO("map_delete"); \ + __ret; \ + }) + +#define xbpf_map_lookup_elem(fd, key, val) \ + ({ \ + int __ret = bpf_map_lookup_elem((fd), (key), (val)); \ + if (__ret == -1) \ + FAIL_ERRNO("map_lookup"); \ + __ret; \ + }) + +#define xbpf_map_update_elem(fd, key, val, flags) \ + ({ \ + int __ret = bpf_map_update_elem((fd), (key), (val), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("map_update"); \ + __ret; \ + }) + +#define xbpf_prog_attach(prog, target, type, flags) \ + ({ \ + int __ret = \ + bpf_prog_attach((prog), (target), (type), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("prog_attach(" #type ")"); \ + __ret; \ + }) + +#define xbpf_prog_detach2(prog, target, type) \ + ({ \ + int __ret = bpf_prog_detach2((prog), (target), (type)); \ + if (__ret == -1) \ + FAIL_ERRNO("prog_detach2(" #type ")"); \ + __ret; \ + }) + +#define xpthread_create(thread, attr, func, arg) \ + ({ \ + int __ret = pthread_create((thread), (attr), (func), (arg)); \ + errno = __ret; \ + if (__ret) \ + FAIL_ERRNO("pthread_create"); \ + __ret; \ + }) + +#define xpthread_join(thread, retval) \ + ({ \ + int __ret = pthread_join((thread), (retval)); \ + errno = __ret; \ + if (__ret) \ + FAIL_ERRNO("pthread_join"); \ + __ret; \ + }) + +static int poll_read(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set rfds; + int r; + + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + + r = select(fd + 1, &rfds, NULL, NULL, &timeout); + if (r == 0) + errno = ETIME; + + return r == 1 ? 0 : -1; +} + +static int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return accept(fd, addr, len); +} + +static int recv_timeout(int fd, void *buf, size_t len, int flags, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return recv(fd, buf, len, flags); +} + +static void init_addr_loopback4(struct sockaddr_storage *ss, socklen_t *len) +{ + struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); + + addr4->sin_family = AF_INET; + addr4->sin_port = 0; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + *len = sizeof(*addr4); +} + +static void init_addr_loopback6(struct sockaddr_storage *ss, socklen_t *len) +{ + struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = 0; + addr6->sin6_addr = in6addr_loopback; + *len = sizeof(*addr6); +} + +static void init_addr_loopback(int family, struct sockaddr_storage *ss, + socklen_t *len) +{ + switch (family) { + case AF_INET: + init_addr_loopback4(ss, len); + return; + case AF_INET6: + init_addr_loopback6(ss, len); + return; + default: + FAIL("unsupported address family %d", family); + } +} + +static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) +{ + return (struct sockaddr *)ss; +} + +static int enable_reuseport(int s, int progfd) +{ + int err, one = 1; + + err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + if (err) + return -1; + err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, + sizeof(progfd)); + if (err) + return -1; + + return 0; +} + +static int socket_loopback_reuseport(int family, int sotype, int progfd) +{ + struct sockaddr_storage addr; + socklen_t len; + int err, s; + + init_addr_loopback(family, &addr, &len); + + s = xsocket(family, sotype, 0); + if (s == -1) + return -1; + + if (progfd >= 0) + enable_reuseport(s, progfd); + + err = xbind(s, sockaddr(&addr), len); + if (err) + goto close; + + if (sotype & SOCK_DGRAM) + return s; + + err = xlisten(s, SOMAXCONN); + if (err) + goto close; + + return s; +close: + xclose(s); + return -1; +} + +static int socket_loopback(int family, int sotype) +{ + return socket_loopback_reuseport(family, sotype, -1); +} + +static void test_insert_invalid(int family, int sotype, int mapfd) +{ + u32 key = 0; + u64 value; + int err; + + value = -1; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EINVAL) + FAIL_ERRNO("map_update: expected EINVAL"); + + value = INT_MAX; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EBADF) + FAIL_ERRNO("map_update: expected EBADF"); +} + +static void test_insert_opened(int family, int sotype, int mapfd) +{ + u32 key = 0; + u64 value; + int err, s; + + s = xsocket(family, sotype, 0); + if (s == -1) + return; + + errno = 0; + value = s; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EOPNOTSUPP) + FAIL_ERRNO("map_update: expected EOPNOTSUPP"); + + xclose(s); +} + +static void test_insert_bound(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + socklen_t len; + u32 key = 0; + u64 value; + int err, s; + + init_addr_loopback(family, &addr, &len); + + s = xsocket(family, sotype, 0); + if (s == -1) + return; + + err = xbind(s, sockaddr(&addr), len); + if (err) + goto close; + + errno = 0; + value = s; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EOPNOTSUPP) + FAIL_ERRNO("map_update: expected EOPNOTSUPP"); +close: + xclose(s); +} + +static void test_insert(int family, int sotype, int mapfd) +{ + u64 value; + u32 key; + int s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xclose(s); +} + +static void test_delete_after_insert(int family, int sotype, int mapfd) +{ + u64 value; + u32 key; + int s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xbpf_map_delete_elem(mapfd, &key); + xclose(s); +} + +static void test_delete_after_close(int family, int sotype, int mapfd) +{ + int err, s; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + xclose(s); + + errno = 0; + err = bpf_map_delete_elem(mapfd, &key); + if (!err || (errno != EINVAL && errno != ENOENT)) + /* SOCKMAP and SOCKHASH return different error codes */ + FAIL_ERRNO("map_delete: expected EINVAL/EINVAL"); +} + +static void test_lookup_after_insert(int family, int sotype, int mapfd) +{ + u64 cookie, value; + socklen_t len; + u32 key; + int s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + len = sizeof(cookie); + xgetsockopt(s, SOL_SOCKET, SO_COOKIE, &cookie, &len); + + xbpf_map_lookup_elem(mapfd, &key, &value); + + if (value != cookie) { + FAIL("map_lookup: have %#llx, want %#llx", + (unsigned long long)value, (unsigned long long)cookie); + } + + xclose(s); +} + +static void test_lookup_after_delete(int family, int sotype, int mapfd) +{ + int err, s; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xbpf_map_delete_elem(mapfd, &key); + + errno = 0; + err = bpf_map_lookup_elem(mapfd, &key, &value); + if (!err || errno != ENOENT) + FAIL_ERRNO("map_lookup: expected ENOENT"); + + xclose(s); +} + +static void test_lookup_32_bit_value(int family, int sotype, int mapfd) +{ + u32 key, value32; + int err, s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + mapfd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(key), + sizeof(value32), 1, 0); + if (mapfd < 0) { + FAIL_ERRNO("map_create"); + goto close; + } + + key = 0; + value32 = s; + xbpf_map_update_elem(mapfd, &key, &value32, BPF_NOEXIST); + + errno = 0; + err = bpf_map_lookup_elem(mapfd, &key, &value32); + if (!err || errno != ENOSPC) + FAIL_ERRNO("map_lookup: expected ENOSPC"); + + xclose(mapfd); +close: + xclose(s); +} + +static void test_update_existing(int family, int sotype, int mapfd) +{ + int s1, s2; + u64 value; + u32 key; + + s1 = socket_loopback(family, sotype); + if (s1 < 0) + return; + + s2 = socket_loopback(family, sotype); + if (s2 < 0) + goto close_s1; + + key = 0; + value = s1; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + value = s2; + xbpf_map_update_elem(mapfd, &key, &value, BPF_EXIST); + xclose(s2); +close_s1: + xclose(s1); +} + +/* Exercise the code path where we destroy child sockets that never + * got accept()'ed, aka orphans, when parent socket gets closed. + */ +static void test_destroy_orphan_child(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + socklen_t len; + int err, s, c; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + c = xsocket(family, sotype, 0); + if (c == -1) + goto close_srv; + + xconnect(c, sockaddr(&addr), len); + xclose(c); +close_srv: + xclose(s); +} + +/* Perform a passive open after removing listening socket from SOCKMAP + * to ensure that callbacks get restored properly. + */ +static void test_clone_after_delete(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + socklen_t len; + int err, s, c; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xbpf_map_delete_elem(mapfd, &key); + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv; + + xconnect(c, sockaddr(&addr), len); + xclose(c); +close_srv: + xclose(s); +} + +/* Check that child socket that got created while parent was in a + * SOCKMAP, but got accept()'ed only after the parent has been removed + * from SOCKMAP, gets cloned without parent psock state or callbacks. + */ +static void test_accept_after_delete(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + const u32 zero = 0; + int err, s, c, p; + socklen_t len; + u64 value; + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s == -1) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + value = s; + err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c == -1) + goto close_srv; + + /* Create child while parent is in sockmap */ + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + /* Remove parent from sockmap */ + err = xbpf_map_delete_elem(mapfd, &zero); + if (err) + goto close_cli; + + p = xaccept_nonblock(s, NULL, NULL); + if (p == -1) + goto close_cli; + + /* Check that child sk_user_data is not set */ + value = p; + xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + + xclose(p); +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +/* Check that child socket that got created and accepted while parent + * was in a SOCKMAP is cloned without parent psock state or callbacks. + */ +static void test_accept_before_delete(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + const u32 zero = 0, one = 1; + int err, s, c, p; + socklen_t len; + u64 value; + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s == -1) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + value = s; + err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c == -1) + goto close_srv; + + /* Create & accept child while parent is in sockmap */ + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + p = xaccept_nonblock(s, NULL, NULL); + if (p == -1) + goto close_cli; + + /* Check that child sk_user_data is not set */ + value = p; + xbpf_map_update_elem(mapfd, &one, &value, BPF_NOEXIST); + + xclose(p); +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +struct connect_accept_ctx { + int sockfd; + unsigned int done; + unsigned int nr_iter; +}; + +static bool is_thread_done(struct connect_accept_ctx *ctx) +{ + return READ_ONCE(ctx->done); +} + +static void *connect_accept_thread(void *arg) +{ + struct connect_accept_ctx *ctx = arg; + struct sockaddr_storage addr; + int family, socktype; + socklen_t len; + int err, i, s; + + s = ctx->sockfd; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto done; + + len = sizeof(family); + err = xgetsockopt(s, SOL_SOCKET, SO_DOMAIN, &family, &len); + if (err) + goto done; + + len = sizeof(socktype); + err = xgetsockopt(s, SOL_SOCKET, SO_TYPE, &socktype, &len); + if (err) + goto done; + + for (i = 0; i < ctx->nr_iter; i++) { + int c, p; + + c = xsocket(family, socktype, 0); + if (c < 0) + break; + + err = xconnect(c, (struct sockaddr *)&addr, sizeof(addr)); + if (err) { + xclose(c); + break; + } + + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) { + xclose(c); + break; + } + + xclose(p); + xclose(c); + } +done: + WRITE_ONCE(ctx->done, 1); + return NULL; +} + +static void test_syn_recv_insert_delete(int family, int sotype, int mapfd) +{ + struct connect_accept_ctx ctx = { 0 }; + struct sockaddr_storage addr; + socklen_t len; + u32 zero = 0; + pthread_t t; + int err, s; + u64 value; + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close; + + ctx.sockfd = s; + ctx.nr_iter = 1000; + + err = xpthread_create(&t, NULL, connect_accept_thread, &ctx); + if (err) + goto close; + + value = s; + while (!is_thread_done(&ctx)) { + err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + if (err) + break; + + err = xbpf_map_delete_elem(mapfd, &zero); + if (err) + break; + } + + xpthread_join(t, NULL); +close: + xclose(s); +} + +static void *listen_thread(void *arg) +{ + struct sockaddr unspec = { AF_UNSPEC }; + struct connect_accept_ctx *ctx = arg; + int err, i, s; + + s = ctx->sockfd; + + for (i = 0; i < ctx->nr_iter; i++) { + err = xlisten(s, 1); + if (err) + break; + err = xconnect(s, &unspec, sizeof(unspec)); + if (err) + break; + } + + WRITE_ONCE(ctx->done, 1); + return NULL; +} + +static void test_race_insert_listen(int family, int socktype, int mapfd) +{ + struct connect_accept_ctx ctx = { 0 }; + const u32 zero = 0; + const int one = 1; + pthread_t t; + int err, s; + u64 value; + + s = xsocket(family, socktype, 0); + if (s < 0) + return; + + err = xsetsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + if (err) + goto close; + + ctx.sockfd = s; + ctx.nr_iter = 10000; + + err = pthread_create(&t, NULL, listen_thread, &ctx); + if (err) + goto close; + + value = s; + while (!is_thread_done(&ctx)) { + err = bpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + /* Expecting EOPNOTSUPP before listen() */ + if (err && errno != EOPNOTSUPP) { + FAIL_ERRNO("map_update"); + break; + } + + err = bpf_map_delete_elem(mapfd, &zero); + /* Expecting no entry after unhash on connect(AF_UNSPEC) */ + if (err && errno != EINVAL && errno != ENOENT) { + FAIL_ERRNO("map_delete"); + break; + } + } + + xpthread_join(t, NULL); +close: + xclose(s); +} + +static void zero_verdict_count(int mapfd) +{ + unsigned int zero = 0; + int key; + + key = SK_DROP; + xbpf_map_update_elem(mapfd, &key, &zero, BPF_ANY); + key = SK_PASS; + xbpf_map_update_elem(mapfd, &key, &zero, BPF_ANY); +} + +enum redir_mode { + REDIR_INGRESS, + REDIR_EGRESS, +}; + +static const char *redir_mode_str(enum redir_mode mode) +{ + switch (mode) { + case REDIR_INGRESS: + return "ingress"; + case REDIR_EGRESS: + return "egress"; + default: + return "unknown"; + } +} + +static void redir_to_connected(int family, int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + struct sockaddr_storage addr; + int s, c0, c1, p0, p1; + unsigned int pass; + socklen_t len; + int err, n; + u64 value; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + c0 = xsocket(family, sotype, 0); + if (c0 < 0) + goto close_srv; + err = xconnect(c0, sockaddr(&addr), len); + if (err) + goto close_cli0; + + p0 = xaccept_nonblock(s, NULL, NULL); + if (p0 < 0) + goto close_cli0; + + c1 = xsocket(family, sotype, 0); + if (c1 < 0) + goto close_peer0; + err = xconnect(c1, sockaddr(&addr), len); + if (err) + goto close_cli1; + + p1 = xaccept_nonblock(s, NULL, NULL); + if (p1 < 0) + goto close_cli1; + + key = 0; + value = p0; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer1; + + key = 1; + value = p1; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer1; + + n = write(mode == REDIR_INGRESS ? c1 : p1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close_peer1; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close_peer1; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = read(c0, &b, 1); + if (n < 0) + FAIL_ERRNO("%s: read", log_prefix); + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close_peer1: + xclose(p1); +close_cli1: + xclose(c1); +close_peer0: + xclose(p0); +close_cli0: + xclose(c0); +close_srv: + xclose(s); +} + +static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0); + if (err) + return; + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) + goto detach; + + redir_to_connected(family, sotype, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT); +detach: + xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER); +} + +static void test_msg_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_MSG_VERDICT, 0); + if (err) + return; + + redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); +} + +static void redir_to_listening(int family, int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + struct sockaddr_storage addr; + int s, c, p, err, n; + unsigned int drop; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_mapfd); + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv; + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + goto close_cli; + + key = 0; + value = s; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer; + + key = 1; + value = p; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer; + + n = write(mode == REDIR_INGRESS ? c : p, "a", 1); + if (n < 0 && errno != EACCES) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close_peer; + + key = SK_DROP; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &drop); + if (err) + goto close_peer; + if (drop != 1) + FAIL("%s: want drop count 1, have %d", log_prefix, drop); + +close_peer: + xclose(p); +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0); + if (err) + return; + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) + goto detach; + + redir_to_listening(family, sotype, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT); +detach: + xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER); +} + +static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_MSG_VERDICT, 0); + if (err) + return; + + redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); +} + +static void test_reuseport_select_listening(int family, int sotype, + int sock_map, int verd_map, + int reuseport_prog) +{ + struct sockaddr_storage addr; + unsigned int pass; + int s, c, err; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_map); + + s = socket_loopback_reuseport(family, sotype | SOCK_NONBLOCK, + reuseport_prog); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + key = 0; + value = s; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv; + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + if (sotype == SOCK_STREAM) { + int p; + + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + goto close_cli; + xclose(p); + } else { + char b = 'a'; + ssize_t n; + + n = xsend(c, &b, sizeof(b), 0); + if (n == -1) + goto close_cli; + + n = xrecv_nonblock(s, &b, sizeof(b), 0); + if (n == -1) + goto close_cli; + } + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_map, &key, &pass); + if (err) + goto close_cli; + if (pass != 1) + FAIL("want pass count 1, have %d", pass); + +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +static void test_reuseport_select_connected(int family, int sotype, + int sock_map, int verd_map, + int reuseport_prog) +{ + struct sockaddr_storage addr; + int s, c0, c1, p0, err; + unsigned int drop; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_map); + + s = socket_loopback_reuseport(family, sotype, reuseport_prog); + if (s < 0) + return; + + /* Populate sock_map[0] to avoid ENOENT on first connection */ + key = 0; + value = s; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + c0 = xsocket(family, sotype, 0); + if (c0 < 0) + goto close_srv; + + err = xconnect(c0, sockaddr(&addr), len); + if (err) + goto close_cli0; + + if (sotype == SOCK_STREAM) { + p0 = xaccept_nonblock(s, NULL, NULL); + if (p0 < 0) + goto close_cli0; + } else { + p0 = xsocket(family, sotype, 0); + if (p0 < 0) + goto close_cli0; + + len = sizeof(addr); + err = xgetsockname(c0, sockaddr(&addr), &len); + if (err) + goto close_cli0; + + err = xconnect(p0, sockaddr(&addr), len); + if (err) + goto close_cli0; + } + + /* Update sock_map[0] to redirect to a connected socket */ + key = 0; + value = p0; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_EXIST); + if (err) + goto close_peer0; + + c1 = xsocket(family, sotype, 0); + if (c1 < 0) + goto close_peer0; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + errno = 0; + err = connect(c1, sockaddr(&addr), len); + if (sotype == SOCK_DGRAM) { + char b = 'a'; + ssize_t n; + + n = xsend(c1, &b, sizeof(b), 0); + if (n == -1) + goto close_cli1; + + n = recv_timeout(c1, &b, sizeof(b), 0, IO_TIMEOUT_SEC); + err = n == -1; + } + if (!err || errno != ECONNREFUSED) + FAIL_ERRNO("connect: expected ECONNREFUSED"); + + key = SK_DROP; + err = xbpf_map_lookup_elem(verd_map, &key, &drop); + if (err) + goto close_cli1; + if (drop != 1) + FAIL("want drop count 1, have %d", drop); + +close_cli1: + xclose(c1); +close_peer0: + xclose(p0); +close_cli0: + xclose(c0); +close_srv: + xclose(s); +} + +/* Check that redirecting across reuseport groups is not allowed. */ +static void test_reuseport_mixed_groups(int family, int sotype, int sock_map, + int verd_map, int reuseport_prog) +{ + struct sockaddr_storage addr; + int s1, s2, c, err; + unsigned int drop; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_map); + + /* Create two listeners, each in its own reuseport group */ + s1 = socket_loopback_reuseport(family, sotype, reuseport_prog); + if (s1 < 0) + return; + + s2 = socket_loopback_reuseport(family, sotype, reuseport_prog); + if (s2 < 0) + goto close_srv1; + + key = 0; + value = s1; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + if (err) + goto close_srv2; + + key = 1; + value = s2; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + + /* Connect to s2, reuseport BPF selects s1 via sock_map[0] */ + len = sizeof(addr); + err = xgetsockname(s2, sockaddr(&addr), &len); + if (err) + goto close_srv2; + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv2; + + err = connect(c, sockaddr(&addr), len); + if (sotype == SOCK_DGRAM) { + char b = 'a'; + ssize_t n; + + n = xsend(c, &b, sizeof(b), 0); + if (n == -1) + goto close_cli; + + n = recv_timeout(c, &b, sizeof(b), 0, IO_TIMEOUT_SEC); + err = n == -1; + } + if (!err || errno != ECONNREFUSED) { + FAIL_ERRNO("connect: expected ECONNREFUSED"); + goto close_cli; + } + + /* Expect drop, can't redirect outside of reuseport group */ + key = SK_DROP; + err = xbpf_map_lookup_elem(verd_map, &key, &drop); + if (err) + goto close_cli; + if (drop != 1) + FAIL("want drop count 1, have %d", drop); + +close_cli: + xclose(c); +close_srv2: + xclose(s2); +close_srv1: + xclose(s1); +} + +#define TEST(fn, ...) \ + { \ + fn, #fn, __VA_ARGS__ \ + } + +static void test_ops_cleanup(const struct bpf_map *map) +{ + const struct bpf_map_def *def; + int err, mapfd; + u32 key; + + def = bpf_map__def(map); + mapfd = bpf_map__fd(map); + + for (key = 0; key < def->max_entries; key++) { + err = bpf_map_delete_elem(mapfd, &key); + if (err && errno != EINVAL && errno != ENOENT) + FAIL_ERRNO("map_delete: expected EINVAL/ENOENT"); + } +} + +static const char *family_str(sa_family_t family) +{ + switch (family) { + case AF_INET: + return "IPv4"; + case AF_INET6: + return "IPv6"; + default: + return "unknown"; + } +} + +static const char *map_type_str(const struct bpf_map *map) +{ + const struct bpf_map_def *def; + + def = bpf_map__def(map); + if (IS_ERR(def)) + return "invalid"; + + switch (def->type) { + case BPF_MAP_TYPE_SOCKMAP: + return "sockmap"; + case BPF_MAP_TYPE_SOCKHASH: + return "sockhash"; + default: + return "unknown"; + } +} + +static const char *sotype_str(int sotype) +{ + switch (sotype) { + case SOCK_DGRAM: + return "UDP"; + case SOCK_STREAM: + return "TCP"; + default: + return "unknown"; + } +} + +static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, + int family, int sotype) +{ + const struct op_test { + void (*fn)(int family, int sotype, int mapfd); + const char *name; + int sotype; + } tests[] = { + /* insert */ + TEST(test_insert_invalid), + TEST(test_insert_opened), + TEST(test_insert_bound, SOCK_STREAM), + TEST(test_insert), + /* delete */ + TEST(test_delete_after_insert), + TEST(test_delete_after_close), + /* lookup */ + TEST(test_lookup_after_insert), + TEST(test_lookup_after_delete), + TEST(test_lookup_32_bit_value), + /* update */ + TEST(test_update_existing), + /* races with insert/delete */ + TEST(test_destroy_orphan_child, SOCK_STREAM), + TEST(test_syn_recv_insert_delete, SOCK_STREAM), + TEST(test_race_insert_listen, SOCK_STREAM), + /* child clone */ + TEST(test_clone_after_delete, SOCK_STREAM), + TEST(test_accept_after_delete, SOCK_STREAM), + TEST(test_accept_before_delete, SOCK_STREAM), + }; + const char *family_name, *map_name, *sotype_name; + const struct op_test *t; + char s[MAX_TEST_NAME]; + int map_fd; + + family_name = family_str(family); + map_name = map_type_str(map); + sotype_name = sotype_str(sotype); + map_fd = bpf_map__fd(map); + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + snprintf(s, sizeof(s), "%s %s %s %s", map_name, family_name, + sotype_name, t->name); + + if (t->sotype != 0 && t->sotype != sotype) + continue; + + if (!test__start_subtest(s)) + continue; + + t->fn(family, sotype, map_fd); + test_ops_cleanup(map); + } +} + +static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family, int sotype) +{ + const struct redir_test { + void (*fn)(struct test_sockmap_listen *skel, + struct bpf_map *map, int family, int sotype); + const char *name; + } tests[] = { + TEST(test_skb_redir_to_connected), + TEST(test_skb_redir_to_listening), + TEST(test_msg_redir_to_connected), + TEST(test_msg_redir_to_listening), + }; + const char *family_name, *map_name; + const struct redir_test *t; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, + t->name); + + if (!test__start_subtest(s)) + continue; + + t->fn(skel, map, family, sotype); + } +} + +static void test_reuseport(struct test_sockmap_listen *skel, + struct bpf_map *map, int family, int sotype) +{ + const struct reuseport_test { + void (*fn)(int family, int sotype, int socket_map, + int verdict_map, int reuseport_prog); + const char *name; + int sotype; + } tests[] = { + TEST(test_reuseport_select_listening), + TEST(test_reuseport_select_connected), + TEST(test_reuseport_mixed_groups), + }; + int socket_map, verdict_map, reuseport_prog; + const char *family_name, *map_name, *sotype_name; + const struct reuseport_test *t; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + sotype_name = sotype_str(sotype); + + socket_map = bpf_map__fd(map); + verdict_map = bpf_map__fd(skel->maps.verdict_map); + reuseport_prog = bpf_program__fd(skel->progs.prog_reuseport); + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + snprintf(s, sizeof(s), "%s %s %s %s", map_name, family_name, + sotype_name, t->name); + + if (t->sotype != 0 && t->sotype != sotype) + continue; + + if (!test__start_subtest(s)) + continue; + + t->fn(family, sotype, socket_map, verdict_map, reuseport_prog); + } +} + +static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) +{ + test_ops(skel, map, family, SOCK_STREAM); + test_ops(skel, map, family, SOCK_DGRAM); + test_redir(skel, map, family, SOCK_STREAM); + test_reuseport(skel, map, family, SOCK_STREAM); + test_reuseport(skel, map, family, SOCK_DGRAM); +} + +void test_sockmap_listen(void) +{ + struct test_sockmap_listen *skel; + + skel = test_sockmap_listen__open_and_load(); + if (!skel) { + FAIL("skeleton open/load failed"); + return; + } + + skel->bss->test_sockmap = true; + run_tests(skel, skel->maps.sock_map, AF_INET); + run_tests(skel, skel->maps.sock_map, AF_INET6); + + skel->bss->test_sockmap = false; + run_tests(skel, skel->maps.sock_hash, AF_INET); + run_tests(skel, skel->maps.sock_hash, AF_INET6); + + test_sockmap_listen__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index f4cd60d6fba2..e56b52ab41da 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -188,7 +188,7 @@ static int start_server(void) }; int fd; - fd = socket(AF_INET, SOCK_STREAM, 0); + fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); if (fd < 0) { log_err("Failed to create server socket"); return -1; @@ -205,6 +205,7 @@ static int start_server(void) static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; +static volatile bool server_done = false; static void *server_thread(void *arg) { @@ -222,23 +223,24 @@ static void *server_thread(void *arg) if (CHECK_FAIL(err < 0)) { perror("Failed to listed on socket"); - return NULL; + return ERR_PTR(err); } - client_fd = accept(fd, (struct sockaddr *)&addr, &len); + while (true) { + client_fd = accept(fd, (struct sockaddr *)&addr, &len); + if (client_fd == -1 && errno == EAGAIN) { + usleep(50); + continue; + } + break; + } if (CHECK_FAIL(client_fd < 0)) { perror("Failed to accept client"); - return NULL; + return ERR_PTR(err); } - /* Wait for the next connection (that never arrives) - * to keep this thread alive to prevent calling - * close() on client_fd. - */ - if (CHECK_FAIL(accept(fd, (struct sockaddr *)&addr, &len) >= 0)) { - perror("Unexpected success in second accept"); - return NULL; - } + while (!server_done) + usleep(50); close(client_fd); @@ -249,6 +251,7 @@ void test_tcp_rtt(void) { int server_fd, cgroup_fd; pthread_t tid; + void *server_res; cgroup_fd = test__join_cgroup("/tcp_rtt"); if (CHECK_FAIL(cgroup_fd < 0)) @@ -267,6 +270,11 @@ void test_tcp_rtt(void) pthread_mutex_unlock(&server_started_mtx); CHECK_FAIL(run_test(cgroup_fd, server_fd)); + + server_done = true; + CHECK_FAIL(pthread_join(tid, &server_res)); + CHECK_FAIL(IS_ERR(server_res)); + close_server_fd: close(server_fd); close_cgroup_fd: diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c new file mode 100644 index 000000000000..1e4c258de09d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include <test_progs.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <unistd.h> +#include <malloc.h> +#include <stdlib.h> + +#include "lsm.skel.h" + +char *CMD_ARGS[] = {"true", NULL}; + +int heap_mprotect(void) +{ + void *buf; + long sz; + int ret; + + sz = sysconf(_SC_PAGESIZE); + if (sz < 0) + return sz; + + buf = memalign(sz, 2 * sz); + if (buf == NULL) + return -ENOMEM; + + ret = mprotect(buf, sz, PROT_READ | PROT_WRITE | PROT_EXEC); + free(buf); + return ret; +} + +int exec_cmd(int *monitored_pid) +{ + int child_pid, child_status; + + child_pid = fork(); + if (child_pid == 0) { + *monitored_pid = getpid(); + execvp(CMD_ARGS[0], CMD_ARGS); + return -EINVAL; + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + return child_status; + } + + return -EINVAL; +} + +void test_test_lsm(void) +{ + struct lsm *skel = NULL; + int err, duration = 0; + + skel = lsm__open_and_load(); + if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) + goto close_prog; + + err = lsm__attach(skel); + if (CHECK(err, "attach", "lsm attach failed: %d\n", err)) + goto close_prog; + + err = exec_cmd(&skel->bss->monitored_pid); + if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno)) + goto close_prog; + + CHECK(skel->bss->bprm_count != 1, "bprm_count", "bprm_count = %d\n", + skel->bss->bprm_count); + + skel->bss->monitored_pid = getpid(); + + err = heap_mprotect(); + if (CHECK(errno != EPERM, "heap_mprotect", "want errno=EPERM, got %d\n", + errno)) + goto close_prog; + + CHECK(skel->bss->mprotect_count != 1, "mprotect_count", + "mprotect_count = %d\n", skel->bss->mprotect_count); + +close_prog: + lsm__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index 1f6ccdaed1ac..781c8d11604b 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -55,31 +55,40 @@ void test_trampoline_count(void) /* attach 'allowed' 40 trampoline programs */ for (i = 0; i < MAX_TRAMP_PROGS; i++) { obj = bpf_object__open_file(object, NULL); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) + if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { + obj = NULL; goto cleanup; + } err = bpf_object__load(obj); if (CHECK(err, "obj_load", "err %d\n", err)) goto cleanup; inst[i].obj = obj; + obj = NULL; if (rand() % 2) { - link = load(obj, fentry_name); - if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) + link = load(inst[i].obj, fentry_name); + if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) { + link = NULL; goto cleanup; + } inst[i].link_fentry = link; } else { - link = load(obj, fexit_name); - if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) + link = load(inst[i].obj, fexit_name); + if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) { + link = NULL; goto cleanup; + } inst[i].link_fexit = link; } } /* and try 1 extra.. */ obj = bpf_object__open_file(object, NULL); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) + if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { + obj = NULL; goto cleanup; + } err = bpf_object__load(obj); if (CHECK(err, "obj_load", "err %d\n", err)) @@ -104,7 +113,9 @@ void test_trampoline_count(void) cleanup_extra: bpf_object__close(obj); cleanup: - while (--i) { + if (i >= MAX_TRAMP_PROGS) + i = MAX_TRAMP_PROGS - 1; + for (; i >= 0; i--) { bpf_link__destroy(inst[i].link_fentry); bpf_link__destroy(inst[i].link_fexit); bpf_object__close(inst[i].obj); diff --git a/tools/testing/selftests/bpf/prog_tests/vmlinux.c b/tools/testing/selftests/bpf/prog_tests/vmlinux.c new file mode 100644 index 000000000000..72310cfc6474 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/vmlinux.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> +#include <time.h> +#include "test_vmlinux.skel.h" + +#define MY_TV_NSEC 1337 + +static void nsleep() +{ + struct timespec ts = { .tv_nsec = MY_TV_NSEC }; + + (void)syscall(__NR_nanosleep, &ts, NULL); +} + +void test_vmlinux(void) +{ + int duration = 0, err; + struct test_vmlinux* skel; + struct test_vmlinux__bss *bss; + + skel = test_vmlinux__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + bss = skel->bss; + + err = test_vmlinux__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger everything */ + nsleep(); + + CHECK(!bss->tp_called, "tp", "not called\n"); + CHECK(!bss->raw_tp_called, "raw_tp", "not called\n"); + CHECK(!bss->tp_btf_called, "tp_btf", "not called\n"); + CHECK(!bss->kprobe_called, "kprobe", "not called\n"); + CHECK(!bss->fentry_called, "fentry", "not called\n"); + +cleanup: + test_vmlinux__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c new file mode 100644 index 000000000000..05b294d6b923 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> + +#define IFINDEX_LO 1 +#define XDP_FLAGS_REPLACE (1U << 4) + +void test_xdp_attach(void) +{ + struct bpf_object *obj1, *obj2, *obj3; + const char *file = "./test_xdp.o"; + int err, fd1, fd2, fd3; + __u32 duration = 0; + DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, + .old_fd = -1); + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj1, &fd1); + if (CHECK_FAIL(err)) + return; + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj2, &fd2); + if (CHECK_FAIL(err)) + goto out_1; + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj3, &fd3); + if (CHECK_FAIL(err)) + goto out_2; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd1, XDP_FLAGS_REPLACE, + &opts); + if (CHECK(err, "load_ok", "initial load failed")) + goto out_close; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, XDP_FLAGS_REPLACE, + &opts); + if (CHECK(!err, "load_fail", "load with expected id didn't fail")) + goto out; + + opts.old_fd = fd1; + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, 0, &opts); + if (CHECK(err, "replace_ok", "replace valid old_fd failed")) + goto out; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd3, 0, &opts); + if (CHECK(!err, "replace_fail", "replace invalid old_fd didn't fail")) + goto out; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts); + if (CHECK(!err, "remove_fail", "remove invalid old_fd didn't fail")) + goto out; + + opts.old_fd = fd2; + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts); + if (CHECK(err, "remove_ok", "remove valid old_fd failed")) + goto out; + +out: + bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0); +out_close: + bpf_object__close(obj3); +out_2: + bpf_object__close(obj2); +out_1: + bpf_object__close(obj1); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index 6b56bdc73ebc..a0f688c37023 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -4,17 +4,51 @@ #include "test_xdp.skel.h" #include "test_xdp_bpf2bpf.skel.h" +struct meta { + int ifindex; + int pkt_len; +}; + +static void on_sample(void *ctx, int cpu, void *data, __u32 size) +{ + int duration = 0; + struct meta *meta = (struct meta *)data; + struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta); + + if (CHECK(size < sizeof(pkt_v4) + sizeof(*meta), + "check_size", "size %u < %zu\n", + size, sizeof(pkt_v4) + sizeof(*meta))) + return; + + if (CHECK(meta->ifindex != if_nametoindex("lo"), "check_meta_ifindex", + "meta->ifindex = %d\n", meta->ifindex)) + return; + + if (CHECK(meta->pkt_len != sizeof(pkt_v4), "check_meta_pkt_len", + "meta->pkt_len = %zd\n", sizeof(pkt_v4))) + return; + + if (CHECK(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)), + "check_packet_content", "content not the same\n")) + return; + + *(bool *)ctx = true; +} + void test_xdp_bpf2bpf(void) { __u32 duration = 0, retval, size; char buf[128]; int err, pkt_fd, map_fd; + bool passed = false; struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); struct iptnl_info value4 = {.family = AF_INET}; struct test_xdp *pkt_skel = NULL; struct test_xdp_bpf2bpf *ftrace_skel = NULL; struct vip key4 = {.protocol = 6, .family = AF_INET}; - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + struct bpf_program *prog; + struct perf_buffer *pb = NULL; + struct perf_buffer_opts pb_opts = {}; /* Load XDP program to introspect */ pkt_skel = test_xdp__open_and_load(); @@ -27,11 +61,21 @@ void test_xdp_bpf2bpf(void) bpf_map_update_elem(map_fd, &key4, &value4, 0); /* Load trace program */ - opts.attach_prog_fd = pkt_fd, - ftrace_skel = test_xdp_bpf2bpf__open_opts(&opts); + ftrace_skel = test_xdp_bpf2bpf__open(); if (CHECK(!ftrace_skel, "__open", "ftrace skeleton failed\n")) goto out; + /* Demonstrate the bpf_program__set_attach_target() API rather than + * the load with options, i.e. opts.attach_prog_fd. + */ + prog = ftrace_skel->progs.trace_on_entry; + bpf_program__set_expected_attach_type(prog, BPF_TRACE_FENTRY); + bpf_program__set_attach_target(prog, pkt_fd, "_xdp_tx_iptunnel"); + + prog = ftrace_skel->progs.trace_on_exit; + bpf_program__set_expected_attach_type(prog, BPF_TRACE_FEXIT); + bpf_program__set_attach_target(prog, pkt_fd, "_xdp_tx_iptunnel"); + err = test_xdp_bpf2bpf__load(ftrace_skel); if (CHECK(err, "__load", "ftrace skeleton failed\n")) goto out; @@ -40,6 +84,14 @@ void test_xdp_bpf2bpf(void) if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err)) goto out; + /* Set up perf buffer */ + pb_opts.sample_cb = on_sample; + pb_opts.ctx = &passed; + pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), + 1, &pb_opts); + if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + goto out; + /* Run test program */ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); @@ -50,6 +102,15 @@ void test_xdp_bpf2bpf(void) err, errno, retval, size)) goto out; + /* Make sure bpf_xdp_output() was triggered and it sent the expected + * data to the perf ring buffer. + */ + err = perf_buffer__poll(pb, 100); + if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err)) + goto out; + + CHECK_FAIL(!passed); + /* Verify test results */ if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"), "result", "fentry failed err %llu\n", @@ -60,6 +121,8 @@ void test_xdp_bpf2bpf(void) "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit); out: + if (pb) + perf_buffer__free(pb); test_xdp__destroy(pkt_skel); test_xdp_bpf2bpf__destroy(ftrace_skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index b631fb5032d2..3fb4260570b1 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -6,14 +6,24 @@ * the kernel BPF logic. */ +#include <stddef.h> #include <linux/bpf.h> #include <linux/types.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> #include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; +int stg_result = 0; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_stg_map SEC(".maps"); + #define DCTCP_MAX_ALPHA 1024U struct dctcp { @@ -43,12 +53,18 @@ void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); + int *stg; ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; ca->ce_state = 0; + stg = bpf_sk_storage_get(&sk_stg_map, (void *)tp, NULL, 0); + if (stg) { + stg_result = *stg; + bpf_sk_storage_delete(&sk_stg_map, (void *)tp); + } dctcp_reset(tp, ca); } diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index d4a02fe44a12..31975c96e2c9 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -13,7 +13,7 @@ enum e1 { enum e2 { C = 100, - D = -100, + D = 4294967295, E = 0, }; diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 38d3a82144ca..9365b686f84b 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c index c329fccf9842..98e1efe14549 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c @@ -5,7 +5,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> struct sk_buff { unsigned int len; diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c index 92f3fa47cf40..85c0b516d6ee 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> struct sk_buff { unsigned int len; diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index 348109b9ea07..bd1e17d8024c 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c index 8f48a909f079..a46a264ce24e 100644 --- a/tools/testing/selftests/bpf/progs/kfree_skb.c +++ b/tools/testing/selftests/bpf/progs/kfree_skb.c @@ -4,7 +4,7 @@ #include <stdbool.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; struct { diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c new file mode 100644 index 000000000000..a4e3c223028d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <errno.h> + +char _license[] SEC("license") = "GPL"; + +int monitored_pid = 0; +int mprotect_count = 0; +int bprm_count = 0; + +SEC("lsm/file_mprotect") +int BPF_PROG(test_int_hook, struct vm_area_struct *vma, + unsigned long reqprot, unsigned long prot, int ret) +{ + if (ret != 0) + return ret; + + __u32 pid = bpf_get_current_pid_tgid() >> 32; + int is_heap = 0; + + is_heap = (vma->vm_start >= vma->vm_mm->start_brk && + vma->vm_end <= vma->vm_mm->brk); + + if (is_heap && monitored_pid == pid) { + mprotect_count++; + ret = -EPERM; + } + + return ret; +} + +SEC("lsm/bprm_committed_creds") +int BPF_PROG(test_void_hook, struct linux_binprm *bprm) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + + if (monitored_pid == pid) + bprm_count++; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/modify_return.c b/tools/testing/selftests/bpf/progs/modify_return.c new file mode 100644 index 000000000000..8b7466a15c6b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/modify_return.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +static int sequence = 0; +__s32 input_retval = 0; + +__u64 fentry_result = 0; +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(fentry_test, int a, __u64 b) +{ + sequence++; + fentry_result = (sequence == 1); + return 0; +} + +__u64 fmod_ret_result = 0; +SEC("fmod_ret/bpf_modify_return_test") +int BPF_PROG(fmod_ret_test, int a, int *b, int ret) +{ + sequence++; + /* This is the first fmod_ret program, the ret passed should be 0 */ + fmod_ret_result = (sequence == 2 && ret == 0); + return input_retval; +} + +__u64 fexit_result = 0; +SEC("fexit/bpf_modify_return_test") +int BPF_PROG(fexit_test, int a, __u64 b, int ret) +{ + sequence++; + /* If the input_reval is non-zero a successful modification should have + * occurred. + */ + if (input_retval) + fexit_result = (sequence == 3 && ret == input_retval); + else + fexit_result = (sequence == 3 && ret == 4); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c index a5c6d5903b22..ca283af80d4e 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c @@ -12,7 +12,6 @@ int bpf_prog1(struct __sk_buff *skb) __u32 lport = skb->local_port; __u32 rport = skb->remote_port; __u8 *d = data; - __u32 len = (__u32) data_end - (__u32) data; int err; if (data + 10 > data_end) { diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index dd8fae6660ab..8056a4c6d918 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -4,6 +4,7 @@ #include <linux/ptrace.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> int kprobe_res = 0; int kretprobe_res = 0; @@ -18,7 +19,7 @@ int handle_kprobe(struct pt_regs *ctx) } SEC("kretprobe/sys_nanosleep") -int handle_kretprobe(struct pt_regs *ctx) +int BPF_KRETPROBE(handle_kretprobe) { kretprobe_res = 2; return 0; diff --git a/tools/testing/selftests/bpf/progs/test_cgroup_link.c b/tools/testing/selftests/bpf/progs/test_cgroup_link.c new file mode 100644 index 000000000000..77e47b9e4446 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cgroup_link.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +int calls = 0; +int alt_calls = 0; + +SEC("cgroup_skb/egress1") +int egress(struct __sk_buff *skb) +{ + __sync_fetch_and_add(&calls, 1); + return 1; +} + +SEC("cgroup_skb/egress2") +int egress_alt(struct __sk_buff *skb) +{ + __sync_fetch_and_add(&alt_calls, 1); + return 1; +} + +char _license[] SEC("license") = "GPL"; + diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c new file mode 100644 index 000000000000..8941a41c2a55 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#define MAX_STACK_RAWTP 10 + +SEC("raw_tracepoint/sys_enter") +int bpf_prog2(void *ctx) +{ + __u64 stack[MAX_STACK_RAWTP]; + int error; + + /* set all the flags which should return -EINVAL */ + error = bpf_get_stack(ctx, stack, 0, -1); + if (error < 0) + goto loop; + + return error; +loop: + while (1) { + error++; + } +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_global_data.c b/tools/testing/selftests/bpf/progs/test_global_data.c index dd7a4d3dbc0d..1319be1c54ba 100644 --- a/tools/testing/selftests/bpf/progs/test_global_data.c +++ b/tools/testing/selftests/bpf/progs/test_global_data.c @@ -68,7 +68,7 @@ static struct foo struct3 = { bpf_map_update_elem(&result_##map, &key, var, 0); \ } while (0) -SEC("static_data_load") +SEC("classifier/static_data_load") int load_static_data(struct __sk_buff *skb) { static const __u64 bar = ~0; diff --git a/tools/testing/selftests/bpf/progs/test_link_pinning.c b/tools/testing/selftests/bpf/progs/test_link_pinning.c new file mode 100644 index 000000000000..bbf2a5264dc0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_link_pinning.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <stdbool.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +int in = 0; +int out = 0; + +SEC("raw_tp/sys_enter") +int raw_tp_prog(const void *ctx) +{ + out = in; + return 0; +} + +SEC("tp_btf/sys_enter") +int tp_btf_prog(const void *ctx) +{ + out = in; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c new file mode 100644 index 000000000000..1dca70a6de2f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Carlos Neira cneirabustos@gmail.com */ + +#include <linux/bpf.h> +#include <stdint.h> +#include <bpf/bpf_helpers.h> + +static volatile struct { + __u64 dev; + __u64 ino; + __u64 pid_tgid; + __u64 user_pid_tgid; +} res; + +SEC("raw_tracepoint/sys_enter") +int trace(void *ctx) +{ + __u64 ns_pid_tgid, expected_pid; + struct bpf_pidns_info nsdata; + __u32 key = 0; + + if (bpf_get_ns_current_pid_tgid(res.dev, res.ino, &nsdata, + sizeof(struct bpf_pidns_info))) + return 0; + + ns_pid_tgid = (__u64)nsdata.tgid << 32 | nsdata.pid; + expected_pid = res.user_pid_tgid; + + if (expected_pid != ns_pid_tgid) + return 0; + + res.pid_tgid = ns_pid_tgid; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index bfe9fbcb9684..56a50b25cd33 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -6,7 +6,6 @@ #include <linux/ptrace.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#include "bpf_trace_helpers.h" struct task_struct; @@ -17,11 +16,9 @@ int BPF_KPROBE(prog1, struct task_struct *tsk, const char *buf, bool exec) } SEC("kretprobe/__set_task_comm") -int BPF_KRETPROBE(prog2, - struct task_struct *tsk, const char *buf, bool exec, - int ret) +int BPF_KRETPROBE(prog2, int ret) { - return !PT_REGS_PARM1(ctx) && ret; + return ret; } SEC("raw_tp/task_rename") diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c new file mode 100644 index 000000000000..a1ccc831c882 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook + +#include <stddef.h> +#include <linux/ptrace.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +int valid = 0; +int required_size_out = 0; +int written_stack_out = 0; +int written_global_out = 0; + +struct { + __u64 _a; + __u64 _b; + __u64 _c; +} fpbe[30] = {0}; + +SEC("perf_event") +int perf_branches(void *ctx) +{ + __u64 entries[4 * 3] = {0}; + int required_size, written_stack, written_global; + + /* write to stack */ + written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0); + /* ignore spurious events */ + if (!written_stack) + return 1; + + /* get required size */ + required_size = bpf_read_branch_records(ctx, NULL, 0, + BPF_F_GET_BRANCH_RECORDS_SIZE); + + written_global = bpf_read_branch_records(ctx, fpbe, sizeof(fpbe), 0); + /* ignore spurious events */ + if (!written_global) + return 1; + + required_size_out = required_size; + written_stack_out = written_stack; + written_global_out = written_global; + valid = 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c index ebfcc9f50c35..ad59c4c9aba8 100644 --- a/tools/testing/selftests/bpf/progs/test_perf_buffer.c +++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c @@ -4,7 +4,7 @@ #include <linux/ptrace.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/test_probe_user.c b/tools/testing/selftests/bpf/progs/test_probe_user.c index d556b1572cc6..89b3532ccc75 100644 --- a/tools/testing/selftests/bpf/progs/test_probe_user.c +++ b/tools/testing/selftests/bpf/progs/test_probe_user.c @@ -7,7 +7,6 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#include "bpf_trace_helpers.h" static struct sockaddr_in old; diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c index 1acc91e87bfc..b4233d3efac2 100644 --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c @@ -31,6 +31,12 @@ int send_signal_tp(void *ctx) return bpf_send_signal_test(ctx); } +SEC("tracepoint/sched/sched_switch") +int send_signal_tp_sched(void *ctx) +{ + return bpf_send_signal_test(ctx); +} + SEC("perf_event") int send_signal_perf(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c new file mode 100644 index 000000000000..8f530843b4da --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Cloudflare Ltd. +// Copyright (c) 2020 Isovalent, Inc. + +#include <stddef.h> +#include <stdbool.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/pkt_cls.h> +#include <linux/tcp.h> +#include <sys/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ +static inline struct bpf_sock_tuple * +get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct bpf_sock_tuple *result; + struct ethhdr *eth; + __u64 tuple_len; + __u8 proto = 0; + __u64 ihl_len; + + eth = (struct ethhdr *)(data); + if (eth + 1 > data_end) + return NULL; + + if (eth->h_proto == bpf_htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(data + sizeof(*eth)); + + if (iph + 1 > data_end) + return NULL; + if (iph->ihl != 5) + /* Options are not supported */ + return NULL; + ihl_len = iph->ihl * 4; + proto = iph->protocol; + *ipv4 = true; + result = (struct bpf_sock_tuple *)&iph->saddr; + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + sizeof(*eth)); + + if (ip6h + 1 > data_end) + return NULL; + ihl_len = sizeof(*ip6h); + proto = ip6h->nexthdr; + *ipv4 = false; + result = (struct bpf_sock_tuple *)&ip6h->saddr; + } else { + return (struct bpf_sock_tuple *)data; + } + + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) + return NULL; + + *tcp = (proto == IPPROTO_TCP); + return result; +} + +static inline int +handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) +{ + struct bpf_sock_tuple ln = {0}; + struct bpf_sock *sk; + size_t tuple_len; + int ret; + + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); + if ((void *)tuple + tuple_len > (void *)(long)skb->data_end) + return TC_ACT_SHOT; + + sk = bpf_sk_lookup_udp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + if (sk) + goto assign; + + if (ipv4) { + if (tuple->ipv4.dport != bpf_htons(4321)) + return TC_ACT_OK; + + ln.ipv4.daddr = bpf_htonl(0x7f000001); + ln.ipv4.dport = bpf_htons(1234); + + sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4), + BPF_F_CURRENT_NETNS, 0); + } else { + if (tuple->ipv6.dport != bpf_htons(4321)) + return TC_ACT_OK; + + /* Upper parts of daddr are already zero. */ + ln.ipv6.daddr[3] = bpf_htonl(0x1); + ln.ipv6.dport = bpf_htons(1234); + + sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6), + BPF_F_CURRENT_NETNS, 0); + } + + /* workaround: We can't do a single socket lookup here, because then + * the compiler will likely spill tuple_len to the stack. This makes it + * lose all bounds information in the verifier, which then rejects the + * call as unsafe. + */ + if (!sk) + return TC_ACT_SHOT; + +assign: + ret = bpf_sk_assign(skb, sk, 0); + bpf_sk_release(sk); + return ret; +} + +static inline int +handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) +{ + struct bpf_sock_tuple ln = {0}; + struct bpf_sock *sk; + size_t tuple_len; + int ret; + + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); + if ((void *)tuple + tuple_len > (void *)(long)skb->data_end) + return TC_ACT_SHOT; + + sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + if (sk) { + if (sk->state != BPF_TCP_LISTEN) + goto assign; + bpf_sk_release(sk); + } + + if (ipv4) { + if (tuple->ipv4.dport != bpf_htons(4321)) + return TC_ACT_OK; + + ln.ipv4.daddr = bpf_htonl(0x7f000001); + ln.ipv4.dport = bpf_htons(1234); + + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), + BPF_F_CURRENT_NETNS, 0); + } else { + if (tuple->ipv6.dport != bpf_htons(4321)) + return TC_ACT_OK; + + /* Upper parts of daddr are already zero. */ + ln.ipv6.daddr[3] = bpf_htonl(0x1); + ln.ipv6.dport = bpf_htons(1234); + + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), + BPF_F_CURRENT_NETNS, 0); + } + + /* workaround: We can't do a single socket lookup here, because then + * the compiler will likely spill tuple_len to the stack. This makes it + * lose all bounds information in the verifier, which then rejects the + * call as unsafe. + */ + if (!sk) + return TC_ACT_SHOT; + + if (sk->state != BPF_TCP_LISTEN) { + bpf_sk_release(sk); + return TC_ACT_SHOT; + } + +assign: + ret = bpf_sk_assign(skb, sk, 0); + bpf_sk_release(sk); + return ret; +} + +SEC("classifier/sk_assign_test") +int bpf_sk_assign_test(struct __sk_buff *skb) +{ + struct bpf_sock_tuple *tuple, ln = {0}; + bool ipv4 = false; + bool tcp = false; + int tuple_len; + int ret = 0; + + tuple = get_tuple(skb, &ipv4, &tcp); + if (!tuple) + return TC_ACT_SHOT; + + /* Note that the verifier socket return type for bpf_skc_lookup_tcp() + * differs from bpf_sk_lookup_udp(), so even though the C-level type is + * the same here, if we try to share the implementations they will + * fail to verify because we're crossing pointer types. + */ + if (tcp) + ret = handle_tcp(skb, tuple, ipv4); + else + ret = handle_udp(skb, tuple, ipv4); + + return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; +} diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c index 202de3938494..b02ea589ce7e 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c +++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c @@ -23,6 +23,8 @@ int process(struct __sk_buff *skb) return 1; if (skb->gso_segs != 8) return 1; + if (skb->gso_size != 10) + return 1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c new file mode 100644 index 000000000000..a3a366c57ce1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare + +#include <errno.h> +#include <stdbool.h> +#include <linux/bpf.h> + +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, unsigned int); +} verdict_map SEC(".maps"); + +static volatile bool test_sockmap; /* toggled by user-space */ + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return skb->len; +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + unsigned int *count; + __u32 zero = 0; + int verdict; + + if (test_sockmap) + verdict = bpf_sk_redirect_map(skb, &sock_map, zero, 0); + else + verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, 0); + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + unsigned int *count; + __u32 zero = 0; + int verdict; + + if (test_sockmap) + verdict = bpf_msg_redirect_map(msg, &sock_map, zero, 0); + else + verdict = bpf_msg_redirect_hash(msg, &sock_hash, &zero, 0); + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + +SEC("sk_reuseport") +int prog_reuseport(struct sk_reuseport_md *reuse) +{ + unsigned int *count; + int err, verdict; + __u32 zero = 0; + + if (test_sockmap) + err = bpf_sk_select_reuseport(reuse, &sock_map, &zero, 0); + else + err = bpf_sk_select_reuseport(reuse, &sock_hash, &zero, 0); + verdict = err ? SK_DROP : SK_PASS; + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_trampoline_count.c b/tools/testing/selftests/bpf/progs/test_trampoline_count.c index e51e6e3a81c2..f030e469d05b 100644 --- a/tools/testing/selftests/bpf/progs/test_trampoline_count.c +++ b/tools/testing/selftests/bpf/progs/test_trampoline_count.c @@ -2,7 +2,8 @@ #include <stdbool.h> #include <stddef.h> #include <linux/bpf.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct task_struct; diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c new file mode 100644 index 000000000000..5611b564d3b1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include "vmlinux.h" +#include <asm/unistd.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#define MY_TV_NSEC 1337 + +bool tp_called = false; +bool raw_tp_called = false; +bool tp_btf_called = false; +bool kprobe_called = false; +bool fentry_called = false; + +SEC("tp/syscalls/sys_enter_nanosleep") +int handle__tp(struct trace_event_raw_sys_enter *args) +{ + struct __kernel_timespec *ts; + + if (args->id != __NR_nanosleep) + return 0; + + ts = (void *)args->args[0]; + if (BPF_CORE_READ(ts, tv_nsec) != MY_TV_NSEC) + return 0; + + tp_called = true; + return 0; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handle__raw_tp, struct pt_regs *regs, long id) +{ + struct __kernel_timespec *ts; + + if (id != __NR_nanosleep) + return 0; + + ts = (void *)PT_REGS_PARM1_CORE(regs); + if (BPF_CORE_READ(ts, tv_nsec) != MY_TV_NSEC) + return 0; + + raw_tp_called = true; + return 0; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(handle__tp_btf, struct pt_regs *regs, long id) +{ + struct __kernel_timespec *ts; + + if (id != __NR_nanosleep) + return 0; + + ts = (void *)PT_REGS_PARM1_CORE(regs); + if (BPF_CORE_READ(ts, tv_nsec) != MY_TV_NSEC) + return 0; + + tp_btf_called = true; + return 0; +} + +SEC("kprobe/hrtimer_nanosleep") +int BPF_KPROBE(handle__kprobe, + ktime_t rqtp, enum hrtimer_mode mode, clockid_t clockid) +{ + if (rqtp == MY_TV_NSEC) + kprobe_called = true; + return 0; +} + +SEC("fentry/hrtimer_nanosleep") +int BPF_PROG(handle__fentry, + ktime_t rqtp, enum hrtimer_mode mode, clockid_t clockid) +{ + if (rqtp == MY_TV_NSEC) + fentry_called = true; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c index cb8a04ab7a78..a038e827f850 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> +#include <bpf/bpf_tracing.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" + +char _license[] SEC("license") = "GPL"; struct net_device { /* Structure does not need to contain all entries, @@ -27,16 +29,38 @@ struct xdp_buff { struct xdp_rxq_info *rxq; } __attribute__((preserve_access_index)); +struct meta { + int ifindex; + int pkt_len; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} perf_buf_map SEC(".maps"); + __u64 test_result_fentry = 0; -SEC("fentry/_xdp_tx_iptunnel") +SEC("fentry/FUNC") int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) { + struct meta meta; + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + + meta.ifindex = xdp->rxq->dev->ifindex; + meta.pkt_len = data_end - data; + bpf_xdp_output(xdp, &perf_buf_map, + ((__u64) meta.pkt_len << 32) | + BPF_F_CURRENT_CPU, + &meta, sizeof(meta)); + test_result_fentry = xdp->rxq->dev->ifindex; return 0; } __u64 test_result_fexit = 0; -SEC("fexit/_xdp_tx_iptunnel") +SEC("fexit/FUNC") int BPF_PROG(trace_on_exit, struct xdp_buff *xdp, int ret) { test_result_fexit = ret; diff --git a/tools/testing/selftests/bpf/test_bpftool.py b/tools/testing/selftests/bpf/test_bpftool.py new file mode 100644 index 000000000000..4fed2dc25c0a --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2020 SUSE LLC. + +import collections +import functools +import json +import os +import socket +import subprocess +import unittest + + +# Add the source tree of bpftool and /usr/local/sbin to PATH +cur_dir = os.path.dirname(os.path.realpath(__file__)) +bpftool_dir = os.path.abspath(os.path.join(cur_dir, "..", "..", "..", "..", + "tools", "bpf", "bpftool")) +os.environ["PATH"] = bpftool_dir + ":/usr/local/sbin:" + os.environ["PATH"] + + +class IfaceNotFoundError(Exception): + pass + + +class UnprivilegedUserError(Exception): + pass + + +def _bpftool(args, json=True): + _args = ["bpftool"] + if json: + _args.append("-j") + _args.extend(args) + + return subprocess.check_output(_args) + + +def bpftool(args): + return _bpftool(args, json=False).decode("utf-8") + + +def bpftool_json(args): + res = _bpftool(args) + return json.loads(res) + + +def get_default_iface(): + for iface in socket.if_nameindex(): + if iface[1] != "lo": + return iface[1] + raise IfaceNotFoundError("Could not find any network interface to probe") + + +def default_iface(f): + @functools.wraps(f) + def wrapper(*args, **kwargs): + iface = get_default_iface() + return f(*args, iface, **kwargs) + return wrapper + + +class TestBpftool(unittest.TestCase): + @classmethod + def setUpClass(cls): + if os.getuid() != 0: + raise UnprivilegedUserError( + "This test suite needs root privileges") + + @default_iface + def test_feature_dev_json(self, iface): + unexpected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + expected_keys = [ + "syscall_config", + "program_types", + "map_types", + "helpers", + "misc", + ] + + res = bpftool_json(["feature", "probe", "dev", iface]) + # Check if the result has all expected keys. + self.assertCountEqual(res.keys(), expected_keys) + # Check if unexpected helpers are not included in helpers probes + # result. + for helpers in res["helpers"].values(): + for unexpected_helper in unexpected_helpers: + self.assertNotIn(unexpected_helper, helpers) + + def test_feature_kernel(self): + test_cases = [ + bpftool_json(["feature", "probe", "kernel"]), + bpftool_json(["feature", "probe"]), + bpftool_json(["feature"]), + ] + unexpected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + expected_keys = [ + "syscall_config", + "system_config", + "program_types", + "map_types", + "helpers", + "misc", + ] + + for tc in test_cases: + # Check if the result has all expected keys. + self.assertCountEqual(tc.keys(), expected_keys) + # Check if unexpected helpers are not included in helpers probes + # result. + for helpers in tc["helpers"].values(): + for unexpected_helper in unexpected_helpers: + self.assertNotIn(unexpected_helper, helpers) + + def test_feature_kernel_full(self): + test_cases = [ + bpftool_json(["feature", "probe", "kernel", "full"]), + bpftool_json(["feature", "probe", "full"]), + ] + expected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + + for tc in test_cases: + # Check if expected helpers are included at least once in any + # helpers list for any program type. Unfortunately we cannot assume + # that they will be included in all program types or a specific + # subset of programs. It depends on the kernel version and + # configuration. + found_helpers = False + + for helpers in tc["helpers"].values(): + if all(expected_helper in helpers + for expected_helper in expected_helpers): + found_helpers = True + break + + self.assertTrue(found_helpers) + + def test_feature_kernel_full_vs_not_full(self): + full_res = bpftool_json(["feature", "probe", "full"]) + not_full_res = bpftool_json(["feature", "probe"]) + not_full_set = set() + full_set = set() + + for helpers in full_res["helpers"].values(): + for helper in helpers: + full_set.add(helper) + + for helpers in not_full_res["helpers"].values(): + for helper in helpers: + not_full_set.add(helper) + + self.assertCountEqual(full_set - not_full_set, + {"bpf_probe_write_user", "bpf_trace_printk"}) + self.assertCountEqual(not_full_set - full_set, set()) + + def test_feature_macros(self): + expected_patterns = [ + r"/\*\*\* System call availability \*\*\*/", + r"#define HAVE_BPF_SYSCALL", + r"/\*\*\* eBPF program types \*\*\*/", + r"#define HAVE.*PROG_TYPE", + r"/\*\*\* eBPF map types \*\*\*/", + r"#define HAVE.*MAP_TYPE", + r"/\*\*\* eBPF helper functions \*\*\*/", + r"#define HAVE.*HELPER", + r"/\*\*\* eBPF misc features \*\*\*/", + ] + + res = bpftool(["feature", "probe", "macros"]) + for pattern in expected_patterns: + self.assertRegex(res, pattern) diff --git a/tools/testing/selftests/bpf/test_bpftool.sh b/tools/testing/selftests/bpf/test_bpftool.sh new file mode 100755 index 000000000000..66690778e36d --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2020 SUSE LLC. + +python3 -m unittest -v test_bpftool.TestBpftool diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 93040ca83e60..8da77cda5f4a 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -1062,6 +1062,48 @@ static struct btf_raw_test raw_tests[] = { .err_str = "Member exceeds struct_size", }, +/* Test member unexceeds the size of struct + * + * enum E { + * E0, + * E1, + * }; + * + * struct A { + * char m; + * enum E __attribute__((packed)) n; + * }; + */ +{ + .descr = "size check test #5", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)), + /* char */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), + /* enum E { */ /* [3] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), 1), + BTF_ENUM_ENC(NAME_TBD, 0), + BTF_ENUM_ENC(NAME_TBD, 1), + /* } */ + /* struct A { */ /* [4] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 2), + BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* char m; */ + BTF_MEMBER_ENC(NAME_TBD, 3, 8),/* enum E __attribute__((packed)) n; */ + /* } */ + BTF_END_RAW, + }, + .str_sec = "\0E\0E0\0E1\0A\0m\0n", + .str_sec_size = sizeof("\0E\0E0\0E1\0A\0m\0n"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "size_check5_map", + .key_size = sizeof(int), + .value_size = 2, + .key_type_id = 1, + .value_type_id = 4, + .max_entries = 4, +}, + /* typedef const void * const_void_ptr; * struct A { * const_void_ptr m; diff --git a/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c b/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c new file mode 100644 index 000000000000..ed253f252cd0 --- /dev/null +++ b/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Carlos Neira cneirabustos@gmail.com */ +#define _GNU_SOURCE +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <sched.h> +#include <sys/wait.h> +#include <sys/mount.h> +#include "test_progs.h" + +#define CHECK_NEWNS(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s:FAIL:%s ", __func__, tag); \ + printf(format); \ + } else { \ + printf("%s:PASS:%s\n", __func__, tag); \ + } \ + __ret; \ +}) + +struct bss { + __u64 dev; + __u64 ino; + __u64 pid_tgid; + __u64 user_pid_tgid; +}; + +int main(int argc, char **argv) +{ + pid_t pid; + int exit_code = 1; + struct stat st; + + printf("Testing bpf_get_ns_current_pid_tgid helper in new ns\n"); + + if (stat("/proc/self/ns/pid", &st)) { + perror("stat failed on /proc/self/ns/pid ns\n"); + printf("%s:FAILED\n", argv[0]); + return exit_code; + } + + if (CHECK_NEWNS(unshare(CLONE_NEWPID | CLONE_NEWNS), + "unshare CLONE_NEWPID | CLONE_NEWNS", "error errno=%d\n", errno)) + return exit_code; + + pid = fork(); + if (pid == -1) { + perror("Fork() failed\n"); + printf("%s:FAILED\n", argv[0]); + return exit_code; + } + + if (pid > 0) { + int status; + + usleep(5); + waitpid(pid, &status, 0); + return 0; + } else { + + pid = fork(); + if (pid == -1) { + perror("Fork() failed\n"); + printf("%s:FAILED\n", argv[0]); + return exit_code; + } + + if (pid > 0) { + int status; + waitpid(pid, &status, 0); + return 0; + } else { + if (CHECK_NEWNS(mount("none", "/proc", NULL, MS_PRIVATE|MS_REC, NULL), + "Unmounting proc", "Cannot umount proc! errno=%d\n", errno)) + return exit_code; + + if (CHECK_NEWNS(mount("proc", "/proc", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL), + "Mounting proc", "Cannot mount proc! errno=%d\n", errno)) + return exit_code; + + const char *probe_name = "raw_tracepoint/sys_enter"; + const char *file = "test_ns_current_pid_tgid.o"; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_map *bss_map; + struct bpf_object *obj; + int exit_code = 1; + int err, key = 0; + struct bss bss; + struct stat st; + __u64 id; + + obj = bpf_object__open_file(file, NULL); + if (CHECK_NEWNS(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj))) + return exit_code; + + err = bpf_object__load(obj); + if (CHECK_NEWNS(err, "obj_load", "err %d errno %d\n", err, errno)) + goto cleanup; + + bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss"); + if (CHECK_NEWNS(!bss_map, "find_bss_map", "failed\n")) + goto cleanup; + + prog = bpf_object__find_program_by_title(obj, probe_name); + if (CHECK_NEWNS(!prog, "find_prog", "prog '%s' not found\n", + probe_name)) + goto cleanup; + + memset(&bss, 0, sizeof(bss)); + pid_t tid = syscall(SYS_gettid); + pid_t pid = getpid(); + + id = (__u64) tid << 32 | pid; + bss.user_pid_tgid = id; + + if (CHECK_NEWNS(stat("/proc/self/ns/pid", &st), + "stat new ns", "Failed to stat /proc/self/ns/pid errno=%d\n", errno)) + goto cleanup; + + bss.dev = st.st_dev; + bss.ino = st.st_ino; + + err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0); + if (CHECK_NEWNS(err, "setting_bss", "failed to set bss : %d\n", err)) + goto cleanup; + + link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); + if (CHECK_NEWNS(IS_ERR(link), "attach_raw_tp", "err %ld\n", + PTR_ERR(link))) { + link = NULL; + goto cleanup; + } + + /* trigger some syscalls */ + usleep(1); + + err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss); + if (CHECK_NEWNS(err, "set_bss", "failed to get bss : %d\n", err)) + goto cleanup; + + if (CHECK_NEWNS(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid", + "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) + goto cleanup; + + exit_code = 0; + printf("%s:PASS\n", argv[0]); +cleanup: + if (!link) { + bpf_link__destroy(link); + link = NULL; + } + bpf_object__close(obj); + } + } +} diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 02eae1e864c2..c6766b2cff85 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -756,11 +756,7 @@ static void test_sockmap(unsigned int tasks, void *data) /* Test update without programs */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (i < 2 && !err) { - printf("Allowed update sockmap '%i:%i' not in ESTABLISHED\n", - i, sfd[i]); - goto out_sockmap; - } else if (i >= 2 && err) { + if (err) { printf("Failed noprog update sockmap '%i:%i'\n", i, sfd[i]); goto out_sockmap; diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index bab1e6f1d8f1..b521e0a512b6 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1,11 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ +#define _GNU_SOURCE #include "test_progs.h" #include "cgroup_helpers.h" #include "bpf_rlimit.h" #include <argp.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> #include <string.h> +#include <execinfo.h> /* backtrace */ /* defined in test_progs.h */ struct test_env env = {}; @@ -27,6 +32,20 @@ struct prog_test_def { int old_error_cnt; }; +/* Override C runtime library's usleep() implementation to ensure nanosleep() + * is always called. Usleep is frequently used in selftests as a way to + * trigger kprobe and tracepoints. + */ +int usleep(useconds_t usec) +{ + struct timespec ts = { + .tv_sec = usec / 1000000, + .tv_nsec = (usec % 1000000) * 1000, + }; + + return syscall(__NR_nanosleep, &ts, NULL); +} + static bool should_run(struct test_selector *sel, int num, const char *name) { int i; @@ -74,6 +93,34 @@ static void skip_account(void) } } +static void stdio_restore(void); + +/* A bunch of tests set custom affinity per-thread and/or per-process. Reset + * it after each test/sub-test. + */ +static void reset_affinity() { + + cpu_set_t cpuset; + int i, err; + + CPU_ZERO(&cpuset); + for (i = 0; i < env.nr_cpus; i++) + CPU_SET(i, &cpuset); + + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { + stdio_restore(); + fprintf(stderr, "Failed to reset process affinity: %d!\n", err); + exit(-1); + } + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + if (err < 0) { + stdio_restore(); + fprintf(stderr, "Failed to reset thread affinity: %d!\n", err); + exit(-1); + } +} + void test__end_subtest() { struct prog_test_def *test = env.test; @@ -91,6 +138,8 @@ void test__end_subtest() test->test_num, test->subtest_num, test->subtest_name, sub_error_cnt ? "FAIL" : "OK"); + reset_affinity(); + free(test->subtest_name); test->subtest_name = NULL; } @@ -196,7 +245,7 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name) map = bpf_object__find_map_by_name(obj, name); if (!map) { - printf("%s:FAIL:map '%s' not found\n", test, name); + fprintf(stdout, "%s:FAIL:map '%s' not found\n", test, name); test__fail(); return -1; } @@ -367,7 +416,7 @@ static int libbpf_print_fn(enum libbpf_print_level level, { if (env.verbosity < VERBOSE_VERY && level == LIBBPF_DEBUG) return 0; - vprintf(format, args); + vfprintf(stdout, format, args); return 0; } @@ -408,7 +457,7 @@ err: int parse_num_list(const char *s, struct test_selector *sel) { - int i, set_len = 0, num, start = 0, end = -1; + int i, set_len = 0, new_len, num, start = 0, end = -1; bool *set = NULL, *tmp, parsing_end = false; char *next; @@ -443,18 +492,19 @@ int parse_num_list(const char *s, struct test_selector *sel) return -EINVAL; if (end + 1 > set_len) { - set_len = end + 1; - tmp = realloc(set, set_len); + new_len = end + 1; + tmp = realloc(set, new_len); if (!tmp) { free(set); return -ENOMEM; } + for (i = set_len; i < start; i++) + tmp[i] = false; set = tmp; + set_len = new_len; } - for (i = start; i <= end; i++) { + for (i = start; i <= end; i++) set[i] = true; - } - } if (!set) @@ -613,10 +663,27 @@ int cd_flavor_subdir(const char *exec_name) if (!flavor) return 0; flavor++; - printf("Switching to flavor '%s' subdirectory...\n", flavor); + fprintf(stdout, "Switching to flavor '%s' subdirectory...\n", flavor); return chdir(flavor); } +#define MAX_BACKTRACE_SZ 128 +void crash_handler(int signum) +{ + void *bt[MAX_BACKTRACE_SZ]; + size_t sz; + + sz = backtrace(bt, ARRAY_SIZE(bt)); + + if (env.test) + dump_test_log(env.test, true); + if (env.stdout) + stdio_restore(); + + fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum); + backtrace_symbols_fd(bt, sz, STDERR_FILENO); +} + int main(int argc, char **argv) { static const struct argp argp = { @@ -624,8 +691,14 @@ int main(int argc, char **argv) .parser = parse_arg, .doc = argp_program_doc, }; + struct sigaction sigact = { + .sa_handler = crash_handler, + .sa_flags = SA_RESETHAND, + }; int err, i; + sigaction(SIGSEGV, &sigact, NULL); + err = argp_parse(&argp, argc, argv, 0, NULL, &env); if (err) return err; @@ -639,6 +712,12 @@ int main(int argc, char **argv) srand(time(NULL)); env.jit_enabled = is_jit_enabled(); + env.nr_cpus = libbpf_num_possible_cpus(); + if (env.nr_cpus < 0) { + fprintf(stderr, "Failed to get number of CPUs: %d!\n", + env.nr_cpus); + return -1; + } stdio_hijack(); for (i = 0; i < prog_test_cnt; i++) { @@ -669,12 +748,13 @@ int main(int argc, char **argv) test->test_num, test->test_name, test->error_cnt ? "FAIL" : "OK"); + reset_affinity(); if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); } stdio_restore(); - printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", - env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); + fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", + env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); free(env.test_selector.blacklist.strs); free(env.test_selector.whitelist.strs); diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index bcfa9ef23fda..f4aff6b8284b 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -71,6 +71,7 @@ struct test_env { FILE *stderr; char *log_buf; size_t log_cnt; + int nr_cpus; int succ_cnt; /* successful tests */ int sub_succ_cnt; /* successful sub-tests */ @@ -109,10 +110,10 @@ extern struct ipv6_packet pkt_v6; int __save_errno = errno; \ if (__ret) { \ test__fail(); \ - printf("%s:FAIL:%s ", __func__, tag); \ - printf(format); \ + fprintf(stdout, "%s:FAIL:%s ", __func__, tag); \ + fprintf(stdout, ##format); \ } else { \ - printf("%s:PASS:%s %d nsec\n", \ + fprintf(stdout, "%s:PASS:%s %d nsec\n", \ __func__, tag, duration); \ } \ errno = __save_errno; \ @@ -124,7 +125,7 @@ extern struct ipv6_packet pkt_v6; int __save_errno = errno; \ if (__ret) { \ test__fail(); \ - printf("%s:FAIL:%d\n", __func__, __LINE__); \ + fprintf(stdout, "%s:FAIL:%d\n", __func__, __LINE__); \ } \ errno = __save_errno; \ __ret; \ diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 7f989b3e4e22..4d0e913bbb22 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -4,12 +4,15 @@ #include <string.h> #include <assert.h> #include <errno.h> +#include <fcntl.h> #include <poll.h> #include <unistd.h> #include <linux/perf_event.h> #include <sys/mman.h> #include "trace_helpers.h" +#define DEBUGFS "/sys/kernel/debug/tracing/" + #define MAX_SYMS 300000 static struct ksym syms[MAX_SYMS]; static int sym_cnt; @@ -86,3 +89,23 @@ long ksym_get_addr(const char *name) return 0; } + +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[4096]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 0383c9b8adc1..25ef597dd03f 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -12,5 +12,6 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); +void read_trace_pipe(void); #endif diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index d55f476f2237..4d0d09574bf4 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -257,17 +257,15 @@ * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), - /* no-op or OOB pointer computation */ + /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), - /* potentially OOB access */ - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), /* exit */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "R0 unbounded memory access", + .errstr = "value 72057594021150720 makes map_value pointer be out of bounds", .result = REJECT }, { @@ -299,17 +297,15 @@ * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), - /* no-op or OOB pointer computation */ + /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), - /* potentially OOB access */ - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), /* exit */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "R0 unbounded memory access", + .errstr = "value 72057594021150720 makes map_value pointer be out of bounds", .result = REJECT }, { @@ -411,16 +407,14 @@ BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 31), /* r1 = 0xffff'fffe (NOT 0!) */ BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 2), - /* computes OOB pointer */ + /* error on computing OOB pointer */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), - /* OOB access */ - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), /* exit */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 invalid mem access", + .errstr = "math between map_value pointer and 4294967294 is not allowed", .result = REJECT, }, { @@ -506,3 +500,42 @@ .errstr = "map_value pointer and 1000000000000", .result = REJECT }, +{ + "bounds check mixed 32bit and 64bit arithmatic. test1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + /* r1 = 0xffffFFFF00000001 */ + BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 1, 3), + /* check ALU64 op keeps 32bit bounds */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 2, 1), + BPF_JMP_A(1), + /* invalid ldx if bounds are lost above */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT +}, +{ + "bounds check mixed 32bit and 64bit arithmatic. test2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + /* r1 = 0xffffFFFF00000001 */ + BPF_MOV64_IMM(BPF_REG_2, 3), + /* r1 = 0x2 */ + BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 1), + /* check ALU32 op zero extends 64bit bounds */ + BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 1), + BPF_JMP_A(1), + /* invalid ldx if bounds are lost above */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT +}, diff --git a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c index f24d50f09dbe..69b048cf46d9 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c +++ b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c @@ -9,17 +9,17 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28), BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), - BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)/2), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), - BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)/2), BPF_MOV64_IMM(BPF_REG_4, 256), BPF_EMIT_CALL(BPF_FUNC_get_stack), BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32), - BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16), + BPF_JMP_REG(BPF_JSGT, BPF_REG_1, BPF_REG_8, 16), BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8), @@ -29,7 +29,7 @@ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1), BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), - BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)/2), BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5), BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c index 92762c08f5e3..93d6b1641481 100644 --- a/tools/testing/selftests/bpf/verifier/ctx.c +++ b/tools/testing/selftests/bpf/verifier/ctx.c @@ -91,3 +91,108 @@ .result = REJECT, .errstr = "variable ctx access var_off=(0x0; 0x4)", }, +{ + "pass ctx or null check, 1: ctx", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 2: null", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 3: 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = REJECT, + .errstr = "R1 type=inv expected=ctx", +}, +{ + "pass ctx or null check, 4: ctx - const", + .insns = { + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", +}, +{ + "pass ctx or null check, 5: null (connect)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_INET4_CONNECT, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 6: null (bind)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 7: ctx (bind)", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 8: null (bind)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = REJECT, + .errstr = "R1 type=inv expected=ctx", +}, diff --git a/tools/testing/selftests/bpf/verifier/ctx_skb.c b/tools/testing/selftests/bpf/verifier/ctx_skb.c index d438193804b2..2e16b8e268f2 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_skb.c +++ b/tools/testing/selftests/bpf/verifier/ctx_skb.c @@ -1011,6 +1011,53 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { + "read gso_size from CGROUP_SKB", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, +{ + "read gso_size from CGROUP_SKB", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, +{ + "write gso_size from CGROUP_SKB", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .result_unpriv = REJECT, + .errstr = "invalid bpf_context access off=176 size=4", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, +{ + "read gso_size from CLS", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, +}, +{ "check wire_len is not readable by sockets", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index bf0322eb5346..bd5cae4a7f73 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -62,6 +62,21 @@ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { + "jset32: ignores upper bits", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_7, 0x8000000000000000), + BPF_LD_IMM64(BPF_REG_8, 0x8000000000000000), + BPF_JMP_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1), + BPF_EXIT_INSN(), + BPF_JMP32_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 2, +}, +{ "jset32: min/max deduction", .insns = { BPF_RAND_UEXT_R7, diff --git a/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh b/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh index 5ba5bef44d5b..bdffe698e1d1 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh @@ -45,6 +45,7 @@ ALL_TESTS=" blackhole_ipv6 " NUM_NETIFS=4 +: ${TIMEOUT:=20000} # ms source $lib_dir/tc_common.sh source $lib_dir/lib.sh @@ -123,7 +124,7 @@ blackhole_ipv4() skip_hw dst_ip 198.51.100.1 src_ip 192.0.2.1 ip_proto icmp \ action pass - ip -4 route show 198.51.100.0/30 | grep -q offload + busywait "$TIMEOUT" wait_for_offload ip -4 route show 198.51.100.0/30 check_err $? "route not marked as offloaded when should" ping_do $h1 198.51.100.1 @@ -147,7 +148,7 @@ blackhole_ipv6() skip_hw dst_ip 2001:db8:2::1 src_ip 2001:db8:1::1 \ ip_proto icmpv6 action pass - ip -6 route show 2001:db8:2::/120 | grep -q offload + busywait "$TIMEOUT" wait_for_offload ip -6 route show 2001:db8:2::/120 check_err $? "route not marked as offloaded when should" ping6_do $h1 2001:db8:2::1 diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh new file mode 100755 index 000000000000..26044e397157 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap ACL drops functionality over mlxsw. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + ingress_flow_action_drop_test + egress_flow_action_drop_test +" +NUM_NETIFS=4 +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 +} + +h1_destroy() +{ + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 +} + +h2_destroy() +{ + simple_if_fini $h2 +} + +switch_create() +{ + ip link add dev br0 type bridge vlan_filtering 1 mcast_snooping 0 + + ip link set dev $swp1 master br0 + ip link set dev $swp2 master br0 + + ip link set dev br0 up + ip link set dev $swp1 up + ip link set dev $swp2 up + + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact +} + +switch_destroy() +{ + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + ip link set dev $swp2 down + ip link set dev $swp1 down + + ip link del dev br0 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + h1mac=$(mac_get $h1) + h2mac=$(mac_get $h2) + + vrf_prepare + + h1_create + h2_create + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ingress_flow_action_drop_test() +{ + local mz_pid + + tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \ + flower src_mac $h1mac action pass + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 \ + flower dst_ip 192.0.2.2 action drop + + $MZ $h1 -c 0 -p 100 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -d 1msec -q & + mz_pid=$! + + RET=0 + + devlink_trap_drop_test ingress_flow_action_drop acl_drops $swp2 101 + + log_test "ingress_flow_action_drop" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 +} + +egress_flow_action_drop_test() +{ + local mz_pid + + tc filter add dev $swp2 egress protocol ip pref 2 handle 102 \ + flower src_mac $h1mac action pass + + tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \ + flower dst_ip 192.0.2.2 action drop + + $MZ $h1 -c 0 -p 100 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -d 1msec -q & + mz_pid=$! + + RET=0 + + devlink_trap_drop_test egress_flow_action_drop acl_drops $swp2 102 + + log_test "egress_flow_action_drop" + + tc filter del dev $swp2 egress protocol ip pref 1 handle 101 flower + + devlink_trap_drop_cleanup $mz_pid $swp2 ip 2 102 +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh index 58cdbfb608e9..e7aecb065409 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh @@ -107,11 +107,11 @@ source_mac_is_multicast_test() RET=0 - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $group_name $swp2 101 log_test "Source MAC is multicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 } __vlan_tag_mismatch_test() @@ -132,7 +132,7 @@ __vlan_tag_mismatch_test() $MZ $h1 "$opt" -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $group_name $swp2 101 # Add PVID and make sure packets are no longer dropped. bridge vlan add vid 1 dev $swp1 pvid untagged master @@ -148,7 +148,7 @@ __vlan_tag_mismatch_test() devlink_trap_action_set $trap_name "drop" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 } vlan_tag_mismatch_untagged_test() @@ -193,7 +193,7 @@ ingress_vlan_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $group_name $swp2 101 # Add the VLAN on the bridge port and make sure packets are no longer # dropped. @@ -212,7 +212,7 @@ ingress_vlan_filter_test() log_test "Ingress VLAN filter" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 bridge vlan del vid $vid dev $swp1 master bridge vlan del vid $vid dev $swp2 master @@ -237,7 +237,7 @@ __ingress_stp_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $group_name $swp2 101 # Change STP state to forwarding and make sure packets are no longer # dropped. @@ -254,7 +254,7 @@ __ingress_stp_filter_test() devlink_trap_action_set $trap_name "drop" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 bridge vlan del vid $vid dev $swp1 master bridge vlan del vid $vid dev $swp2 master @@ -308,7 +308,7 @@ port_list_is_empty_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $group_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave flood on @@ -326,7 +326,7 @@ port_list_is_empty_uc_test() log_test "Port list is empty - unicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 ip link set dev $swp1 type bridge_slave flood on } @@ -354,7 +354,7 @@ port_list_is_empty_mc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $group_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave mcast_flood on @@ -372,7 +372,7 @@ port_list_is_empty_mc_test() log_test "Port list is empty - multicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 ip link set dev $swp1 type bridge_slave mcast_flood on } @@ -401,7 +401,7 @@ port_loopback_filter_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $group_name $swp2 101 # Allow packets to be flooded. ip link set dev $swp2 type bridge_slave flood on @@ -419,7 +419,7 @@ port_loopback_filter_uc_test() log_test "Port loopback filter - unicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 } port_loopback_filter_test() diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index d88d8e47d11b..616f47d86a61 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -176,11 +176,11 @@ non_ip_test() 00:00 de:ad:be:ef" & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "Non IP" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } __uc_dip_over_mc_dmac_test() @@ -206,11 +206,11 @@ __uc_dip_over_mc_dmac_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "Unicast destination IP over multicast destination MAC: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } uc_dip_over_mc_dmac_test() @@ -242,11 +242,11 @@ __sip_is_loopback_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "Source IP is loopback address: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } sip_is_loopback_test() @@ -277,11 +277,11 @@ __dip_is_loopback_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "Destination IP is loopback address: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } dip_is_loopback_test() @@ -313,11 +313,11 @@ __sip_is_mc_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "Source IP is multicast: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } sip_is_mc_test() @@ -345,11 +345,11 @@ ipv4_sip_is_limited_bc_test() -B $h2_ipv4 -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "IPv4 source IP is limited broadcast" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } ipv4_payload_get() @@ -399,11 +399,11 @@ __ipv4_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "IP header corrupted: $desc: IPv4" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } ipv6_payload_get() @@ -446,11 +446,11 @@ __ipv6_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "IP header corrupted: $desc: IPv6" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } ip_header_corrupted_test() @@ -485,11 +485,11 @@ ipv6_mc_dip_reserved_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "IPv6 multicast destination IP reserved scope" - devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" + devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" 1 101 } ipv6_mc_dip_interface_local_scope_test() @@ -511,11 +511,11 @@ ipv6_mc_dip_interface_local_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "IPv6 multicast destination IP interface-local scope" - devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" + devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" 1 101 } __blackhole_route_test() @@ -542,10 +542,10 @@ __blackhole_route_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $group_name $rp2 101 log_test "Blackhole route: IPv$flags" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 ip -$flags route del blackhole $subnet } @@ -641,13 +641,9 @@ erif_disabled_test() mz_pid=$! sleep 5 - # In order to see this trap we need a route that points to disabled RIF. - # When ipv6 address is flushed, there is a delay and the routes are - # deleted before the RIF and we cannot get state that we have route - # to disabled RIF. - # Delete IPv6 address first and then check this trap with flushing IPv4. - ip -6 add flush dev br0 - ip -4 add flush dev br0 + # Unlinking the port from the bridge will disable the RIF associated + # with br0 as it is no longer an upper of any mlxsw port. + ip link set dev $rp1 nomaster t1_packets=$(devlink_trap_rx_packets_get $trap_name) t1_bytes=$(devlink_trap_rx_bytes_get $trap_name) @@ -659,7 +655,6 @@ erif_disabled_test() log_test "Egress RIF disabled" kill $mz_pid && wait $mz_pid &> /dev/null - ip link set dev $rp1 nomaster __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 ip link del dev br0 type bridge devlink_trap_action_set $trap_name "drop" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh new file mode 100755 index 000000000000..47edf099a17e --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh @@ -0,0 +1,384 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap policer functionality over mlxsw. + +# +---------------------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/24 | +# | | | +# | | default via 192.0.2.2 | +# +----|----------------------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | + $rp1 | +# | 192.0.2.2/24 | +# | | +# | 198.51.100.2/24 | +# | + $rp2 | +# | | | +# +----|----------------------------------------------------------------------+ +# | +# +----|----------------------------+ +# | | default via 198.51.100.2 | +# | | | +# | | 198.51.100.1/24 | +# | + $h2 | +# | H2 (vrf) | +# +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + rate_limits_test + burst_limits_test + rate_test + burst_test +" +NUM_NETIFS=4 +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 + mtu_set $h1 10000 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + mtu_restore $h1 + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/24 + mtu_set $h2 10000 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 +} + +h2_destroy() +{ + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + mtu_restore $h2 + simple_if_fini $h2 198.51.100.1/24 +} + +router_create() +{ + ip link set dev $rp1 up + ip link set dev $rp2 up + + __addr_add_del $rp1 add 192.0.2.2/24 + __addr_add_del $rp2 add 198.51.100.2/24 + mtu_set $rp1 10000 + mtu_set $rp2 10000 + + ip -4 route add blackhole 198.51.100.100 + + devlink trap set $DEVLINK_DEV trap blackhole_route action trap +} + +router_destroy() +{ + devlink trap set $DEVLINK_DEV trap blackhole_route action drop + + ip -4 route del blackhole 198.51.100.100 + + mtu_restore $rp2 + mtu_restore $rp1 + __addr_add_del $rp2 del 198.51.100.2/24 + __addr_add_del $rp1 del 192.0.2.2/24 + + ip link set dev $rp2 down + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1_mac=$(mac_get $rp1) + + vrf_prepare + + h1_create + h2_create + + router_create +} + +cleanup() +{ + pre_cleanup + + router_destroy + + h2_destroy + h1_destroy + + vrf_cleanup + + # Reload to ensure devlink-trap settings are back to default. + devlink_reload +} + +rate_limits_test() +{ + RET=0 + + devlink trap policer set $DEVLINK_DEV policer 1 rate 0 &> /dev/null + check_fail $? "Policer rate was changed to rate lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 \ + rate 2000000001 &> /dev/null + check_fail $? "Policer rate was changed to rate higher than limit" + + devlink trap policer set $DEVLINK_DEV policer 1 rate 1 + check_err $? "Failed to set policer rate to minimum" + devlink trap policer set $DEVLINK_DEV policer 1 rate 2000000000 + check_err $? "Failed to set policer rate to maximum" + + log_test "Trap policer rate limits" +} + +burst_limits_test() +{ + RET=0 + + devlink trap policer set $DEVLINK_DEV policer 1 burst 0 &> /dev/null + check_fail $? "Policer burst size was changed to 0" + devlink trap policer set $DEVLINK_DEV policer 1 burst 17 &> /dev/null + check_fail $? "Policer burst size was changed to burst size that is not power of 2" + devlink trap policer set $DEVLINK_DEV policer 1 burst 8 &> /dev/null + check_fail $? "Policer burst size was changed to burst size lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 \ + burst $((2**25)) &> /dev/null + check_fail $? "Policer burst size was changed to burst size higher than limit" + + devlink trap policer set $DEVLINK_DEV policer 1 burst 16 + check_err $? "Failed to set policer burst size to minimum" + devlink trap policer set $DEVLINK_DEV policer 1 burst $((2**24)) + check_err $? "Failed to set policer burst size to maximum" + + log_test "Trap policer burst size limits" +} + +trap_rate_get() +{ + local t0 t1 + + t0=$(devlink_trap_rx_packets_get blackhole_route) + sleep 10 + t1=$(devlink_trap_rx_packets_get blackhole_route) + + echo $(((t1 - t0) / 10)) +} + +policer_drop_rate_get() +{ + local id=$1; shift + local t0 t1 + + t0=$(devlink_trap_policer_rx_dropped_get $id) + sleep 10 + t1=$(devlink_trap_policer_rx_dropped_get $id) + + echo $(((t1 - t0) / 10)) +} + +__rate_test() +{ + local rate pct drop_rate + local id=$1; shift + + RET=0 + + devlink trap policer set $DEVLINK_DEV policer $id rate 1000 burst 16 + devlink trap group set $DEVLINK_DEV group l3_drops policer $id + + # Send packets at highest possible rate and make sure they are dropped + # by the policer. Make sure measured received rate is about 1000 pps + log_info "=== Tx rate: Highest, Policer rate: 1000 pps ===" + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac + + sleep 5 # Take measurements when rate is stable + + rate=$(trap_rate_get) + pct=$((100 * (rate - 1000) / 1000)) + ((-5 <= pct && pct <= 5)) + check_err $? "Expected rate 1000 pps, got $rate pps, which is $pct% off. Required accuracy is +-5%" + log_info "Expected rate 1000 pps, measured rate $rate pps" + + drop_rate=$(policer_drop_rate_get $id) + (( drop_rate > 0 )) + check_err $? "Expected non-zero policer drop rate, got 0" + log_info "Measured policer drop rate of $drop_rate pps" + + stop_traffic + + # Send packets at a rate of 1000 pps and make sure they are not dropped + # by the policer + log_info "=== Tx rate: 1000 pps, Policer rate: 1000 pps ===" + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -d 1msec + + sleep 5 # Take measurements when rate is stable + + drop_rate=$(policer_drop_rate_get $id) + (( drop_rate == 0 )) + check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps" + log_info "Measured policer drop rate of $drop_rate pps" + + stop_traffic + + # Unbind the policer and send packets at highest possible rate. Make + # sure they are not dropped by the policer and that the measured + # received rate is higher than 1000 pps + log_info "=== Tx rate: Highest, Policer rate: No policer ===" + + devlink trap group set $DEVLINK_DEV group l3_drops nopolicer + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac + + rate=$(trap_rate_get) + (( rate > 1000 )) + check_err $? "Expected rate higher than 1000 pps, got $rate pps" + log_info "Measured rate $rate pps" + + drop_rate=$(policer_drop_rate_get $id) + (( drop_rate == 0 )) + check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps" + log_info "Measured policer drop rate of $drop_rate pps" + + stop_traffic + + log_test "Trap policer rate" +} + +rate_test() +{ + local id + + for id in $(devlink_trap_policer_ids_get); do + echo + log_info "Running rate test for policer $id" + __rate_test $id + done +} + +__burst_test() +{ + local t0_rx t0_drop t1_rx t1_drop rx drop + local id=$1; shift + + RET=0 + + devlink trap policer set $DEVLINK_DEV policer $id rate 1000 burst 32 + devlink trap group set $DEVLINK_DEV group l3_drops policer $id + + # Send a burst of 64 packets and make sure that about 32 are received + # and the rest are dropped by the policer + log_info "=== Tx burst size: 64, Policer burst size: 32 pps ===" + + t0_rx=$(devlink_trap_rx_packets_get blackhole_route) + t0_drop=$(devlink_trap_policer_rx_dropped_get $id) + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 64 + + t1_rx=$(devlink_trap_rx_packets_get blackhole_route) + t1_drop=$(devlink_trap_policer_rx_dropped_get $id) + + rx=$((t1_rx - t0_rx)) + pct=$((100 * (rx - 32) / 32)) + ((-20 <= pct && pct <= 20)) + check_err $? "Expected burst size of 32 packets, got $rx packets, which is $pct% off. Required accuracy is +-20%" + log_info "Expected burst size of 32 packets, measured burst size of $rx packets" + + drop=$((t1_drop - t0_drop)) + (( drop > 0 )) + check_err $? "Expected non-zero policer drops, got 0" + log_info "Measured policer drops of $drop packets" + + # Send a burst of 16 packets and make sure that 16 are received + # and that none are dropped by the policer + log_info "=== Tx burst size: 16, Policer burst size: 32 pps ===" + + t0_rx=$(devlink_trap_rx_packets_get blackhole_route) + t0_drop=$(devlink_trap_policer_rx_dropped_get $id) + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 16 + + t1_rx=$(devlink_trap_rx_packets_get blackhole_route) + t1_drop=$(devlink_trap_policer_rx_dropped_get $id) + + rx=$((t1_rx - t0_rx)) + (( rx == 16 )) + check_err $? "Expected burst size of 16 packets, got $rx packets" + log_info "Expected burst size of 16 packets, measured burst size of $rx packets" + + drop=$((t1_drop - t0_drop)) + (( drop == 0 )) + check_err $? "Expected zero policer drops, got $drop" + log_info "Measured policer drops of $drop packets" + + # Unbind the policer and send a burst of 64 packets. Make sure that + # 64 packets are received and that none are dropped by the policer + log_info "=== Tx burst size: 64, Policer burst size: No policer ===" + + devlink trap group set $DEVLINK_DEV group l3_drops nopolicer + + t0_rx=$(devlink_trap_rx_packets_get blackhole_route) + t0_drop=$(devlink_trap_policer_rx_dropped_get $id) + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 64 + + t1_rx=$(devlink_trap_rx_packets_get blackhole_route) + t1_drop=$(devlink_trap_policer_rx_dropped_get $id) + + rx=$((t1_rx - t0_rx)) + (( rx == 64 )) + check_err $? "Expected burst size of 64 packets, got $rx packets" + log_info "Expected burst size of 64 packets, measured burst size of $rx packets" + + drop=$((t1_drop - t0_drop)) + (( drop == 0 )) + check_err $? "Expected zero policer drops, got $drop" + log_info "Measured policer drops of $drop packets" + + log_test "Trap policer burst size" +} + +burst_test() +{ + local id + + for id in $(devlink_trap_policer_ids_get); do + echo + log_info "Running burst size test for policer $id" + __burst_test $id + done +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh index fd19161dd4ec..e11a416323cf 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh @@ -314,11 +314,11 @@ overlay_smac_is_mc_test() -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp1 + devlink_trap_drop_test $trap_name $group_name $swp1 101 log_test "Overlay source MAC is multicast" - devlink_trap_drop_cleanup $mz_pid $swp1 "ip" + devlink_trap_drop_cleanup $mz_pid $swp1 "ip" 1 101 } trap cleanup EXIT diff --git a/tools/testing/selftests/drivers/net/mlxsw/extack.sh b/tools/testing/selftests/drivers/net/mlxsw/extack.sh index d72d8488a3b2..7a0a99c1d22f 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/extack.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/extack.sh @@ -8,7 +8,8 @@ lib_dir=$(dirname $0)/../../../net/forwarding ALL_TESTS=" netdev_pre_up_test vxlan_vlan_add_test - port_vlan_add_test + vxlan_bridge_create_test + bridge_create_test " NUM_NETIFS=2 source $lib_dir/lib.sh @@ -106,32 +107,56 @@ vxlan_vlan_add_test() ip link del dev br1 } -port_vlan_add_test() +vxlan_bridge_create_test() { RET=0 - ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 0 - # Unsupported configuration: mlxsw demands VXLAN with "noudpcsum". ip link add name vx1 up type vxlan id 1000 \ local 192.0.2.17 remote 192.0.2.18 \ dstport 4789 tos inherit ttl 100 - ip link set dev $swp1 master br1 - check_err $? - - bridge vlan del dev $swp1 vid 1 + # Test with VLAN-aware bridge. + ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 0 ip link set dev vx1 master br1 + + ip link set dev $swp1 master br1 2>&1 > /dev/null \ + | grep -q mlxsw_spectrum check_err $? - bridge vlan add dev $swp1 vid 1 pvid untagged 2>&1 >/dev/null \ + # Test with VLAN-unaware bridge. + ip link set dev br1 type bridge vlan_filtering 0 + + ip link set dev $swp1 master br1 2>&1 > /dev/null \ | grep -q mlxsw_spectrum check_err $? - log_test "extack - map VLAN at port" + log_test "extack - bridge creation with VXLAN" + ip link del dev br1 ip link del dev vx1 +} + +bridge_create_test() +{ + RET=0 + + ip link add name br1 up type bridge vlan_filtering 1 + ip link add name br2 up type bridge vlan_filtering 1 + + ip link set dev $swp1 master br1 + check_err $? + + # Only one VLAN-aware bridge is supported, so this should fail with + # an extack. + ip link set dev $swp2 master br2 2>&1 > /dev/null \ + | grep -q mlxsw_spectrum + check_err $? + + log_test "extack - multiple VLAN-aware bridges creation" + + ip link del dev br2 ip link del dev br1 } diff --git a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh new file mode 100644 index 000000000000..cbe50f260a40 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +############################################################################## +# Defines + +if [[ ! -v MLXSW_CHIP ]]; then + MLXSW_CHIP=$(devlink -j dev info $DEVLINK_DEV | jq -r '.[][]["driver"]') + if [ -z "$MLXSW_CHIP" ]; then + echo "SKIP: Device $DEVLINK_DEV doesn't support devlink info command" + exit 1 + fi +fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh index eff6393ce974..71066bc4b886 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh @@ -114,23 +114,12 @@ ping_ipv4() ping_test $h1 192.0.2.2 } -wait_for_packets() -{ - local t0=$1; shift - local prio_observe=$1; shift - - local t1=$(ethtool_stats_get $swp1 rx_frames_prio_$prio_observe) - local delta=$((t1 - t0)) - echo $delta - ((delta >= 10)) -} - __test_defprio() { local prio_install=$1; shift local prio_observe=$1; shift - local delta local key + local t1 local i RET=0 @@ -139,9 +128,10 @@ __test_defprio() local t0=$(ethtool_stats_get $swp1 rx_frames_prio_$prio_observe) mausezahn -q $h1 -d 100m -c 10 -t arp reply - delta=$(busywait "$HIT_TIMEOUT" wait_for_packets $t0 $prio_observe) + t1=$(busywait "$HIT_TIMEOUT" until_counter_is ">= $((t0 + 10))" \ + ethtool_stats_get $swp1 rx_frames_prio_$prio_observe) - check_err $? "Default priority $prio_install/$prio_observe: Expected to capture 10 packets, got $delta." + check_err $? "Default priority $prio_install/$prio_observe: Expected to capture 10 packets, got $((t1 - t0))." log_test "Default priority $prio_install/$prio_observe" defprio_uninstall $swp1 $prio_install diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh index c745ce3befee..4cb2aa65278a 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh @@ -31,6 +31,7 @@ ALL_TESTS=" ping_ipv4 test_update test_no_update + test_pedit_norewrite test_dscp_leftover " @@ -56,6 +57,11 @@ zero() echo 0 } +three() +{ + echo 3 +} + h1_create() { simple_if_init $h1 192.0.2.1/28 @@ -103,6 +109,9 @@ switch_create() simple_if_init $swp1 192.0.2.2/28 __simple_if_init $swp2 v$swp1 192.0.2.17/28 + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact + lldptool -T -i $swp1 -V APP $(dscp_map 0) >/dev/null lldptool -T -i $swp2 -V APP $(dscp_map 0) >/dev/null lldpad_app_wait_set $swp1 @@ -115,6 +124,9 @@ switch_destroy() lldptool -T -i $swp1 -V APP -d $(dscp_map 0) >/dev/null lldpad_app_wait_del + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + __simple_if_fini $swp2 192.0.2.17/28 simple_if_fini $swp1 192.0.2.2/28 } @@ -223,18 +235,36 @@ __test_update() test_update() { + echo "Test net.ipv4.ip_forward_update_priority=1" __test_update 1 reprioritize } test_no_update() { + echo "Test net.ipv4.ip_forward_update_priority=0" __test_update 0 echo } +# Test that when DSCP is updated in pedit, the DSCP rewrite is turned off. +test_pedit_norewrite() +{ + echo "Test no DSCP rewrite after DSCP is updated by pedit" + + tc filter add dev $swp1 ingress handle 101 pref 1 prot ip flower \ + action pedit ex munge ip dsfield set $((3 << 2)) retain 0xfc \ + action skbedit priority 3 + + __test_update 0 three + + tc filter del dev $swp1 ingress pref 1 +} + # Test that when the last APP rule is removed, the prio->DSCP map is properly # set to zeroes, and that the last APP rule does not stay active in the ASIC. test_dscp_leftover() { + echo "Test that last removed DSCP rule is deconfigured correctly" + lldptool -T -i $swp2 -V APP -d $(dscp_map 0) >/dev/null lldpad_app_wait_del diff --git a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh index d231649b4f01..e93878d42596 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh @@ -2,16 +2,15 @@ # SPDX-License-Identifier: GPL-2.0 ROUTER_NUM_NETIFS=4 +: ${TIMEOUT:=20000} # ms router_h1_create() { simple_if_init $h1 192.0.1.1/24 - ip route add 193.0.0.0/8 via 192.0.1.2 dev $h1 } router_h1_destroy() { - ip route del 193.0.0.0/8 via 192.0.1.2 dev $h1 simple_if_fini $h1 192.0.1.1/24 } @@ -64,13 +63,15 @@ router_setup_prepare() router_create } -router_offload_validate() +wait_for_routes() { - local route_count=$1 - local offloaded_count + local t0=$1; shift + local route_count=$1; shift - offloaded_count=$(ip route | grep -o 'offload' | wc -l) - [[ $offloaded_count -ge $route_count ]] + local t1=$(ip route | grep -o 'offload' | wc -l) + local delta=$((t1 - t0)) + echo $delta + [[ $delta -ge $route_count ]] } router_routes_create() @@ -90,8 +91,8 @@ router_routes_create() break 3 fi - echo route add 193.${i}.${j}.${k}/32 via \ - 192.0.2.1 dev $rp2 >> $ROUTE_FILE + echo route add 193.${i}.${j}.${k}/32 dev $rp2 \ + >> $ROUTE_FILE ((count++)) done done @@ -111,45 +112,19 @@ router_test() { local route_count=$1 local should_fail=$2 - local count=0 + local delta RET=0 + local t0=$(ip route | grep -o 'offload' | wc -l) router_routes_create $route_count + delta=$(busywait "$TIMEOUT" wait_for_routes $t0 $route_count) - router_offload_validate $route_count - check_err_fail $should_fail $? "Offload of $route_count routes" + check_err_fail $should_fail $? "Offload routes: Expected $route_count, got $delta." if [[ $RET -ne 0 ]] || [[ $should_fail -eq 1 ]]; then return fi - tc filter add dev $h2 ingress protocol ip pref 1 flower \ - skip_sw dst_ip 193.0.0.0/8 action drop - - for i in {0..255} - do - for j in {0..255} - do - for k in {0..255} - do - if [[ $count -eq $route_count ]]; then - break 3 - fi - - $MZ $h1 -c 1 -p 64 -a $h1mac -b $rp1mac \ - -A 192.0.1.1 -B 193.${i}.${j}.${k} \ - -t ip -q - ((count++)) - done - done - done - - tc_check_packets "dev $h2 ingress" 1 $route_count - check_err $? "Offload mismatch" - - tc filter del dev $h2 ingress protocol ip pref 1 flower \ - skip_sw dst_ip 193.0.0.0/8 action drop - router_routes_destroy } diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh index 5c39e5f6a480..f4031002d5e9 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh @@ -32,6 +32,7 @@ ALL_TESTS=" devlink_reload_test " NUM_NETIFS=2 +: ${TIMEOUT:=20000} # ms source $lib_dir/lib.sh source $lib_dir/devlink_lib.sh @@ -360,20 +361,24 @@ vlan_rif_refcount_test() ip link add link br0 name br0.10 up type vlan id 10 ip -6 address add 2001:db8:1::1/64 dev br0.10 - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 check_err $? "vlan rif was not created before adding port to vlan" bridge vlan add vid 10 dev $swp1 - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 check_err $? "vlan rif was destroyed after adding port to vlan" bridge vlan del vid 10 dev $swp1 - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 check_err $? "vlan rif was destroyed after removing port from vlan" ip link set dev $swp1 nomaster - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload - check_fail $? "vlan rif was not destroyed after unlinking port from bridge" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 + check_err $? "vlan rif was not destroyed after unlinking port from bridge" log_test "vlan rif refcount" @@ -401,22 +406,28 @@ subport_rif_refcount_test() ip -6 address add 2001:db8:1::1/64 dev bond1 ip -6 address add 2001:db8:2::1/64 dev bond1.10 - ip -6 route get fibmatch 2001:db8:1::2 dev bond1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 check_err $? "subport rif was not created on lag device" - ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 check_err $? "subport rif was not created on vlan device" ip link set dev $swp1 nomaster - ip -6 route get fibmatch 2001:db8:1::2 dev bond1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 check_err $? "subport rif of lag device was destroyed when should not" - ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 check_err $? "subport rif of vlan device was destroyed when should not" ip link set dev $swp2 nomaster - ip -6 route get fibmatch 2001:db8:1::2 dev bond1 | grep -q offload - check_fail $? "subport rif of lag device was not destroyed when should" - ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 | grep -q offload - check_fail $? "subport rif of vlan device was not destroyed when should" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 + check_err $? "subport rif of lag device was not destroyed when should" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 + check_err $? "subport rif of vlan device was not destroyed when should" log_test "subport rif refcount" @@ -575,7 +586,8 @@ bridge_extern_learn_test() bridge fdb add de:ad:be:ef:13:37 dev $swp1 master extern_learn - bridge fdb show brport $swp1 | grep de:ad:be:ef:13:37 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + bridge fdb show brport $swp1 de:ad:be:ef:13:37 check_err $? "fdb entry not marked as offloaded when should" log_test "externally learned fdb entry" @@ -595,9 +607,11 @@ neigh_offload_test() ip -6 neigh add 2001:db8:1::2 lladdr de:ad:be:ef:13:37 nud perm \ dev $swp1 - ip -4 neigh show dev $swp1 | grep 192.0.2.2 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -4 neigh show dev $swp1 192.0.2.2 check_err $? "ipv4 neigh entry not marked as offloaded when should" - ip -6 neigh show dev $swp1 | grep 2001:db8:1::2 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 neigh show dev $swp1 2001:db8:1::2 check_err $? "ipv6 neigh entry not marked as offloaded when should" log_test "neighbour offload indication" @@ -623,25 +637,31 @@ nexthop_offload_test() ip -6 route add 2001:db8:2::/64 vrf v$swp1 \ nexthop via 2001:db8:1::2 dev $swp1 - ip -4 route show 198.51.100.0/24 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -4 route show 198.51.100.0/24 vrf v$swp1 check_err $? "ipv4 nexthop not marked as offloaded when should" - ip -6 route show 2001:db8:2::/64 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route show 2001:db8:2::/64 vrf v$swp1 check_err $? "ipv6 nexthop not marked as offloaded when should" ip link set dev $swp2 down sleep 1 - ip -4 route show 198.51.100.0/24 vrf v$swp1 | grep -q offload - check_fail $? "ipv4 nexthop marked as offloaded when should not" - ip -6 route show 2001:db8:2::/64 vrf v$swp1 | grep -q offload - check_fail $? "ipv6 nexthop marked as offloaded when should not" + busywait "$TIMEOUT" not wait_for_offload \ + ip -4 route show 198.51.100.0/24 vrf v$swp1 + check_err $? "ipv4 nexthop marked as offloaded when should not" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route show 2001:db8:2::/64 vrf v$swp1 + check_err $? "ipv6 nexthop marked as offloaded when should not" ip link set dev $swp2 up setup_wait - ip -4 route show 198.51.100.0/24 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -4 route show 198.51.100.0/24 vrf v$swp1 check_err $? "ipv4 nexthop not marked as offloaded after neigh add" - ip -6 route show 2001:db8:2::/64 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route show 2001:db8:2::/64 vrf v$swp1 check_err $? "ipv6 nexthop not marked as offloaded after neigh add" log_test "nexthop offload indication" diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh index c9fc4d4885c1..94c37124a840 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh @@ -56,11 +56,19 @@ switch_destroy() } # Callback from sch_ets_tests.sh -get_stats() +collect_stats() { - local band=$1; shift + local -a streams=("$@") + local stream - ethtool_stats_get "$h2" rx_octets_prio_$band + # Wait for qdisc counter update so that we don't get it mid-way through. + busywait_for_counter 1000 +1 \ + qdisc_parent_stats_get $swp2 10:$((${streams[0]} + 1)) .bytes \ + > /dev/null + + for stream in ${streams[@]}; do + qdisc_parent_stats_get $swp2 10:$((stream + 1)) .bytes + done } bail_on_lldpad diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh new file mode 100644 index 000000000000..0d347d48c112 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh @@ -0,0 +1,533 @@ +# SPDX-License-Identifier: GPL-2.0 + +# This test sends a >1Gbps stream of traffic from H1, to the switch, which +# forwards it to a 1Gbps port. This 1Gbps stream is then looped back to the +# switch and forwarded to the port under test $swp3, which is also 1Gbps. +# +# This way, $swp3 should be 100% filled with traffic without any of it spilling +# to the backlog. Any extra packets sent should almost 1:1 go to backlog. That +# is what H2 is used for--it sends the extra traffic to create backlog. +# +# A RED Qdisc is installed on $swp3. The configuration is such that the minimum +# and maximum size are 1 byte apart, so there is a very clear border under which +# no marking or dropping takes place, and above which everything is marked or +# dropped. +# +# The test uses the buffer build-up behavior to test the installed RED. +# +# In order to test WRED, $swp3 actually contains RED under PRIO, with two +# different configurations. Traffic is prioritized using 802.1p and relies on +# the implicit mlxsw configuration, where packet priority is taken 1:1 from the +# 802.1p marking. +# +# +--------------------------+ +--------------------------+ +# | H1 | | H2 | +# | + $h1.10 | | + $h2.10 | +# | | 192.0.2.1/28 | | | 192.0.2.2/28 | +# | | | | | | +# | | $h1.11 + | | | $h2.11 + | +# | | 192.0.2.17/28 | | | | 192.0.2.18/28 | | +# | | | | | | | | +# | \______ ______/ | | \______ ______/ | +# | \ / | | \ / | +# | + $h1 | | + $h2 | +# +-------------|------------+ +-------------|------------+ +# | >1Gbps | +# +-------------|------------------------------------------------|------------+ +# | SW + $swp1 + $swp2 | +# | _______/ \___________ ___________/ \_______ | +# | / \ / \ | +# | +-|-----------------+ | +-|-----------------+ | | +# | | + $swp1.10 | | | + $swp2.10 | | | +# | | | | .-------------+ $swp5.10 | | | +# | | BR1_10 | | | | | | | +# | | | | | | BR2_10 | | | +# | | + $swp2.10 | | | | | | | +# | +-|-----------------+ | | | + $swp3.10 | | | +# | | | | +-|-----------------+ | | +# | | +-----------------|-+ | | +-----------------|-+ | +# | | | $swp1.11 + | | | | $swp2.11 + | | +# | | | | | .-----------------+ $swp5.11 | | +# | | | BR1_11 | | | | | | | +# | | | | | | | | BR2_11 | | +# | | | $swp2.11 + | | | | | | | +# | | +-----------------|-+ | | | | $swp3.11 + | | +# | | | | | | +-----------------|-+ | +# | \_______ ___________/ | | \___________ _______/ | +# | \ / \ / \ / | +# | + $swp4 + $swp5 + $swp3 | +# +-------------|----------------------|-------------------------|------------+ +# | | | 1Gbps +# \________1Gbps_________/ | +# +----------------------------|------------+ +# | H3 + $h3 | +# | _____________________/ \_______ | +# | / \ | +# | | | | +# | + $h3.10 $h3.11 + | +# | 192.0.2.3/28 192.0.2.19/28 | +# +-----------------------------------------+ + +NUM_NETIFS=8 +CHECK_TC="yes" +lib_dir=$(dirname $0)/../../../net/forwarding +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh +source qos_lib.sh + +ipaddr() +{ + local host=$1; shift + local vlan=$1; shift + + echo 192.0.2.$((16 * (vlan - 10) + host)) +} + +host_create() +{ + local dev=$1; shift + local host=$1; shift + + simple_if_init $dev + mtu_set $dev 10000 + + vlan_create $dev 10 v$dev $(ipaddr $host 10)/28 + ip link set dev $dev.10 type vlan egress 0:0 + + vlan_create $dev 11 v$dev $(ipaddr $host 11)/28 + ip link set dev $dev.11 type vlan egress 0:1 +} + +host_destroy() +{ + local dev=$1; shift + + vlan_destroy $dev 11 + vlan_destroy $dev 10 + mtu_restore $dev + simple_if_fini $dev +} + +h1_create() +{ + host_create $h1 1 +} + +h1_destroy() +{ + host_destroy $h1 +} + +h2_create() +{ + host_create $h2 2 + + # Some of the tests in this suite use multicast traffic. As this traffic + # enters BR2_10 resp. BR2_11, it is flooded to all other ports. Thus + # e.g. traffic ingressing through $swp2 is flooded to $swp3 (the + # intended destination) and $swp5 (which is intended as ingress for + # another stream of traffic). + # + # This is generally not a problem, but if the $swp5 throughput is lower + # than $swp2 throughput, there will be a build-up at $swp5. That may + # cause packets to fail to queue up at $swp3 due to shared buffer + # quotas, and the test to spuriously fail. + # + # Prevent this by setting the speed of $h2 to 1Gbps. + + ethtool -s $h2 speed 1000 autoneg off +} + +h2_destroy() +{ + ethtool -s $h2 autoneg on + host_destroy $h2 +} + +h3_create() +{ + host_create $h3 3 + ethtool -s $h3 speed 1000 autoneg off +} + +h3_destroy() +{ + ethtool -s $h3 autoneg on + host_destroy $h3 +} + +switch_create() +{ + local intf + local vlan + + ip link add dev br1_10 type bridge + ip link add dev br1_11 type bridge + + ip link add dev br2_10 type bridge + ip link add dev br2_11 type bridge + + for intf in $swp1 $swp2 $swp3 $swp4 $swp5; do + ip link set dev $intf up + mtu_set $intf 10000 + done + + for intf in $swp1 $swp4; do + for vlan in 10 11; do + vlan_create $intf $vlan + ip link set dev $intf.$vlan master br1_$vlan + ip link set dev $intf.$vlan up + done + done + + for intf in $swp2 $swp3 $swp5; do + for vlan in 10 11; do + vlan_create $intf $vlan + ip link set dev $intf.$vlan master br2_$vlan + ip link set dev $intf.$vlan up + done + done + + ip link set dev $swp4.10 type vlan egress 0:0 + ip link set dev $swp4.11 type vlan egress 0:1 + for intf in $swp1 $swp2 $swp5; do + for vlan in 10 11; do + ip link set dev $intf.$vlan type vlan ingress 0:0 1:1 + done + done + + for intf in $swp2 $swp3 $swp4 $swp5; do + ethtool -s $intf speed 1000 autoneg off + done + + ip link set dev br1_10 up + ip link set dev br1_11 up + ip link set dev br2_10 up + ip link set dev br2_11 up + + local size=$(devlink_pool_size_thtype 0 | cut -d' ' -f 1) + devlink_port_pool_th_set $swp3 8 $size +} + +switch_destroy() +{ + local intf + local vlan + + devlink_port_pool_th_restore $swp3 8 + + tc qdisc del dev $swp3 root 2>/dev/null + + ip link set dev br2_11 down + ip link set dev br2_10 down + ip link set dev br1_11 down + ip link set dev br1_10 down + + for intf in $swp5 $swp4 $swp3 $swp2; do + ethtool -s $intf autoneg on + done + + for intf in $swp5 $swp3 $swp2 $swp4 $swp1; do + for vlan in 11 10; do + ip link set dev $intf.$vlan down + ip link set dev $intf.$vlan nomaster + vlan_destroy $intf $vlan + done + + mtu_restore $intf + ip link set dev $intf down + done + + ip link del dev br2_11 + ip link del dev br2_10 + ip link del dev br1_11 + ip link del dev br1_10 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + swp4=${NETIFS[p7]} + swp5=${NETIFS[p8]} + + h3_mac=$(mac_get $h3) + + vrf_prepare + + h1_create + h2_create + h3_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h3_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1.10 $(ipaddr 3 10) " from host 1, vlan 10" + ping_test $h1.11 $(ipaddr 3 11) " from host 1, vlan 11" + ping_test $h2.10 $(ipaddr 3 10) " from host 2, vlan 10" + ping_test $h2.11 $(ipaddr 3 11) " from host 2, vlan 11" +} + +get_tc() +{ + local vlan=$1; shift + + echo $((vlan - 10)) +} + +get_qdisc_handle() +{ + local vlan=$1; shift + + local tc=$(get_tc $vlan) + local band=$((8 - tc)) + + # Handle is 107: for TC1, 108: for TC0. + echo "10$band:" +} + +get_qdisc_backlog() +{ + local vlan=$1; shift + + qdisc_stats_get $swp3 $(get_qdisc_handle $vlan) .backlog +} + +get_mc_transmit_queue() +{ + local vlan=$1; shift + + local tc=$(($(get_tc $vlan) + 8)) + ethtool_stats_get $swp3 tc_transmit_queue_tc_$tc +} + +get_nmarked() +{ + local vlan=$1; shift + + ethtool_stats_get $swp3 ecn_marked +} + +get_qdisc_npackets() +{ + local vlan=$1; shift + + busywait_for_counter 1100 +1 \ + qdisc_stats_get $swp3 $(get_qdisc_handle $vlan) .packets +} + +# This sends traffic in an attempt to build a backlog of $size. Returns 0 on +# success. After 10 failed attempts it bails out and returns 1. It dumps the +# backlog size to stdout. +build_backlog() +{ + local vlan=$1; shift + local size=$1; shift + local proto=$1; shift + + local tc=$((vlan - 10)) + local band=$((8 - tc)) + local cur=-1 + local i=0 + + while :; do + local cur=$(busywait 1100 until_counter_is "> $cur" \ + get_qdisc_backlog $vlan) + local diff=$((size - cur)) + local pkts=$(((diff + 7999) / 8000)) + + if ((cur >= size)); then + echo $cur + return 0 + elif ((i++ > 10)); then + echo $cur + return 1 + fi + + $MZ $h2.$vlan -p 8000 -a own -b $h3_mac \ + -A $(ipaddr 2 $vlan) -B $(ipaddr 3 $vlan) \ + -t $proto -q -c $pkts "$@" + done +} + +check_marking() +{ + local vlan=$1; shift + local cond=$1; shift + + local npackets_0=$(get_qdisc_npackets $vlan) + local nmarked_0=$(get_nmarked $vlan) + sleep 5 + local npackets_1=$(get_qdisc_npackets $vlan) + local nmarked_1=$(get_nmarked $vlan) + + local nmarked_d=$((nmarked_1 - nmarked_0)) + local npackets_d=$((npackets_1 - npackets_0)) + local pct=$((100 * nmarked_d / npackets_d)) + + echo $pct + ((pct $cond)) +} + +ecn_test_common() +{ + local name=$1; shift + local vlan=$1; shift + local limit=$1; shift + local backlog + local pct + + # Build the below-the-limit backlog using UDP. We could use TCP just + # fine, but this way we get a proof that UDP is accepted when queue + # length is below the limit. The main stream is using TCP, and if the + # limit is misconfigured, we would see this traffic being ECN marked. + RET=0 + backlog=$(build_backlog $vlan $((2 * limit / 3)) udp) + check_err $? "Could not build the requested backlog" + pct=$(check_marking $vlan "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + log_test "TC $((vlan - 10)): $name backlog < limit" + + # Now push TCP, because non-TCP traffic would be early-dropped after the + # backlog crosses the limit, and we want to make sure that the backlog + # is above the limit. + RET=0 + backlog=$(build_backlog $vlan $((3 * limit / 2)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_marking $vlan ">= 95") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected >= 95." + log_test "TC $((vlan - 10)): $name backlog > limit" +} + +do_ecn_test() +{ + local vlan=$1; shift + local limit=$1; shift + local name=ECN + + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ + $h3_mac tos=0x01 + sleep 1 + + ecn_test_common "$name" $vlan $limit + + # Up there we saw that UDP gets accepted when backlog is below the + # limit. Now that it is above, it should all get dropped, and backlog + # building should fail. + RET=0 + build_backlog $vlan $((2 * limit)) udp >/dev/null + check_fail $? "UDP traffic went into backlog instead of being early-dropped" + log_test "TC $((vlan - 10)): $name backlog > limit: UDP early-dropped" + + stop_traffic + sleep 1 +} + +do_ecn_nodrop_test() +{ + local vlan=$1; shift + local limit=$1; shift + local name="ECN nodrop" + + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ + $h3_mac tos=0x01 + sleep 1 + + ecn_test_common "$name" $vlan $limit + + # Up there we saw that UDP gets accepted when backlog is below the + # limit. Now that it is above, in nodrop mode, make sure it goes to + # backlog as well. + RET=0 + build_backlog $vlan $((2 * limit)) udp >/dev/null + check_err $? "UDP traffic was early-dropped instead of getting into backlog" + log_test "TC $((vlan - 10)): $name backlog > limit: UDP not dropped" + + stop_traffic + sleep 1 +} + +do_red_test() +{ + local vlan=$1; shift + local limit=$1; shift + local backlog + local pct + + # Use ECN-capable TCP to verify there's no marking even though the queue + # is above limit. + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ + $h3_mac tos=0x01 + + # Pushing below the queue limit should work. + RET=0 + backlog=$(build_backlog $vlan $((2 * limit / 3)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_marking $vlan "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + log_test "TC $((vlan - 10)): RED backlog < limit" + + # Pushing above should not. + RET=0 + backlog=$(build_backlog $vlan $((3 * limit / 2)) tcp tos=0x01) + check_fail $? "Traffic went into backlog instead of being early-dropped" + pct=$(check_marking $vlan "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + local diff=$((limit - backlog)) + pct=$((100 * diff / limit)) + ((0 <= pct && pct <= 5)) + check_err $? "backlog $backlog / $limit expected <= 5% distance" + log_test "TC $((vlan - 10)): RED backlog > limit" + + stop_traffic + sleep 1 +} + +do_mc_backlog_test() +{ + local vlan=$1; shift + local limit=$1; shift + local backlog + local pct + + RET=0 + + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) bc + start_tcp_traffic $h2.$vlan $(ipaddr 2 $vlan) $(ipaddr 3 $vlan) bc + + qbl=$(busywait 5000 until_counter_is ">= 500000" \ + get_qdisc_backlog $vlan) + check_err $? "Could not build MC backlog" + + # Verify that we actually see the backlog on BUM TC. Do a busywait as + # well, performance blips might cause false fail. + local ebl + ebl=$(busywait 5000 until_counter_is ">= 500000" \ + get_mc_transmit_queue $vlan) + check_err $? "MC backlog reported by qdisc not visible in ethtool" + + stop_traffic + stop_traffic + + log_test "TC $((vlan - 10)): Qdisc reports MC backlog" +} diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh new file mode 100755 index 000000000000..1c36c576613b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + ecn_test + ecn_nodrop_test + red_test + mc_backlog_test +" +: ${QDISC:=ets} +source sch_red_core.sh + +# do_ecn_test first build 2/3 of the requested backlog and expects no marking, +# and then builds 3/2 of it and does expect marking. The values of $BACKLOG1 and +# $BACKLOG2 are far enough not to overlap, so that we can assume that if we do +# see (do not see) marking, it is actually due to the configuration of that one +# TC, and not due to configuration of the other TC leaking over. +BACKLOG1=200000 +BACKLOG2=500000 + +install_qdisc() +{ + local -a args=("$@") + + tc qdisc add dev $swp3 root handle 10: $QDISC \ + bands 8 priomap 7 6 5 4 3 2 1 0 + tc qdisc add dev $swp3 parent 10:8 handle 108: red \ + limit 1000000 min $BACKLOG1 max $((BACKLOG1 + 1)) \ + probability 1.0 avpkt 8000 burst 38 "${args[@]}" + tc qdisc add dev $swp3 parent 10:7 handle 107: red \ + limit 1000000 min $BACKLOG2 max $((BACKLOG2 + 1)) \ + probability 1.0 avpkt 8000 burst 63 "${args[@]}" + sleep 1 +} + +uninstall_qdisc() +{ + tc qdisc del dev $swp3 parent 10:7 + tc qdisc del dev $swp3 parent 10:8 + tc qdisc del dev $swp3 root +} + +ecn_test() +{ + install_qdisc ecn + + do_ecn_test 10 $BACKLOG1 + do_ecn_test 11 $BACKLOG2 + + uninstall_qdisc +} + +ecn_nodrop_test() +{ + install_qdisc ecn nodrop + + do_ecn_nodrop_test 10 $BACKLOG1 + do_ecn_nodrop_test 11 $BACKLOG2 + + uninstall_qdisc +} + +red_test() +{ + install_qdisc + + do_red_test 10 $BACKLOG1 + do_red_test 11 $BACKLOG2 + + uninstall_qdisc +} + +mc_backlog_test() +{ + install_qdisc + + # Note that the backlog numbers here do not correspond to RED + # configuration, but are arbitrary. + do_mc_backlog_test 10 $BACKLOG1 + do_mc_backlog_test 11 $BACKLOG2 + + uninstall_qdisc +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +bail_on_lldpad +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh new file mode 100755 index 000000000000..76820a0e9a1b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +QDISC=prio +source sch_red_ets.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh new file mode 100755 index 000000000000..558667ea11ec --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + ecn_test + ecn_nodrop_test + red_test + mc_backlog_test +" +source sch_red_core.sh + +BACKLOG=300000 + +install_qdisc() +{ + local -a args=("$@") + + tc qdisc add dev $swp3 root handle 108: red \ + limit 1000000 min $BACKLOG max $((BACKLOG + 1)) \ + probability 1.0 avpkt 8000 burst 38 "${args[@]}" + sleep 1 +} + +uninstall_qdisc() +{ + tc qdisc del dev $swp3 root +} + +ecn_test() +{ + install_qdisc ecn + do_ecn_test 10 $BACKLOG + uninstall_qdisc +} + +ecn_nodrop_test() +{ + install_qdisc ecn nodrop + do_ecn_nodrop_test 10 $BACKLOG + uninstall_qdisc +} + +red_test() +{ + install_qdisc + do_red_test 10 $BACKLOG + uninstall_qdisc +} + +mc_backlog_test() +{ + install_qdisc + # Note that the backlog value here does not correspond to RED + # configuration, but is arbitrary. + do_mc_backlog_test 10 $BACKLOG + uninstall_qdisc +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +bail_on_lldpad +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh new file mode 100755 index 000000000000..58f3a05f08af --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + port_pool_test + port_tc_ip_test + port_tc_arp_test +" + +NUM_NETIFS=2 +source ../../../net/forwarding/lib.sh +source ../../../net/forwarding/devlink_lib.sh +source mlxsw_lib.sh + +SB_POOL_ING=0 +SB_POOL_EGR_CPU=10 + +SB_ITC_CPU_IP=3 +SB_ITC_CPU_ARP=2 +SB_ITC=0 + +h1_create() +{ + simple_if_init $h1 192.0.1.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.1.1/24 +} + +h2_create() +{ + simple_if_init $h2 192.0.1.2/24 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.1.2/24 +} + +sb_occ_pool_check() +{ + local dl_port=$1; shift + local pool=$1; shift + local exp_max_occ=$1 + local max_occ + local err=0 + + max_occ=$(devlink sb -j occupancy show $dl_port \ + | jq -e ".[][][\"pool\"][\"$pool\"][\"max\"]") + + if [[ "$max_occ" -ne "$exp_max_occ" ]]; then + err=1 + fi + + echo $max_occ + return $err +} + +sb_occ_itc_check() +{ + local dl_port=$1; shift + local itc=$1; shift + local exp_max_occ=$1 + local max_occ + local err=0 + + max_occ=$(devlink sb -j occupancy show $dl_port \ + | jq -e ".[][][\"itc\"][\"$itc\"][\"max\"]") + + if [[ "$max_occ" -ne "$exp_max_occ" ]]; then + err=1 + fi + + echo $max_occ + return $err +} + +sb_occ_etc_check() +{ + local dl_port=$1; shift + local etc=$1; shift + local exp_max_occ=$1; shift + local max_occ + local err=0 + + max_occ=$(devlink sb -j occupancy show $dl_port \ + | jq -e ".[][][\"etc\"][\"$etc\"][\"max\"]") + + if [[ "$max_occ" -ne "$exp_max_occ" ]]; then + err=1 + fi + + echo $max_occ + return $err +} + +port_pool_test() +{ + local exp_max_occ=288 + local max_occ + + devlink sb occupancy clearmax $DEVLINK_DEV + + $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \ + -t ip -q + + devlink sb occupancy snapshot $DEVLINK_DEV + + RET=0 + max_occ=$(sb_occ_pool_check $dl_port1 $SB_POOL_ING $exp_max_occ) + check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h1) ingress pool" + + RET=0 + max_occ=$(sb_occ_pool_check $dl_port2 $SB_POOL_ING $exp_max_occ) + check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h2) ingress pool" + + RET=0 + max_occ=$(sb_occ_pool_check $cpu_dl_port $SB_POOL_EGR_CPU $exp_max_occ) + check_err $? "Expected ePool($SB_POOL_EGR_CPU) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "CPU port's egress pool" +} + +port_tc_ip_test() +{ + local exp_max_occ=288 + local max_occ + + devlink sb occupancy clearmax $DEVLINK_DEV + + $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \ + -t ip -q + + devlink sb occupancy snapshot $DEVLINK_DEV + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h1) ingress TC - IP packet" + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h2) ingress TC - IP packet" + + RET=0 + max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_IP $exp_max_occ) + check_err $? "Expected egress TC($SB_ITC_CPU_IP) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "CPU port's egress TC - IP packet" +} + +port_tc_arp_test() +{ + local exp_max_occ=96 + local max_occ + + if [[ $MLXSW_CHIP != "mlxsw_spectrum" ]]; then + exp_max_occ=144 + fi + + devlink sb occupancy clearmax $DEVLINK_DEV + + $MZ $h1 -c 1 -p 160 -a $h1mac -A 192.0.1.1 -t arp -q + + devlink sb occupancy snapshot $DEVLINK_DEV + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h1) ingress TC - ARP packet" + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h2) ingress TC - ARP packet" + + RET=0 + max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_ARP $exp_max_occ) + check_err $? "Expected egress TC($SB_ITC_IP2ME) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "CPU port's egress TC - ARP packet" +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1mac=$(mac_get $h1) + h2mac=$(mac_get $h2) + + dl_port1=$(devlink_port_by_netdev $h1) + dl_port2=$(devlink_port_by_netdev $h2) + + cpu_dl_port=$(devlink_cpu_port_get) + + vrf_prepare + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + + vrf_cleanup +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py new file mode 100755 index 000000000000..0d4b9327c9b3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py @@ -0,0 +1,416 @@ +#!/usr/bin/python +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import json as j +import random + + +class SkipTest(Exception): + pass + + +class RandomValuePicker: + """ + Class for storing shared buffer configuration. Can handle 3 different + objects, pool, tcbind and portpool. Provide an interface to get random + values for a specific object type as the follow: + 1. Pool: + - random size + + 2. TcBind: + - random pool number + - random threshold + + 3. PortPool: + - random threshold + """ + def __init__(self, pools): + self._pools = [] + for pool in pools: + self._pools.append(pool) + + def _cell_size(self): + return self._pools[0]["cell_size"] + + def _get_static_size(self, th): + # For threshold of 16, this works out to be about 12MB on Spectrum-1, + # and about 17MB on Spectrum-2. + return th * 8000 * self._cell_size() + + def _get_size(self): + return self._get_static_size(16) + + def _get_thtype(self): + return "static" + + def _get_th(self, pool): + # Threshold value could be any integer between 3 to 16 + th = random.randint(3, 16) + if pool["thtype"] == "dynamic": + return th + else: + return self._get_static_size(th) + + def _get_pool(self, direction): + ing_pools = [] + egr_pools = [] + for pool in self._pools: + if pool["type"] == "ingress": + ing_pools.append(pool) + else: + egr_pools.append(pool) + if direction == "ingress": + arr = ing_pools + else: + arr = egr_pools + return arr[random.randint(0, len(arr) - 1)] + + def get_value(self, objid): + if isinstance(objid, Pool): + if objid["pool"] in [4, 8, 9, 10]: + # The threshold type of pools 4, 8, 9 and 10 cannot be changed + raise SkipTest() + else: + return (self._get_size(), self._get_thtype()) + if isinstance(objid, TcBind): + if objid["tc"] >= 8: + # Multicast TCs cannot be changed + raise SkipTest() + else: + pool = self._get_pool(objid["type"]) + th = self._get_th(pool) + pool_n = pool["pool"] + return (pool_n, th) + if isinstance(objid, PortPool): + pool_n = objid["pool"] + pool = self._pools[pool_n] + assert pool["pool"] == pool_n + th = self._get_th(pool) + return (th,) + + +class RecordValuePickerException(Exception): + pass + + +class RecordValuePicker: + """ + Class for storing shared buffer configuration. Can handle 2 different + objects, pool and tcbind. Provide an interface to get the stored values per + object type. + """ + def __init__(self, objlist): + self._recs = [] + for item in objlist: + self._recs.append({"objid": item, "value": item.var_tuple()}) + + def get_value(self, objid): + if isinstance(objid, Pool) and objid["pool"] in [4, 8, 9, 10]: + # The threshold type of pools 4, 8, 9 and 10 cannot be changed + raise SkipTest() + if isinstance(objid, TcBind) and objid["tc"] >= 8: + # Multicast TCs cannot be changed + raise SkipTest() + for rec in self._recs: + if rec["objid"].weak_eq(objid): + return rec["value"] + raise RecordValuePickerException() + + +def run_cmd(cmd, json=False): + out = subprocess.check_output(cmd, shell=True) + if json: + return j.loads(out) + return out + + +def run_json_cmd(cmd): + return run_cmd(cmd, json=True) + + +def log_test(test_name, err_msg=None): + if err_msg: + print("\t%s" % err_msg) + print("TEST: %-80s [FAIL]" % test_name) + else: + print("TEST: %-80s [ OK ]" % test_name) + + +class CommonItem(dict): + varitems = [] + + def var_tuple(self): + ret = [] + self.varitems.sort() + for key in self.varitems: + ret.append(self[key]) + return tuple(ret) + + def weak_eq(self, other): + for key in self: + if key in self.varitems: + continue + if self[key] != other[key]: + return False + return True + + +class CommonList(list): + def get_by(self, by_obj): + for item in self: + if item.weak_eq(by_obj): + return item + return None + + def del_by(self, by_obj): + for item in self: + if item.weak_eq(by_obj): + self.remove(item) + + +class Pool(CommonItem): + varitems = ["size", "thtype"] + + def dl_set(self, dlname, size, thtype): + run_cmd("devlink sb pool set {} sb {} pool {} size {} thtype {}".format(dlname, self["sb"], + self["pool"], + size, thtype)) + + +class PoolList(CommonList): + pass + + +def get_pools(dlname, direction=None): + d = run_json_cmd("devlink sb pool show -j") + pools = PoolList() + for pooldict in d["pool"][dlname]: + if not direction or direction == pooldict["type"]: + pools.append(Pool(pooldict)) + return pools + + +def do_check_pools(dlname, pools, vp): + for pool in pools: + pre_pools = get_pools(dlname) + try: + (size, thtype) = vp.get_value(pool) + except SkipTest: + continue + pool.dl_set(dlname, size, thtype) + post_pools = get_pools(dlname) + pool = post_pools.get_by(pool) + + err_msg = None + if pool["size"] != size: + err_msg = "Incorrect pool size (got {}, expected {})".format(pool["size"], size) + if pool["thtype"] != thtype: + err_msg = "Incorrect pool threshold type (got {}, expected {})".format(pool["thtype"], thtype) + + pre_pools.del_by(pool) + post_pools.del_by(pool) + if pre_pools != post_pools: + err_msg = "Other pool setup changed as well" + log_test("pool {} of sb {} set verification".format(pool["pool"], + pool["sb"]), err_msg) + + +def check_pools(dlname, pools): + # Save defaults + record_vp = RecordValuePicker(pools) + + # For each pool, set random size and static threshold type + do_check_pools(dlname, pools, RandomValuePicker(pools)) + + # Restore defaults + do_check_pools(dlname, pools, record_vp) + + +class TcBind(CommonItem): + varitems = ["pool", "threshold"] + + def __init__(self, port, d): + super(TcBind, self).__init__(d) + self["dlportname"] = port.name + + def dl_set(self, pool, th): + run_cmd("devlink sb tc bind set {} sb {} tc {} type {} pool {} th {}".format(self["dlportname"], + self["sb"], + self["tc"], + self["type"], + pool, th)) + + +class TcBindList(CommonList): + pass + + +def get_tcbinds(ports, verify_existence=False): + d = run_json_cmd("devlink sb tc bind show -j -n") + tcbinds = TcBindList() + for port in ports: + err_msg = None + if port.name not in d["tc_bind"] or len(d["tc_bind"][port.name]) == 0: + err_msg = "No tc bind for port" + else: + for tcbinddict in d["tc_bind"][port.name]: + tcbinds.append(TcBind(port, tcbinddict)) + if verify_existence: + log_test("tc bind existence for port {} verification".format(port.name), err_msg) + return tcbinds + + +def do_check_tcbind(ports, tcbinds, vp): + for tcbind in tcbinds: + pre_tcbinds = get_tcbinds(ports) + try: + (pool, th) = vp.get_value(tcbind) + except SkipTest: + continue + tcbind.dl_set(pool, th) + post_tcbinds = get_tcbinds(ports) + tcbind = post_tcbinds.get_by(tcbind) + + err_msg = None + if tcbind["pool"] != pool: + err_msg = "Incorrect pool (got {}, expected {})".format(tcbind["pool"], pool) + if tcbind["threshold"] != th: + err_msg = "Incorrect threshold (got {}, expected {})".format(tcbind["threshold"], th) + + pre_tcbinds.del_by(tcbind) + post_tcbinds.del_by(tcbind) + if pre_tcbinds != post_tcbinds: + err_msg = "Other tc bind setup changed as well" + log_test("tc bind {}-{} of sb {} set verification".format(tcbind["dlportname"], + tcbind["tc"], + tcbind["sb"]), err_msg) + + +def check_tcbind(dlname, ports, pools): + tcbinds = get_tcbinds(ports, verify_existence=True) + + # Save defaults + record_vp = RecordValuePicker(tcbinds) + + # Bind each port and unicast TC (TCs < 8) to a random pool and a random + # threshold + do_check_tcbind(ports, tcbinds, RandomValuePicker(pools)) + + # Restore defaults + do_check_tcbind(ports, tcbinds, record_vp) + + +class PortPool(CommonItem): + varitems = ["threshold"] + + def __init__(self, port, d): + super(PortPool, self).__init__(d) + self["dlportname"] = port.name + + def dl_set(self, th): + run_cmd("devlink sb port pool set {} sb {} pool {} th {}".format(self["dlportname"], + self["sb"], + self["pool"], th)) + + +class PortPoolList(CommonList): + pass + + +def get_portpools(ports, verify_existence=False): + d = run_json_cmd("devlink sb port pool -j -n") + portpools = PortPoolList() + for port in ports: + err_msg = None + if port.name not in d["port_pool"] or len(d["port_pool"][port.name]) == 0: + err_msg = "No port pool for port" + else: + for portpooldict in d["port_pool"][port.name]: + portpools.append(PortPool(port, portpooldict)) + if verify_existence: + log_test("port pool existence for port {} verification".format(port.name), err_msg) + return portpools + + +def do_check_portpool(ports, portpools, vp): + for portpool in portpools: + pre_portpools = get_portpools(ports) + (th,) = vp.get_value(portpool) + portpool.dl_set(th) + post_portpools = get_portpools(ports) + portpool = post_portpools.get_by(portpool) + + err_msg = None + if portpool["threshold"] != th: + err_msg = "Incorrect threshold (got {}, expected {})".format(portpool["threshold"], th) + + pre_portpools.del_by(portpool) + post_portpools.del_by(portpool) + if pre_portpools != post_portpools: + err_msg = "Other port pool setup changed as well" + log_test("port pool {}-{} of sb {} set verification".format(portpool["dlportname"], + portpool["pool"], + portpool["sb"]), err_msg) + + +def check_portpool(dlname, ports, pools): + portpools = get_portpools(ports, verify_existence=True) + + # Save defaults + record_vp = RecordValuePicker(portpools) + + # For each port pool, set a random threshold + do_check_portpool(ports, portpools, RandomValuePicker(pools)) + + # Restore defaults + do_check_portpool(ports, portpools, record_vp) + + +class Port: + def __init__(self, name): + self.name = name + + +class PortList(list): + pass + + +def get_ports(dlname): + d = run_json_cmd("devlink port show -j") + ports = PortList() + for name in d["port"]: + if name.find(dlname) == 0 and d["port"][name]["flavour"] == "physical": + ports.append(Port(name)) + return ports + + +def get_device(): + devices_info = run_json_cmd("devlink -j dev info")["info"] + for d in devices_info: + if "mlxsw_spectrum" in devices_info[d]["driver"]: + return d + return None + + +class UnavailableDevlinkNameException(Exception): + pass + + +def test_sb_configuration(): + # Use static seed + random.seed(0) + + dlname = get_device() + if not dlname: + raise UnavailableDevlinkNameException() + + ports = get_ports(dlname) + pools = get_pools(dlname) + + check_pools(dlname, pools) + check_tcbind(dlname, ports, pools) + check_portpool(dlname, ports, pools) + + +test_sb_configuration() diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 7b2acba82a49..fd583a171db7 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -8,8 +8,9 @@ source $lib_dir/lib.sh source $lib_dir/tc_common.sh source $lib_dir/devlink_lib.sh -if [ "$DEVLINK_VIDDID" != "15b3:cf6c" ]; then - echo "SKIP: test is tailored for Mellanox Spectrum-2" +if [[ "$DEVLINK_VIDDID" != "15b3:cf6c" && \ + "$DEVLINK_VIDDID" != "15b3:cf70" ]]; then + echo "SKIP: test is tailored for Mellanox Spectrum-2 and Spectrum-3" exit 1 fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh index a0795227216e..efd798a85931 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh @@ -8,9 +8,9 @@ tc_flower_get_target() # The driver associates a counter with each tc filter, which means the # number of supported filters is bounded by the number of available # counters. - # Currently, the driver supports 12K (12,288) flow counters and six of + # Currently, the driver supports 30K (30,720) flow counters and six of # these are used for multicast routing. - local target=12282 + local target=30714 if ((! should_fail)); then echo $target diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh new file mode 100755 index 000000000000..20ed98fe5a60 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + default_hw_stats_test + immediate_hw_stats_test + delayed_hw_stats_test + disabled_hw_stats_test +" +NUM_NETIFS=2 + +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 +} + +switch_create() +{ + simple_if_init $swp1 192.0.2.2/24 + tc qdisc add dev $swp1 clsact +} + +switch_destroy() +{ + tc qdisc del dev $swp1 clsact + simple_if_fini $swp1 192.0.2.2/24 +} + +hw_stats_test() +{ + RET=0 + + local name=$1 + local action_hw_stats=$2 + local occ_delta=$3 + local expected_packet_count=$4 + + local orig_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]') + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop $action_hw_stats + check_err $? "Failed to add rule with $name hw_stats" + + local new_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]') + local expected_occ=$((orig_occ + occ_delta)) + [ "$new_occ" == "$expected_occ" ] + check_err $? "Expected occupancy of $expected_occ, got $new_occ" + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $swp1mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -q + + tc_check_packets "dev $swp1 ingress" 101 $expected_packet_count + check_err $? "Did not match incoming packet" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + log_test "$name hw_stats" +} + +default_hw_stats_test() +{ + hw_stats_test "default" "" 2 1 +} + +immediate_hw_stats_test() +{ + hw_stats_test "immediate" "hw_stats immediate" 2 1 +} + +delayed_hw_stats_test() +{ + RET=0 + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop hw_stats delayed + check_fail $? "Unexpected success in adding rule with delayed hw_stats" + + log_test "delayed hw_stats" +} + +disabled_hw_stats_test() +{ + hw_stats_test "disabled" "hw_stats disabled" 0 0 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + h1mac=$(mac_get $h1) + swp1mac=$(mac_get $swp1) + + vrf_prepare + + h1_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h1_destroy + + vrf_cleanup +} + +check_tc_action_hw_stats_support + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh new file mode 100755 index 000000000000..68c80d0ec1ec --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + shared_block_drop_test + egress_redirect_test + multi_mirror_test +" +NUM_NETIFS=2 + +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh + +switch_create() +{ + simple_if_init $swp1 192.0.2.1/24 + simple_if_init $swp2 192.0.2.2/24 +} + +switch_destroy() +{ + simple_if_fini $swp2 192.0.2.2/24 + simple_if_fini $swp1 192.0.2.1/24 +} + +shared_block_drop_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have mixed-bound + # shared block with a drop rule. + + tc qdisc add dev $swp1 ingress_block 22 clsact + check_err $? "Failed to create clsact with ingress block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add drop rule to ingress bound block" + + tc qdisc add dev $swp2 ingress_block 22 clsact + check_err $? "Failed to create another clsact with ingress shared block" + + tc qdisc del dev $swp2 clsact + + tc qdisc add dev $swp2 egress_block 22 clsact + check_fail $? "Incorrect success to create another clsact with egress shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc add dev $swp2 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block after blocker drop rule removed" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add drop rule to mixed bound block" + + tc qdisc del dev $swp1 clsact + + tc qdisc add dev $swp1 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add drop rule to egress bound shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + log_test "shared block drop" +} + +egress_redirect_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have mirred redirect on + # egress-bound block. + + tc qdisc add dev $swp1 ingress_block 22 clsact + check_err $? "Failed to create clsact with ingress block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_err $? "Failed to add redirect rule to ingress bound block" + + tc qdisc add dev $swp2 ingress_block 22 clsact + check_err $? "Failed to create another clsact with ingress shared block" + + tc qdisc del dev $swp2 clsact + + tc qdisc add dev $swp2 egress_block 22 clsact + check_fail $? "Incorrect success to create another clsact with egress shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc add dev $swp2 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block after blocker redirect rule removed" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to mixed bound block" + + tc qdisc del dev $swp1 clsact + + tc qdisc add dev $swp1 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to egress bound shared block" + + tc qdisc del dev $swp2 clsact + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to egress bound block" + + tc qdisc del dev $swp1 clsact + + log_test "shared block drop" +} + +multi_mirror_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have multiple mirror + # actions in a single rule. + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress mirror dev $swp2 + check_err $? "Failed to add rule with single mirror action" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress mirror dev $swp2 \ + action mirred egress mirror dev $swp1 + check_fail $? "Incorrect success to add rule with two mirror actions" + + tc qdisc del dev $swp1 clsact + + log_test "multi mirror" +} + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + + vrf_prepare + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + vrf_cleanup +} + +check_tc_shblock_support + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh index a6d733d2a4b4..cc0f07e72cf2 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh @@ -2,9 +2,9 @@ # SPDX-License-Identifier: GPL-2.0 # Test for resource limit of offloaded flower rules. The test adds a given -# number of flower matches for different IPv6 addresses, then generates traffic, -# and ensures each was hit exactly once. This file contains functions to set up -# a testing topology and run the test, and is meant to be sourced from a test +# number of flower matches for different IPv6 addresses, then check the offload +# indication for all of the tc flower rules. This file contains functions to set +# up a testing topology and run the test, and is meant to be sourced from a test # script that calls the testing routine with a given number of rules. TC_FLOWER_NUM_NETIFS=2 @@ -94,22 +94,15 @@ __tc_flower_test() tc_flower_rules_create $count $should_fail - for ((i = 0; i < count; ++i)); do - $MZ $h1 -q -c 1 -t ip -p 20 -b bc -6 \ - -A 2001:db8:2::1 \ - -B $(tc_flower_addr $i) - done - - MISMATCHES=$( - tc -j -s filter show dev $h2 ingress | - jq -r '[ .[] | select(.kind == "flower") | .options | - values as $rule | .actions[].stats.packets | - select(. != 1) | "\(.) on \($rule.keys.dst_ip)" ] | - join(", ")' - ) - - test -z "$MISMATCHES" - check_err $? "Expected to capture 1 packet for each IP, but got $MISMATCHES" + offload_count=$(tc -j -s filter show dev $h2 ingress | + jq -r '[ .[] | select(.kind == "flower") | + .options | .in_hw ]' | jq .[] | wc -l) + [[ $((offload_count - 1)) -eq $count ]] + if [[ $should_fail -eq 0 ]]; then + check_err $? "Offload mismatch" + else + check_err_fail $should_fail $? "Offload more than expacted" + fi } tc_flower_test() diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh index 4632f51af7ab..729a86cc4ede 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh @@ -9,6 +9,7 @@ lib_dir=$(dirname $0)/../../../net/forwarding ALL_TESTS="sanitization_test offload_indication_test \ sanitization_vlan_aware_test offload_indication_vlan_aware_test" NUM_NETIFS=2 +: ${TIMEOUT:=20000} # ms source $lib_dir/lib.sh setup_prepare() @@ -470,8 +471,8 @@ offload_indication_fdb_flood_test() bridge fdb append 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.2 - bridge fdb show brport vxlan0 | grep 00:00:00:00:00:00 \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb 00:00:00:00:00:00 \ + bridge fdb show brport vxlan0 check_err $? bridge fdb del 00:00:00:00:00:00 dev vxlan0 self @@ -486,11 +487,11 @@ offload_indication_fdb_bridge_test() bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self master static \ dst 198.51.100.2 - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 check_err $? - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 check_err $? log_test "vxlan entry offload indication - initial state" @@ -500,9 +501,9 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan0 master - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 + check_err $? log_test "vxlan entry offload indication - after removal from bridge" @@ -511,11 +512,11 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan0 master static - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 check_err $? - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 check_err $? log_test "vxlan entry offload indication - after re-add to bridge" @@ -525,9 +526,9 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan0 self - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 + check_err $? log_test "vxlan entry offload indication - after removal from vxlan" @@ -536,11 +537,11 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self dst 198.51.100.2 - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 check_err $? - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 check_err $? log_test "vxlan entry offload indication - after re-add to vxlan" @@ -558,27 +559,32 @@ offload_indication_decap_route_test() { RET=0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev vxlan0 down - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev vxlan1 down - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? log_test "vxlan decap route - vxlan device down" RET=0 ip link set dev vxlan1 up - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev vxlan0 up - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - vxlan device up" @@ -586,11 +592,13 @@ offload_indication_decap_route_test() RET=0 ip address delete 198.51.100.1/32 dev lo - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? ip address add 198.51.100.1/32 dev lo - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - add local route" @@ -598,16 +606,19 @@ offload_indication_decap_route_test() RET=0 ip link set dev $swp1 nomaster - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev $swp2 nomaster - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? ip link set dev $swp1 master br0 ip link set dev $swp2 master br1 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - local ports enslavement" @@ -615,12 +626,14 @@ offload_indication_decap_route_test() RET=0 ip link del dev br0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link del dev br1 - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? log_test "vxlan decap route - bridge device deletion" @@ -632,16 +645,19 @@ offload_indication_decap_route_test() ip link set dev $swp2 master br1 ip link set dev vxlan0 master br0 ip link set dev vxlan1 master br1 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link del dev vxlan0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link del dev vxlan1 - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? log_test "vxlan decap route - vxlan device deletion" @@ -656,12 +672,15 @@ check_fdb_offloaded() local mac=00:11:22:33:44:55 local zmac=00:00:00:00:00:00 - bridge fdb show dev vxlan0 | grep $mac | grep self | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $mac self \ + bridge fdb show dev vxlan0 check_err $? - bridge fdb show dev vxlan0 | grep $mac | grep master | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $mac master \ + bridge fdb show dev vxlan0 check_err $? - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 check_err $? } @@ -672,13 +691,15 @@ check_vxlan_fdb_not_offloaded() bridge fdb show dev vxlan0 | grep $mac | grep -q self check_err $? - bridge fdb show dev vxlan0 | grep $mac | grep self | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $mac self \ + bridge fdb show dev vxlan0 + check_err $? bridge fdb show dev vxlan0 | grep $zmac | grep -q self check_err $? - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 + check_err $? } check_bridge_fdb_not_offloaded() @@ -688,8 +709,9 @@ check_bridge_fdb_not_offloaded() bridge fdb show dev vxlan0 | grep $mac | grep -q master check_err $? - bridge fdb show dev vxlan0 | grep $mac | grep master | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $mac master \ + bridge fdb show dev vxlan0 + check_err $? } __offload_indication_join_vxlan_first() @@ -771,12 +793,14 @@ __offload_indication_join_vxlan_last() ip link set dev $swp1 master br0 - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 + check_err $? ip link set dev vxlan0 master br0 - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 check_err $? log_test "offload indication - attach vxlan last" @@ -854,20 +878,26 @@ sanitization_vlan_aware_test() bridge vlan del vid 10 dev vxlan20 bridge vlan add vid 20 dev vxlan20 pvid untagged - # Test that offloading of an unsupported tunnel fails when it is - # triggered by addition of VLAN to a local port - RET=0 + # Test that when two VXLAN tunnels with conflicting configurations + # (i.e., different TTL) are enslaved to the same VLAN-aware bridge, + # then the enslavement of a port to the bridge is denied. - # TOS must be set to inherit - ip link set dev vxlan10 type vxlan tos 42 + # Use the offload indication of the local route to ensure the VXLAN + # configuration was correctly rollbacked. + ip address add 198.51.100.1/32 dev lo - ip link set dev $swp1 master br0 - bridge vlan add vid 10 dev $swp1 &> /dev/null + ip link set dev vxlan10 type vxlan ttl 10 + ip link set dev $swp1 master br0 &> /dev/null check_fail $? - log_test "vlan-aware - failed vlan addition to a local port" + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? + + log_test "vlan-aware - failed enslavement to bridge due to conflict" - ip link set dev vxlan10 type vxlan tos inherit + ip link set dev vxlan10 type vxlan ttl 20 + ip address del 198.51.100.1/32 dev lo ip link del dev vxlan20 ip link del dev vxlan10 @@ -924,11 +954,11 @@ offload_indication_vlan_aware_fdb_test() bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self master static \ dst 198.51.100.2 vlan 10 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 check_err $? - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 check_err $? log_test "vxlan entry offload indication - initial state" @@ -938,9 +968,9 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan10 master vlan 10 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 + check_err $? log_test "vxlan entry offload indication - after removal from bridge" @@ -949,11 +979,11 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan10 master static vlan 10 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 check_err $? - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 check_err $? log_test "vxlan entry offload indication - after re-add to bridge" @@ -963,9 +993,9 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan10 self - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 + check_err $? log_test "vxlan entry offload indication - after removal from vxlan" @@ -974,11 +1004,11 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self dst 198.51.100.2 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 check_err $? - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 check_err $? log_test "vxlan entry offload indication - after re-add to vxlan" @@ -990,28 +1020,31 @@ offload_indication_vlan_aware_decap_route_test() { RET=0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? # Toggle PVID flag on one VxLAN device and make sure route is still # marked as offloaded bridge vlan add vid 10 dev vxlan10 untagged - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? # Toggle PVID flag on second VxLAN device and make sure route is no # longer marked as offloaded bridge vlan add vid 20 dev vxlan20 untagged - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? # Toggle PVID flag back and make sure route is marked as offloaded bridge vlan add vid 10 dev vxlan10 pvid untagged bridge vlan add vid 20 dev vxlan20 pvid untagged - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - vni map/unmap" @@ -1064,35 +1097,33 @@ offload_indication_vlan_aware_l3vni_test() ip link set dev vxlan0 master br0 bridge vlan add dev vxlan0 vid 10 pvid untagged - # No local port or router port is member in the VLAN, so tunnel should - # not be offloaded - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload - check_fail $? "vxlan tunnel offloaded when should not" + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 + check_err $? "vxlan tunnel not offloaded when should" # Configure a VLAN interface and make sure tunnel is offloaded ip link add link br0 name br10 up type vlan id 10 sysctl_set net.ipv6.conf.br10.disable_ipv6 0 ip -6 address add 2001:db8:1::1/64 dev br10 - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 check_err $? "vxlan tunnel not offloaded when should" # Unlink the VXLAN device, make sure tunnel is no longer offloaded, # then add it back to the bridge and make sure it is offloaded ip link set dev vxlan0 nomaster - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload - check_fail $? "vxlan tunnel offloaded after unlinked from bridge" + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 + check_err $? "vxlan tunnel offloaded after unlinked from bridge" ip link set dev vxlan0 master br0 - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload - check_fail $? "vxlan tunnel offloaded despite no matching vid" + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 + check_err $? "vxlan tunnel offloaded despite no matching vid" bridge vlan add dev vxlan0 vid 10 pvid untagged - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 check_err $? "vxlan tunnel not offloaded after adding vid" log_test "vxlan - l3 vni" diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 025a84c2ab5a..9f9741444549 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -141,6 +141,16 @@ regions_test() check_region_snapshot_count dummy post-first-delete 2 + devlink region new $DL_HANDLE/dummy snapshot 25 + check_err $? "Failed to create a new snapshot with id 25" + + check_region_snapshot_count dummy post-first-request 3 + + devlink region del $DL_HANDLE/dummy snapshot 25 + check_err $? "Failed to delete snapshot with id 25" + + check_region_snapshot_count dummy post-second-delete 2 + log_test "regions test" } @@ -367,6 +377,11 @@ dummy_reporter_test() { RET=0 + check_reporter_info dummy healthy 0 0 0 true + + devlink health set $DL_HANDLE reporter dummy auto_recover false + check_err $? "Failed to dummy reporter auto_recover option" + check_reporter_info dummy healthy 0 0 0 false local BREAK_MSG="foo bar" diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh index f101ab9441e2..dbd1e014ba17 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh @@ -16,6 +16,8 @@ ALL_TESTS=" trap_group_action_test bad_trap_group_test trap_group_stats_test + trap_policer_test + trap_policer_bind_test port_del_test dev_del_test " @@ -23,6 +25,7 @@ NETDEVSIM_PATH=/sys/bus/netdevsim/ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} DEVLINK_DEV=netdevsim/${DEV} +DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV/ SLEEP_TIME=1 NETDEV="" NUM_NETIFS=0 @@ -103,6 +106,11 @@ trap_metadata_test() for trap_name in $(devlink_traps_get); do devlink_trap_metadata_test $trap_name "input_port" check_err $? "Input port not reported as metadata of trap $trap_name" + if [ $trap_name == "ingress_flow_action_drop" ] || + [ $trap_name == "egress_flow_action_drop" ]; then + devlink_trap_metadata_test $trap_name "flow_action_cookie" + check_err $? "Flow action cookie not reported as metadata of trap $trap_name" + fi done log_test "Trap metadata" @@ -251,6 +259,119 @@ trap_group_stats_test() log_test "Trap group statistics" } +trap_policer_test() +{ + local packets_t0 + local packets_t1 + + if [ $(devlink_trap_policers_num_get) -eq 0 ]; then + check_err 1 "Failed to dump policers" + fi + + devlink trap policer set $DEVLINK_DEV policer 1337 &> /dev/null + check_fail $? "Did not get an error for setting a non-existing policer" + devlink trap policer show $DEVLINK_DEV policer 1337 &> /dev/null + check_fail $? "Did not get an error for getting a non-existing policer" + + devlink trap policer set $DEVLINK_DEV policer 1 rate 2000 burst 16 + check_err $? "Failed to set valid parameters for a valid policer" + if [ $(devlink_trap_policer_rate_get 1) -ne 2000 ]; then + check_err 1 "Policer rate was not changed" + fi + if [ $(devlink_trap_policer_burst_get 1) -ne 16 ]; then + check_err 1 "Policer burst size was not changed" + fi + + devlink trap policer set $DEVLINK_DEV policer 1 rate 0 &> /dev/null + check_fail $? "Policer rate was changed to rate lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 rate 9000 &> /dev/null + check_fail $? "Policer rate was changed to rate higher than limit" + devlink trap policer set $DEVLINK_DEV policer 1 burst 2 &> /dev/null + check_fail $? "Policer burst size was changed to burst size lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 rate 65537 &> /dev/null + check_fail $? "Policer burst size was changed to burst size higher than limit" + echo "y" > $DEBUGFS_DIR/fail_trap_policer_set + devlink trap policer set $DEVLINK_DEV policer 1 rate 3000 &> /dev/null + check_fail $? "Managed to set policer rate when should not" + echo "n" > $DEBUGFS_DIR/fail_trap_policer_set + if [ $(devlink_trap_policer_rate_get 1) -ne 2000 ]; then + check_err 1 "Policer rate was changed to an invalid value" + fi + if [ $(devlink_trap_policer_burst_get 1) -ne 16 ]; then + check_err 1 "Policer burst size was changed to an invalid value" + fi + + packets_t0=$(devlink_trap_policer_rx_dropped_get 1) + sleep .5 + packets_t1=$(devlink_trap_policer_rx_dropped_get 1) + if [ ! $packets_t1 -gt $packets_t0 ]; then + check_err 1 "Policer drop counter was not incremented" + fi + + echo "y"> $DEBUGFS_DIR/fail_trap_policer_counter_get + devlink -s trap policer show $DEVLINK_DEV policer 1 &> /dev/null + check_fail $? "Managed to read policer drop counter when should not" + echo "n"> $DEBUGFS_DIR/fail_trap_policer_counter_get + devlink -s trap policer show $DEVLINK_DEV policer 1 &> /dev/null + check_err $? "Did not manage to read policer drop counter when should" + + log_test "Trap policer" +} + +trap_group_check_policer() +{ + local group_name=$1; shift + + devlink -j -p trap group show $DEVLINK_DEV group $group_name \ + | jq -e '.[][][]["policer"]' &> /dev/null +} + +trap_policer_bind_test() +{ + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 + check_err $? "Failed to bind a valid policer" + if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then + check_err 1 "Bound policer was not changed" + fi + + devlink trap group set $DEVLINK_DEV group l2_drops policer 1337 \ + &> /dev/null + check_fail $? "Did not get an error for binding a non-existing policer" + if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then + check_err 1 "Bound policer was changed when should not" + fi + + devlink trap group set $DEVLINK_DEV group l2_drops policer 0 + check_err $? "Failed to unbind a policer when using ID 0" + trap_group_check_policer "l2_drops" + check_fail $? "Trap group has a policer after unbinding with ID 0" + + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 + check_err $? "Failed to bind a valid policer" + + devlink trap group set $DEVLINK_DEV group l2_drops nopolicer + check_err $? "Failed to unbind a policer when using 'nopolicer' keyword" + trap_group_check_policer "l2_drops" + check_fail $? "Trap group has a policer after unbinding with 'nopolicer' keyword" + + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 + check_err $? "Failed to bind a valid policer" + + echo "y"> $DEBUGFS_DIR/fail_trap_group_set + devlink trap group set $DEVLINK_DEV group l2_drops policer 2 \ + &> /dev/null + check_fail $? "Managed to bind a policer when should not" + echo "n"> $DEBUGFS_DIR/fail_trap_group_set + devlink trap group set $DEVLINK_DEV group l2_drops policer 2 + check_err $? "Did not manage to bind a policer when should" + + devlink trap group set $DEVLINK_DEV group l2_drops action drop \ + policer 1337 &> /dev/null + check_fail $? "Did not get an error for partially modified trap group" + + log_test "Trap policer binding" +} + port_del_test() { local group_name diff --git a/tools/testing/selftests/firmware/Makefile b/tools/testing/selftests/firmware/Makefile index 012b2cf69c11..40211cd8f0e6 100644 --- a/tools/testing/selftests/firmware/Makefile +++ b/tools/testing/selftests/firmware/Makefile @@ -1,13 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only # Makefile for firmware loading selftests - -# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" -all: +CFLAGS = -Wall \ + -O2 TEST_PROGS := fw_run_tests.sh TEST_FILES := fw_fallback.sh fw_filesystem.sh fw_lib.sh +TEST_GEN_FILES := fw_namespace include ../lib.mk - -# Nothing to clean up. -clean: diff --git a/tools/testing/selftests/firmware/fw_filesystem.sh b/tools/testing/selftests/firmware/fw_filesystem.sh index 56894477c8bd..fcc281373b4d 100755 --- a/tools/testing/selftests/firmware/fw_filesystem.sh +++ b/tools/testing/selftests/firmware/fw_filesystem.sh @@ -86,6 +86,29 @@ else fi fi +# Try platform (EFI embedded fw) loading too +if [ ! -e "$DIR"/trigger_request_platform ]; then + echo "$0: firmware loading: platform trigger not present, ignoring test" >&2 +else + if printf '\000' >"$DIR"/trigger_request_platform 2> /dev/null; then + echo "$0: empty filename should not succeed (platform)" >&2 + exit 1 + fi + + # Note we echo a non-existing name, since files on the file-system + # are preferred over firmware embedded inside the platform's firmware + # The test adds a fake entry with the requested name to the platform's + # fw list, so the name does not matter as long as it does not exist + if ! echo -n "nope-$NAME" >"$DIR"/trigger_request_platform ; then + echo "$0: could not trigger request platform" >&2 + exit 1 + fi + + # The test verifies itself that the loaded firmware contents matches + # the contents for the fake platform fw entry it added. + echo "$0: platform loading works" +fi + ### Batched requests tests test_config_present() { diff --git a/tools/testing/selftests/firmware/fw_namespace.c b/tools/testing/selftests/firmware/fw_namespace.c new file mode 100644 index 000000000000..5ebc1aec7923 --- /dev/null +++ b/tools/testing/selftests/firmware/fw_namespace.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test triggering of loading of firmware from different mount + * namespaces. Expect firmware to be always loaded from the mount + * namespace of PID 1. */ +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 +#endif + +static char *fw_path = NULL; + +static void die(char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + if (fw_path) + unlink(fw_path); + umount("/lib/firmware"); + exit(EXIT_FAILURE); +} + +static void trigger_fw(const char *fw_name, const char *sys_path) +{ + int fd; + + fd = open(sys_path, O_WRONLY); + if (fd < 0) + die("open failed: %s\n", + strerror(errno)); + if (write(fd, fw_name, strlen(fw_name)) != strlen(fw_name)) + exit(EXIT_FAILURE); + close(fd); +} + +static void setup_fw(const char *fw_path) +{ + int fd; + const char fw[] = "ABCD0123"; + + fd = open(fw_path, O_WRONLY | O_CREAT, 0600); + if (fd < 0) + die("open failed: %s\n", + strerror(errno)); + if (write(fd, fw, sizeof(fw) -1) != sizeof(fw) -1) + die("write failed: %s\n", + strerror(errno)); + close(fd); +} + +static bool test_fw_in_ns(const char *fw_name, const char *sys_path, bool block_fw_in_parent_ns) +{ + pid_t child; + + if (block_fw_in_parent_ns) + if (mount("test", "/lib/firmware", "tmpfs", MS_RDONLY, NULL) == -1) + die("blocking firmware in parent ns failed\n"); + + child = fork(); + if (child == -1) { + die("fork failed: %s\n", + strerror(errno)); + } + if (child != 0) { /* parent */ + pid_t pid; + int status; + + pid = waitpid(child, &status, 0); + if (pid == -1) { + die("waitpid failed: %s\n", + strerror(errno)); + } + if (pid != child) { + die("waited for %d got %d\n", + child, pid); + } + if (!WIFEXITED(status)) { + die("child did not terminate cleanly\n"); + } + if (block_fw_in_parent_ns) + umount("/lib/firmware"); + return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; + } + + if (unshare(CLONE_NEWNS) != 0) { + die("unshare(CLONE_NEWNS) failed: %s\n", + strerror(errno)); + } + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) == -1) + die("remount root in child ns failed\n"); + + if (!block_fw_in_parent_ns) { + if (mount("test", "/lib/firmware", "tmpfs", MS_RDONLY, NULL) == -1) + die("blocking firmware in child ns failed\n"); + } else + umount("/lib/firmware"); + + trigger_fw(fw_name, sys_path); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + const char *fw_name = "test-firmware.bin"; + char *sys_path; + if (argc != 2) + die("usage: %s sys_path\n", argv[0]); + + /* Mount tmpfs to /lib/firmware so we don't have to assume + that it is writable for us.*/ + if (mount("test", "/lib/firmware", "tmpfs", 0, NULL) == -1) + die("mounting tmpfs to /lib/firmware failed\n"); + + sys_path = argv[1]; + asprintf(&fw_path, "/lib/firmware/%s", fw_name); + + setup_fw(fw_path); + + setvbuf(stdout, NULL, _IONBF, 0); + /* Positive case: firmware in PID1 mount namespace */ + printf("Testing with firmware in parent namespace (assumed to be same file system as PID1)\n"); + if (!test_fw_in_ns(fw_name, sys_path, false)) + die("error: failed to access firmware\n"); + + /* Negative case: firmware in child mount namespace, expected to fail */ + printf("Testing with firmware in child namespace\n"); + if (test_fw_in_ns(fw_name, sys_path, true)) + die("error: firmware access did not fail\n"); + + unlink(fw_path); + free(fw_path); + umount("/lib/firmware"); + exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/firmware/fw_run_tests.sh b/tools/testing/selftests/firmware/fw_run_tests.sh index 8e14d555c197..777377078d5e 100755 --- a/tools/testing/selftests/firmware/fw_run_tests.sh +++ b/tools/testing/selftests/firmware/fw_run_tests.sh @@ -61,6 +61,10 @@ run_test_config_0003() check_mods check_setup +echo "Running namespace test: " +$TEST_DIR/fw_namespace $DIR/trigger_request +echo "OK" + if [ -f $FW_FORCE_SYSFS_FALLBACK ]; then run_test_config_0001 run_test_config_0002 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc index 18fdaab9f570..68ff3f45c720 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc @@ -23,7 +23,7 @@ if [ ! -f events/sched/sched_process_fork/hist ]; then exit_unsupported fi -echo "Test histogram multiple tiggers" +echo "Test histogram multiple triggers" echo 'hist:keys=parent_pid:vals=child_pid' > events/sched/sched_process_fork/trigger echo 'hist:keys=parent_comm:vals=child_pid' >> events/sched/sched_process_fork/trigger diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 5336b26506ab..2902f6a78f8a 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -635,10 +635,12 @@ struct __test_metadata { const char *name; void (*fn)(struct __test_metadata *); + pid_t pid; /* pid of test when being run */ int termsig; int passed; int trigger; /* extra handler after the evaluation */ - int timeout; + int timeout; /* seconds to wait for test timeout */ + bool timed_out; /* did this test timeout instead of exiting? */ __u8 step; bool no_print; /* manual trigger when TH_LOG_STREAM is not available */ struct __test_metadata *prev, *next; @@ -695,64 +697,116 @@ static inline int __bail(int for_realz, bool no_print, __u8 step) return 0; } -void __run_test(struct __test_metadata *t) +struct __test_metadata *__active_test; +static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) { - pid_t child_pid; + struct __test_metadata *t = __active_test; + + /* Sanity check handler execution environment. */ + if (!t) { + fprintf(TH_LOG_STREAM, + "no active test in SIGARLM handler!?\n"); + abort(); + } + if (sig != SIGALRM || sig != info->si_signo) { + fprintf(TH_LOG_STREAM, + "%s: SIGALRM handler caught signal %d!?\n", + t->name, sig != SIGALRM ? sig : info->si_signo); + abort(); + } + + t->timed_out = true; + kill(t->pid, SIGKILL); +} + +void __wait_for_test(struct __test_metadata *t) +{ + struct sigaction action = { + .sa_sigaction = __timeout_handler, + .sa_flags = SA_SIGINFO, + }; + struct sigaction saved_action; int status; + if (sigaction(SIGALRM, &action, &saved_action)) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: unable to install SIGARLM handler\n", + t->name); + return; + } + __active_test = t; + t->timed_out = false; + alarm(t->timeout); + waitpid(t->pid, &status, 0); + alarm(0); + if (sigaction(SIGALRM, &saved_action, NULL)) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: unable to uninstall SIGARLM handler\n", + t->name); + return; + } + __active_test = NULL; + + if (t->timed_out) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: Test terminated by timeout\n", t->name); + } else if (WIFEXITED(status)) { + t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; + if (t->termsig != -1) { + fprintf(TH_LOG_STREAM, + "%s: Test exited normally " + "instead of by signal (code: %d)\n", + t->name, + WEXITSTATUS(status)); + } else if (!t->passed) { + fprintf(TH_LOG_STREAM, + "%s: Test failed at step #%d\n", + t->name, + WEXITSTATUS(status)); + } + } else if (WIFSIGNALED(status)) { + t->passed = 0; + if (WTERMSIG(status) == SIGABRT) { + fprintf(TH_LOG_STREAM, + "%s: Test terminated by assertion\n", + t->name); + } else if (WTERMSIG(status) == t->termsig) { + t->passed = 1; + } else { + fprintf(TH_LOG_STREAM, + "%s: Test terminated unexpectedly " + "by signal %d\n", + t->name, + WTERMSIG(status)); + } + } else { + fprintf(TH_LOG_STREAM, + "%s: Test ended in some other way [%u]\n", + t->name, + status); + } +} + +void __run_test(struct __test_metadata *t) +{ t->passed = 1; t->trigger = 0; printf("[ RUN ] %s\n", t->name); - alarm(t->timeout); - child_pid = fork(); - if (child_pid < 0) { + t->pid = fork(); + if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; - } else if (child_pid == 0) { + } else if (t->pid == 0) { t->fn(t); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { - /* TODO(wad) add timeout support. */ - waitpid(child_pid, &status, 0); - if (WIFEXITED(status)) { - t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; - if (t->termsig != -1) { - fprintf(TH_LOG_STREAM, - "%s: Test exited normally " - "instead of by signal (code: %d)\n", - t->name, - WEXITSTATUS(status)); - } else if (!t->passed) { - fprintf(TH_LOG_STREAM, - "%s: Test failed at step #%d\n", - t->name, - WEXITSTATUS(status)); - } - } else if (WIFSIGNALED(status)) { - t->passed = 0; - if (WTERMSIG(status) == SIGABRT) { - fprintf(TH_LOG_STREAM, - "%s: Test terminated by assertion\n", - t->name); - } else if (WTERMSIG(status) == t->termsig) { - t->passed = 1; - } else { - fprintf(TH_LOG_STREAM, - "%s: Test terminated unexpectedly " - "by signal %d\n", - t->name, - WTERMSIG(status)); - } - } else { - fprintf(TH_LOG_STREAM, - "%s: Test ended in some other way [%u]\n", - t->name, - status); - } + __wait_for_test(t); } printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); - alarm(0); } static int test_harness_run(int __attribute__((unused)) argc, diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 3ed0134a764d..b0556c752443 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -137,7 +137,8 @@ endif # Selftest makefiles can override those targets by setting # OVERRIDE_TARGETS = 1. ifeq ($(OVERRIDE_TARGETS),) -$(OUTPUT)/%:%.c +LOCAL_HDRS := $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h +$(OUTPUT)/%:%.c $(LOCAL_HDRS) $(LINK.c) $^ $(LDLIBS) -o $@ $(OUTPUT)/%.o:%.S diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile index 53a848109f7b..0a15f9e23431 100644 --- a/tools/testing/selftests/memfd/Makefile +++ b/tools/testing/selftests/memfd/Makefile @@ -4,9 +4,8 @@ CFLAGS += -I../../../../include/uapi/ CFLAGS += -I../../../../include/ CFLAGS += -I../../../../usr/include/ -TEST_GEN_PROGS := memfd_test +TEST_GEN_PROGS := memfd_test fuse_test fuse_mnt TEST_PROGS := run_fuse_test.sh run_hugetlbfs_test.sh -TEST_GEN_FILES := fuse_mnt fuse_test fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) @@ -14,7 +13,7 @@ include ../lib.mk $(OUTPUT)/fuse_mnt: LDLIBS += $(shell pkg-config fuse --libs) -$(OUTPUT)/memfd_test: memfd_test.c common.o -$(OUTPUT)/fuse_test: fuse_test.c common.o +$(OUTPUT)/memfd_test: memfd_test.c common.c +$(OUTPUT)/fuse_test: fuse_test.c common.c -EXTRA_CLEAN = common.o +EXTRA_CLEAN = $(OUTPUT)/common.o diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index ecc52d4c034d..997c65dcad68 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -23,3 +23,8 @@ so_txtime tcp_fastopen_backup_key nettest fin_ack_lat +reuseaddr_ports_exhausted +hwtstamp_config +rxtimestamp +timestamping +txtimestamp diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 287ae916ec0b..3f386eb9e7d7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -11,7 +11,11 @@ TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh -TEST_PROGS += fin_ack_lat.sh +TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh +TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh +TEST_PROGS += route_localnet.sh +TEST_PROGS += reuseaddr_ports_exhausted.sh +TEST_PROGS += txtimestamp.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any @@ -20,6 +24,8 @@ TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr TEST_GEN_FILES += tcp_fastopen_backup_key TEST_GEN_FILES += fin_ack_lat +TEST_GEN_FILES += reuseaddr_ports_exhausted +TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index b8503a8119b0..3b42c06b5985 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -12,6 +12,7 @@ CONFIG_IPV6_VTI=y CONFIG_DUMMY=y CONFIG_BRIDGE=y CONFIG_VLAN_8021Q=y +CONFIG_IFB=y CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NF_CONNTRACK=m @@ -27,5 +28,6 @@ CONFIG_NFT_CHAIN_NAT_IPV6=m CONFIG_NFT_CHAIN_NAT_IPV4=m CONFIG_NET_SCH_FQ=m CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_NETEM=y CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_KALLSYMS=y diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 60273f1bc7d9..b7616704b55e 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -1041,6 +1041,27 @@ ipv6_addr_metric_test() fi log_test $rc 0 "Prefix route with metric on link up" + # verify peer metric added correctly + set -e + run_cmd "$IP -6 addr flush dev dummy2" + run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::1 peer 2001:db8:104::2 metric 260" + set +e + + check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 260" + log_test $? 0 "Set metric with peer route on local side" + log_test $? 0 "User specified metric on local address" + check_route6 "2001:db8:104::2 dev dummy2 proto kernel metric 260" + log_test $? 0 "Set metric with peer route on peer side" + + set -e + run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::1 peer 2001:db8:104::3 metric 261" + set +e + + check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 261" + log_test $? 0 "Modify metric and peer address on local side" + check_route6 "2001:db8:104::3 dev dummy2 proto kernel metric 261" + log_test $? 0 "Modify metric and peer address on peer side" + $IP li del dummy1 $IP li del dummy2 cleanup @@ -1457,13 +1478,20 @@ ipv4_addr_metric_test() run_cmd "$IP addr flush dev dummy2" run_cmd "$IP addr add dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 260" - run_cmd "$IP addr change dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 261" rc=$? if [ $rc -eq 0 ]; then - check_route "172.16.104.2 dev dummy2 proto kernel scope link src 172.16.104.1 metric 261" + check_route "172.16.104.2 dev dummy2 proto kernel scope link src 172.16.104.1 metric 260" + rc=$? + fi + log_test $rc 0 "Set metric of address with peer route" + + run_cmd "$IP addr change dev dummy2 172.16.104.1/32 peer 172.16.104.3 metric 261" + rc=$? + if [ $rc -eq 0 ]; then + check_route "172.16.104.3 dev dummy2 proto kernel scope link src 172.16.104.1 metric 261" rc=$? fi - log_test $rc 0 "Modify metric of address with peer route" + log_test $rc 0 "Modify metric and peer address for peer route" $IP li del dummy1 $IP li del dummy2 diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile new file mode 100644 index 000000000000..250fbb2d1625 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: GPL-2.0+ OR MIT + +TEST_PROGS = bridge_igmp.sh \ + bridge_port_isolation.sh \ + bridge_sticky_fdb.sh \ + bridge_vlan_aware.sh \ + bridge_vlan_unaware.sh \ + ethtool.sh \ + gre_inner_v4_multipath.sh \ + gre_inner_v6_multipath.sh \ + gre_multipath.sh \ + ip6gre_inner_v4_multipath.sh \ + ip6gre_inner_v6_multipath.sh \ + ipip_flat_gre_key.sh \ + ipip_flat_gre_keys.sh \ + ipip_flat_gre.sh \ + ipip_hier_gre_key.sh \ + ipip_hier_gre_keys.sh \ + ipip_hier_gre.sh \ + loopback.sh \ + mirror_gre_bound.sh \ + mirror_gre_bridge_1d.sh \ + mirror_gre_bridge_1d_vlan.sh \ + mirror_gre_bridge_1q_lag.sh \ + mirror_gre_bridge_1q.sh \ + mirror_gre_changes.sh \ + mirror_gre_flower.sh \ + mirror_gre_lag_lacp.sh \ + mirror_gre_neigh.sh \ + mirror_gre_nh.sh \ + mirror_gre.sh \ + mirror_gre_vlan_bridge_1q.sh \ + mirror_gre_vlan.sh \ + mirror_vlan.sh \ + router_bridge.sh \ + router_bridge_vlan.sh \ + router_broadcast.sh \ + router_mpath_nh.sh \ + router_multicast.sh \ + router_multipath.sh \ + router.sh \ + router_vid_1.sh \ + sch_ets.sh \ + sch_tbf_ets.sh \ + sch_tbf_prio.sh \ + sch_tbf_root.sh \ + tc_actions.sh \ + tc_chains.sh \ + tc_flower_router.sh \ + tc_flower.sh \ + tc_shblocks.sh \ + tc_vlan_modify.sh \ + vxlan_asymmetric.sh \ + vxlan_bridge_1d_port_8472.sh \ + vxlan_bridge_1d.sh \ + vxlan_bridge_1q_port_8472.sh \ + vxlan_bridge_1q.sh \ + vxlan_symmetric.sh + +TEST_PROGS_EXTENDED := devlink_lib.sh \ + ethtool_lib.sh \ + fib_offload_lib.sh \ + forwarding.config.sample \ + ipip_lib.sh \ + lib.sh \ + mirror_gre_lib.sh \ + mirror_gre_topo_lib.sh \ + mirror_lib.sh \ + mirror_topo_lib.sh \ + sch_ets_core.sh \ + sch_ets_tests.sh \ + sch_tbf_core.sh \ + sch_tbf_etsprio.sh \ + tc_common.sh + +include ../../lib.mk diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index 40b076983239..155d48bd4d9e 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -35,6 +35,12 @@ if [ $? -ne 0 ]; then exit 1 fi +devlink dev help 2>&1 | grep info &> /dev/null +if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing devlink dev info support" + exit 1 +fi + ############################################################################## # Devlink helpers @@ -373,6 +379,7 @@ devlink_trap_drop_test() local trap_name=$1; shift local group_name=$1; shift local dev=$1; shift + local handle=$1; shift # This is the common part of all the tests. It checks that stats are # initially idle, then non-idle after changing the trap action and @@ -397,7 +404,7 @@ devlink_trap_drop_test() devlink_trap_group_stats_idle_test $group_name check_err $? "Trap group stats not idle after setting action to drop" - tc_check_packets "dev $dev egress" 101 0 + tc_check_packets "dev $dev egress" $handle 0 check_err $? "Packets were not dropped" } @@ -406,7 +413,68 @@ devlink_trap_drop_cleanup() local mz_pid=$1; shift local dev=$1; shift local proto=$1; shift + local pref=$1; shift + local handle=$1; shift kill $mz_pid && wait $mz_pid &> /dev/null - tc filter del dev $dev egress protocol $proto pref 1 handle 101 flower + tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower +} + +devlink_trap_policers_num_get() +{ + devlink -j -p trap policer show | jq '.[]["'$DEVLINK_DEV'"] | length' +} + +devlink_trap_policer_rate_get() +{ + local policer_id=$1; shift + + devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \ + | jq '.[][][]["rate"]' +} + +devlink_trap_policer_burst_get() +{ + local policer_id=$1; shift + + devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \ + | jq '.[][][]["burst"]' +} + +devlink_trap_policer_rx_dropped_get() +{ + local policer_id=$1; shift + + devlink -j -p -s trap policer show $DEVLINK_DEV policer $policer_id \ + | jq '.[][][]["stats"]["rx"]["dropped"]' +} + +devlink_trap_group_policer_get() +{ + local group_name=$1; shift + + devlink -j -p trap group show $DEVLINK_DEV group $group_name \ + | jq '.[][][]["policer"]' +} + +devlink_trap_policer_ids_get() +{ + devlink -j -p trap policer show \ + | jq '.[]["'$DEVLINK_DEV'"][]["policer"]' +} + +devlink_port_by_netdev() +{ + local if_name=$1 + + devlink -j port show $if_name | jq -e '.[] | keys' | jq -r '.[]' +} + +devlink_cpu_port_get() +{ + local cpu_dl_port_num=$(devlink port list | grep "$DEVLINK_DEV" | + grep cpu | cut -d/ -f3 | cut -d: -f1 | + sed -n '1p') + + echo "$DEVLINK_DEV/$cpu_dl_port_num" } diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/net/forwarding/ethtool_lib.sh index 925d229a59d8..925d229a59d8 100755..100644 --- a/tools/testing/selftests/net/forwarding/ethtool_lib.sh +++ b/tools/testing/selftests/net/forwarding/ethtool_lib.sh diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 2f5da414aaa7..977fc2b326a2 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -60,6 +60,15 @@ check_tc_chain_support() fi } +check_tc_action_hw_stats_support() +{ + tc actions help 2>&1 | grep -q hw_stats + if [[ $? -ne 0 ]]; then + echo "SKIP: iproute2 too old; tc is missing action hw_stats support" + exit 1 + fi +} + if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" exit 0 @@ -248,13 +257,40 @@ busywait() done } +not() +{ + "$@" + [[ $? != 0 ]] +} + +grep_bridge_fdb() +{ + local addr=$1; shift + local word + local flag + + if [ "$1" == "self" ] || [ "$1" == "master" ]; then + word=$1; shift + if [ "$1" == "-v" ]; then + flag=$1; shift + fi + fi + + $@ | grep $addr | grep $flag "$word" +} + +wait_for_offload() +{ + "$@" | grep -q offload +} + until_counter_is() { - local value=$1; shift + local expr=$1; shift local current=$("$@") echo $((current)) - ((current >= value)) + ((current $expr)) } busywait_for_counter() @@ -263,7 +299,7 @@ busywait_for_counter() local delta=$1; shift local base=$("$@") - busywait "$timeout" until_counter_is $((base + delta)) "$@" + busywait "$timeout" until_counter_is ">= $((base + delta))" "$@" } setup_wait_dev() @@ -599,6 +635,17 @@ tc_rule_stats_get() | jq ".[1].options.actions[].stats$selector" } +tc_rule_handle_stats_get() +{ + local id=$1; shift + local handle=$1; shift + local selector=${1:-.packets}; shift + + tc -j -s filter show $id \ + | jq ".[] | select(.options.handle == $handle) | \ + .options.actions[0].stats$selector" +} + ethtool_stats_get() { local dev=$1; shift @@ -607,6 +654,26 @@ ethtool_stats_get() ethtool -S $dev | grep "^ *$stat:" | head -n 1 | cut -d: -f2 } +qdisc_stats_get() +{ + local dev=$1; shift + local handle=$1; shift + local selector=$1; shift + + tc -j -s qdisc show dev "$dev" \ + | jq '.[] | select(.handle == "'"$handle"'") | '"$selector" +} + +qdisc_parent_stats_get() +{ + local dev=$1; shift + local parent=$1; shift + local selector=$1; shift + + tc -j -s qdisc show dev "$dev" invisible \ + | jq '.[] | select(.parent == "'"$parent"'") | '"$selector" +} + humanize() { local speed=$1; shift @@ -1132,18 +1199,29 @@ flood_test() flood_multicast_test $br_port $host1_if $host2_if } -start_traffic() +__start_traffic() { + local proto=$1; shift local h_in=$1; shift # Where the traffic egresses the host local sip=$1; shift local dip=$1; shift local dmac=$1; shift $MZ $h_in -p 8000 -A $sip -B $dip -c 0 \ - -a own -b $dmac -t udp -q & + -a own -b $dmac -t "$proto" -q "$@" & sleep 1 } +start_traffic() +{ + __start_traffic udp "$@" +} + +start_tcp_traffic() +{ + __start_traffic tcp "$@" +} + stop_traffic() { # Suppress noise from killing mausezahn. diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh new file mode 100755 index 000000000000..b50081855913 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -0,0 +1,238 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on +# egress of $swp2, the traffic is acted upon by a pedit action. An ingress +# filter installed on $h2 verifies that the packet looks like expected. +# +# +----------------------+ +----------------------+ +# | H1 | | H2 | +# | + $h1 | | $h2 + | +# | | 192.0.2.1/28 | | 192.0.2.2/28 | | +# +----|-----------------+ +----------------|-----+ +# | | +# +----|----------------------------------------------------------------|-----+ +# | SW | | | +# | +-|----------------------------------------------------------------|-+ | +# | | + $swp1 BR $swp2 + | | +# | +--------------------------------------------------------------------+ | +# +---------------------------------------------------------------------------+ + +ALL_TESTS=" + ping_ipv4 + test_ip_dsfield + test_ip_dscp + test_ip_ecn + test_ip_dscp_ecn +" + +NUM_NETIFS=4 +source lib.sh +source tc_common.sh + +: ${HIT_TIMEOUT:=2000} # ms + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64 + tc qdisc add dev $h2 clsact +} + +h2_destroy() +{ + tc qdisc del dev $h2 clsact + simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64 +} + +switch_create() +{ + ip link add name br1 up type bridge vlan_filtering 1 + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + ip link set dev $swp2 master br1 + ip link set dev $swp2 up + + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact +} + +switch_destroy() +{ + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + ip link set dev $swp2 nomaster + ip link set dev $swp1 nomaster + ip link del dev br1 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + h2mac=$(mac_get $h2) + + vrf_prepare + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h2_destroy + h1_destroy + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.2 +} + +do_test_pedit_dsfield_common() +{ + local pedit_locus=$1; shift + local pedit_action=$1; shift + local mz_flags=$1; shift + + RET=0 + + # TOS 125: DSCP 31, ECN 1. Used for testing that the relevant part is + # overwritten when zero is selected. + $MZ $mz_flags $h1 -c 10 -d 20msec -p 100 \ + -a own -b $h2mac -q -t tcp tos=0x7d,sp=54321,dp=12345 + + local pkts + pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \ + tc_rule_handle_stats_get "dev $h2 ingress" 101) + check_err $? "Expected to get 10 packets, but got $pkts." + log_test "$pedit_locus pedit $pedit_action" +} + +do_test_pedit_dsfield() +{ + local pedit_locus=$1; shift + local pedit_action=$1; shift + local match_prot=$1; shift + local match_flower=$1; shift + local mz_flags=$1; shift + local saddr=$1; shift + local daddr=$1; shift + + tc filter add $pedit_locus handle 101 pref 1 \ + flower action pedit ex munge $pedit_action + tc filter add dev $h2 ingress handle 101 pref 1 prot $match_prot \ + flower skip_hw $match_flower action pass + + do_test_pedit_dsfield_common "$pedit_locus" "$pedit_action" "$mz_flags" + + tc filter del dev $h2 ingress pref 1 + tc filter del $pedit_locus pref 1 +} + +do_test_ip_dsfield() +{ + local locus=$1; shift + local dsfield + + for dsfield in 0 1 2 3 128 252 253 254 255; do + do_test_pedit_dsfield "$locus" \ + "ip dsfield set $dsfield" \ + ip "ip_tos $dsfield" \ + "-A 192.0.2.1 -B 192.0.2.2" + done +} + +test_ip_dsfield() +{ + do_test_ip_dsfield "dev $swp1 ingress" + do_test_ip_dsfield "dev $swp2 egress" +} + +do_test_ip_dscp() +{ + local locus=$1; shift + local dscp + + for dscp in 0 1 2 3 32 61 62 63; do + do_test_pedit_dsfield "$locus" \ + "ip dsfield set $((dscp << 2)) retain 0xfc" \ + ip "ip_tos $(((dscp << 2) | 1))" \ + "-A 192.0.2.1 -B 192.0.2.2" + done +} + +test_ip_dscp() +{ + do_test_ip_dscp "dev $swp1 ingress" + do_test_ip_dscp "dev $swp2 egress" +} + +do_test_ip_ecn() +{ + local locus=$1; shift + local ecn + + for ecn in 0 1 2 3; do + do_test_pedit_dsfield "$locus" \ + "ip dsfield set $ecn retain 0x03" \ + ip "ip_tos $((124 | $ecn))" \ + "-A 192.0.2.1 -B 192.0.2.2" + done +} + +test_ip_ecn() +{ + do_test_ip_ecn "dev $swp1 ingress" + do_test_ip_ecn "dev $swp2 egress" +} + +do_test_ip_dscp_ecn() +{ + local locus=$1; shift + + tc filter add $locus handle 101 pref 1 \ + flower action pedit ex munge ip dsfield set 124 retain 0xfc \ + action pedit ex munge ip dsfield set 1 retain 0x03 + tc filter add dev $h2 ingress handle 101 pref 1 prot ip \ + flower skip_hw ip_tos 125 action pass + + do_test_pedit_dsfield_common "$locus" "set DSCP + set ECN" \ + "-A 192.0.2.1 -B 192.0.2.2" + + tc filter del dev $h2 ingress pref 1 + tc filter del $locus pref 1 +} + +test_ip_dscp_ecn() +{ + do_test_ip_dscp_ecn "dev $swp1 ingress" + do_test_ip_dscp_ecn "dev $swp2 egress" +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh index 40e0ad1bc4f2..e60c8b4818cc 100755 --- a/tools/testing/selftests/net/forwarding/sch_ets.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets.sh @@ -34,11 +34,14 @@ switch_destroy() } # Callback from sch_ets_tests.sh -get_stats() +collect_stats() { - local stream=$1; shift + local -a streams=("$@") + local stream - link_stats_get $h2.1$stream rx bytes + for stream in ${streams[@]}; do + qdisc_parent_stats_get $swp2 10:$((stream + 1)) .bytes + done } ets_run diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index 3c3b204d47e8..cdf689e99458 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -2,7 +2,7 @@ # Global interface: # $put -- port under test (e.g. $swp2) -# get_stats($band) -- A function to collect stats for band +# collect_stats($streams...) -- A function to get stats for individual streams # ets_start_traffic($band) -- Start traffic for this band # ets_change_qdisc($op, $dev, $nstrict, $quanta...) -- Add or change qdisc @@ -94,15 +94,11 @@ __ets_dwrr_test() sleep 10 - t0=($(for stream in ${streams[@]}; do - get_stats $stream - done)) + t0=($(collect_stats "${streams[@]}")) sleep 10 - t1=($(for stream in ${streams[@]}; do - get_stats $stream - done)) + t1=($(collect_stats "${streams[@]}")) d=($(for ((i = 0; i < ${#streams[@]}; i++)); do echo $((${t1[$i]} - ${t0[$i]})) done)) diff --git a/tools/testing/selftests/net/forwarding/skbedit_priority.sh b/tools/testing/selftests/net/forwarding/skbedit_priority.sh new file mode 100755 index 000000000000..e3bd8a6bb8b4 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/skbedit_priority.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on +# egress of $swp2, the traffic is acted upon by an action skbedit priority. The +# new priority should be taken into account when classifying traffic on the PRIO +# qdisc at $swp2. The test verifies that for different priority values, the +# traffic ends up in expected PRIO band. +# +# +----------------------+ +----------------------+ +# | H1 | | H2 | +# | + $h1 | | $h2 + | +# | | 192.0.2.1/28 | | 192.0.2.2/28 | | +# +----|-----------------+ +----------------|-----+ +# | | +# +----|----------------------------------------------------------------|-----+ +# | SW | | | +# | +-|----------------------------------------------------------------|-+ | +# | | + $swp1 BR $swp2 + | | +# | | PRIO | | +# | +--------------------------------------------------------------------+ | +# +---------------------------------------------------------------------------+ + +ALL_TESTS=" + ping_ipv4 + test_ingress + test_egress +" + +NUM_NETIFS=4 +source lib.sh + +: ${HIT_TIMEOUT:=2000} # ms + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/28 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/28 +} + +switch_create() +{ + ip link add name br1 up type bridge vlan_filtering 1 + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + ip link set dev $swp2 master br1 + ip link set dev $swp2 up + + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact + tc qdisc add dev $swp2 root handle 10: \ + prio bands 8 priomap 7 6 5 4 3 2 1 0 +} + +switch_destroy() +{ + tc qdisc del dev $swp2 root + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + ip link set dev $swp2 nomaster + ip link set dev $swp1 nomaster + ip link del dev br1 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + h2mac=$(mac_get $h2) + + vrf_prepare + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h2_destroy + h1_destroy + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.2 +} + +test_skbedit_priority_one() +{ + local locus=$1; shift + local prio=$1; shift + local classid=$1; shift + + RET=0 + + tc filter add $locus handle 101 pref 1 \ + flower action skbedit priority $prio + + local pkt0=$(qdisc_parent_stats_get $swp2 $classid .packets) + local pkt2=$(tc_rule_handle_stats_get "$locus" 101) + $MZ $h1 -t udp "sp=54321,dp=12345" -c 10 -d 20msec -p 100 \ + -a own -b $h2mac -A 192.0.2.1 -B 192.0.2.2 -q + + local pkt1 + pkt1=$(busywait "$HIT_TIMEOUT" until_counter_is ">= $((pkt0 + 10))" \ + qdisc_parent_stats_get $swp2 $classid .packets) + check_err $? "Expected to get 10 packets on class $classid, but got $((pkt1 - pkt0))." + + local pkt3=$(tc_rule_handle_stats_get "$locus" 101) + ((pkt3 >= pkt2 + 10)) + check_err $? "Expected to get 10 packets on skbedit rule but got $((pkt3 - pkt2))." + + log_test "$locus skbedit priority $prio -> classid $classid" + + tc filter del $locus pref 1 +} + +test_ingress() +{ + local prio + + for prio in {0..7}; do + test_skbedit_priority_one "dev $swp1 ingress" \ + $prio 10:$((8 - prio)) + done +} + +test_egress() +{ + local prio + + for prio in {0..7}; do + test_skbedit_priority_one "dev $swp2 egress" \ + $prio 10:$((8 - prio)) + done +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh index 64f652633585..0e18e8be6e2a 100644 --- a/tools/testing/selftests/net/forwarding/tc_common.sh +++ b/tools/testing/selftests/net/forwarding/tc_common.sh @@ -6,39 +6,14 @@ CHECK_TC="yes" # Can be overridden by the configuration file. See lib.sh TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms -__tc_check_packets() -{ - local id=$1 - local handle=$2 - local count=$3 - local operator=$4 - - start_time="$(date -u +%s%3N)" - while true - do - cmd_jq "tc -j -s filter show $id" \ - ".[] | select(.options.handle == $handle) | \ - select(.options.actions[0].stats.packets $operator $count)" \ - &> /dev/null - ret=$? - if [[ $ret -eq 0 ]]; then - return $ret - fi - current_time="$(date -u +%s%3N)" - diff=$(expr $current_time - $start_time) - if [ "$diff" -gt "$TC_HIT_TIMEOUT" ]; then - return 1 - fi - done -} - tc_check_packets() { local id=$1 local handle=$2 local count=$3 - __tc_check_packets "$id" "$handle" "$count" "==" + busywait "$TC_HIT_TIMEOUT" until_counter_is "== $count" \ + tc_rule_handle_stats_get "$id" "$handle" > /dev/null } tc_check_packets_hitting() @@ -46,5 +21,6 @@ tc_check_packets_hitting() local id=$1 local handle=$2 - __tc_check_packets "$id" "$handle" 0 ">" + busywait "$TC_HIT_TIMEOUT" until_counter_is "> 0" \ + tc_rule_handle_stats_get "$id" "$handle" > /dev/null } diff --git a/tools/testing/selftests/networking/timestamping/hwtstamp_config.c b/tools/testing/selftests/net/hwtstamp_config.c index e1fdee841021..e1fdee841021 100644 --- a/tools/testing/selftests/networking/timestamping/hwtstamp_config.c +++ b/tools/testing/selftests/net/hwtstamp_config.c diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore index d72f07642738..ea13b255a99d 100644 --- a/tools/testing/selftests/net/mptcp/.gitignore +++ b/tools/testing/selftests/net/mptcp/.gitignore @@ -1,2 +1,3 @@ mptcp_connect +pm_nl_ctl *.pcap diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index ba450e62dc5b..f50976ee7d44 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 top_srcdir = ../../../../.. +KSFT_KHDR_INSTALL := 1 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include -TEST_PROGS := mptcp_connect.sh +TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh -TEST_GEN_FILES = mptcp_connect +TEST_GEN_FILES = mptcp_connect pm_nl_ctl TEST_FILES := settings diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 99579c0223c1..cedee5b952ba 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -34,8 +34,8 @@ extern int optind; #define TCP_ULP 31 #endif +static int poll_timeout = 10 * 1000; static bool listen_mode; -static int poll_timeout; enum cfg_mode { CFG_MODE_POLL, @@ -50,11 +50,21 @@ static int cfg_sock_proto = IPPROTO_MPTCP; static bool tcpulp_audit; static int pf = AF_INET; static int cfg_sndbuf; +static int cfg_rcvbuf; +static bool cfg_join; static void die_usage(void) { - fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] -m mode]" - "[ -l ] [ -t timeout ] connect_address\n"); + fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] [-m mode]" + "[-l] connect_address\n"); + fprintf(stderr, "\t-6 use ipv6\n"); + fprintf(stderr, "\t-t num -- set poll timeout to num\n"); + fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n"); + fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n"); + fprintf(stderr, "\t-p num -- use port num\n"); + fprintf(stderr, "\t-m [MPTCP|TCP] -- use tcp or mptcp sockets\n"); + fprintf(stderr, "\t-s [mmap|poll] -- use poll (default) or mmap\n"); + fprintf(stderr, "\t-u -- check mptcp ulp\n"); exit(1); } @@ -97,6 +107,17 @@ static void xgetaddrinfo(const char *node, const char *service, } } +static void set_rcvbuf(int fd, unsigned int size) +{ + int err; + + err = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)); + if (err) { + perror("set SO_RCVBUF"); + exit(1); + } +} + static void set_sndbuf(int fd, unsigned int size) { int err; @@ -230,6 +251,7 @@ static int sock_connect_mptcp(const char * const remoteaddr, static size_t do_rnd_write(const int fd, char *buf, const size_t len) { + static bool first = true; unsigned int do_w; ssize_t bw; @@ -237,10 +259,19 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len) if (do_w == 0 || do_w > len) do_w = len; + if (cfg_join && first && do_w > 100) + do_w = 100; + bw = write(fd, buf, do_w); if (bw < 0) perror("write"); + /* let the join handshake complete, before going on */ + if (cfg_join && first) { + usleep(200000); + first = false; + } + return bw; } @@ -365,8 +396,11 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) break; /* ... but we still receive. - * Close our write side. + * Close our write side, ev. give some time + * for address notification */ + if (cfg_join) + usleep(400000); shutdown(peerfd, SHUT_WR); } else { if (errno == EINTR) @@ -383,6 +417,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) } } + /* leave some time for late join/announce */ + if (cfg_join) + usleep(400000); + close(peerfd); return 0; } @@ -638,7 +676,7 @@ static void maybe_close(int fd) { unsigned int r = rand(); - if (r & 1) + if (!cfg_join && (r & 1)) close(fd); } @@ -704,6 +742,8 @@ int main_loop(void) check_getpeername_connect(fd); + if (cfg_rcvbuf) + set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); @@ -745,7 +785,7 @@ int parse_mode(const char *mode) return 0; } -int parse_sndbuf(const char *size) +static int parse_int(const char *size) { unsigned long s; @@ -765,17 +805,19 @@ int parse_sndbuf(const char *size) die_usage(); } - cfg_sndbuf = s; - - return 0; + return (int)s; } static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6lp:s:hut:m:b:")) != -1) { + while ((c = getopt(argc, argv, "6jlp:s:hut:m:S:R:")) != -1) { switch (c) { + case 'j': + cfg_join = true; + cfg_mode = CFG_MODE_POLL; + break; case 'l': listen_mode = true; break; @@ -802,8 +844,11 @@ static void parse_opts(int argc, char **argv) case 'm': cfg_mode = parse_mode(optarg); break; - case 'b': - cfg_sndbuf = parse_sndbuf(optarg); + case 'S': + cfg_sndbuf = parse_int(optarg); + break; + case 'R': + cfg_rcvbuf = parse_int(optarg); break; } } @@ -831,6 +876,8 @@ int main(int argc, char *argv[]) if (fd < 0) return 1; + if (cfg_rcvbuf) + set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index d573a0feb98d..acf02e156d20 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -3,7 +3,7 @@ time_start=$(date +%s) -optstring="b:d:e:l:r:h4cm:" +optstring="S:R:d:e:l:r:h4cm:" ret=0 sin="" sout="" @@ -19,6 +19,7 @@ tc_loss=$((RANDOM%101)) tc_reorder="" testmode="" sndbuf=0 +rcvbuf=0 options_log=true if [ $tc_loss -eq 100 ];then @@ -39,7 +40,8 @@ usage() { echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)" echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)" echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" - echo -e "\t-b: set sndbuf value (default: use kernel default)" + echo -e "\t-S: set sndbuf value (default: use kernel default)" + echo -e "\t-R: set rcvbuf value (default: use kernel default)" echo -e "\t-m: test mode (poll, sendfile; default: poll)" } @@ -73,11 +75,19 @@ while getopts "$optstring" option;do "c") capture=true ;; - "b") + "S") if [ $OPTARG -ge 0 ];then sndbuf="$OPTARG" else - echo "-s requires numeric argument, got \"$OPTARG\"" 1>&2 + echo "-S requires numeric argument, got \"$OPTARG\"" 1>&2 + exit 1 + fi + ;; + "R") + if [ $OPTARG -ge 0 ];then + rcvbuf="$OPTARG" + else + echo "-R requires numeric argument, got \"$OPTARG\"" 1>&2 exit 1 fi ;; @@ -342,8 +352,12 @@ do_transfer() port=$((10000+$TEST_COUNT)) TEST_COUNT=$((TEST_COUNT+1)) + if [ "$rcvbuf" -gt 0 ]; then + extra_args="$extra_args -R $rcvbuf" + fi + if [ "$sndbuf" -gt 0 ]; then - extra_args="$extra_args -b $sndbuf" + extra_args="$extra_args -S $sndbuf" fi if [ -n "$testmode" ]; then diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh new file mode 100755 index 000000000000..dd42c2f692d0 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -0,0 +1,357 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ret=0 +sin="" +sout="" +cin="" +cout="" +ksft_skip=4 +timeout=30 +capture=0 + +TEST_COUNT=0 + +init() +{ + capout=$(mktemp) + + rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) + + ns1="ns1-$rndh" + ns2="ns2-$rndh" + + for netns in "$ns1" "$ns2";do + ip netns add $netns || exit $ksft_skip + ip -net $netns link set lo up + ip netns exec $netns sysctl -q net.mptcp.enabled=1 + ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 + ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + done + + # ns1 ns2 + # ns1eth1 ns2eth1 + # ns1eth2 ns2eth2 + # ns1eth3 ns2eth3 + # ns1eth4 ns2eth4 + + for i in `seq 1 4`; do + ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2" + ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i + ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad + ip -net "$ns1" link set ns1eth$i up + + ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i + ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad + ip -net "$ns2" link set ns2eth$i up + + # let $ns2 reach any $ns1 address from any interface + ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i + done +} + +cleanup_partial() +{ + rm -f "$capout" + + for netns in "$ns1" "$ns2"; do + ip netns del $netns + done +} + +cleanup() +{ + rm -f "$cin" "$cout" + rm -f "$sin" "$sout" + cleanup_partial +} + +reset() +{ + cleanup_partial + init +} + +for arg in "$@"; do + if [ "$arg" = "-c" ]; then + capture=1 + fi +done + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "[ FAIL ] $what does not match (in, out):" + print_file_err "$in" + print_file_err "$out" + + return 1 + fi + + return 0 +} + +do_ping() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + + ip netns exec ${connector_ns} ping -q -c 1 $connect_addr >/dev/null + if [ $? -ne 0 ] ; then + echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2 + ret=1 + fi +} + +do_transfer() +{ + listener_ns="$1" + connector_ns="$2" + cl_proto="$3" + srv_proto="$4" + connect_addr="$5" + + port=$((10000+$TEST_COUNT)) + TEST_COUNT=$((TEST_COUNT+1)) + + :> "$cout" + :> "$sout" + :> "$capout" + + if [ $capture -eq 1 ]; then + if [ -z $SUDO_USER ] ; then + capuser="" + else + capuser="-Z $SUDO_USER" + fi + + capfile="mp_join-${listener_ns}.pcap" + + echo "Capturing traffic for test $TEST_COUNT into $capfile" + ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 & + cappid=$! + + sleep 1 + fi + + ip netns exec ${listener_ns} ./mptcp_connect -j -t $timeout -l -p $port -s ${srv_proto} 0.0.0.0 < "$sin" > "$sout" & + spid=$! + + sleep 1 + + ip netns exec ${connector_ns} ./mptcp_connect -j -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + cpid=$! + + wait $cpid + retc=$? + wait $spid + rets=$? + + if [ $capture -eq 1 ]; then + sleep 1 + kill $cappid + fi + + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " client exit code $retc, server $rets" 1>&2 + echo "\nnetns ${listener_ns} socket stat for $port:" 1>&2 + ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" + echo "\nnetns ${connector_ns} socket stat for $port:" 1>&2 + ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" + + cat "$capout" + return 1 + fi + + check_transfer $sin $cout "file received by client" + retc=$? + check_transfer $cin $sout "file received by server" + rets=$? + + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then + cat "$capout" + return 0 + fi + + cat "$capout" + return 1 +} + +make_file() +{ + name=$1 + who=$2 + + SIZE=1 + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" + + echo "Created $name (size $SIZE KB) containing data sent by $who" +} + +run_tests() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + lret=0 + + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} + lret=$? + if [ $lret -ne 0 ]; then + ret=$lret + return + fi +} + +chk_join_nr() +{ + local msg="$1" + local syn_nr=$2 + local syn_ack_nr=$3 + local ack_nr=$4 + local count + local dump_stats + + printf "%-36s %s" "$msg" "syn" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$syn_nr" ]; then + echo "[fail] got $count JOIN[s] syn expected $syn_nr" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + + echo -n " - synack" + count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$syn_ack_nr" ]; then + echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + + echo -n " - ack" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinAckRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$ack_nr" ]; then + echo "[fail] got $count JOIN[s] ack expected $ack_nr" + ret=1 + dump_stats=1 + else + echo "[ ok ]" + fi + if [ "${dump_stats}" = 1 ]; then + echo Server ns stats + ip netns exec $ns1 nstat -as | grep MPTcp + echo Client ns stats + ip netns exec $ns2 nstat -as | grep MPTcp + fi +} + +sin=$(mktemp) +sout=$(mktemp) +cin=$(mktemp) +cout=$(mktemp) +init +make_file "$cin" "client" +make_file "$sin" "server" +trap cleanup EXIT + +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "no JOIN" "0" "0" "0" + +# subflow limted by client +reset +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow, limited by client" 0 0 0 + +# subflow limted by server +reset +ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow, limited by server" 1 1 0 + +# subflow +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow" 1 1 1 + +# multiple subflows +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows" 2 2 2 + +# multiple subflows limited by serverf +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows, limited by server" 2 2 1 + +# add_address, unused +reset +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "unused signal address" 0 0 0 + +# accept and use add_addr +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "signal address" 1 1 1 + +# accept and use add_addr with an additional subflow +# note: signal address in server ns and local addresses in client ns must +# belong to different subnets or one of the listed local address could be +# used for 'add_addr' subflow +reset +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns1 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl limits 1 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "subflow and signal" 2 2 2 + +# accept and use add_addr with additional subflows +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows and signal" 3 3 3 + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh new file mode 100755 index 000000000000..9172746b6cf0 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ksft_skip=4 +ret=0 + +usage() { + echo "Usage: $0 [ -h ]" +} + + +while getopts "$optstring" option;do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "?") + usage $0 + exit 1 + ;; + esac +done + +sec=$(date +%s) +rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) +ns1="ns1-$rndh" +err=$(mktemp) +ret=0 + +cleanup() +{ + rm -f $out + ip netns del $ns1 +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns1 || exit $ksft_skip +ip -net $ns1 link set lo up +ip netns exec $ns1 sysctl -q net.mptcp.enabled=1 + +check() +{ + local cmd="$1" + local expected="$2" + local msg="$3" + local out=`$cmd 2>$err` + local cmd_ret=$? + + printf "%-50s %s" "$msg" + if [ $cmd_ret -ne 0 ]; then + echo "[FAIL] command execution '$cmd' stderr " + cat $err + ret=1 + elif [ "$out" = "$expected" ]; then + echo "[ OK ]" + else + echo -n "[FAIL] " + echo "expected '$expected' got '$out'" + ret=1 + fi +} + +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "defaults limits" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup +check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr" + +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 2 flags subflow dev lo 10.0.1.2 +id 3 flags signal,backup 10.0.1.3" "dump addrs" + +ip netns exec $ns1 ./pm_nl_ctl del 2 +check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3" "dump addrs after del" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 id 10 flags signal +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment" + +for i in `seq 5 9`; do + ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 +done +check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" +check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" + +for i in `seq 9 256`; do + ip netns exec $ns1 ./pm_nl_ctl del $i + ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 +done +check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3 +id 4 flags signal 10.0.1.4 +id 5 flags signal 10.0.1.5 +id 6 flags signal 10.0.1.6 +id 7 flags signal 10.0.1.7 +id 8 flags signal 10.0.1.8" "id limit" + +ip netns exec $ns1 ./pm_nl_ctl flush +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" + +ip netns exec $ns1 ./pm_nl_ctl limits 9 1 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "rcv addrs above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 1 9 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "subflows above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 8 8 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 +subflows 8" "set limits" + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c new file mode 100644 index 000000000000..b24a2f17d415 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <errno.h> +#include <error.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/socket.h> +#include <sys/types.h> + +#include <arpa/inet.h> +#include <net/if.h> + +#include <linux/rtnetlink.h> +#include <linux/genetlink.h> + +#include "linux/mptcp.h" + +#ifndef MPTCP_PM_NAME +#define MPTCP_PM_NAME "mptcp_pm" +#endif + +static void syntax(char *argv[]) +{ + fprintf(stderr, "%s add|get|del|flush|dump|accept [<args>]\n", argv[0]); + fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n"); + fprintf(stderr, "\tdel <id>\n"); + fprintf(stderr, "\tget <id>\n"); + fprintf(stderr, "\tflush\n"); + fprintf(stderr, "\tdump\n"); + fprintf(stderr, "\tlimits [<rcv addr max> <subflow max>]\n"); + exit(0); +} + +static int init_genl_req(char *data, int family, int cmd, int version) +{ + struct nlmsghdr *nh = (void *)data; + struct genlmsghdr *gh; + int off = 0; + + nh->nlmsg_type = family; + nh->nlmsg_flags = NLM_F_REQUEST; + nh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + off += NLMSG_ALIGN(sizeof(*nh)); + + gh = (void *)(data + off); + gh->cmd = cmd; + gh->version = version; + off += NLMSG_ALIGN(sizeof(*gh)); + return off; +} + +static void nl_error(struct nlmsghdr *nh) +{ + struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(nh); + int len = nh->nlmsg_len - sizeof(*nh); + uint32_t off; + + if (len < sizeof(struct nlmsgerr)) + error(1, 0, "netlink error message truncated %d min %ld", len, + sizeof(struct nlmsgerr)); + + if (!err->error) { + /* check messages from kernel */ + struct rtattr *attrs = (struct rtattr *)NLMSG_DATA(nh); + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == NLMSGERR_ATTR_MSG) + fprintf(stderr, "netlink ext ack msg: %s\n", + (char *)RTA_DATA(attrs)); + if (attrs->rta_type == NLMSGERR_ATTR_OFFS) { + memcpy(&off, RTA_DATA(attrs), 4); + fprintf(stderr, "netlink err off %d\n", + (int)off); + } + attrs = RTA_NEXT(attrs, len); + } + } else { + fprintf(stderr, "netlink error %d", err->error); + } +} + +/* do a netlink command and, if max > 0, fetch the reply */ +static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max) +{ + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + socklen_t addr_len; + void *data = nh; + int rem, ret; + int err = 0; + + nh->nlmsg_len = len; + ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr)); + if (ret != len) + error(1, errno, "send netlink: %uB != %uB\n", ret, len); + if (max == 0) + return 0; + + addr_len = sizeof(nladdr); + rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len); + if (ret < 0) + error(1, errno, "recv netlink: %uB\n", ret); + + /* Beware: the NLMSG_NEXT macro updates the 'rem' argument */ + for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + nl_error(nh); + err = 1; + } + } + if (err) + error(1, 0, "bailing out due to netlink error[s]"); + return ret; +} + +static int genl_parse_getfamily(struct nlmsghdr *nlh) +{ + struct genlmsghdr *ghdr = NLMSG_DATA(nlh); + int len = nlh->nlmsg_len; + struct rtattr *attrs; + + if (nlh->nlmsg_type != GENL_ID_CTRL) + error(1, errno, "Not a controller message, len=%d type=0x%x\n", + nlh->nlmsg_len, nlh->nlmsg_type); + + len -= NLMSG_LENGTH(GENL_HDRLEN); + + if (len < 0) + error(1, errno, "wrong controller message len %d\n", len); + + if (ghdr->cmd != CTRL_CMD_NEWFAMILY) + error(1, errno, "Unknown controller command %d\n", ghdr->cmd); + + attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == CTRL_ATTR_FAMILY_ID) + return *(__u16 *)RTA_DATA(attrs); + attrs = RTA_NEXT(attrs, len); + } + + error(1, errno, "can't find CTRL_ATTR_FAMILY_ID attr"); + return -1; +} + +static int resolve_mptcp_pm_netlink(int fd) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + struct rtattr *rta; + int namelen; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, GENL_ID_CTRL, CTRL_CMD_GETFAMILY, 0); + + rta = (void *)(data + off); + namelen = strlen(MPTCP_PM_NAME) + 1; + rta->rta_type = CTRL_ATTR_FAMILY_NAME; + rta->rta_len = RTA_LENGTH(namelen); + memcpy(RTA_DATA(rta), MPTCP_PM_NAME, namelen); + off += NLMSG_ALIGN(rta->rta_len); + + do_nl_req(fd, nh, off, sizeof(data)); + return genl_parse_getfamily((void *)data); +} + +int add_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + u_int16_t family; + u_int32_t flags; + int nest_start; + u_int8_t id; + int off = 0; + int arg; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_VER); + + if (argc < 3) + syntax(argv); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else + error(1, errno, "can't parse ip %s", argv[2]); + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + + for (arg = 3; arg < argc; arg++) { + if (!strcmp(argv[arg], "flags")) { + char *tok, *str; + + /* flags */ + flags = 0; + if (++arg >= argc) + error(1, 0, " missing flags value"); + + /* do not support flag list yet */ + for (str = argv[arg]; (tok = strtok(str, ",")); + str = NULL) { + if (!strcmp(tok, "subflow")) + flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + else if (!strcmp(tok, "signal")) + flags |= MPTCP_PM_ADDR_FLAG_SIGNAL; + else if (!strcmp(tok, "backup")) + flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + error(1, errno, + "unknown flag %s", argv[arg]); + } + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &flags, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "id")) { + if (++arg >= argc) + error(1, 0, " missing id value"); + + id = atoi(argv[arg]); + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "dev")) { + int32_t ifindex; + + if (++arg >= argc) + error(1, 0, " missing dev name"); + + ifindex = if_nametoindex(argv[arg]); + if (!ifindex) + error(1, errno, "unknown device %s", argv[arg]); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_IF_IDX; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &ifindex, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else + error(1, 0, "unknown keyword %s", argv[arg]); + } + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +int del_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_addr(struct rtattr *attrs, int len) +{ + uint16_t family = 0; + char str[1024]; + uint32_t flags; + uint8_t id; + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FAMILY) + memcpy(&family, RTA_DATA(attrs), 2); + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR4) { + if (family != AF_INET) + error(1, errno, "wrong IP (v4) for family %d", + family); + inet_ntop(AF_INET, RTA_DATA(attrs), str, sizeof(str)); + printf("%s", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR6) { + if (family != AF_INET6) + error(1, errno, "wrong IP (v6) for family %d", + family); + inet_ntop(AF_INET6, RTA_DATA(attrs), str, sizeof(str)); + printf("%s", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ID) { + memcpy(&id, RTA_DATA(attrs), 1); + printf("id %d ", id); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FLAGS) { + memcpy(&flags, RTA_DATA(attrs), 4); + + printf("flags "); + if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + printf("signal"); + flags &= ~MPTCP_PM_ADDR_FLAG_SIGNAL; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + printf("subflow"); + flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_BACKUP) { + printf("backup"); + flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + if (flags) + printf(","); + } + + /* bump unknown flags, if any */ + if (flags) + printf("0x%x", flags); + printf(" "); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_IF_IDX) { + char name[IF_NAMESIZE], *ret; + int32_t ifindex; + + memcpy(&ifindex, RTA_DATA(attrs), 4); + ret = if_indextoname(ifindex, name); + if (ret) + printf("dev %s ", ret); + else + printf("dev unknown/%d", ifindex); + } + + attrs = RTA_NEXT(attrs, len); + } + printf("\n"); +} + +static void print_addrs(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == + (MPTCP_PM_ATTR_ADDR | NLA_F_NESTED)) + print_addr((void *)RTA_DATA(attrs), + attrs->rta_len); + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int dump_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + pid_t pid = getpid(); + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + nh->nlmsg_flags |= NLM_F_DUMP; + nh->nlmsg_seq = 1; + nh->nlmsg_pid = pid; + nh->nlmsg_len = off; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int flush_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_VER); + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_limits(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + uint32_t max; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + int type = attrs->rta_type; + + if (type != MPTCP_PM_ATTR_RCV_ADD_ADDRS && + type != MPTCP_PM_ATTR_SUBFLOWS) + goto next; + + memcpy(&max, RTA_DATA(attrs), 4); + printf("%s %u\n", type == MPTCP_PM_ATTR_SUBFLOWS ? + "subflows" : "accept", max); + +next: + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_set_limits(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + uint32_t rcv_addr = 0, subflows = 0; + int cmd, len = sizeof(data); + struct nlmsghdr *nh; + int off = 0; + + /* limit */ + if (argc == 4) { + rcv_addr = atoi(argv[2]); + subflows = atoi(argv[3]); + cmd = MPTCP_PM_CMD_SET_LIMITS; + } else { + cmd = MPTCP_PM_CMD_GET_LIMITS; + } + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, cmd, MPTCP_PM_VER); + + /* limit */ + if (cmd == MPTCP_PM_CMD_SET_LIMITS) { + struct rtattr *rta = (void *)(data + off); + + rta->rta_type = MPTCP_PM_ATTR_RCV_ADD_ADDRS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &rcv_addr, 4); + off += NLMSG_ALIGN(rta->rta_len); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ATTR_SUBFLOWS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &subflows, 4); + off += NLMSG_ALIGN(rta->rta_len); + + /* do not expect a reply */ + len = 0; + } + + len = do_nl_req(fd, nh, off, len); + if (cmd == MPTCP_PM_CMD_GET_LIMITS) + print_limits(nh, pm_family, len); + return 0; +} + +int main(int argc, char *argv[]) +{ + int fd, pm_family; + + if (argc < 2) + syntax(argv); + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd == -1) + error(1, errno, "socket netlink"); + + pm_family = resolve_mptcp_pm_netlink(fd); + + if (!strcmp(argv[1], "add")) + return add_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "del")) + return del_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "flush")) + return flush_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "get")) + return get_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "dump")) + return dump_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "limits")) + return get_set_limits(fd, pm_family, argc, argv); + + fprintf(stderr, "unknown sub-command: %s", argv[1]); + syntax(argv); + return 0; +} diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c new file mode 100644 index 000000000000..7b01b7c2ec10 --- /dev/null +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Check if we can fully utilize 4-tuples for connect(). + * + * Rules to bind sockets to the same port when all ephemeral ports are + * exhausted. + * + * 1. if there are TCP_LISTEN sockets on the port, fail to bind. + * 2. if there are sockets without SO_REUSEADDR, fail to bind. + * 3. if SO_REUSEADDR is disabled, fail to bind. + * 4. if SO_REUSEADDR is enabled and SO_REUSEPORT is disabled, + * succeed to bind. + * 5. if SO_REUSEADDR and SO_REUSEPORT are enabled and + * there is no socket having the both options and the same EUID, + * succeed to bind. + * 6. fail to bind. + * + * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> + */ +#include <arpa/inet.h> +#include <netinet/in.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <unistd.h> +#include "../kselftest_harness.h" + +struct reuse_opts { + int reuseaddr[2]; + int reuseport[2]; +}; + +struct reuse_opts unreusable_opts[12] = { + {0, 0, 0, 0}, + {0, 0, 0, 1}, + {0, 0, 1, 0}, + {0, 0, 1, 1}, + {0, 1, 0, 0}, + {0, 1, 0, 1}, + {0, 1, 1, 0}, + {0, 1, 1, 1}, + {1, 0, 0, 0}, + {1, 0, 0, 1}, + {1, 0, 1, 0}, + {1, 0, 1, 1}, +}; + +struct reuse_opts reusable_opts[4] = { + {1, 1, 0, 0}, + {1, 1, 0, 1}, + {1, 1, 1, 0}, + {1, 1, 1, 1}, +}; + +int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport) +{ + struct sockaddr_in local_addr; + int len = sizeof(local_addr); + int fd, ret; + + fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_NE(-1, fd) TH_LOG("failed to open socket."); + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int)); + ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEADDR."); + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(int)); + ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEPORT."); + + local_addr.sin_family = AF_INET; + local_addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + local_addr.sin_port = 0; + + if (bind(fd, (struct sockaddr *)&local_addr, len) == -1) { + close(fd); + return -1; + } + + return fd; +} + +TEST(reuseaddr_ports_exhausted_unreusable) +{ + struct reuse_opts *opts; + int i, j, fd[2]; + + for (i = 0; i < 12; i++) { + opts = &unreusable_opts[i]; + + for (j = 0; j < 2; j++) + fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]); + + ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind."); + EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind."); + + for (j = 0; j < 2; j++) + if (fd[j] != -1) + close(fd[j]); + } +} + +TEST(reuseaddr_ports_exhausted_reusable_same_euid) +{ + struct reuse_opts *opts; + int i, j, fd[2]; + + for (i = 0; i < 4; i++) { + opts = &reusable_opts[i]; + + for (j = 0; j < 2; j++) + fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]); + + ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind."); + + if (opts->reuseport[0] && opts->reuseport[1]) { + EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened."); + } else { + EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations."); + } + + for (j = 0; j < 2; j++) + if (fd[j] != -1) + close(fd[j]); + } +} + +TEST(reuseaddr_ports_exhausted_reusable_different_euid) +{ + struct reuse_opts *opts; + int i, j, ret, fd[2]; + uid_t euid[2] = {10, 20}; + + for (i = 0; i < 4; i++) { + opts = &reusable_opts[i]; + + for (j = 0; j < 2; j++) { + ret = seteuid(euid[j]); + ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: %d.", euid[j]); + + fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]); + + ret = seteuid(0); + ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: 0."); + } + + ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind."); + EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind because one socket can be bound in each euid."); + + if (fd[1] != -1) { + ret = listen(fd[0], 5); + ASSERT_EQ(0, ret) TH_LOG("failed to listen."); + + ret = listen(fd[1], 5); + EXPECT_EQ(-1, ret) TH_LOG("should fail to listen because only one uid reserves the port in TCP_LISTEN."); + } + + for (j = 0; j < 2; j++) + if (fd[j] != -1) + close(fd[j]); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh new file mode 100755 index 000000000000..20e3a2913d06 --- /dev/null +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Run tests when all ephemeral ports are exhausted. +# +# Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> + +set +x +set -e + +readonly NETNS="ns-$(mktemp -u XXXXXX)" + +setup() { + ip netns add "${NETNS}" + ip -netns "${NETNS}" link set lo up + ip netns exec "${NETNS}" \ + sysctl -w net.ipv4.ip_local_port_range="32768 32768" \ + > /dev/null 2>&1 + ip netns exec "${NETNS}" \ + sysctl -w net.ipv4.ip_autobind_reuse=1 > /dev/null 2>&1 +} + +cleanup() { + ip netns del "${NETNS}" +} + +trap cleanup EXIT +setup + +do_test() { + ip netns exec "${NETNS}" ./reuseaddr_ports_exhausted +} + +do_test +echo "tests done" diff --git a/tools/testing/selftests/net/reuseport_addr_any.c b/tools/testing/selftests/net/reuseport_addr_any.c index c6233935fed1..b8475cb29be7 100644 --- a/tools/testing/selftests/net/reuseport_addr_any.c +++ b/tools/testing/selftests/net/reuseport_addr_any.c @@ -21,6 +21,10 @@ #include <sys/socket.h> #include <unistd.h> +#ifndef SOL_DCCP +#define SOL_DCCP 269 +#endif + static const char *IP4_ADDR = "127.0.0.1"; static const char *IP6_ADDR = "::1"; static const char *IP4_MAPPED6 = "::ffff:127.0.0.1"; diff --git a/tools/testing/selftests/networking/timestamping/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c index 6dee9e636a95..6dee9e636a95 100644 --- a/tools/testing/selftests/networking/timestamping/rxtimestamp.c +++ b/tools/testing/selftests/net/rxtimestamp.c diff --git a/tools/testing/selftests/networking/timestamping/timestamping.c b/tools/testing/selftests/net/timestamping.c index aca3491174a1..aca3491174a1 100644 --- a/tools/testing/selftests/networking/timestamping/timestamping.c +++ b/tools/testing/selftests/net/timestamping.c diff --git a/tools/testing/selftests/networking/timestamping/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index 7e386be47120..011b0da6b033 100644 --- a/tools/testing/selftests/networking/timestamping/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -41,6 +41,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/epoll.h> #include <sys/ioctl.h> #include <sys/select.h> #include <sys/socket.h> @@ -49,6 +50,10 @@ #include <time.h> #include <unistd.h> +#define NSEC_PER_USEC 1000L +#define USEC_PER_SEC 1000000L +#define NSEC_PER_SEC 1000000000LL + /* command line parameters */ static int cfg_proto = SOCK_STREAM; static int cfg_ipproto = IPPROTO_TCP; @@ -61,12 +66,16 @@ static int cfg_delay_snd; static int cfg_delay_ack; static bool cfg_show_payload; static bool cfg_do_pktinfo; +static bool cfg_busy_poll; +static int cfg_sleep_usec = 50 * 1000; static bool cfg_loop_nodata; -static bool cfg_no_delay; static bool cfg_use_cmsg; static bool cfg_use_pf_packet; +static bool cfg_use_epoll; +static bool cfg_epollet; static bool cfg_do_listen; static uint16_t dest_port = 9000; +static bool cfg_print_nsec; static struct sockaddr_in daddr; static struct sockaddr_in6 daddr6; @@ -75,11 +84,48 @@ static struct timespec ts_usr; static int saved_tskey = -1; static int saved_tskey_type = -1; +struct timing_event { + int64_t min; + int64_t max; + int64_t total; + int count; +}; + +static struct timing_event usr_enq; +static struct timing_event usr_snd; +static struct timing_event usr_ack; + static bool test_failed; +static int64_t timespec_to_ns64(struct timespec *ts) +{ + return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec; +} + static int64_t timespec_to_us64(struct timespec *ts) { - return ts->tv_sec * 1000 * 1000 + ts->tv_nsec / 1000; + return ts->tv_sec * USEC_PER_SEC + ts->tv_nsec / NSEC_PER_USEC; +} + +static void init_timing_event(struct timing_event *te) +{ + te->min = INT64_MAX; + te->max = 0; + te->total = 0; + te->count = 0; +} + +static void add_timing_event(struct timing_event *te, + struct timespec *t_start, struct timespec *t_end) +{ + int64_t ts_delta = timespec_to_ns64(t_end) - timespec_to_ns64(t_start); + + te->count++; + if (ts_delta < te->min) + te->min = ts_delta; + if (ts_delta > te->max) + te->max = ts_delta; + te->total += ts_delta; } static void validate_key(int tskey, int tstype) @@ -113,25 +159,43 @@ static void validate_timestamp(struct timespec *cur, int min_delay) start64 = timespec_to_us64(&ts_usr); if (cur64 < start64 + min_delay || cur64 > start64 + max_delay) { - fprintf(stderr, "ERROR: delay %lu expected between %d and %d\n", + fprintf(stderr, "ERROR: %lu us expected between %d and %d\n", cur64 - start64, min_delay, max_delay); test_failed = true; } } +static void __print_ts_delta_formatted(int64_t ts_delta) +{ + if (cfg_print_nsec) + fprintf(stderr, "%lu ns", ts_delta); + else + fprintf(stderr, "%lu us", ts_delta / NSEC_PER_USEC); +} + static void __print_timestamp(const char *name, struct timespec *cur, uint32_t key, int payload_len) { + int64_t ts_delta; + if (!(cur->tv_sec | cur->tv_nsec)) return; - fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)", - name, cur->tv_sec, cur->tv_nsec / 1000, - key, payload_len); - - if (cur != &ts_usr) - fprintf(stderr, " (USR %+" PRId64 " us)", - timespec_to_us64(cur) - timespec_to_us64(&ts_usr)); + if (cfg_print_nsec) + fprintf(stderr, " %s: %lu s %lu ns (seq=%u, len=%u)", + name, cur->tv_sec, cur->tv_nsec, + key, payload_len); + else + fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)", + name, cur->tv_sec, cur->tv_nsec / NSEC_PER_USEC, + key, payload_len); + + if (cur != &ts_usr) { + ts_delta = timespec_to_ns64(cur) - timespec_to_ns64(&ts_usr); + fprintf(stderr, " (USR +"); + __print_ts_delta_formatted(ts_delta); + fprintf(stderr, ")"); + } fprintf(stderr, "\n"); } @@ -155,14 +219,17 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype, case SCM_TSTAMP_SCHED: tsname = " ENQ"; validate_timestamp(&tss->ts[0], 0); + add_timing_event(&usr_enq, &ts_usr, &tss->ts[0]); break; case SCM_TSTAMP_SND: tsname = " SND"; validate_timestamp(&tss->ts[0], cfg_delay_snd); + add_timing_event(&usr_snd, &ts_usr, &tss->ts[0]); break; case SCM_TSTAMP_ACK: tsname = " ACK"; validate_timestamp(&tss->ts[0], cfg_delay_ack); + add_timing_event(&usr_ack, &ts_usr, &tss->ts[0]); break; default: error(1, 0, "unknown timestamp type: %u", @@ -171,6 +238,21 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype, __print_timestamp(tsname, &tss->ts[0], tskey, payload_len); } +static void print_timing_event(char *name, struct timing_event *te) +{ + if (!te->count) + return; + + fprintf(stderr, " %s: count=%d", name, te->count); + fprintf(stderr, ", avg="); + __print_ts_delta_formatted((int64_t)(te->total / te->count)); + fprintf(stderr, ", min="); + __print_ts_delta_formatted(te->min); + fprintf(stderr, ", max="); + __print_ts_delta_formatted(te->max); + fprintf(stderr, "\n"); +} + /* TODO: convert to check_and_print payload once API is stable */ static void print_payload(char *data, int len) { @@ -198,6 +280,17 @@ static void print_pktinfo(int family, int ifindex, void *saddr, void *daddr) daddr ? inet_ntop(family, daddr, da, sizeof(da)) : "unknown"); } +static void __epoll(int epfd) +{ + struct epoll_event events; + int ret; + + memset(&events, 0, sizeof(events)); + ret = epoll_wait(epfd, &events, 1, cfg_poll_timeout); + if (ret != 1) + error(1, errno, "epoll_wait"); +} + static void __poll(int fd) { struct pollfd pollfd; @@ -391,7 +484,11 @@ static void do_test(int family, unsigned int report_opt) struct msghdr msg; struct iovec iov; char *buf; - int fd, i, val = 1, total_len; + int fd, i, val = 1, total_len, epfd = 0; + + init_timing_event(&usr_enq); + init_timing_event(&usr_snd); + init_timing_event(&usr_ack); total_len = cfg_payload_len; if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) { @@ -418,6 +515,20 @@ static void do_test(int family, unsigned int report_opt) if (fd < 0) error(1, errno, "socket"); + if (cfg_use_epoll) { + struct epoll_event ev; + + memset(&ev, 0, sizeof(ev)); + ev.data.fd = fd; + if (cfg_epollet) + ev.events |= EPOLLET; + epfd = epoll_create(1); + if (epfd <= 0) + error(1, errno, "epoll_create"); + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev)) + error(1, errno, "epoll_ctl"); + } + /* reset expected key on each new socket */ saved_tskey = -1; @@ -525,19 +636,28 @@ static void do_test(int family, unsigned int report_opt) error(1, errno, "send"); /* wait for all errors to be queued, else ACKs arrive OOO */ - if (!cfg_no_delay) - usleep(50 * 1000); + if (cfg_sleep_usec) + usleep(cfg_sleep_usec); - __poll(fd); + if (!cfg_busy_poll) { + if (cfg_use_epoll) + __epoll(epfd); + else + __poll(fd); + } while (!recv_errmsg(fd)) {} } + print_timing_event("USR-ENQ", &usr_enq); + print_timing_event("USR-SND", &usr_snd); + print_timing_event("USR-ACK", &usr_ack); + if (close(fd)) error(1, errno, "close"); free(buf); - usleep(100 * 1000); + usleep(100 * NSEC_PER_USEC); } static void __attribute__((noreturn)) usage(const char *filepath) @@ -547,18 +667,22 @@ static void __attribute__((noreturn)) usage(const char *filepath) " -4: only IPv4\n" " -6: only IPv6\n" " -h: show this message\n" + " -b: busy poll to read from error queue\n" " -c N: number of packets for each test\n" " -C: use cmsg to set tstamp recording options\n" - " -D: no delay between packets\n" - " -F: poll() waits forever for an event\n" + " -e: use level-triggered epoll() instead of poll()\n" + " -E: use event-triggered epoll() instead of poll()\n" + " -F: poll()/epoll() waits forever for an event\n" " -I: request PKTINFO\n" " -l N: send N bytes at a time\n" " -L listen on hostname and port\n" " -n: set no-payload option\n" + " -N: print timestamps and durations in nsec (instead of usec)\n" " -p N: connect to port N\n" " -P: use PF_PACKET\n" " -r: use raw\n" " -R: use raw (IP_HDRINCL)\n" + " -S N: usec to sleep before reading error queue\n" " -u: use udp\n" " -v: validate SND delay (usec)\n" " -V: validate ACK delay (usec)\n" @@ -572,7 +696,8 @@ static void parse_opt(int argc, char **argv) int proto_count = 0; int c; - while ((c = getopt(argc, argv, "46c:CDFhIl:Lnp:PrRuv:V:x")) != -1) { + while ((c = getopt(argc, argv, + "46bc:CeEFhIl:LnNp:PrRS:uv:V:x")) != -1) { switch (c) { case '4': do_ipv6 = 0; @@ -580,15 +705,21 @@ static void parse_opt(int argc, char **argv) case '6': do_ipv4 = 0; break; + case 'b': + cfg_busy_poll = true; + break; case 'c': cfg_num_pkts = strtoul(optarg, NULL, 10); break; case 'C': cfg_use_cmsg = true; break; - case 'D': - cfg_no_delay = true; + case 'e': + cfg_use_epoll = true; break; + case 'E': + cfg_use_epoll = true; + cfg_epollet = true; case 'F': cfg_poll_timeout = -1; break; @@ -604,6 +735,9 @@ static void parse_opt(int argc, char **argv) case 'n': cfg_loop_nodata = true; break; + case 'N': + cfg_print_nsec = true; + break; case 'p': dest_port = strtoul(optarg, NULL, 10); break; @@ -623,6 +757,9 @@ static void parse_opt(int argc, char **argv) cfg_proto = SOCK_RAW; cfg_ipproto = IPPROTO_RAW; break; + case 'S': + cfg_sleep_usec = strtoul(optarg, NULL, 10); + break; case 'u': proto_count++; cfg_proto = SOCK_DGRAM; @@ -653,6 +790,8 @@ static void parse_opt(int argc, char **argv) error(1, 0, "pass -P, -r, -R or -u, not multiple"); if (cfg_do_pktinfo && cfg_use_pf_packet) error(1, 0, "cannot ask for pktinfo over pf_packet"); + if (cfg_busy_poll && cfg_use_epoll) + error(1, 0, "pass epoll or busy_poll, not both"); if (optind != argc - 1) error(1, 0, "missing required hostname argument"); diff --git a/tools/testing/selftests/networking/timestamping/txtimestamp.sh b/tools/testing/selftests/net/txtimestamp.sh index df0d86ca72b7..eea6f5193693 100755 --- a/tools/testing/selftests/networking/timestamping/txtimestamp.sh +++ b/tools/testing/selftests/net/txtimestamp.sh @@ -43,15 +43,40 @@ run_test_tcpudpraw() { } run_test_all() { + setup run_test_tcpudpraw # setsockopt run_test_tcpudpraw -C # cmsg run_test_tcpudpraw -n # timestamp w/o data + echo "OK. All tests passed" +} + +run_test_one() { + setup + ./txtimestamp $@ +} + +usage() { + echo "Usage: $0 [ -r | --run ] <txtimestamp args> | [ -h | --help ]" + echo " (no args) Run all tests" + echo " -r|--run Run an individual test with arguments" + echo " -h|--help Help" +} + +main() { + if [[ $# -eq 0 ]]; then + run_test_all + else + if [[ "$1" = "-r" || "$1" == "--run" ]]; then + shift + run_test_one $@ + else + usage + fi + fi } if [[ "$(ip netns identify)" == "root" ]]; then - ../../net/in_netns.sh $0 $@ + ./in_netns.sh $0 $@ else - setup - run_test_all - echo "OK. All tests passed" + main $@ fi diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 08194aa44006..9c0f758310fe 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -3,6 +3,10 @@ TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ - nft_concat_range.sh + nft_concat_range.sh \ + nft_queue.sh + +LDLIBS = -lmnl +TEST_GEN_FILES = nf-queue include ../lib.mk diff --git a/tools/testing/selftests/netfilter/config b/tools/testing/selftests/netfilter/config index 59caa8f71cd8..4faf2ce021d9 100644 --- a/tools/testing/selftests/netfilter/config +++ b/tools/testing/selftests/netfilter/config @@ -1,2 +1,8 @@ CONFIG_NET_NS=y CONFIG_NF_TABLES_INET=y +CONFIG_NFT_QUEUE=m +CONFIG_NFT_NAT=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NF_CT_NETLINK=m diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/netfilter/nf-queue.c new file mode 100644 index 000000000000..29c73bce38fa --- /dev/null +++ b/tools/testing/selftests/netfilter/nf-queue.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <time.h> +#include <arpa/inet.h> + +#include <libmnl/libmnl.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> + +struct options { + bool count_packets; + int verbose; + unsigned int queue_num; + unsigned int timeout; +}; + +static unsigned int queue_stats[5]; +static struct options opts; + +static void help(const char *p) +{ + printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num]\n", p); +} + +static int parse_attr_cb(const struct nlattr *attr, void *data) +{ + const struct nlattr **tb = data; + int type = mnl_attr_get_type(attr); + + /* skip unsupported attribute in user-space */ + if (mnl_attr_type_valid(attr, NFQA_MAX) < 0) + return MNL_CB_OK; + + switch (type) { + case NFQA_MARK: + case NFQA_IFINDEX_INDEV: + case NFQA_IFINDEX_OUTDEV: + case NFQA_IFINDEX_PHYSINDEV: + case NFQA_IFINDEX_PHYSOUTDEV: + if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0) { + perror("mnl_attr_validate"); + return MNL_CB_ERROR; + } + break; + case NFQA_TIMESTAMP: + if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, + sizeof(struct nfqnl_msg_packet_timestamp)) < 0) { + perror("mnl_attr_validate2"); + return MNL_CB_ERROR; + } + break; + case NFQA_HWADDR: + if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, + sizeof(struct nfqnl_msg_packet_hw)) < 0) { + perror("mnl_attr_validate2"); + return MNL_CB_ERROR; + } + break; + case NFQA_PAYLOAD: + break; + } + tb[type] = attr; + return MNL_CB_OK; +} + +static int queue_cb(const struct nlmsghdr *nlh, void *data) +{ + struct nlattr *tb[NFQA_MAX+1] = { 0 }; + struct nfqnl_msg_packet_hdr *ph = NULL; + uint32_t id = 0; + + (void)data; + + mnl_attr_parse(nlh, sizeof(struct nfgenmsg), parse_attr_cb, tb); + if (tb[NFQA_PACKET_HDR]) { + ph = mnl_attr_get_payload(tb[NFQA_PACKET_HDR]); + id = ntohl(ph->packet_id); + + if (opts.verbose > 0) + printf("packet hook=%u, hwproto 0x%x", + ntohs(ph->hw_protocol), ph->hook); + + if (ph->hook >= 5) { + fprintf(stderr, "Unknown hook %d\n", ph->hook); + return MNL_CB_ERROR; + } + + if (opts.verbose > 0) { + uint32_t skbinfo = 0; + + if (tb[NFQA_SKB_INFO]) + skbinfo = ntohl(mnl_attr_get_u32(tb[NFQA_SKB_INFO])); + if (skbinfo & NFQA_SKB_CSUMNOTREADY) + printf(" csumnotready"); + if (skbinfo & NFQA_SKB_GSO) + printf(" gso"); + if (skbinfo & NFQA_SKB_CSUM_NOTVERIFIED) + printf(" csumnotverified"); + puts(""); + } + + if (opts.count_packets) + queue_stats[ph->hook]++; + } + + return MNL_CB_OK + id; +} + +static struct nlmsghdr * +nfq_build_cfg_request(char *buf, uint8_t command, int queue_num) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + struct nfqnl_msg_config_cmd cmd = { + .command = command, + .pf = htons(AF_INET), + }; + struct nfgenmsg *nfg; + + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; + nlh->nlmsg_flags = NLM_F_REQUEST; + + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_CFG_CMD, sizeof(cmd), &cmd); + + return nlh; +} + +static struct nlmsghdr * +nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + struct nfqnl_msg_config_params params = { + .copy_range = htonl(range), + .copy_mode = mode, + }; + struct nfgenmsg *nfg; + + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; + nlh->nlmsg_flags = NLM_F_REQUEST; + + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_CFG_PARAMS, sizeof(params), ¶ms); + + return nlh; +} + +static struct nlmsghdr * +nfq_build_verdict(char *buf, int id, int queue_num, int verd) +{ + struct nfqnl_msg_verdict_hdr vh = { + .verdict = htonl(verd), + .id = htonl(id), + }; + struct nlmsghdr *nlh; + struct nfgenmsg *nfg; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_VERDICT; + nlh->nlmsg_flags = NLM_F_REQUEST; + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_VERDICT_HDR, sizeof(vh), &vh); + + return nlh; +} + +static void print_stats(void) +{ + unsigned int last, total; + int i; + + if (!opts.count_packets) + return; + + total = 0; + last = queue_stats[0]; + + for (i = 0; i < 5; i++) { + printf("hook %d packets %08u\n", i, queue_stats[i]); + last = queue_stats[i]; + total += last; + } + + printf("%u packets total\n", total); +} + +struct mnl_socket *open_queue(void) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + unsigned int queue_num; + struct mnl_socket *nl; + struct nlmsghdr *nlh; + struct timeval tv; + uint32_t flags; + + nl = mnl_socket_open(NETLINK_NETFILTER); + if (nl == NULL) { + perror("mnl_socket_open"); + exit(EXIT_FAILURE); + } + + if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { + perror("mnl_socket_bind"); + exit(EXIT_FAILURE); + } + + queue_num = opts.queue_num; + nlh = nfq_build_cfg_request(buf, NFQNL_CFG_CMD_BIND, queue_num); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + + nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); + + flags = NFQA_CFG_F_GSO | NFQA_CFG_F_UID_GID; + mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); + mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + + memset(&tv, 0, sizeof(tv)); + tv.tv_sec = opts.timeout; + if (opts.timeout && setsockopt(mnl_socket_get_fd(nl), + SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + perror("setsockopt(SO_RCVTIMEO)"); + exit(EXIT_FAILURE); + } + + return nl; +} + +static int mainloop(void) +{ + unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; + struct mnl_socket *nl; + struct nlmsghdr *nlh; + unsigned int portid; + char *buf; + int ret; + + buf = malloc(buflen); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + nl = open_queue(); + portid = mnl_socket_get_portid(nl); + + for (;;) { + uint32_t id; + + ret = mnl_socket_recvfrom(nl, buf, buflen); + if (ret == -1) { + if (errno == ENOBUFS) + continue; + + if (errno == EAGAIN) { + errno = 0; + ret = 0; + break; + } + + perror("mnl_socket_recvfrom"); + exit(EXIT_FAILURE); + } + + ret = mnl_cb_run(buf, ret, 0, portid, queue_cb, NULL); + if (ret < 0) { + perror("mnl_cb_run"); + exit(EXIT_FAILURE); + } + + id = ret - MNL_CB_OK; + nlh = nfq_build_verdict(buf, id, opts.queue_num, NF_ACCEPT); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + } + + mnl_socket_close(nl); + + return ret; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "chvt:q:")) != -1) { + switch (c) { + case 'c': + opts.count_packets = true; + break; + case 'h': + help(argv[0]); + exit(0); + break; + case 'q': + opts.queue_num = atoi(optarg); + if (opts.queue_num > 0xffff) + opts.queue_num = 0; + break; + case 't': + opts.timeout = atoi(optarg); + break; + case 'v': + opts.verbose++; + break; + } + } +} + +int main(int argc, char *argv[]) +{ + int ret; + + parse_opts(argc, argv); + + ret = mainloop(); + if (opts.count_packets) + print_stats(); + + return ret; +} diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh new file mode 100755 index 000000000000..6898448b4266 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_queue.sh @@ -0,0 +1,332 @@ +#!/bin/bash +# +# This tests nf_queue: +# 1. can process packets from all hooks +# 2. support running nfqueue from more than one base chain +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +nsrouter="nsrouter-$sfx" + +cleanup() +{ + ip netns del ${ns1} + ip netns del ${ns2} + ip netns del ${nsrouter} + rm -f "$TMPFILE0" + rm -f "$TMPFILE1" +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add ${nsrouter} +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace" + exit $ksft_skip +fi + +TMPFILE0=$(mktemp) +TMPFILE1=$(mktemp) +trap cleanup EXIT + +ip netns add ${ns1} +ip netns add ${ns2} + +ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} + +ip -net ${nsrouter} link set lo up +ip -net ${nsrouter} link set veth0 up +ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 +ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 + +ip -net ${nsrouter} link set veth1 up +ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 +ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 + +ip -net ${ns1} link set lo up +ip -net ${ns1} link set eth0 up + +ip -net ${ns2} link set lo up +ip -net ${ns2} link set eth0 up + +ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net ${ns1} route add default via 10.0.1.1 +ip -net ${ns1} route add default via dead:1::1 + +ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns2} addr add dead:2::99/64 dev eth0 +ip -net ${ns2} route add default via 10.0.2.1 +ip -net ${ns2} route add default via dead:2::1 + +load_ruleset() { + local name=$1 + local prio=$2 + +ip netns exec ${nsrouter} nft -f - <<EOF +table inet $name { + chain nfq { + ip protocol icmp queue bypass + icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass + } + chain pre { + type filter hook prerouting priority $prio; policy accept; + jump nfq + } + chain input { + type filter hook input priority $prio; policy accept; + jump nfq + } + chain forward { + type filter hook forward priority $prio; policy accept; + tcp dport 12345 queue num 2 + jump nfq + } + chain output { + type filter hook output priority $prio; policy accept; + tcp dport 12345 queue num 3 + jump nfq + } + chain post { + type filter hook postrouting priority $prio; policy accept; + jump nfq + } +} +EOF +} + +load_counter_ruleset() { + local prio=$1 + +ip netns exec ${nsrouter} nft -f - <<EOF +table inet countrules { + chain pre { + type filter hook prerouting priority $prio; policy accept; + counter + } + chain input { + type filter hook input priority $prio; policy accept; + counter + } + chain forward { + type filter hook forward priority $prio; policy accept; + counter + } + chain output { + type filter hook output priority $prio; policy accept; + counter + } + chain post { + type filter hook postrouting priority $prio; policy accept; + counter + } +} +EOF +} + +test_ping() { + ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + return 0 +} + +test_ping_router() { + ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + return 0 +} + +test_queue_blackhole() { + local proto=$1 + +ip netns exec ${nsrouter} nft -f - <<EOF +table $proto blackh { + chain forward { + type filter hook forward priority 0; policy accept; + queue num 600 + } +} +EOF + if [ $proto = "ip" ] ;then + ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null + lret=$? + elif [ $proto = "ip6" ]; then + ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null + lret=$? + else + lret=111 + fi + + # queue without bypass keyword should drop traffic if no listener exists. + if [ $lret -eq 0 ];then + echo "FAIL: $proto expected failure, got $lret" 1>&2 + exit 1 + fi + + ip netns exec ${nsrouter} nft delete table $proto blackh + if [ $? -ne 0 ] ;then + echo "FAIL: $proto: Could not delete blackh table" + exit 1 + fi + + echo "PASS: $proto: statement with no listener results in packet drop" +} + +test_queue() +{ + local expected=$1 + local last="" + + # spawn nf-queue listeners + ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t 3 > "$TMPFILE0" & + ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t 3 > "$TMPFILE1" & + sleep 1 + test_ping + ret=$? + if [ $ret -ne 0 ];then + echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2 + exit $ret + fi + + test_ping_router + ret=$? + if [ $ret -ne 0 ];then + echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2 + exit $ret + fi + + wait + ret=$? + + for file in $TMPFILE0 $TMPFILE1; do + last=$(tail -n1 "$file") + if [ x"$last" != x"$expected packets total" ]; then + echo "FAIL: Expected $expected packets total, but got $last" 1>&2 + cat "$file" 1>&2 + + ip netns exec ${nsrouter} nft list ruleset + exit 1 + fi + done + + echo "PASS: Expected and received $last" +} + +test_tcp_forward() +{ + ip netns exec ${nsrouter} ./nf-queue -q 2 -t 10 & + local nfqpid=$! + + tmpfile=$(mktemp) || exit 1 + dd conv=sparse status=none if=/dev/zero bs=1M count=100 of=$tmpfile + ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + local rpid=$! + + sleep 1 + ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null & + + rm -f "$tmpfile" + + wait $rpid + wait $lpid + [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" +} + +test_tcp_localhost() +{ + tc -net "${nsrouter}" qdisc add dev lo root netem loss random 1% + + tmpfile=$(mktemp) || exit 1 + + dd conv=sparse status=none if=/dev/zero bs=1M count=900 of=$tmpfile + ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + local rpid=$! + + ip netns exec ${nsrouter} ./nf-queue -q 3 -t 30 & + local nfqpid=$! + + sleep 1 + ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null + rm -f "$tmpfile" + + wait $rpid + [ $? -eq 0 ] && echo "PASS: tcp via loopback" +} + +ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + +load_ruleset "filter" 0 + +sleep 3 + +test_ping +ret=$? +if [ $ret -eq 0 ];then + # queue bypass works (rules were skipped, no listener) + echo "PASS: ${ns1} can reach ${ns2}" +else + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 + exit $ret +fi + +test_queue_blackhole ip +test_queue_blackhole ip6 + +# dummy ruleset to add base chains between the +# queueing rules. We don't want the second reinject +# to re-execute the old hooks. +load_counter_ruleset 10 + +# we are hooking all: prerouting/input/forward/output/postrouting. +# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: +# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). +# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. +# so we expect that userspace program receives 10 packets. +test_queue 10 + +# same. We queue to a second program as well. +load_ruleset "filter2" 20 +test_queue 20 + +test_tcp_forward +test_tcp_localhost + +exit $ret diff --git a/tools/testing/selftests/networking/timestamping/.gitignore b/tools/testing/selftests/networking/timestamping/.gitignore deleted file mode 100644 index d9355035e746..000000000000 --- a/tools/testing/selftests/networking/timestamping/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -timestamping -rxtimestamp -txtimestamp -hwtstamp_config diff --git a/tools/testing/selftests/networking/timestamping/Makefile b/tools/testing/selftests/networking/timestamping/Makefile deleted file mode 100644 index 1de8bd8ccf5d..000000000000 --- a/tools/testing/selftests/networking/timestamping/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I../../../../../usr/include - -TEST_GEN_FILES := hwtstamp_config rxtimestamp timestamping txtimestamp -TEST_PROGS := txtimestamp.sh - -all: $(TEST_PROGS) - -top_srcdir = ../../../../.. -KSFT_KHDR_INSTALL := 1 -include ../../lib.mk diff --git a/tools/testing/selftests/networking/timestamping/config b/tools/testing/selftests/networking/timestamping/config deleted file mode 100644 index a13e3169b0a4..000000000000 --- a/tools/testing/selftests/networking/timestamping/config +++ /dev/null @@ -1,2 +0,0 @@ -CONFIG_IFB=y -CONFIG_NET_SCH_NETEM=y diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index c3a49fb4d6f6..12810229fddc 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -12,7 +12,7 @@ # Returns 1 if the specified boot-parameter string tells rcutorture to # test CPU-hotplug operations. bootparam_hotplug_cpu () { - echo "$1" | grep -q "rcutorture\.onoff_" + echo "$1" | grep -q "torture\.onoff_" } # checkarg --argname argtype $# arg mustmatch cannotmatch diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 1871d00bccd7..6f50722f251f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -20,7 +20,9 @@ rundir="${1}" if test -z "$rundir" -o ! -d "$rundir" then + echo Directory "$rundir" not found. echo Usage: $0 directory + exit 1 fi editor=${EDITOR-vi} diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index e5edd5198725..0326f4a5ff9c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -13,6 +13,9 @@ # # Authors: Paul E. McKenney <paulmck@linux.ibm.com> +T=/tmp/kvm-recheck.sh.$$ +trap 'rm -f $T' 0 2 + PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH . functions.sh for rd in "$@" @@ -68,4 +71,16 @@ do fi done done -EDITOR=echo kvm-find-errors.sh "${@: -1}" > /dev/null 2>&1 +EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1 +ret=$? +builderrors="`tr ' ' '\012' < $T | grep -c '/Make.out.diags'`" +if test "$builderrors" -gt 0 +then + echo $builderrors runs with build errors. +fi +runerrors="`tr ' ' '\012' < $T | grep -c '/console.log.diags'`" +if test "$runerrors" -gt 0 +then + echo $runerrors runs with runtime errors. +fi +exit $ret diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 78d18ab8e954..2315e2ec12d6 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -39,7 +39,7 @@ TORTURE_TRUST_MAKE="" resdir="" configs="" cpus=0 -ds=`date +%Y.%m.%d-%H:%M:%S` +ds=`date +%Y.%m.%d-%H.%M.%S` jitter="-1" usage () { diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index e19a444a0684..0e92d85313aa 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -3,3 +3,5 @@ CONFIG_PRINTK_TIME=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n +CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 new file mode 100644 index 000000000000..2debe7891aeb --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=100 +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TREE_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n +CONFIG_DEBUG_OBJECTS=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=n diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile new file mode 100644 index 000000000000..d585cc1948cc --- /dev/null +++ b/tools/testing/selftests/resctrl/Makefile @@ -0,0 +1,17 @@ +CC = $(CROSS_COMPILE)gcc +CFLAGS = -g -Wall +SRCS=$(wildcard *.c) +OBJS=$(SRCS:.c=.o) + +all: resctrl_tests + +$(OBJS): $(SRCS) + $(CC) $(CFLAGS) -c $(SRCS) + +resctrl_tests: $(OBJS) + $(CC) $(CFLAGS) -o $@ $^ + +.PHONY: clean + +clean: + $(RM) $(OBJS) resctrl_tests diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README new file mode 100644 index 000000000000..6e5a0ffa18e8 --- /dev/null +++ b/tools/testing/selftests/resctrl/README @@ -0,0 +1,53 @@ +resctrl_tests - resctrl file system test suit + +Authors: + Fenghua Yu <fenghua.yu@intel.com> + Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + +resctrl_tests tests various resctrl functionalities and interfaces including +both software and hardware. + +Currently it supports Memory Bandwidth Monitoring test and Memory Bandwidth +Allocation test on Intel RDT hardware. More tests will be added in the future. +And the test suit can be extended to cover AMD QoS and ARM MPAM hardware +as well. + +BUILD +----- + +Run "make" to build executable file "resctrl_tests". + +RUN +--- + +To use resctrl_tests, root or sudoer privileges are required. This is because +the test needs to mount resctrl file system and change contents in the file +system. + +Executing the test without any parameter will run all supported tests: + + sudo ./resctrl_tests + +OVERVIEW OF EXECUTION +--------------------- + +A test case has four stages: + + - setup: mount resctrl file system, create group, setup schemata, move test + process pids to tasks, start benchmark. + - execute: let benchmark run + - verify: get resctrl data and verify the data with another source, e.g. + perf event. + - teardown: umount resctrl and clear temporary files. + +ARGUMENTS +--------- + +Parameter '-h' shows usage information. + +usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits] + -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf + -t test list: run tests specified in the test list, e.g. -t mbm, mba, cqm, cat + -n no_of_bits: run cache tests using specified no of bits in cache bit mask + -p cpu_no: specify CPU number to run the test. 1 is default + -h: help diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c new file mode 100644 index 000000000000..38dbf4962e33 --- /dev/null +++ b/tools/testing/selftests/resctrl/cache.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdint.h> +#include "resctrl.h" + +struct read_format { + __u64 nr; /* The number of events */ + struct { + __u64 value; /* The value of the event */ + } values[2]; +}; + +static struct perf_event_attr pea_llc_miss; +static struct read_format rf_cqm; +static int fd_lm; +char llc_occup_path[1024]; + +static void initialize_perf_event_attr(void) +{ + pea_llc_miss.type = PERF_TYPE_HARDWARE; + pea_llc_miss.size = sizeof(struct perf_event_attr); + pea_llc_miss.read_format = PERF_FORMAT_GROUP; + pea_llc_miss.exclude_kernel = 1; + pea_llc_miss.exclude_hv = 1; + pea_llc_miss.exclude_idle = 1; + pea_llc_miss.exclude_callchain_kernel = 1; + pea_llc_miss.inherit = 1; + pea_llc_miss.exclude_guest = 1; + pea_llc_miss.disabled = 1; +} + +static void ioctl_perf_event_ioc_reset_enable(void) +{ + ioctl(fd_lm, PERF_EVENT_IOC_RESET, 0); + ioctl(fd_lm, PERF_EVENT_IOC_ENABLE, 0); +} + +static int perf_event_open_llc_miss(pid_t pid, int cpu_no) +{ + fd_lm = perf_event_open(&pea_llc_miss, pid, cpu_no, -1, + PERF_FLAG_FD_CLOEXEC); + if (fd_lm == -1) { + perror("Error opening leader"); + ctrlc_handler(0, NULL, NULL); + return -1; + } + + return 0; +} + +static int initialize_llc_perf(void) +{ + memset(&pea_llc_miss, 0, sizeof(struct perf_event_attr)); + memset(&rf_cqm, 0, sizeof(struct read_format)); + + /* Initialize perf_event_attr structures for HW_CACHE_MISSES */ + initialize_perf_event_attr(); + + pea_llc_miss.config = PERF_COUNT_HW_CACHE_MISSES; + + rf_cqm.nr = 1; + + return 0; +} + +static int reset_enable_llc_perf(pid_t pid, int cpu_no) +{ + int ret = 0; + + ret = perf_event_open_llc_miss(pid, cpu_no); + if (ret < 0) + return ret; + + /* Start counters to log values */ + ioctl_perf_event_ioc_reset_enable(); + + return 0; +} + +/* + * get_llc_perf: llc cache miss through perf events + * @cpu_no: CPU number that the benchmark PID is binded to + * + * Perf events like HW_CACHE_MISSES could be used to validate number of + * cache lines allocated. + * + * Return: =0 on success. <0 on failure. + */ +static int get_llc_perf(unsigned long *llc_perf_miss) +{ + __u64 total_misses; + + /* Stop counters after one span to get miss rate */ + + ioctl(fd_lm, PERF_EVENT_IOC_DISABLE, 0); + + if (read(fd_lm, &rf_cqm, sizeof(struct read_format)) == -1) { + perror("Could not get llc misses through perf"); + + return -1; + } + + total_misses = rf_cqm.values[0].value; + + close(fd_lm); + + *llc_perf_miss = total_misses; + + return 0; +} + +/* + * Get LLC Occupancy as reported by RESCTRL FS + * For CQM, + * 1. If con_mon grp and mon grp given, then read from mon grp in + * con_mon grp + * 2. If only con_mon grp given, then read from con_mon grp + * 3. If both not given, then read from root con_mon grp + * For CAT, + * 1. If con_mon grp given, then read from it + * 2. If con_mon grp not given, then read from root con_mon grp + * + * Return: =0 on success. <0 on failure. + */ +static int get_llc_occu_resctrl(unsigned long *llc_occupancy) +{ + FILE *fp; + + fp = fopen(llc_occup_path, "r"); + if (!fp) { + perror("Failed to open results file"); + + return errno; + } + if (fscanf(fp, "%lu", llc_occupancy) <= 0) { + perror("Could not get llc occupancy"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * print_results_cache: the cache results are stored in a file + * @filename: file that stores the results + * @bm_pid: child pid that runs benchmark + * @llc_value: perf miss value / + * llc occupancy value reported by resctrl FS + * + * Return: 0 on success. non-zero on failure. + */ +static int print_results_cache(char *filename, int bm_pid, + unsigned long llc_value) +{ + FILE *fp; + + if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) { + printf("Pid: %d \t LLC_value: %lu\n", bm_pid, + llc_value); + } else { + fp = fopen(filename, "a"); + if (!fp) { + perror("Cannot open results file"); + + return errno; + } + fprintf(fp, "Pid: %d \t llc_value: %lu\n", bm_pid, llc_value); + fclose(fp); + } + + return 0; +} + +int measure_cache_vals(struct resctrl_val_param *param, int bm_pid) +{ + unsigned long llc_perf_miss = 0, llc_occu_resc = 0, llc_value = 0; + int ret; + + /* + * Measure cache miss from perf. + */ + if (!strcmp(param->resctrl_val, "cat")) { + ret = get_llc_perf(&llc_perf_miss); + if (ret < 0) + return ret; + llc_value = llc_perf_miss; + } + + /* + * Measure llc occupancy from resctrl. + */ + if (!strcmp(param->resctrl_val, "cqm")) { + ret = get_llc_occu_resctrl(&llc_occu_resc); + if (ret < 0) + return ret; + llc_value = llc_occu_resc; + } + ret = print_results_cache(param->filename, bm_pid, llc_value); + if (ret) + return ret; + + return 0; +} + +/* + * cache_val: execute benchmark and measure LLC occupancy resctrl + * and perf cache miss for the benchmark + * @param: parameters passed to cache_val() + * + * Return: 0 on success. non-zero on failure. + */ +int cat_val(struct resctrl_val_param *param) +{ + int malloc_and_init_memory = 1, memflush = 1, operation = 0, ret = 0; + char *resctrl_val = param->resctrl_val; + pid_t bm_pid; + + if (strcmp(param->filename, "") == 0) + sprintf(param->filename, "stdio"); + + bm_pid = getpid(); + + /* Taskset benchmark to specified cpu */ + ret = taskset_benchmark(bm_pid, param->cpu_no); + if (ret) + return ret; + + /* Write benchmark to specified con_mon grp, mon_grp in resctrl FS*/ + ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp, + resctrl_val); + if (ret) + return ret; + + if ((strcmp(resctrl_val, "cat") == 0)) { + ret = initialize_llc_perf(); + if (ret) + return ret; + } + + /* Test runs until the callback setup() tells the test to stop. */ + while (1) { + if (strcmp(resctrl_val, "cat") == 0) { + ret = param->setup(1, param); + if (ret) { + ret = 0; + break; + } + ret = reset_enable_llc_perf(bm_pid, param->cpu_no); + if (ret) + break; + + if (run_fill_buf(param->span, malloc_and_init_memory, + memflush, operation, resctrl_val)) { + fprintf(stderr, "Error-running fill buffer\n"); + ret = -1; + break; + } + + sleep(1); + ret = measure_cache_vals(param, bm_pid); + if (ret) + break; + } else { + break; + } + } + + return ret; +} diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c new file mode 100644 index 000000000000..5da43767b973 --- /dev/null +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cache Allocation Technology (CAT) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" +#include <unistd.h> + +#define RESULT_FILE_NAME1 "result_cat1" +#define RESULT_FILE_NAME2 "result_cat2" +#define NUM_OF_RUNS 5 +#define MAX_DIFF_PERCENT 4 +#define MAX_DIFF 1000000 + +int count_of_bits; +char cbm_mask[256]; +unsigned long long_mask; +unsigned long cache_size; + +/* + * Change schemata. Write schemata to specified + * con_mon grp, mon_grp in resctrl FS. + * Run 5 times in order to get average values. + */ +static int cat_setup(int num, ...) +{ + struct resctrl_val_param *p; + char schemata[64]; + va_list param; + int ret = 0; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + /* Run NUM_OF_RUNS times */ + if (p->num_of_runs >= NUM_OF_RUNS) + return -1; + + if (p->num_of_runs == 0) { + sprintf(schemata, "%lx", p->mask); + ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no, + p->resctrl_val); + } + p->num_of_runs++; + + return ret; +} + +static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits, + unsigned long span) +{ + unsigned long allocated_cache_lines = span / 64; + unsigned long avg_llc_perf_miss = 0; + float diff_percent; + + avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1); + diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) / + allocated_cache_lines * 100; + + printf("%sok CAT: cache miss rate within %d%%\n", + !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ? + "not " : "", MAX_DIFF_PERCENT); + tests_run++; + printf("# Percent diff=%d\n", abs((int)diff_percent)); + printf("# Number of bits: %d\n", no_of_bits); + printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss); + printf("# Allocated cache lines: %lu\n", allocated_cache_lines); +} + +static int check_results(struct resctrl_val_param *param) +{ + char *token_array[8], temp[512]; + unsigned long sum_llc_perf_miss = 0; + int runs = 0, no_of_bits = 0; + FILE *fp; + + printf("# Checking for pass/fail\n"); + fp = fopen(param->filename, "r"); + if (!fp) { + perror("# Cannot open file"); + + return errno; + } + + while (fgets(temp, sizeof(temp), fp)) { + char *token = strtok(temp, ":\t"); + int fields = 0; + + while (token) { + token_array[fields++] = token; + token = strtok(NULL, ":\t"); + } + /* + * Discard the first value which is inaccurate due to monitoring + * setup transition phase. + */ + if (runs > 0) + sum_llc_perf_miss += strtoul(token_array[3], NULL, 0); + runs++; + } + + fclose(fp); + no_of_bits = count_bits(param->mask); + + show_cache_info(sum_llc_perf_miss, no_of_bits, param->span); + + return 0; +} + +void cat_test_cleanup(void) +{ + remove(RESULT_FILE_NAME1); + remove(RESULT_FILE_NAME2); +} + +int cat_perf_miss_val(int cpu_no, int n, char *cache_type) +{ + unsigned long l_mask, l_mask_1; + int ret, pipefd[2], sibling_cpu_no; + char pipe_message; + pid_t bm_pid; + + cache_size = 0; + + ret = remount_resctrlfs(true); + if (ret) + return ret; + + if (!validate_resctrl_feature_request("cat")) + return -1; + + /* Get default cbm mask for L3/L2 cache */ + ret = get_cbm_mask(cache_type); + if (ret) + return ret; + + long_mask = strtoul(cbm_mask, NULL, 16); + + /* Get L3/L2 cache size */ + ret = get_cache_size(cpu_no, cache_type, &cache_size); + if (ret) + return ret; + printf("cache size :%lu\n", cache_size); + + /* Get max number of bits from default-cabm mask */ + count_of_bits = count_bits(long_mask); + + if (n < 1 || n > count_of_bits - 1) { + printf("Invalid input value for no_of_bits n!\n"); + printf("Please Enter value in range 1 to %d\n", + count_of_bits - 1); + return -1; + } + + /* Get core id from same socket for running another thread */ + sibling_cpu_no = get_core_sibling(cpu_no); + if (sibling_cpu_no < 0) + return -1; + + struct resctrl_val_param param = { + .resctrl_val = "cat", + .cpu_no = cpu_no, + .mum_resctrlfs = 0, + .setup = cat_setup, + }; + + l_mask = long_mask >> n; + l_mask_1 = ~l_mask & long_mask; + + /* Set param values for parent thread which will be allocated bitmask + * with (max_bits - n) bits + */ + param.span = cache_size * (count_of_bits - n) / count_of_bits; + strcpy(param.ctrlgrp, "c2"); + strcpy(param.mongrp, "m2"); + strcpy(param.filename, RESULT_FILE_NAME2); + param.mask = l_mask; + param.num_of_runs = 0; + + if (pipe(pipefd)) { + perror("# Unable to create pipe"); + return errno; + } + + bm_pid = fork(); + + /* Set param values for child thread which will be allocated bitmask + * with n bits + */ + if (bm_pid == 0) { + param.mask = l_mask_1; + strcpy(param.ctrlgrp, "c1"); + strcpy(param.mongrp, "m1"); + param.span = cache_size * n / count_of_bits; + strcpy(param.filename, RESULT_FILE_NAME1); + param.num_of_runs = 0; + param.cpu_no = sibling_cpu_no; + } + + remove(param.filename); + + ret = cat_val(¶m); + if (ret) + return ret; + + ret = check_results(¶m); + if (ret) + return ret; + + if (bm_pid == 0) { + /* Tell parent that child is ready */ + close(pipefd[0]); + pipe_message = 1; + if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) < + sizeof(pipe_message)) { + close(pipefd[1]); + perror("# failed signaling parent process"); + return errno; + } + + close(pipefd[1]); + while (1) + ; + } else { + /* Parent waits for child to be ready. */ + close(pipefd[1]); + pipe_message = 0; + while (pipe_message != 1) { + if (read(pipefd[0], &pipe_message, + sizeof(pipe_message)) < sizeof(pipe_message)) { + perror("# failed reading from child process"); + break; + } + } + close(pipefd[0]); + kill(bm_pid, SIGKILL); + } + + cat_test_cleanup(); + if (bm_pid) + umount_resctrlfs(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c new file mode 100644 index 000000000000..c8756152bd61 --- /dev/null +++ b/tools/testing/selftests/resctrl/cqm_test.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cache Monitoring Technology (CQM) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" +#include <unistd.h> + +#define RESULT_FILE_NAME "result_cqm" +#define NUM_OF_RUNS 5 +#define MAX_DIFF 2000000 +#define MAX_DIFF_PERCENT 15 + +int count_of_bits; +char cbm_mask[256]; +unsigned long long_mask; +unsigned long cache_size; + +static int cqm_setup(int num, ...) +{ + struct resctrl_val_param *p; + va_list param; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + /* Run NUM_OF_RUNS times */ + if (p->num_of_runs >= NUM_OF_RUNS) + return -1; + + p->num_of_runs++; + + return 0; +} + +static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, + unsigned long span) +{ + unsigned long avg_llc_occu_resc = 0; + float diff_percent; + long avg_diff = 0; + bool res; + + avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1); + avg_diff = (long)abs(span - avg_llc_occu_resc); + + diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100; + + if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) || + (abs(avg_diff) <= MAX_DIFF)) + res = true; + else + res = false; + + printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not", + MAX_DIFF, (int)MAX_DIFF_PERCENT); + + printf("# diff: %ld\n", avg_diff); + printf("# percent diff=%d\n", abs((int)diff_percent)); + printf("# Results are displayed in (Bytes)\n"); + printf("# Number of bits: %d\n", no_of_bits); + printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); + printf("# llc_occu_exp (span): %lu\n", span); + + tests_run++; +} + +static int check_results(struct resctrl_val_param *param, int no_of_bits) +{ + char *token_array[8], temp[512]; + unsigned long sum_llc_occu_resc = 0; + int runs = 0; + FILE *fp; + + printf("# checking for pass/fail\n"); + fp = fopen(param->filename, "r"); + if (!fp) { + perror("# Error in opening file\n"); + + return errno; + } + + while (fgets(temp, 1024, fp)) { + char *token = strtok(temp, ":\t"); + int fields = 0; + + while (token) { + token_array[fields++] = token; + token = strtok(NULL, ":\t"); + } + + /* Field 3 is llc occ resc value */ + if (runs > 0) + sum_llc_occu_resc += strtoul(token_array[3], NULL, 0); + runs++; + } + fclose(fp); + show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); + + return 0; +} + +void cqm_test_cleanup(void) +{ + remove(RESULT_FILE_NAME); +} + +int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) +{ + int ret, mum_resctrlfs; + + cache_size = 0; + mum_resctrlfs = 1; + + ret = remount_resctrlfs(mum_resctrlfs); + if (ret) + return ret; + + if (!validate_resctrl_feature_request("cqm")) + return -1; + + ret = get_cbm_mask("L3"); + if (ret) + return ret; + + long_mask = strtoul(cbm_mask, NULL, 16); + + ret = get_cache_size(cpu_no, "L3", &cache_size); + if (ret) + return ret; + printf("cache size :%lu\n", cache_size); + + count_of_bits = count_bits(long_mask); + + if (n < 1 || n > count_of_bits) { + printf("Invalid input value for numbr_of_bits n!\n"); + printf("Please Enter value in range 1 to %d\n", count_of_bits); + return -1; + } + + struct resctrl_val_param param = { + .resctrl_val = "cqm", + .ctrlgrp = "c1", + .mongrp = "m1", + .cpu_no = cpu_no, + .mum_resctrlfs = 0, + .filename = RESULT_FILE_NAME, + .mask = ~(long_mask << n) & long_mask, + .span = cache_size * n / count_of_bits, + .num_of_runs = 0, + .setup = cqm_setup, + }; + + if (strcmp(benchmark_cmd[0], "fill_buf") == 0) + sprintf(benchmark_cmd[1], "%lu", param.span); + + remove(RESULT_FILE_NAME); + + ret = resctrl_val(benchmark_cmd, ¶m); + if (ret) + return ret; + + ret = check_results(¶m, n); + if (ret) + return ret; + + cqm_test_cleanup(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c new file mode 100644 index 000000000000..79c611c99a3d --- /dev/null +++ b/tools/testing/selftests/resctrl/fill_buf.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fill_buf benchmark + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <inttypes.h> +#include <malloc.h> +#include <string.h> + +#include "resctrl.h" + +#define CL_SIZE (64) +#define PAGE_SIZE (4 * 1024) +#define MB (1024 * 1024) + +static unsigned char *startptr; + +static void sb(void) +{ +#if defined(__i386) || defined(__x86_64) + asm volatile("sfence\n\t" + : : : "memory"); +#endif +} + +static void ctrl_handler(int signo) +{ + free(startptr); + printf("\nEnding\n"); + sb(); + exit(EXIT_SUCCESS); +} + +static void cl_flush(void *p) +{ +#if defined(__i386) || defined(__x86_64) + asm volatile("clflush (%0)\n\t" + : : "r"(p) : "memory"); +#endif +} + +static void mem_flush(void *p, size_t s) +{ + char *cp = (char *)p; + size_t i = 0; + + s = s / CL_SIZE; /* mem size in cache llines */ + + for (i = 0; i < s; i++) + cl_flush(&cp[i * CL_SIZE]); + + sb(); +} + +static void *malloc_and_init_memory(size_t s) +{ + uint64_t *p64; + size_t s64; + + void *p = memalign(PAGE_SIZE, s); + + p64 = (uint64_t *)p; + s64 = s / sizeof(uint64_t); + + while (s64 > 0) { + *p64 = (uint64_t)rand(); + p64 += (CL_SIZE / sizeof(uint64_t)); + s64 -= (CL_SIZE / sizeof(uint64_t)); + } + + return p; +} + +static int fill_one_span_read(unsigned char *start_ptr, unsigned char *end_ptr) +{ + unsigned char sum, *p; + + sum = 0; + p = start_ptr; + while (p < end_ptr) { + sum += *p; + p += (CL_SIZE / 2); + } + + return sum; +} + +static +void fill_one_span_write(unsigned char *start_ptr, unsigned char *end_ptr) +{ + unsigned char *p; + + p = start_ptr; + while (p < end_ptr) { + *p = '1'; + p += (CL_SIZE / 2); + } +} + +static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr, + char *resctrl_val) +{ + int ret = 0; + FILE *fp; + + while (1) { + ret = fill_one_span_read(start_ptr, end_ptr); + if (!strcmp(resctrl_val, "cat")) + break; + } + + /* Consume read result so that reading memory is not optimized out. */ + fp = fopen("/dev/null", "w"); + if (!fp) + perror("Unable to write to /dev/null"); + fprintf(fp, "Sum: %d ", ret); + fclose(fp); + + return 0; +} + +static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr, + char *resctrl_val) +{ + while (1) { + fill_one_span_write(start_ptr, end_ptr); + if (!strcmp(resctrl_val, "cat")) + break; + } + + return 0; +} + +static int +fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush, + int op, char *resctrl_val) +{ + unsigned char *start_ptr, *end_ptr; + unsigned long long i; + int ret; + + if (malloc_and_init) + start_ptr = malloc_and_init_memory(buf_size); + else + start_ptr = malloc(buf_size); + + if (!start_ptr) + return -1; + + startptr = start_ptr; + end_ptr = start_ptr + buf_size; + + /* + * It's better to touch the memory once to avoid any compiler + * optimizations + */ + if (!malloc_and_init) { + for (i = 0; i < buf_size; i++) + *start_ptr++ = (unsigned char)rand(); + } + + start_ptr = startptr; + + /* Flush the memory before using to avoid "cache hot pages" effect */ + if (memflush) + mem_flush(start_ptr, buf_size); + + if (op == 0) + ret = fill_cache_read(start_ptr, end_ptr, resctrl_val); + else + ret = fill_cache_write(start_ptr, end_ptr, resctrl_val); + + if (ret) { + printf("\n Error in fill cache read/write...\n"); + return -1; + } + + free(startptr); + + return 0; +} + +int run_fill_buf(unsigned long span, int malloc_and_init_memory, + int memflush, int op, char *resctrl_val) +{ + unsigned long long cache_size = span; + int ret; + + /* set up ctrl-c handler */ + if (signal(SIGINT, ctrl_handler) == SIG_ERR) + printf("Failed to catch SIGINT!\n"); + if (signal(SIGHUP, ctrl_handler) == SIG_ERR) + printf("Failed to catch SIGHUP!\n"); + + ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op, + resctrl_val); + if (ret) { + printf("\n Error in fill cache\n"); + return -1; + } + + return 0; +} diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c new file mode 100644 index 000000000000..7bf8eaa6204b --- /dev/null +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory Bandwidth Allocation (MBA) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define RESULT_FILE_NAME "result_mba" +#define NUM_OF_RUNS 5 +#define MAX_DIFF 300 +#define ALLOCATION_MAX 100 +#define ALLOCATION_MIN 10 +#define ALLOCATION_STEP 10 + +/* + * Change schemata percentage from 100 to 10%. Write schemata to specified + * con_mon grp, mon_grp in resctrl FS. + * For each allocation, run 5 times in order to get average values. + */ +static int mba_setup(int num, ...) +{ + static int runs_per_allocation, allocation = 100; + struct resctrl_val_param *p; + char allocation_str[64]; + va_list param; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + if (runs_per_allocation >= NUM_OF_RUNS) + runs_per_allocation = 0; + + /* Only set up schemata once every NUM_OF_RUNS of allocations */ + if (runs_per_allocation++ != 0) + return 0; + + if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX) + return -1; + + sprintf(allocation_str, "%d", allocation); + + write_schemata(p->ctrlgrp, allocation_str, p->cpu_no, p->resctrl_val); + allocation -= ALLOCATION_STEP; + + return 0; +} + +static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) +{ + int allocation, runs; + bool failed = false; + + printf("# Results are displayed in (MB)\n"); + /* Memory bandwidth from 100% down to 10% */ + for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP; + allocation++) { + unsigned long avg_bw_imc, avg_bw_resc; + unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + unsigned long avg_diff; + + /* + * The first run is discarded due to inaccurate value from + * phase transition. + */ + for (runs = NUM_OF_RUNS * allocation + 1; + runs < NUM_OF_RUNS * allocation + NUM_OF_RUNS ; runs++) { + sum_bw_imc += bw_imc[runs]; + sum_bw_resc += bw_resc[runs]; + } + + avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1); + avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1); + avg_diff = labs((long)(avg_bw_resc - avg_bw_imc)); + + printf("%sok MBA schemata percentage %u smaller than %d %%\n", + avg_diff > MAX_DIFF ? "not " : "", + ALLOCATION_MAX - ALLOCATION_STEP * allocation, + MAX_DIFF); + tests_run++; + printf("# avg_diff: %lu\n", avg_diff); + printf("# avg_bw_imc: %lu\n", avg_bw_imc); + printf("# avg_bw_resc: %lu\n", avg_bw_resc); + if (avg_diff > MAX_DIFF) + failed = true; + } + + printf("%sok schemata change using MBA%s\n", failed ? "not " : "", + failed ? " # at least one test failed" : ""); + tests_run++; +} + +static int check_results(void) +{ + char *token_array[8], output[] = RESULT_FILE_NAME, temp[512]; + unsigned long bw_imc[1024], bw_resc[1024]; + int runs; + FILE *fp; + + fp = fopen(output, "r"); + if (!fp) { + perror(output); + + return errno; + } + + runs = 0; + while (fgets(temp, sizeof(temp), fp)) { + char *token = strtok(temp, ":\t"); + int fields = 0; + + while (token) { + token_array[fields++] = token; + token = strtok(NULL, ":\t"); + } + + /* Field 3 is perf imc value */ + bw_imc[runs] = strtoul(token_array[3], NULL, 0); + /* Field 5 is resctrl value */ + bw_resc[runs] = strtoul(token_array[5], NULL, 0); + runs++; + } + + fclose(fp); + + show_mba_info(bw_imc, bw_resc); + + return 0; +} + +void mba_test_cleanup(void) +{ + remove(RESULT_FILE_NAME); +} + +int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) +{ + struct resctrl_val_param param = { + .resctrl_val = "mba", + .ctrlgrp = "c1", + .mongrp = "m1", + .cpu_no = cpu_no, + .mum_resctrlfs = 1, + .filename = RESULT_FILE_NAME, + .bw_report = bw_report, + .setup = mba_setup + }; + int ret; + + remove(RESULT_FILE_NAME); + + if (!validate_resctrl_feature_request("mba")) + return -1; + + ret = resctrl_val(benchmark_cmd, ¶m); + if (ret) + return ret; + + ret = check_results(); + if (ret) + return ret; + + mba_test_cleanup(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c new file mode 100644 index 000000000000..4700f7453f81 --- /dev/null +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory Bandwidth Monitoring (MBM) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define RESULT_FILE_NAME "result_mbm" +#define MAX_DIFF 300 +#define NUM_OF_RUNS 5 + +static void +show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) +{ + unsigned long avg_bw_imc = 0, avg_bw_resc = 0; + unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_diff = 0; + int runs; + + /* + * Discard the first value which is inaccurate due to monitoring setup + * transition phase. + */ + for (runs = 1; runs < NUM_OF_RUNS ; runs++) { + sum_bw_imc += bw_imc[runs]; + sum_bw_resc += bw_resc[runs]; + } + + avg_bw_imc = sum_bw_imc / 4; + avg_bw_resc = sum_bw_resc / 4; + avg_diff = avg_bw_resc - avg_bw_imc; + + printf("%sok MBM: diff within %d%%\n", + labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF); + tests_run++; + printf("# avg_diff: %lu\n", labs(avg_diff)); + printf("# Span (MB): %d\n", span); + printf("# avg_bw_imc: %lu\n", avg_bw_imc); + printf("# avg_bw_resc: %lu\n", avg_bw_resc); +} + +static int check_results(int span) +{ + unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS]; + char temp[1024], *token_array[8]; + char output[] = RESULT_FILE_NAME; + int runs; + FILE *fp; + + printf("# Checking for pass/fail\n"); + + fp = fopen(output, "r"); + if (!fp) { + perror(output); + + return errno; + } + + runs = 0; + while (fgets(temp, sizeof(temp), fp)) { + char *token = strtok(temp, ":\t"); + int i = 0; + + while (token) { + token_array[i++] = token; + token = strtok(NULL, ":\t"); + } + + bw_resc[runs] = strtoul(token_array[5], NULL, 0); + bw_imc[runs] = strtoul(token_array[3], NULL, 0); + runs++; + } + + show_bw_info(bw_imc, bw_resc, span); + + fclose(fp); + + return 0; +} + +static int mbm_setup(int num, ...) +{ + struct resctrl_val_param *p; + static int num_of_runs; + va_list param; + int ret = 0; + + /* Run NUM_OF_RUNS times */ + if (num_of_runs++ >= NUM_OF_RUNS) + return -1; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + /* Set up shemata with 100% allocation on the first run. */ + if (num_of_runs == 0) + ret = write_schemata(p->ctrlgrp, "100", p->cpu_no, + p->resctrl_val); + + return ret; +} + +void mbm_test_cleanup(void) +{ + remove(RESULT_FILE_NAME); +} + +int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) +{ + struct resctrl_val_param param = { + .resctrl_val = "mbm", + .ctrlgrp = "c1", + .mongrp = "m1", + .span = span, + .cpu_no = cpu_no, + .mum_resctrlfs = 1, + .filename = RESULT_FILE_NAME, + .bw_report = bw_report, + .setup = mbm_setup + }; + int ret; + + remove(RESULT_FILE_NAME); + + if (!validate_resctrl_feature_request("mbm")) + return -1; + + ret = resctrl_val(benchmark_cmd, ¶m); + if (ret) + return ret; + + ret = check_results(span); + if (ret) + return ret; + + mbm_test_cleanup(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h new file mode 100644 index 000000000000..39bf59c6b9c5 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define _GNU_SOURCE +#ifndef RESCTRL_H +#define RESCTRL_H +#include <stdio.h> +#include <stdarg.h> +#include <math.h> +#include <errno.h> +#include <sched.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <signal.h> +#include <dirent.h> +#include <stdbool.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/select.h> +#include <sys/time.h> +#include <sys/eventfd.h> +#include <asm/unistd.h> +#include <linux/perf_event.h> + +#define MB (1024 * 1024) +#define RESCTRL_PATH "/sys/fs/resctrl" +#define PHYS_ID_PATH "/sys/devices/system/cpu/cpu" +#define CBM_MASK_PATH "/sys/fs/resctrl/info" + +#define PARENT_EXIT(err_msg) \ + do { \ + perror(err_msg); \ + kill(ppid, SIGKILL); \ + exit(EXIT_FAILURE); \ + } while (0) + +/* + * resctrl_val_param: resctrl test parameters + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @cpu_no: CPU number to which the benchmark would be binded + * @span: Memory bytes accessed in each benchmark iteration + * @mum_resctrlfs: Should the resctrl FS be remounted? + * @filename: Name of file to which the o/p should be written + * @bw_report: Bandwidth report type (reads vs writes) + * @setup: Call back function to setup test environment + */ +struct resctrl_val_param { + char *resctrl_val; + char ctrlgrp[64]; + char mongrp[64]; + int cpu_no; + unsigned long span; + int mum_resctrlfs; + char filename[64]; + char *bw_report; + unsigned long mask; + int num_of_runs; + int (*setup)(int num, ...); +}; + +pid_t bm_pid, ppid; +int tests_run; + +char llc_occup_path[1024]; +bool is_amd; + +bool check_resctrlfs_support(void); +int filter_dmesg(void); +int remount_resctrlfs(bool mum_resctrlfs); +int get_resource_id(int cpu_no, int *resource_id); +int umount_resctrlfs(void); +int validate_bw_report_request(char *bw_report); +bool validate_resctrl_feature_request(char *resctrl_val); +char *fgrep(FILE *inf, const char *str); +int taskset_benchmark(pid_t bm_pid, int cpu_no); +void run_benchmark(int signum, siginfo_t *info, void *ucontext); +int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, + char *resctrl_val); +int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, + char *resctrl_val); +int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, + int group_fd, unsigned long flags); +int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush, + int op, char *resctrl_va); +int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param); +int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd); +void tests_cleanup(void); +void mbm_test_cleanup(void); +int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd); +void mba_test_cleanup(void); +int get_cbm_mask(char *cache_type); +int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size); +void ctrlc_handler(int signum, siginfo_t *info, void *ptr); +int cat_val(struct resctrl_val_param *param); +void cat_test_cleanup(void); +int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type); +int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd); +unsigned int count_bits(unsigned long n); +void cqm_test_cleanup(void); +int get_core_sibling(int cpu_no); +int measure_cache_vals(struct resctrl_val_param *param, int bm_pid); + +#endif /* RESCTRL_H */ diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c new file mode 100644 index 000000000000..425cc85ac883 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resctrl tests + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define BENCHMARK_ARGS 64 +#define BENCHMARK_ARG_SIZE 64 + +bool is_amd; + +void detect_amd(void) +{ + FILE *inf = fopen("/proc/cpuinfo", "r"); + char *res; + + if (!inf) + return; + + res = fgrep(inf, "vendor_id"); + + if (res) { + char *s = strchr(res, ':'); + + is_amd = s && !strcmp(s, ": AuthenticAMD\n"); + free(res); + } + fclose(inf); +} + +static void cmd_help(void) +{ + printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n"); + printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM"); + printf("\t default benchmark is builtin fill_buf\n"); + printf("\t-t test list: run tests specified in the test list, "); + printf("e.g. -t mbm, mba, cqm, cat\n"); + printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n"); + printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n"); + printf("\t-h: help\n"); +} + +void tests_cleanup(void) +{ + mbm_test_cleanup(); + mba_test_cleanup(); + cqm_test_cleanup(); + cat_test_cleanup(); +} + +int main(int argc, char **argv) +{ + bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true; + int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5; + char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64]; + char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE]; + int ben_ind, ben_count; + bool cat_test = true; + + for (i = 0; i < argc; i++) { + if (strcmp(argv[i], "-b") == 0) { + ben_ind = i + 1; + ben_count = argc - ben_ind; + argc_new = ben_ind - 1; + has_ben = true; + break; + } + } + + while ((c = getopt(argc_new, argv, "ht:b:")) != -1) { + char *token; + + switch (c) { + case 't': + token = strtok(optarg, ","); + + mbm_test = false; + mba_test = false; + cqm_test = false; + cat_test = false; + while (token) { + if (!strcmp(token, "mbm")) { + mbm_test = true; + } else if (!strcmp(token, "mba")) { + mba_test = true; + } else if (!strcmp(token, "cqm")) { + cqm_test = true; + } else if (!strcmp(token, "cat")) { + cat_test = true; + } else { + printf("invalid argument\n"); + + return -1; + } + token = strtok(NULL, ":\t"); + } + break; + case 'p': + cpu_no = atoi(optarg); + break; + case 'n': + no_of_bits = atoi(optarg); + break; + case 'h': + cmd_help(); + + return 0; + default: + printf("invalid argument\n"); + + return -1; + } + } + + printf("TAP version 13\n"); + + /* + * Typically we need root privileges, because: + * 1. We write to resctrl FS + * 2. We execute perf commands + */ + if (geteuid() != 0) + printf("# WARNING: not running as root, tests may fail.\n"); + + /* Detect AMD vendor */ + detect_amd(); + + if (has_ben) { + /* Extract benchmark command from command line. */ + for (i = ben_ind; i < argc; i++) { + benchmark_cmd[i - ben_ind] = benchmark_cmd_area[i]; + sprintf(benchmark_cmd[i - ben_ind], "%s", argv[i]); + } + benchmark_cmd[ben_count] = NULL; + } else { + /* If no benchmark is given by "-b" argument, use fill_buf. */ + for (i = 0; i < 6; i++) + benchmark_cmd[i] = benchmark_cmd_area[i]; + + strcpy(benchmark_cmd[0], "fill_buf"); + sprintf(benchmark_cmd[1], "%d", span); + strcpy(benchmark_cmd[2], "1"); + strcpy(benchmark_cmd[3], "1"); + strcpy(benchmark_cmd[4], "0"); + strcpy(benchmark_cmd[5], ""); + benchmark_cmd[6] = NULL; + } + + sprintf(bw_report, "reads"); + sprintf(bm_type, "fill_buf"); + + check_resctrlfs_support(); + filter_dmesg(); + + if (!is_amd && mbm_test) { + printf("# Starting MBM BW change ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", "mba"); + res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); + printf("%sok MBM: bw change\n", res ? "not " : ""); + mbm_test_cleanup(); + tests_run++; + } + + if (!is_amd && mba_test) { + printf("# Starting MBA Schemata change ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[1], "%d", span); + res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); + printf("%sok MBA: schemata change\n", res ? "not " : ""); + mba_test_cleanup(); + tests_run++; + } + + if (cqm_test) { + printf("# Starting CQM test ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", "cqm"); + res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); + printf("%sok CQM: test\n", res ? "not " : ""); + cqm_test_cleanup(); + tests_run++; + } + + if (cat_test) { + printf("# Starting CAT test ...\n"); + res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); + printf("%sok CAT: test\n", res ? "not " : ""); + tests_run++; + cat_test_cleanup(); + } + + printf("1..%d\n", tests_run); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c new file mode 100644 index 000000000000..520fea3606d1 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -0,0 +1,744 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory bandwidth monitoring and allocation library + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define UNCORE_IMC "uncore_imc" +#define READ_FILE_NAME "events/cas_count_read" +#define WRITE_FILE_NAME "events/cas_count_write" +#define DYN_PMU_PATH "/sys/bus/event_source/devices" +#define SCALE 0.00006103515625 +#define MAX_IMCS 20 +#define MAX_TOKENS 5 +#define READ 0 +#define WRITE 1 +#define CON_MON_MBM_LOCAL_BYTES_PATH \ + "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define CON_MBM_LOCAL_BYTES_PATH \ + "%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define MON_MBM_LOCAL_BYTES_PATH \ + "%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define MBM_LOCAL_BYTES_PATH \ + "%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define CON_MON_LCC_OCCUP_PATH \ + "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy" + +#define CON_LCC_OCCUP_PATH \ + "%s/%s/mon_data/mon_L3_%02d/llc_occupancy" + +#define MON_LCC_OCCUP_PATH \ + "%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy" + +#define LCC_OCCUP_PATH \ + "%s/mon_data/mon_L3_%02d/llc_occupancy" + +struct membw_read_format { + __u64 value; /* The value of the event */ + __u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ + __u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ + __u64 id; /* if PERF_FORMAT_ID */ +}; + +struct imc_counter_config { + __u32 type; + __u64 event; + __u64 umask; + struct perf_event_attr pe; + struct membw_read_format return_value; + int fd; +}; + +static char mbm_total_path[1024]; +static int imcs; +static struct imc_counter_config imc_counters_config[MAX_IMCS][2]; + +void membw_initialize_perf_event_attr(int i, int j) +{ + memset(&imc_counters_config[i][j].pe, 0, + sizeof(struct perf_event_attr)); + imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type; + imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr); + imc_counters_config[i][j].pe.disabled = 1; + imc_counters_config[i][j].pe.inherit = 1; + imc_counters_config[i][j].pe.exclude_guest = 0; + imc_counters_config[i][j].pe.config = + imc_counters_config[i][j].umask << 8 | + imc_counters_config[i][j].event; + imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER; + imc_counters_config[i][j].pe.read_format = + PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; +} + +void membw_ioctl_perf_event_ioc_reset_enable(int i, int j) +{ + ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0); + ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0); +} + +void membw_ioctl_perf_event_ioc_disable(int i, int j) +{ + ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0); +} + +/* + * get_event_and_umask: Parse config into event and umask + * @cas_count_cfg: Config + * @count: iMC number + * @op: Operation (read/write) + */ +void get_event_and_umask(char *cas_count_cfg, int count, bool op) +{ + char *token[MAX_TOKENS]; + int i = 0; + + strcat(cas_count_cfg, ","); + token[0] = strtok(cas_count_cfg, "=,"); + + for (i = 1; i < MAX_TOKENS; i++) + token[i] = strtok(NULL, "=,"); + + for (i = 0; i < MAX_TOKENS; i++) { + if (!token[i]) + break; + if (strcmp(token[i], "event") == 0) { + if (op == READ) + imc_counters_config[count][READ].event = + strtol(token[i + 1], NULL, 16); + else + imc_counters_config[count][WRITE].event = + strtol(token[i + 1], NULL, 16); + } + if (strcmp(token[i], "umask") == 0) { + if (op == READ) + imc_counters_config[count][READ].umask = + strtol(token[i + 1], NULL, 16); + else + imc_counters_config[count][WRITE].umask = + strtol(token[i + 1], NULL, 16); + } + } +} + +static int open_perf_event(int i, int cpu_no, int j) +{ + imc_counters_config[i][j].fd = + perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1, + PERF_FLAG_FD_CLOEXEC); + + if (imc_counters_config[i][j].fd == -1) { + fprintf(stderr, "Error opening leader %llx\n", + imc_counters_config[i][j].pe.config); + + return -1; + } + + return 0; +} + +/* Get type and config (read and write) of an iMC counter */ +static int read_from_imc_dir(char *imc_dir, int count) +{ + char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024]; + FILE *fp; + + /* Get type of iMC counter */ + sprintf(imc_counter_type, "%s%s", imc_dir, "type"); + fp = fopen(imc_counter_type, "r"); + if (!fp) { + perror("Failed to open imc counter type file"); + + return -1; + } + if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) { + perror("Could not get imc type"); + fclose(fp); + + return -1; + } + fclose(fp); + + imc_counters_config[count][WRITE].type = + imc_counters_config[count][READ].type; + + /* Get read config */ + sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME); + fp = fopen(imc_counter_cfg, "r"); + if (!fp) { + perror("Failed to open imc config file"); + + return -1; + } + if (fscanf(fp, "%s", cas_count_cfg) <= 0) { + perror("Could not get imc cas count read"); + fclose(fp); + + return -1; + } + fclose(fp); + + get_event_and_umask(cas_count_cfg, count, READ); + + /* Get write config */ + sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME); + fp = fopen(imc_counter_cfg, "r"); + if (!fp) { + perror("Failed to open imc config file"); + + return -1; + } + if (fscanf(fp, "%s", cas_count_cfg) <= 0) { + perror("Could not get imc cas count write"); + fclose(fp); + + return -1; + } + fclose(fp); + + get_event_and_umask(cas_count_cfg, count, WRITE); + + return 0; +} + +/* + * A system can have 'n' number of iMC (Integrated Memory Controller) + * counters, get that 'n'. For each iMC counter get it's type and config. + * Also, each counter has two configs, one for read and the other for write. + * A config again has two parts, event and umask. + * Enumerate all these details into an array of structures. + * + * Return: >= 0 on success. < 0 on failure. + */ +static int num_of_imcs(void) +{ + unsigned int count = 0; + char imc_dir[512]; + struct dirent *ep; + int ret; + DIR *dp; + + dp = opendir(DYN_PMU_PATH); + if (dp) { + while ((ep = readdir(dp))) { + if (strstr(ep->d_name, UNCORE_IMC)) { + sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH, + ep->d_name); + ret = read_from_imc_dir(imc_dir, count); + if (ret) { + closedir(dp); + + return ret; + } + count++; + } + } + closedir(dp); + if (count == 0) { + perror("Unable find iMC counters!\n"); + + return -1; + } + } else { + perror("Unable to open PMU directory!\n"); + + return -1; + } + + return count; +} + +static int initialize_mem_bw_imc(void) +{ + int imc, j; + + imcs = num_of_imcs(); + if (imcs <= 0) + return imcs; + + /* Initialize perf_event_attr structures for all iMC's */ + for (imc = 0; imc < imcs; imc++) { + for (j = 0; j < 2; j++) + membw_initialize_perf_event_attr(imc, j); + } + + return 0; +} + +/* + * get_mem_bw_imc: Memory band width as reported by iMC counters + * @cpu_no: CPU number that the benchmark PID is binded to + * @bw_report: Bandwidth report type (reads, writes) + * + * Memory B/W utilized by a process on a socket can be calculated using + * iMC counters. Perf events are used to read these counters. + * + * Return: >= 0 on success. < 0 on failure. + */ +static float get_mem_bw_imc(int cpu_no, char *bw_report) +{ + float reads, writes, of_mul_read, of_mul_write; + int imc, j, ret; + + /* Start all iMC counters to log values (both read and write) */ + reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1; + for (imc = 0; imc < imcs; imc++) { + for (j = 0; j < 2; j++) { + ret = open_perf_event(imc, cpu_no, j); + if (ret) + return -1; + } + for (j = 0; j < 2; j++) + membw_ioctl_perf_event_ioc_reset_enable(imc, j); + } + + sleep(1); + + /* Stop counters after a second to get results (both read and write) */ + for (imc = 0; imc < imcs; imc++) { + for (j = 0; j < 2; j++) + membw_ioctl_perf_event_ioc_disable(imc, j); + } + + /* + * Get results which are stored in struct type imc_counter_config + * Take over flow into consideration before calculating total b/w + */ + for (imc = 0; imc < imcs; imc++) { + struct imc_counter_config *r = + &imc_counters_config[imc][READ]; + struct imc_counter_config *w = + &imc_counters_config[imc][WRITE]; + + if (read(r->fd, &r->return_value, + sizeof(struct membw_read_format)) == -1) { + perror("Couldn't get read b/w through iMC"); + + return -1; + } + + if (read(w->fd, &w->return_value, + sizeof(struct membw_read_format)) == -1) { + perror("Couldn't get write bw through iMC"); + + return -1; + } + + __u64 r_time_enabled = r->return_value.time_enabled; + __u64 r_time_running = r->return_value.time_running; + + if (r_time_enabled != r_time_running) + of_mul_read = (float)r_time_enabled / + (float)r_time_running; + + __u64 w_time_enabled = w->return_value.time_enabled; + __u64 w_time_running = w->return_value.time_running; + + if (w_time_enabled != w_time_running) + of_mul_write = (float)w_time_enabled / + (float)w_time_running; + reads += r->return_value.value * of_mul_read * SCALE; + writes += w->return_value.value * of_mul_write * SCALE; + } + + for (imc = 0; imc < imcs; imc++) { + close(imc_counters_config[imc][READ].fd); + close(imc_counters_config[imc][WRITE].fd); + } + + if (strcmp(bw_report, "reads") == 0) + return reads; + + if (strcmp(bw_report, "writes") == 0) + return writes; + + return (reads + writes); +} + +void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id) +{ + if (ctrlgrp && mongrp) + sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH, + RESCTRL_PATH, ctrlgrp, mongrp, resource_id); + else if (!ctrlgrp && mongrp) + sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, + mongrp, resource_id); + else if (ctrlgrp && !mongrp) + sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, + ctrlgrp, resource_id); + else if (!ctrlgrp && !mongrp) + sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, + resource_id); +} + +/* + * initialize_mem_bw_resctrl: Appropriately populate "mbm_total_path" + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @cpu_no: CPU number that the benchmark PID is binded to + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + */ +static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp, + int cpu_no, char *resctrl_val) +{ + int resource_id; + + if (get_resource_id(cpu_no, &resource_id) < 0) { + perror("Could not get resource_id"); + return; + } + + if (strcmp(resctrl_val, "mbm") == 0) + set_mbm_path(ctrlgrp, mongrp, resource_id); + + if ((strcmp(resctrl_val, "mba") == 0)) { + if (ctrlgrp) + sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, + RESCTRL_PATH, ctrlgrp, resource_id); + else + sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, + RESCTRL_PATH, resource_id); + } +} + +/* + * Get MBM Local bytes as reported by resctrl FS + * For MBM, + * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp + * 2. If only con_mon grp is given, then read from con_mon grp + * 3. If both are not given, then read from root con_mon grp + * For MBA, + * 1. If con_mon grp is given, then read from it + * 2. If con_mon grp is not given, then read from root con_mon grp + */ +static unsigned long get_mem_bw_resctrl(void) +{ + unsigned long mbm_total = 0; + FILE *fp; + + fp = fopen(mbm_total_path, "r"); + if (!fp) { + perror("Failed to open total bw file"); + + return -1; + } + if (fscanf(fp, "%lu", &mbm_total) <= 0) { + perror("Could not get mbm local bytes"); + fclose(fp); + + return -1; + } + fclose(fp); + + return mbm_total; +} + +pid_t bm_pid, ppid; + +void ctrlc_handler(int signum, siginfo_t *info, void *ptr) +{ + kill(bm_pid, SIGKILL); + umount_resctrlfs(); + tests_cleanup(); + printf("Ending\n\n"); + + exit(EXIT_SUCCESS); +} + +/* + * print_results_bw: the memory bandwidth results are stored in a file + * @filename: file that stores the results + * @bm_pid: child pid that runs benchmark + * @bw_imc: perf imc counter value + * @bw_resc: memory bandwidth value + * + * Return: 0 on success. non-zero on failure. + */ +static int print_results_bw(char *filename, int bm_pid, float bw_imc, + unsigned long bw_resc) +{ + unsigned long diff = fabs(bw_imc - bw_resc); + FILE *fp; + + if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) { + printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc); + printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff); + } else { + fp = fopen(filename, "a"); + if (!fp) { + perror("Cannot open results file"); + + return errno; + } + if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n", + bm_pid, bw_imc, bw_resc, diff) <= 0) { + fclose(fp); + perror("Could not log results."); + + return errno; + } + fclose(fp); + } + + return 0; +} + +static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num) +{ + if (strlen(ctrlgrp) && strlen(mongrp)) + sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH, + ctrlgrp, mongrp, sock_num); + else if (!strlen(ctrlgrp) && strlen(mongrp)) + sprintf(llc_occup_path, MON_LCC_OCCUP_PATH, RESCTRL_PATH, + mongrp, sock_num); + else if (strlen(ctrlgrp) && !strlen(mongrp)) + sprintf(llc_occup_path, CON_LCC_OCCUP_PATH, RESCTRL_PATH, + ctrlgrp, sock_num); + else if (!strlen(ctrlgrp) && !strlen(mongrp)) + sprintf(llc_occup_path, LCC_OCCUP_PATH, RESCTRL_PATH, sock_num); +} + +/* + * initialize_llc_occu_resctrl: Appropriately populate "llc_occup_path" + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @cpu_no: CPU number that the benchmark PID is binded to + * @resctrl_val: Resctrl feature (Eg: cat, cqm.. etc) + */ +static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, + int cpu_no, char *resctrl_val) +{ + int resource_id; + + if (get_resource_id(cpu_no, &resource_id) < 0) { + perror("# Unable to resource_id"); + return; + } + + if (strcmp(resctrl_val, "cqm") == 0) + set_cqm_path(ctrlgrp, mongrp, resource_id); +} + +static int +measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start) +{ + unsigned long bw_imc, bw_resc, bw_resc_end; + int ret; + + /* + * Measure memory bandwidth from resctrl and from + * another source which is perf imc value or could + * be something else if perf imc event is not available. + * Compare the two values to validate resctrl value. + * It takes 1sec to measure the data. + */ + bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report); + if (bw_imc <= 0) + return bw_imc; + + bw_resc_end = get_mem_bw_resctrl(); + if (bw_resc_end <= 0) + return bw_resc_end; + + bw_resc = (bw_resc_end - *bw_resc_start) / MB; + ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc); + if (ret) + return ret; + + *bw_resc_start = bw_resc_end; + + return 0; +} + +/* + * resctrl_val: execute benchmark and measure memory bandwidth on + * the benchmark + * @benchmark_cmd: benchmark command and its arguments + * @param: parameters passed to resctrl_val() + * + * Return: 0 on success. non-zero on failure. + */ +int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) +{ + char *resctrl_val = param->resctrl_val; + unsigned long bw_resc_start = 0; + struct sigaction sigact; + int ret = 0, pipefd[2]; + char pipe_message = 0; + union sigval value; + + if (strcmp(param->filename, "") == 0) + sprintf(param->filename, "stdio"); + + if ((strcmp(resctrl_val, "mba")) == 0 || + (strcmp(resctrl_val, "mbm")) == 0) { + ret = validate_bw_report_request(param->bw_report); + if (ret) + return ret; + } + + ret = remount_resctrlfs(param->mum_resctrlfs); + if (ret) + return ret; + + /* + * If benchmark wasn't successfully started by child, then child should + * kill parent, so save parent's pid + */ + ppid = getpid(); + + if (pipe(pipefd)) { + perror("# Unable to create pipe"); + + return -1; + } + + /* + * Fork to start benchmark, save child's pid so that it can be killed + * when needed + */ + bm_pid = fork(); + if (bm_pid == -1) { + perror("# Unable to fork"); + + return -1; + } + + if (bm_pid == 0) { + /* + * Mask all signals except SIGUSR1, parent uses SIGUSR1 to + * start benchmark + */ + sigfillset(&sigact.sa_mask); + sigdelset(&sigact.sa_mask, SIGUSR1); + + sigact.sa_sigaction = run_benchmark; + sigact.sa_flags = SA_SIGINFO; + + /* Register for "SIGUSR1" signal from parent */ + if (sigaction(SIGUSR1, &sigact, NULL)) + PARENT_EXIT("Can't register child for signal"); + + /* Tell parent that child is ready */ + close(pipefd[0]); + pipe_message = 1; + if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) < + sizeof(pipe_message)) { + perror("# failed signaling parent process"); + close(pipefd[1]); + return -1; + } + close(pipefd[1]); + + /* Suspend child until delivery of "SIGUSR1" from parent */ + sigsuspend(&sigact.sa_mask); + + PARENT_EXIT("Child is done"); + } + + printf("# benchmark PID: %d\n", bm_pid); + + /* + * Register CTRL-C handler for parent, as it has to kill benchmark + * before exiting + */ + sigact.sa_sigaction = ctrlc_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = SA_SIGINFO; + if (sigaction(SIGINT, &sigact, NULL) || + sigaction(SIGHUP, &sigact, NULL)) { + perror("# sigaction"); + ret = errno; + goto out; + } + + value.sival_ptr = benchmark_cmd; + + /* Taskset benchmark to specified cpu */ + ret = taskset_benchmark(bm_pid, param->cpu_no); + if (ret) + goto out; + + /* Write benchmark to specified control&monitoring grp in resctrl FS */ + ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp, + resctrl_val); + if (ret) + goto out; + + if ((strcmp(resctrl_val, "mbm") == 0) || + (strcmp(resctrl_val, "mba") == 0)) { + ret = initialize_mem_bw_imc(); + if (ret) + goto out; + + initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp, + param->cpu_no, resctrl_val); + } else if (strcmp(resctrl_val, "cqm") == 0) + initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp, + param->cpu_no, resctrl_val); + + /* Parent waits for child to be ready. */ + close(pipefd[1]); + while (pipe_message != 1) { + if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) < + sizeof(pipe_message)) { + perror("# failed reading message from child process"); + close(pipefd[0]); + goto out; + } + } + close(pipefd[0]); + + /* Signal child to start benchmark */ + if (sigqueue(bm_pid, SIGUSR1, value) == -1) { + perror("# sigqueue SIGUSR1 to child"); + ret = errno; + goto out; + } + + /* Give benchmark enough time to fully run */ + sleep(1); + + /* Test runs until the callback setup() tells the test to stop. */ + while (1) { + if ((strcmp(resctrl_val, "mbm") == 0) || + (strcmp(resctrl_val, "mba") == 0)) { + ret = param->setup(1, param); + if (ret) { + ret = 0; + break; + } + + ret = measure_vals(param, &bw_resc_start); + if (ret) + break; + } else if (strcmp(resctrl_val, "cqm") == 0) { + ret = param->setup(1, param); + if (ret) { + ret = 0; + break; + } + sleep(1); + ret = measure_cache_vals(param, bm_pid); + if (ret) + break; + } else { + break; + } + } + +out: + kill(bm_pid, SIGKILL); + umount_resctrlfs(); + + return ret; +} diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c new file mode 100644 index 000000000000..19c0ec4045a4 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -0,0 +1,722 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Basic resctrl file system operations + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +int tests_run; + +static int find_resctrl_mount(char *buffer) +{ + FILE *mounts; + char line[256], *fs, *mntpoint; + + mounts = fopen("/proc/mounts", "r"); + if (!mounts) { + perror("/proc/mounts"); + return -ENXIO; + } + while (!feof(mounts)) { + if (!fgets(line, 256, mounts)) + break; + fs = strtok(line, " \t"); + if (!fs) + continue; + mntpoint = strtok(NULL, " \t"); + if (!mntpoint) + continue; + fs = strtok(NULL, " \t"); + if (!fs) + continue; + if (strcmp(fs, "resctrl")) + continue; + + fclose(mounts); + if (buffer) + strncpy(buffer, mntpoint, 256); + + return 0; + } + + fclose(mounts); + + return -ENOENT; +} + +char cbm_mask[256]; + +/* + * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl + * @mum_resctrlfs: Should the resctrl FS be remounted? + * + * If not mounted, mount it. + * If mounted and mum_resctrlfs then remount resctrl FS. + * If mounted and !mum_resctrlfs then noop + * + * Return: 0 on success, non-zero on failure + */ +int remount_resctrlfs(bool mum_resctrlfs) +{ + char mountpoint[256]; + int ret; + + ret = find_resctrl_mount(mountpoint); + if (ret) + strcpy(mountpoint, RESCTRL_PATH); + + if (!ret && mum_resctrlfs && umount(mountpoint)) { + printf("not ok unmounting \"%s\"\n", mountpoint); + perror("# umount"); + tests_run++; + } + + if (!ret && !mum_resctrlfs) + return 0; + + ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL); + printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "", + RESCTRL_PATH); + if (ret) + perror("# mount"); + + tests_run++; + + return ret; +} + +int umount_resctrlfs(void) +{ + if (umount(RESCTRL_PATH)) { + perror("# Unable to umount resctrl"); + + return errno; + } + + return 0; +} + +/* + * get_resource_id - Get socket number/l3 id for a specified CPU + * @cpu_no: CPU number + * @resource_id: Socket number or l3_id + * + * Return: >= 0 on success, < 0 on failure. + */ +int get_resource_id(int cpu_no, int *resource_id) +{ + char phys_pkg_path[1024]; + FILE *fp; + + if (is_amd) + sprintf(phys_pkg_path, "%s%d/cache/index3/id", + PHYS_ID_PATH, cpu_no); + else + sprintf(phys_pkg_path, "%s%d/topology/physical_package_id", + PHYS_ID_PATH, cpu_no); + + fp = fopen(phys_pkg_path, "r"); + if (!fp) { + perror("Failed to open physical_package_id"); + + return -1; + } + if (fscanf(fp, "%d", resource_id) <= 0) { + perror("Could not get socket number or l3 id"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * get_cache_size - Get cache size for a specified CPU + * @cpu_no: CPU number + * @cache_type: Cache level L2/L3 + * @cache_size: pointer to cache_size + * + * Return: = 0 on success, < 0 on failure. + */ +int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size) +{ + char cache_path[1024], cache_str[64]; + int length, i, cache_num; + FILE *fp; + + if (!strcmp(cache_type, "L3")) { + cache_num = 3; + } else if (!strcmp(cache_type, "L2")) { + cache_num = 2; + } else { + perror("Invalid cache level"); + return -1; + } + + sprintf(cache_path, "/sys/bus/cpu/devices/cpu%d/cache/index%d/size", + cpu_no, cache_num); + fp = fopen(cache_path, "r"); + if (!fp) { + perror("Failed to open cache size"); + + return -1; + } + if (fscanf(fp, "%s", cache_str) <= 0) { + perror("Could not get cache_size"); + fclose(fp); + + return -1; + } + fclose(fp); + + length = (int)strlen(cache_str); + + *cache_size = 0; + + for (i = 0; i < length; i++) { + if ((cache_str[i] >= '0') && (cache_str[i] <= '9')) + + *cache_size = *cache_size * 10 + (cache_str[i] - '0'); + + else if (cache_str[i] == 'K') + + *cache_size = *cache_size * 1024; + + else if (cache_str[i] == 'M') + + *cache_size = *cache_size * 1024 * 1024; + + else + break; + } + + return 0; +} + +#define CORE_SIBLINGS_PATH "/sys/bus/cpu/devices/cpu" + +/* + * get_cbm_mask - Get cbm mask for given cache + * @cache_type: Cache level L2/L3 + * + * Mask is stored in cbm_mask which is global variable. + * + * Return: = 0 on success, < 0 on failure. + */ +int get_cbm_mask(char *cache_type) +{ + char cbm_mask_path[1024]; + FILE *fp; + + sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type); + + fp = fopen(cbm_mask_path, "r"); + if (!fp) { + perror("Failed to open cache level"); + + return -1; + } + if (fscanf(fp, "%s", cbm_mask) <= 0) { + perror("Could not get max cbm_mask"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * get_core_sibling - Get sibling core id from the same socket for given CPU + * @cpu_no: CPU number + * + * Return: > 0 on success, < 0 on failure. + */ +int get_core_sibling(int cpu_no) +{ + char core_siblings_path[1024], cpu_list_str[64]; + int sibling_cpu_no = -1; + FILE *fp; + + sprintf(core_siblings_path, "%s%d/topology/core_siblings_list", + CORE_SIBLINGS_PATH, cpu_no); + + fp = fopen(core_siblings_path, "r"); + if (!fp) { + perror("Failed to open core siblings path"); + + return -1; + } + if (fscanf(fp, "%s", cpu_list_str) <= 0) { + perror("Could not get core_siblings list"); + fclose(fp); + + return -1; + } + fclose(fp); + + char *token = strtok(cpu_list_str, "-,"); + + while (token) { + sibling_cpu_no = atoi(token); + /* Skipping core 0 as we don't want to run test on core 0 */ + if (sibling_cpu_no != 0) + break; + token = strtok(NULL, "-,"); + } + + return sibling_cpu_no; +} + +/* + * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu + * @bm_pid: PID that should be binded + * @cpu_no: CPU number at which the PID would be binded + * + * Return: 0 on success, non-zero on failure + */ +int taskset_benchmark(pid_t bm_pid, int cpu_no) +{ + cpu_set_t my_set; + + CPU_ZERO(&my_set); + CPU_SET(cpu_no, &my_set); + + if (sched_setaffinity(bm_pid, sizeof(cpu_set_t), &my_set)) { + perror("Unable to taskset benchmark"); + + return -1; + } + + return 0; +} + +/* + * run_benchmark - Run a specified benchmark or fill_buf (default benchmark) + * in specified signal. Direct benchmark stdio to /dev/null. + * @signum: signal number + * @info: signal info + * @ucontext: user context in signal handling + * + * Return: void + */ +void run_benchmark(int signum, siginfo_t *info, void *ucontext) +{ + int operation, ret, malloc_and_init_memory, memflush; + unsigned long span, buffer_span; + char **benchmark_cmd; + char resctrl_val[64]; + FILE *fp; + + benchmark_cmd = info->si_ptr; + + /* + * Direct stdio of child to /dev/null, so that only parent writes to + * stdio (console) + */ + fp = freopen("/dev/null", "w", stdout); + if (!fp) + PARENT_EXIT("Unable to direct benchmark status to /dev/null"); + + if (strcmp(benchmark_cmd[0], "fill_buf") == 0) { + /* Execute default fill_buf benchmark */ + span = strtoul(benchmark_cmd[1], NULL, 10); + malloc_and_init_memory = atoi(benchmark_cmd[2]); + memflush = atoi(benchmark_cmd[3]); + operation = atoi(benchmark_cmd[4]); + sprintf(resctrl_val, "%s", benchmark_cmd[5]); + + if (strcmp(resctrl_val, "cqm") != 0) + buffer_span = span * MB; + else + buffer_span = span; + + if (run_fill_buf(buffer_span, malloc_and_init_memory, memflush, + operation, resctrl_val)) + fprintf(stderr, "Error in running fill buffer\n"); + } else { + /* Execute specified benchmark */ + ret = execvp(benchmark_cmd[0], benchmark_cmd); + if (ret) + perror("wrong\n"); + } + + fclose(stdout); + PARENT_EXIT("Unable to run specified benchmark"); +} + +/* + * create_grp - Create a group only if one doesn't exist + * @grp_name: Name of the group + * @grp: Full path and name of the group + * @parent_grp: Full path and name of the parent group + * + * Return: 0 on success, non-zero on failure + */ +static int create_grp(const char *grp_name, char *grp, const char *parent_grp) +{ + int found_grp = 0; + struct dirent *ep; + DIR *dp; + + /* + * At this point, we are guaranteed to have resctrl FS mounted and if + * length of grp_name == 0, it means, user wants to use root con_mon + * grp, so do nothing + */ + if (strlen(grp_name) == 0) + return 0; + + /* Check if requested grp exists or not */ + dp = opendir(parent_grp); + if (dp) { + while ((ep = readdir(dp)) != NULL) { + if (strcmp(ep->d_name, grp_name) == 0) + found_grp = 1; + } + closedir(dp); + } else { + perror("Unable to open resctrl for group"); + + return -1; + } + + /* Requested grp doesn't exist, hence create it */ + if (found_grp == 0) { + if (mkdir(grp, 0) == -1) { + perror("Unable to create group"); + + return -1; + } + } + + return 0; +} + +static int write_pid_to_tasks(char *tasks, pid_t pid) +{ + FILE *fp; + + fp = fopen(tasks, "w"); + if (!fp) { + perror("Failed to open tasks file"); + + return -1; + } + if (fprintf(fp, "%d\n", pid) < 0) { + perror("Failed to wr pid to tasks file"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * write_bm_pid_to_resctrl - Write a PID (i.e. benchmark) to resctrl FS + * @bm_pid: PID that should be written + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + * + * If a con_mon grp is requested, create it and write pid to it, otherwise + * write pid to root con_mon grp. + * If a mon grp is requested, create it and write pid to it, otherwise + * pid is not written, this means that pid is in con_mon grp and hence + * should consult con_mon grp's mon_data directory for results. + * + * Return: 0 on success, non-zero on failure + */ +int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, + char *resctrl_val) +{ + char controlgroup[128], monitorgroup[512], monitorgroup_p[256]; + char tasks[1024]; + int ret = 0; + + if (strlen(ctrlgrp)) + sprintf(controlgroup, "%s/%s", RESCTRL_PATH, ctrlgrp); + else + sprintf(controlgroup, "%s", RESCTRL_PATH); + + /* Create control and monitoring group and write pid into it */ + ret = create_grp(ctrlgrp, controlgroup, RESCTRL_PATH); + if (ret) + goto out; + sprintf(tasks, "%s/tasks", controlgroup); + ret = write_pid_to_tasks(tasks, bm_pid); + if (ret) + goto out; + + /* Create mon grp and write pid into it for "mbm" and "cqm" test */ + if ((strcmp(resctrl_val, "cqm") == 0) || + (strcmp(resctrl_val, "mbm") == 0)) { + if (strlen(mongrp)) { + sprintf(monitorgroup_p, "%s/mon_groups", controlgroup); + sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp); + ret = create_grp(mongrp, monitorgroup, monitorgroup_p); + if (ret) + goto out; + + sprintf(tasks, "%s/mon_groups/%s/tasks", + controlgroup, mongrp); + ret = write_pid_to_tasks(tasks, bm_pid); + if (ret) + goto out; + } + } + +out: + printf("%sok writing benchmark parameters to resctrl FS\n", + ret ? "not " : ""); + if (ret) + perror("# writing to resctrlfs"); + + tests_run++; + + return ret; +} + +/* + * write_schemata - Update schemata of a con_mon grp + * @ctrlgrp: Name of the con_mon grp + * @schemata: Schemata that should be updated to + * @cpu_no: CPU number that the benchmark PID is binded to + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + * + * Update schemata of a con_mon grp *only* if requested resctrl feature is + * allocation type + * + * Return: 0 on success, non-zero on failure + */ +int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) +{ + char controlgroup[1024], schema[1024], reason[64]; + int resource_id, ret = 0; + FILE *fp; + + if ((strcmp(resctrl_val, "mba") != 0) && + (strcmp(resctrl_val, "cat") != 0) && + (strcmp(resctrl_val, "cqm") != 0)) + return -ENOENT; + + if (!schemata) { + printf("# Skipping empty schemata update\n"); + + return -1; + } + + if (get_resource_id(cpu_no, &resource_id) < 0) { + sprintf(reason, "Failed to get resource id"); + ret = -1; + + goto out; + } + + if (strlen(ctrlgrp) != 0) + sprintf(controlgroup, "%s/%s/schemata", RESCTRL_PATH, ctrlgrp); + else + sprintf(controlgroup, "%s/schemata", RESCTRL_PATH); + + if (!strcmp(resctrl_val, "cat") || !strcmp(resctrl_val, "cqm")) + sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata); + if (strcmp(resctrl_val, "mba") == 0) + sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata); + + fp = fopen(controlgroup, "w"); + if (!fp) { + sprintf(reason, "Failed to open control group"); + ret = -1; + + goto out; + } + + if (fprintf(fp, "%s\n", schema) < 0) { + sprintf(reason, "Failed to write schemata in control group"); + fclose(fp); + ret = -1; + + goto out; + } + fclose(fp); + +out: + printf("%sok Write schema \"%s\" to resctrl FS%s%s\n", + ret ? "not " : "", schema, ret ? " # " : "", + ret ? reason : ""); + tests_run++; + + return ret; +} + +bool check_resctrlfs_support(void) +{ + FILE *inf = fopen("/proc/filesystems", "r"); + DIR *dp; + char *res; + bool ret = false; + + if (!inf) + return false; + + res = fgrep(inf, "nodev\tresctrl\n"); + + if (res) { + ret = true; + free(res); + } + + fclose(inf); + + printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not "); + tests_run++; + + dp = opendir(RESCTRL_PATH); + printf("%sok resctrl mountpoint \"%s\" exists\n", + dp ? "" : "not ", RESCTRL_PATH); + if (dp) + closedir(dp); + tests_run++; + + printf("# resctrl filesystem %s mounted\n", + find_resctrl_mount(NULL) ? "not" : "is"); + + return ret; +} + +char *fgrep(FILE *inf, const char *str) +{ + char line[256]; + int slen = strlen(str); + + while (!feof(inf)) { + if (!fgets(line, 256, inf)) + break; + if (strncmp(line, str, slen)) + continue; + + return strdup(line); + } + + return NULL; +} + +/* + * validate_resctrl_feature_request - Check if requested feature is valid. + * @resctrl_val: Requested feature + * + * Return: 0 on success, non-zero on failure + */ +bool validate_resctrl_feature_request(char *resctrl_val) +{ + FILE *inf = fopen("/proc/cpuinfo", "r"); + bool found = false; + char *res; + + if (!inf) + return false; + + res = fgrep(inf, "flags"); + + if (res) { + char *s = strchr(res, ':'); + + found = s && !strstr(s, resctrl_val); + free(res); + } + fclose(inf); + + return found; +} + +int filter_dmesg(void) +{ + char line[1024]; + FILE *fp; + int pipefds[2]; + pid_t pid; + int ret; + + ret = pipe(pipefds); + if (ret) { + perror("pipe"); + return ret; + } + pid = fork(); + if (pid == 0) { + close(pipefds[0]); + dup2(pipefds[1], STDOUT_FILENO); + execlp("dmesg", "dmesg", NULL); + perror("executing dmesg"); + exit(1); + } + close(pipefds[1]); + fp = fdopen(pipefds[0], "r"); + if (!fp) { + perror("fdopen(pipe)"); + kill(pid, SIGTERM); + + return -1; + } + + while (fgets(line, 1024, fp)) { + if (strstr(line, "intel_rdt:")) + printf("# dmesg: %s", line); + if (strstr(line, "resctrl:")) + printf("# dmesg: %s", line); + } + fclose(fp); + waitpid(pid, NULL, 0); + + return 0; +} + +int validate_bw_report_request(char *bw_report) +{ + if (strcmp(bw_report, "reads") == 0) + return 0; + if (strcmp(bw_report, "writes") == 0) + return 0; + if (strcmp(bw_report, "nt-writes") == 0) { + strcpy(bw_report, "writes"); + return 0; + } + if (strcmp(bw_report, "total") == 0) + return 0; + + fprintf(stderr, "Requested iMC B/W report type unavailable\n"); + + return -1; +} + +int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, + int group_fd, unsigned long flags) +{ + int ret; + + ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); + return ret; +} + +unsigned int count_bits(unsigned long n) +{ + unsigned int count = 0; + + while (n) { + count += n & 1; + n >>= 1; + } + + return count; +} diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile index 1760b3e39730..0ebfe8b0e147 100644 --- a/tools/testing/selftests/seccomp/Makefile +++ b/tools/testing/selftests/seccomp/Makefile @@ -1,17 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -all: - -include ../lib.mk - -.PHONY: all clean - -BINARIES := seccomp_bpf seccomp_benchmark CFLAGS += -Wl,-no-as-needed -Wall +LDFLAGS += -lpthread -seccomp_bpf: seccomp_bpf.c ../kselftest_harness.h - $(CC) $(CFLAGS) $(LDFLAGS) $< -lpthread -o $@ - -TEST_PROGS += $(BINARIES) -EXTRA_CLEAN := $(BINARIES) - -all: $(BINARIES) +TEST_GEN_PROGS := seccomp_bpf seccomp_benchmark +include ../lib.mk diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index ee1b727ede04..89fb3e0b552e 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -212,6 +212,10 @@ struct seccomp_notif_sizes { #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001 #endif +#ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4) +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -909,7 +913,7 @@ TEST(ERRNO_order) EXPECT_EQ(12, errno); } -FIXTURE_DATA(TRAP) { +FIXTURE(TRAP) { struct sock_fprog prog; }; @@ -1020,7 +1024,7 @@ TEST_F(TRAP, handler) EXPECT_NE(0, (unsigned long)sigsys->_call_addr); } -FIXTURE_DATA(precedence) { +FIXTURE(precedence) { struct sock_fprog allow; struct sock_fprog log; struct sock_fprog trace; @@ -1509,7 +1513,7 @@ void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status, EXPECT_EQ(0, ret); } -FIXTURE_DATA(TRACE_poke) { +FIXTURE(TRACE_poke) { struct sock_fprog prog; pid_t tracer; long poked; @@ -1817,7 +1821,7 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, change_syscall(_metadata, tracee, -1, -ESRCH); } -FIXTURE_DATA(TRACE_syscall) { +FIXTURE(TRACE_syscall) { struct sock_fprog prog; pid_t tracer, mytid, mypid, parent; }; @@ -2187,7 +2191,8 @@ TEST(detect_seccomp_filter_flags) unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_SPEC_ALLOW, - SECCOMP_FILTER_FLAG_NEW_LISTENER }; + SECCOMP_FILTER_FLAG_NEW_LISTENER, + SECCOMP_FILTER_FLAG_TSYNC_ESRCH }; unsigned int exclusive[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_NEW_LISTENER }; @@ -2321,7 +2326,7 @@ struct tsync_sibling { } \ } while (0) -FIXTURE_DATA(TSYNC) { +FIXTURE(TSYNC) { struct sock_fprog root_prog, apply_prog; struct tsync_sibling sibling[TSYNC_SIBLINGS]; sem_t started; @@ -2645,6 +2650,55 @@ TEST_F(TSYNC, two_siblings_with_one_divergence) EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); } +TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err) +{ + long ret, flags; + void *status; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!"); + } + self->sibling[0].diverge = 1; + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + flags = SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_TSYNC_ESRCH; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog); + ASSERT_EQ(ESRCH, errno) { + TH_LOG("Did not return ESRCH for diverged sibling."); + } + ASSERT_EQ(-1, ret) { + TH_LOG("Did not fail on diverged sibling."); + } + + /* Wake the threads */ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + + /* Ensure they are both unkilled. */ + PTHREAD_JOIN(self->sibling[0].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); + PTHREAD_JOIN(self->sibling[1].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); +} + TEST_F(TSYNC, two_siblings_not_under_filter) { long ret, sib; @@ -3196,6 +3250,24 @@ TEST(user_notification_basic) EXPECT_EQ(0, WEXITSTATUS(status)); } +TEST(user_notification_with_tsync) +{ + int ret; + unsigned int flags; + + /* these were exclusive */ + flags = SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_TSYNC; + ASSERT_EQ(-1, user_trap_syscall(__NR_getppid, flags)); + ASSERT_EQ(EINVAL, errno); + + /* but now they're not */ + flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH; + ret = user_trap_syscall(__NR_getppid, flags); + close(ret); + ASSERT_LE(0, ret); +} + TEST(user_notification_kill_in_middle) { pid_t pid; diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index 477bc61b374a..c33a7aac27ff 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -31,6 +31,7 @@ CONFIG_NET_EMATCH_U32=m CONFIG_NET_EMATCH_META=m CONFIG_NET_EMATCH_TEXT=m CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_CANID=m CONFIG_NET_EMATCH_IPT=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m @@ -57,3 +58,9 @@ CONFIG_NET_IFE_SKBMARK=m CONFIG_NET_IFE_SKBPRIO=m CONFIG_NET_IFE_SKBTCINDEX=m CONFIG_NET_SCH_FIFO=y +CONFIG_NET_SCH_ETS=m + +# +## Network testing +# +CONFIG_CAN=m diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json index 98a20faf3198..e788c114a484 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json @@ -372,5 +372,907 @@ "teardown": [ "$TC qdisc del dev $DEV1 ingress" ] + }, + { + "id": "bae4", + "name": "Add basic filter with u32 ematch u8/zero offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x11 0x0f at 0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(01000000/0f000000 at 0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "e6cb", + "name": "Add basic filter with u32 ematch u8/zero offset and invalid value >0xFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x1122 0x0f at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/0f000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "7727", + "name": "Add basic filter with u32 ematch u8/positive offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0x1f at 12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(17000000/1f000000 at 12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "a429", + "name": "Add basic filter with u32 ematch u8/invalid mask >0xFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff00 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000/ff000000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "8373", + "name": "Add basic filter with u32 ematch u8/missing offset", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff at)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "ab8e", + "name": "Add basic filter with u32 ematch u8/missing AT keyword", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "712d", + "name": "Add basic filter with u32 ematch u8/missing value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "350f", + "name": "Add basic filter with u32 ematch u8/non-numeric value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 zero 0xff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ff000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "e28f", + "name": "Add basic filter with u32 ematch u8/non-numeric mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x11 mask at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11000000/00000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "6d5f", + "name": "Add basic filter with u32 ematch u8/negative offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0xaa 0xf0 at -14)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(0000a000/0000f000 at -16\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "12dc", + "name": "Add basic filter with u32 ematch u8/nexthdr+ offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0xaa 0xf0 at nexthdr+0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(a0000000/f0000000 at nexthdr\\+0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "1d85", + "name": "Add basic filter with u32 ematch u16/zero offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x1122 0xffff at 0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/ffff0000 at 0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "3672", + "name": "Add basic filter with u32 ematch u16/zero offset and invalid value >0xFFFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x112233 0xffff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11223300/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "7fb0", + "name": "Add basic filter with u32 ematch u16/positive offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0x1fff at 12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(17880000/1fff0000 at 12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "19af", + "name": "Add basic filter with u32 ematch u16/invalid mask >0xFFFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffffffff at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000/ffffffff at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "446d", + "name": "Add basic filter with u32 ematch u16/missing offset", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffff at)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "151b", + "name": "Add basic filter with u32 ematch u16/missing AT keyword", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffff 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "bb23", + "name": "Add basic filter with u32 ematch u16/missing value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "decc", + "name": "Add basic filter with u32 ematch u16/non-numeric value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 zero 0xffff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "e988", + "name": "Add basic filter with u32 ematch u16/non-numeric mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x1122 mask at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/00000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "07d8", + "name": "Add basic filter with u32 ematch u16/negative offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0xaabb 0xffff at -12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabb0000/ffff0000 at -12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "f474", + "name": "Add basic filter with u32 ematch u16/nexthdr+ offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0xaabb 0xf0f0 at nexthdr+0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(a0b00000/f0f00000 at nexthdr\\+0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "47a0", + "name": "Add basic filter with u32 ematch u32/zero offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xffffffff at 0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabbccdd/ffffffff at 0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "849f", + "name": "Add basic filter with u32 ematch u32/positive offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11227788 0x1ffff0f0 at 12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11227080/1ffff0f0 at 12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "d288", + "name": "Add basic filter with u32 ematch u32/missing offset", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11227788 0xffffffff at)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11227788/ffffffff at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "4998", + "name": "Add basic filter with u32 ematch u32/missing AT keyword", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x77889900 0xfffff0f0 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77889900/fffff0f0 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "1f0a", + "name": "Add basic filter with u32 ematch u32/missing value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "848e", + "name": "Add basic filter with u32 ematch u32/non-numeric value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 zero 0xffff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "f748", + "name": "Add basic filter with u32 ematch u32/non-numeric mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11223344 mask at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11223344/00000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "55a6", + "name": "Add basic filter with u32 ematch u32/negative offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xff00ff00 at -12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aa00cc00/ff00ff00 at -12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "7282", + "name": "Add basic filter with u32 ematch u32/nexthdr+ offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xffffffff at nexthdr+0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabbccdd/ffffffff at nexthdr\\+0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "b2b6", + "name": "Add basic filter with canid ematch and single SFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 1)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "f67f", + "name": "Add basic filter with canid ematch and single SFF with mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0xaabb:0x00ff)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x2BB:0xFF\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "bd5c", + "name": "Add basic filter with canid ematch and multiple SFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 1 sff 2 sff 3)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x1 sff 0x2 sff 0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "83c7", + "name": "Add basic filter with canid ematch and multiple SFF with masks", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0xaa:0x01 sff 0xbb:0x02 sff 0xcc:0x03)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0xAA:0x1 sff 0xBB:0x2 sff 0xCC:0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "a8f5", + "name": "Add basic filter with canid ematch and single EFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 1)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "98ae", + "name": "Add basic filter with canid ematch and single EFF with mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 0xaabb:0xf1f1)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0xAABB:0xF1F1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "6056", + "name": "Add basic filter with canid ematch and multiple EFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 1 eff 2 eff 3)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x1 eff 0x2 eff 0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "d188", + "name": "Add basic filter with canid ematch and multiple EFF with masks", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 0xaa:0x01 eff 0xbb:0x02 eff 0xcc:0x03)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0xAA:0x1 eff 0xBB:0x2 eff 0xCC:0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "25d1", + "name": "Add basic filter with canid ematch and a combination of SFF/EFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0x01 eff 0x02)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x2 sff 0x1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "b438", + "name": "Add basic filter with canid ematch and a combination of SFF/EFF with masks", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0x01:0xf eff 0x02:0xf)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x2:0xF sff 0x1:0xF\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json new file mode 100644 index 000000000000..0703a2a255eb --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json @@ -0,0 +1,185 @@ +[ + { + "id": "8b6e", + "name": "Create RED with no flags", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "342e", + "name": "Create RED with adaptive flag", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red adaptive limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb adaptive $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "2d4b", + "name": "Create RED with ECN flag", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "650f", + "name": "Create RED with flags ECN, adaptive", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn adaptive limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn adaptive $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "5f15", + "name": "Create RED with flags ECN, harddrop", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn harddrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn harddrop $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "53e8", + "name": "Create RED with flags ECN, nodrop", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn nodrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn nodrop $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "d091", + "name": "Fail to create RED with only nodrop flag", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red nodrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "af8e", + "name": "Create RED with flags ECN, nodrop, harddrop", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn harddrop nodrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn harddrop nodrop $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + } +] diff --git a/tools/testing/selftests/timens/exec.c b/tools/testing/selftests/timens/exec.c index 87b47b557a7a..e40dc5be2f66 100644 --- a/tools/testing/selftests/timens/exec.c +++ b/tools/testing/selftests/timens/exec.c @@ -11,7 +11,6 @@ #include <sys/wait.h> #include <time.h> #include <unistd.h> -#include <time.h> #include <string.h> #include "log.h" diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c index 43d93f4006b9..7f14f0fdac84 100644 --- a/tools/testing/selftests/timens/procfs.c +++ b/tools/testing/selftests/timens/procfs.c @@ -12,7 +12,6 @@ #include <sys/types.h> #include <time.h> #include <unistd.h> -#include <time.h> #include "log.h" #include "timens.h" diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c index 559d26e21ba0..098be7c83be3 100644 --- a/tools/testing/selftests/timens/timens.c +++ b/tools/testing/selftests/timens/timens.c @@ -10,7 +10,6 @@ #include <sys/types.h> #include <time.h> #include <unistd.h> -#include <time.h> #include <string.h> #include "log.h" diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c index 0cca7aafc4bd..96dba11ebe44 100644 --- a/tools/testing/selftests/timens/timer.c +++ b/tools/testing/selftests/timens/timer.c @@ -11,7 +11,6 @@ #include <stdio.h> #include <stdint.h> #include <signal.h> -#include <time.h> #include "log.h" #include "timens.h" diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 138d46b3f330..936e1ca9410e 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -527,11 +527,16 @@ n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0 n0 wg set wg0 peer "$pub2" allowed-ips ::/0,1700::/111,5000::/4,e000::/37,9000::/75 n0 wg set wg0 peer "$pub2" allowed-ips ::/0 n0 wg set wg0 peer "$pub2" remove -low_order_points=( AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= 4Ot6fDtBuK4WVuP68Z/EatoJjeucMrH9hmIFFl9JuAA= X5yVvKNQjCSx0LFVnIPvWwREXMRYHI6G2CJO3dCfEVc= 7P///////////////////////////////////////38= 7f///////////////////////////////////////38= 7v///////////////////////////////////////38= ) -n0 wg set wg0 private-key /dev/null ${low_order_points[@]/#/peer } -[[ -z $(n0 wg show wg0 peers) ]] -n0 wg set wg0 private-key <(echo "$key1") ${low_order_points[@]/#/peer } -[[ -z $(n0 wg show wg0 peers) ]] +for low_order_point in AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= 4Ot6fDtBuK4WVuP68Z/EatoJjeucMrH9hmIFFl9JuAA= X5yVvKNQjCSx0LFVnIPvWwREXMRYHI6G2CJO3dCfEVc= 7P///////////////////////////////////////38= 7f///////////////////////////////////////38= 7v///////////////////////////////////////38=; do + n0 wg set wg0 peer "$low_order_point" persistent-keepalive 1 endpoint 127.0.0.1:1111 +done +[[ -n $(n0 wg show wg0 peers) ]] +exec 4< <(n0 ncat -l -u -p 1111) +ncat_pid=$! +waitncatudp $netns0 $ncat_pid +ip0 link set wg0 up +! read -r -n 1 -t 2 <&4 || false +kill $ncat_pid ip0 link del wg0 declare -A objects diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index 28d477683e8a..90598a425c18 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -41,7 +41,7 @@ $(DISTFILES_PATH)/$(1): flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp; [ -f $$@.tmp ] || exit 1; if echo "$(3) $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi' endef -$(eval $(call tar_download,MUSL,musl,1.1.24,.tar.gz,https://www.musl-libc.org/releases/,1370c9a812b2cf2a7d92802510cca0058cc37e66a7bedd70051f0a34015022a3)) +$(eval $(call tar_download,MUSL,musl,1.2.0,.tar.gz,https://musl.libc.org/releases/,c6de7b191139142d3f9a7b5b702c9cae1b5ee6e7f57e582da9328629408fd4e8)) $(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c)) $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d)) $(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae)) diff --git a/tools/testing/selftests/wireguard/qemu/init.c b/tools/testing/selftests/wireguard/qemu/init.c index 90bc9813cadc..c9698120ac9d 100644 --- a/tools/testing/selftests/wireguard/qemu/init.c +++ b/tools/testing/selftests/wireguard/qemu/init.c @@ -13,7 +13,6 @@ #include <fcntl.h> #include <sys/wait.h> #include <sys/mount.h> -#include <sys/types.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/io.h> diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index af9323a0b6e0..d531de13c95b 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -56,7 +56,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_HZ_PERIODIC=n CONFIG_HIGH_RES_TIMERS=y -CONFIG_COMPAT_32BIT_TIME=y CONFIG_ARCH_RANDOM=y CONFIG_FILE_LOCKING=y CONFIG_POSIX_TIMERS=y diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 6f22238f3217..12aaa063196e 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -414,8 +414,12 @@ int main() #if defined(__i386__) && (!defined(__GLIBC__) || __GLIBC__ > 2 || __GLIBC_MINOR__ >= 16) vsyscall32 = (void *)getauxval(AT_SYSINFO); - printf("[RUN]\tCheck AT_SYSINFO return regs\n"); - test_sys32_regs(do_full_vsyscall32); + if (vsyscall32) { + printf("[RUN]\tCheck AT_SYSINFO return regs\n"); + test_sys32_regs(do_full_vsyscall32); + } else { + printf("[SKIP]\tAT_SYSINFO is not available\n"); + } #endif test_ptrace_syscall_restart(); diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 35edd61d1663..42052db0f870 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -259,6 +259,11 @@ static void test_one_clock_gettime(int clock, const char *name) static void test_clock_gettime(void) { + if (!vdso_clock_gettime) { + printf("[SKIP]\tNo vDSO, so skipping clock_gettime() tests\n"); + return; + } + for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]); clock++) { test_one_clock_gettime(clock, clocknames[clock]); diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c index 29a5c94c4b50..fe99f2434155 100644 --- a/tools/testing/selftests/x86/vdso_restorer.c +++ b/tools/testing/selftests/x86/vdso_restorer.c @@ -15,6 +15,7 @@ #include <err.h> #include <stdio.h> +#include <dlfcn.h> #include <string.h> #include <signal.h> #include <unistd.h> @@ -46,11 +47,23 @@ int main() int nerrs = 0; struct real_sigaction sa; + void *vdso = dlopen("linux-vdso.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-gate.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) { + printf("[SKIP]\tFailed to find vDSO. Tests are not expected to work.\n"); + return 0; + } + memset(&sa, 0, sizeof(sa)); sa.handler = handler_with_siginfo; sa.flags = SA_SIGINFO; sa.restorer = NULL; /* request kernel-provided restorer */ + printf("[RUN]\tRaise a signal, SA_SIGINFO, sa.restorer == NULL\n"); + if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0) err(1, "raw rt_sigaction syscall"); @@ -63,6 +76,8 @@ int main() nerrs++; } + printf("[RUN]\tRaise a signal, !SA_SIGINFO, sa.restorer == NULL\n"); + sa.flags = 0; sa.handler = handler_without_siginfo; if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 1d8b93f1af31..5a4fb80fa832 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -55,6 +55,78 @@ static void test_stream_connection_reset(const struct test_opts *opts) close(fd); } +static void test_stream_bind_only_client(const struct test_opts *opts) +{ + union { + struct sockaddr sa; + struct sockaddr_vm svm; + } addr = { + .svm = { + .svm_family = AF_VSOCK, + .svm_port = 1234, + .svm_cid = opts->peer_cid, + }, + }; + int ret; + int fd; + + /* Wait for the server to be ready */ + control_expectln("BIND"); + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + + timeout_begin(TIMEOUT); + do { + ret = connect(fd, &addr.sa, sizeof(addr.svm)); + timeout_check("connect"); + } while (ret < 0 && errno == EINTR); + timeout_end(); + + if (ret != -1) { + fprintf(stderr, "expected connect(2) failure, got %d\n", ret); + exit(EXIT_FAILURE); + } + if (errno != ECONNRESET) { + fprintf(stderr, "unexpected connect(2) errno %d\n", errno); + exit(EXIT_FAILURE); + } + + /* Notify the server that the client has finished */ + control_writeln("DONE"); + + close(fd); +} + +static void test_stream_bind_only_server(const struct test_opts *opts) +{ + union { + struct sockaddr sa; + struct sockaddr_vm svm; + } addr = { + .svm = { + .svm_family = AF_VSOCK, + .svm_port = 1234, + .svm_cid = VMADDR_CID_ANY, + }, + }; + int fd; + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + + if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { + perror("bind"); + exit(EXIT_FAILURE); + } + + /* Notify the client that the server is ready */ + control_writeln("BIND"); + + /* Wait for the client to finish */ + control_expectln("DONE"); + + close(fd); +} + static void test_stream_client_close_client(const struct test_opts *opts) { int fd; @@ -213,6 +285,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_connection_reset, }, { + .name = "SOCK_STREAM bind only", + .run_client = test_stream_bind_only_client, + .run_server = test_stream_bind_only_server, + }, + { .name = "SOCK_STREAM client close", .run_client = test_stream_client_close_client, .run_server = test_stream_client_close_server, |