diff options
Diffstat (limited to 'samples')
-rw-r--r-- | samples/bpf/Makefile | 16 | ||||
-rw-r--r-- | samples/bpf/bpf_load.c | 29 | ||||
-rw-r--r-- | samples/bpf/bpf_load.h | 1 | ||||
-rw-r--r-- | samples/bpf/map_perf_test_kern.c | 46 | ||||
-rw-r--r-- | samples/bpf/map_perf_test_user.c | 89 | ||||
-rw-r--r-- | samples/bpf/sock_flags_kern.c | 5 | ||||
-rw-r--r-- | samples/bpf/syscall_tp_kern.c | 62 | ||||
-rw-r--r-- | samples/bpf/syscall_tp_user.c | 71 | ||||
-rw-r--r-- | samples/bpf/tcbpf2_kern.c | 63 | ||||
-rw-r--r-- | samples/bpf/test_cgrp2_sock.c | 255 | ||||
-rwxr-xr-x | samples/bpf/test_cgrp2_sock.sh | 162 | ||||
-rwxr-xr-x | samples/bpf/test_tunnel_bpf.sh | 29 | ||||
-rw-r--r-- | samples/bpf/xdp_monitor_kern.c | 88 | ||||
-rw-r--r-- | samples/bpf/xdp_monitor_user.c | 295 | ||||
-rw-r--r-- | samples/bpf/xdp_redirect_kern.c | 90 | ||||
-rw-r--r-- | samples/bpf/xdp_redirect_map_kern.c | 92 | ||||
-rw-r--r-- | samples/bpf/xdp_redirect_map_user.c | 145 | ||||
-rw-r--r-- | samples/bpf/xdp_redirect_user.c | 143 | ||||
-rw-r--r-- | samples/sockmap/Makefile | 78 | ||||
-rw-r--r-- | samples/sockmap/sockmap_kern.c | 108 | ||||
-rw-r--r-- | samples/sockmap/sockmap_user.c | 294 |
21 files changed, 2074 insertions, 87 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 87246be6feb8..cf17c7932a6e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -37,6 +37,10 @@ hostprogs-y += xdp_tx_iptunnel hostprogs-y += test_map_in_map hostprogs-y += per_socket_stats_example hostprogs-y += load_sock_ops +hostprogs-y += xdp_redirect +hostprogs-y += xdp_redirect_map +hostprogs-y += xdp_monitor +hostprogs-y += syscall_tp # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -78,6 +82,10 @@ lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o +xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o +xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o +xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o +syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -119,6 +127,10 @@ always += tcp_bufs_kern.o always += tcp_cong_kern.o always += tcp_iw_kern.o always += tcp_clamp_kern.o +always += xdp_redirect_kern.o +always += xdp_redirect_map_kern.o +always += xdp_monitor_kern.o +always += syscall_tp_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -155,6 +167,10 @@ HOSTLOADLIBES_tc_l2_redirect += -l elf HOSTLOADLIBES_lwt_len_hist += -l elf HOSTLOADLIBES_xdp_tx_iptunnel += -lelf HOSTLOADLIBES_test_map_in_map += -lelf +HOSTLOADLIBES_xdp_redirect += -lelf +HOSTLOADLIBES_xdp_redirect_map += -lelf +HOSTLOADLIBES_xdp_monitor += -lelf +HOSTLOADLIBES_syscall_tp += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 899f40310bc3..6aa50098dfb8 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -65,6 +65,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; bool is_sockops = strncmp(event, "sockops", 7) == 0; + bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -92,6 +93,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_CGROUP_SOCK; } else if (is_sockops) { prog_type = BPF_PROG_TYPE_SOCK_OPS; + } else if (is_sk_skb) { + prog_type = BPF_PROG_TYPE_SK_SKB; } else { printf("Unknown event '%s'\n", event); return -1; @@ -109,7 +112,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) return 0; - if (is_socket || is_sockops) { + if (is_socket || is_sockops || is_sk_skb) { if (is_socket) event += 6; else @@ -198,7 +201,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) static int load_maps(struct bpf_map_data *maps, int nr_maps, fixup_map_cb fixup_map) { - int i; + int i, numa_node; for (i = 0; i < nr_maps; i++) { if (fixup_map) { @@ -210,21 +213,26 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, } } + numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? + maps[i].def.numa_node : -1; + if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; - map_fd[i] = bpf_create_map_in_map(maps[i].def.type, + map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, maps[i].def.key_size, inner_map_fd, maps[i].def.max_entries, - maps[i].def.map_flags); + maps[i].def.map_flags, + numa_node); } else { - map_fd[i] = bpf_create_map(maps[i].def.type, - maps[i].def.key_size, - maps[i].def.value_size, - maps[i].def.max_entries, - maps[i].def.map_flags); + map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].def.key_size, + maps[i].def.value_size, + maps[i].def.max_entries, + maps[i].def.map_flags, + numa_node); } if (map_fd[i] < 0) { printf("failed to create a map: %d %s\n", @@ -567,7 +575,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || memcmp(shname, "cgroup/", 7) == 0 || - memcmp(shname, "sockops", 7) == 0) { + memcmp(shname, "sockops", 7) == 0 || + memcmp(shname, "sk_skb", 6) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0) diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index ca0563d04744..453e3226b4ce 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -13,6 +13,7 @@ struct bpf_map_def { unsigned int max_entries; unsigned int map_flags; unsigned int inner_map_idx; + unsigned int numa_node; }; struct bpf_map_data { diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 245165817fbe..098c857f1eda 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -40,6 +40,8 @@ struct bpf_map_def SEC("maps") inner_lru_hash_map = { .key_size = sizeof(u32), .value_size = sizeof(long), .max_entries = MAX_ENTRIES, + .map_flags = BPF_F_NUMA_NODE, + .numa_node = 0, }; struct bpf_map_def SEC("maps") array_of_lru_hashs = { @@ -86,6 +88,13 @@ struct bpf_map_def SEC("maps") array_map = { .max_entries = MAX_ENTRIES, }; +struct bpf_map_def SEC("maps") lru_hash_lookup_map = { + .type = BPF_MAP_TYPE_LRU_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = MAX_ENTRIES, +}; + SEC("kprobe/sys_getuid") int stress_hmap(struct pt_regs *ctx) { @@ -146,12 +155,23 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) SEC("kprobe/sys_connect") int stress_lru_hmap_alloc(struct pt_regs *ctx) { + char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; + union { + u16 dst6[8]; + struct { + u16 magic0; + u16 magic1; + u16 tcase; + u16 unused16; + u32 unused32; + u32 key; + }; + } test_params; struct sockaddr_in6 *in6; - u16 test_case, dst6[8]; + u16 test_case; int addrlen, ret; - char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%d\n"; long val = 1; - u32 key = bpf_get_prandom_u32(); + u32 key = 0; in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); addrlen = (int)PT_REGS_PARM3(ctx); @@ -159,14 +179,18 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) if (addrlen != sizeof(*in6)) return 0; - ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + ret = bpf_probe_read(test_params.dst6, sizeof(test_params.dst6), + &in6->sin6_addr); if (ret) goto done; - if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + if (test_params.magic0 != 0xdead || + test_params.magic1 != 0xbeef) return 0; - test_case = dst6[7]; + test_case = test_params.tcase; + if (test_case != 3) + key = bpf_get_prandom_u32(); if (test_case == 0) { ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); @@ -186,6 +210,16 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, BPF_ANY); + } else if (test_case == 3) { + u32 i; + + key = test_params.key; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; i++) { + bpf_map_lookup_elem(&lru_hash_lookup_map, &key); + key++; + } } else { ret = -EINVAL; } diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 1a8894b5ac51..f388254896f6 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -46,6 +46,7 @@ enum test_type { HASH_LOOKUP, ARRAY_LOOKUP, INNER_LRU_HASH_PREALLOC, + LRU_HASH_LOOKUP, NR_TESTS, }; @@ -60,6 +61,7 @@ const char *test_map_names[NR_TESTS] = { [HASH_LOOKUP] = "hash_map", [ARRAY_LOOKUP] = "array_map", [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map", + [LRU_HASH_LOOKUP] = "lru_hash_lookup_map", }; static int test_flags = ~0; @@ -67,6 +69,8 @@ static uint32_t num_map_entries; static uint32_t inner_lru_hash_size; static int inner_lru_hash_idx = -1; static int array_of_lru_hashs_idx = -1; +static int lru_hash_lookup_idx = -1; +static int lru_hash_lookup_test_entries = 32; static uint32_t max_cnt = 1000000; static int check_test_flags(enum test_type t) @@ -86,6 +90,32 @@ static void test_hash_prealloc(int cpu) cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } +static int pre_test_lru_hash_lookup(int tasks) +{ + int fd = map_fd[lru_hash_lookup_idx]; + uint32_t key; + long val = 1; + int ret; + + if (num_map_entries > lru_hash_lookup_test_entries) + lru_hash_lookup_test_entries = num_map_entries; + + /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test. + * + * It is fine that the user requests for a map with + * num_map_entries < 32 and some of the later lru hash lookup + * may return not found. For LRU map, we are not interested + * in such small map performance. + */ + for (key = 0; key < lru_hash_lookup_test_entries; key++) { + ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST); + if (ret) + return ret; + } + + return 0; +} + static void do_test_lru(enum test_type test, int cpu) { static int inner_lru_map_fds[MAX_NR_CPUS]; @@ -97,14 +127,20 @@ static void do_test_lru(enum test_type test, int cpu) if (test == INNER_LRU_HASH_PREALLOC) { int outer_fd = map_fd[array_of_lru_hashs_idx]; + unsigned int mycpu, mynode; assert(cpu < MAX_NR_CPUS); if (cpu) { + ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); + assert(!ret); + inner_lru_map_fds[cpu] = - bpf_create_map(BPF_MAP_TYPE_LRU_HASH, - sizeof(uint32_t), sizeof(long), - inner_lru_hash_size, 0); + bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, + sizeof(uint32_t), + sizeof(long), + inner_lru_hash_size, 0, + mynode); if (inner_lru_map_fds[cpu] == -1) { printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", strerror(errno), errno); @@ -129,13 +165,17 @@ static void do_test_lru(enum test_type test, int cpu) if (test == LRU_HASH_PREALLOC) { test_name = "lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 0; + in6.sin6_addr.s6_addr16[2] = 0; } else if (test == NOCOMMON_LRU_HASH_PREALLOC) { test_name = "nocommon_lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 1; + in6.sin6_addr.s6_addr16[2] = 1; } else if (test == INNER_LRU_HASH_PREALLOC) { test_name = "inner_lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 2; + in6.sin6_addr.s6_addr16[2] = 2; + } else if (test == LRU_HASH_LOOKUP) { + test_name = "lru_hash_lookup_perf"; + in6.sin6_addr.s6_addr16[2] = 3; + in6.sin6_addr.s6_addr32[3] = 0; } else { assert(0); } @@ -144,6 +184,11 @@ static void do_test_lru(enum test_type test, int cpu) for (i = 0; i < max_cnt; i++) { ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6)); assert(ret == -1 && errno == EBADF); + if (in6.sin6_addr.s6_addr32[3] < + lru_hash_lookup_test_entries - 32) + in6.sin6_addr.s6_addr32[3] += 32; + else + in6.sin6_addr.s6_addr32[3] = 0; } printf("%d:%s pre-alloc %lld events per sec\n", cpu, test_name, @@ -165,6 +210,11 @@ static void test_inner_lru_hash_prealloc(int cpu) do_test_lru(INNER_LRU_HASH_PREALLOC, cpu); } +static void test_lru_hash_lookup(int cpu) +{ + do_test_lru(LRU_HASH_LOOKUP, cpu); +} + static void test_percpu_hash_prealloc(int cpu) { __u64 start_time; @@ -237,6 +287,11 @@ static void test_array_lookup(int cpu) cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); } +typedef int (*pre_test_func)(int tasks); +const pre_test_func pre_test_funcs[] = { + [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup, +}; + typedef void (*test_func)(int cpu); const test_func test_funcs[] = { [HASH_PREALLOC] = test_hash_prealloc, @@ -249,8 +304,25 @@ const test_func test_funcs[] = { [HASH_LOOKUP] = test_hash_lookup, [ARRAY_LOOKUP] = test_array_lookup, [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc, + [LRU_HASH_LOOKUP] = test_lru_hash_lookup, }; +static int pre_test(int tasks) +{ + int i; + + for (i = 0; i < NR_TESTS; i++) { + if (pre_test_funcs[i] && check_test_flags(i)) { + int ret = pre_test_funcs[i](tasks); + + if (ret) + return ret; + } + } + + return 0; +} + static void loop(int cpu) { cpu_set_t cpuset; @@ -271,6 +343,8 @@ static void run_perf_test(int tasks) pid_t pid[tasks]; int i; + assert(!pre_test(tasks)); + for (i = 0; i < tasks; i++) { pid[i] = fork(); if (pid[i] == 0) { @@ -338,6 +412,9 @@ static void fixup_map(struct bpf_map_data *map, int idx) array_of_lru_hashs_idx = idx; } + if (!strcmp("lru_hash_lookup_map", map->name)) + lru_hash_lookup_idx = idx; + if (num_map_entries <= 0) return; diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c index 533dd11a6baa..05dcdf8a4baa 100644 --- a/samples/bpf/sock_flags_kern.c +++ b/samples/bpf/sock_flags_kern.c @@ -9,8 +9,13 @@ SEC("cgroup/sock1") int bpf_prog1(struct bpf_sock *sk) { char fmt[] = "socket: family %d type %d protocol %d\n"; + char fmt2[] = "socket: uid %u gid %u\n"; + __u64 gid_uid = bpf_get_current_uid_gid(); + __u32 uid = gid_uid & 0xffffffff; + __u32 gid = gid_uid >> 32; bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); + bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid); /* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets * ie., make ping6 fail diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c new file mode 100644 index 000000000000..9149c524d279 --- /dev/null +++ b/samples/bpf/syscall_tp_kern.c @@ -0,0 +1,62 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct syscalls_enter_open_args { + unsigned long long unused; + long syscall_nr; + long filename_ptr; + long flags; + long mode; +}; + +struct syscalls_exit_open_args { + unsigned long long unused; + long syscall_nr; + long ret; +}; + +struct bpf_map_def SEC("maps") enter_open_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") exit_open_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; + +static __always_inline void count(void *map) +{ + u32 key = 0; + u32 *value, init_val = 1; + + value = bpf_map_lookup_elem(map, &key); + if (value) + *value += 1; + else + bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST); +} + +SEC("tracepoint/syscalls/sys_enter_open") +int trace_enter_open(struct syscalls_enter_open_args *ctx) +{ + count((void *)&enter_open_map); + return 0; +} + +SEC("tracepoint/syscalls/sys_exit_open") +int trace_enter_exit(struct syscalls_exit_open_args *ctx) +{ + count((void *)&exit_open_map); + return 0; +} diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c new file mode 100644 index 000000000000..a3cb91ebf4e7 --- /dev/null +++ b/samples/bpf/syscall_tp_user.c @@ -0,0 +1,71 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/bpf.h> +#include <string.h> +#include <linux/perf_event.h> +#include <errno.h> +#include <assert.h> +#include <stdbool.h> +#include <sys/resource.h> +#include "libbpf.h" +#include "bpf_load.h" + +/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*. + * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. + */ + +static void verify_map(int map_id) +{ + __u32 key = 0; + __u32 val; + + if (bpf_map_lookup_elem(map_id, &key, &val) != 0) { + fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); + return; + } + if (val == 0) + fprintf(stderr, "failed: map #%d returns value 0\n", map_id); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + int fd; + + setrlimit(RLIMIT_MEMLOCK, &r); + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + fprintf(stderr, "%s", bpf_log_buf); + return 1; + } + + /* current load_bpf_file has perf_event_open default pid = -1 + * and cpu = 0, which permits attached bpf execution on + * all cpus for all pid's. bpf program execution ignores + * cpu affinity. + */ + /* trigger some "open" operations */ + fd = open(filename, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "open failed: %s\n", strerror(errno)); + return 1; + } + close(fd); + + /* verify the map */ + verify_map(map_fd[0]); + verify_map(map_fd[1]); + + return 0; +} diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index 270edcc149a1..370b749f5ee6 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -17,6 +17,7 @@ #include <uapi/linux/pkt_cls.h> #include <net/ipv6.h> #include "bpf_helpers.h" +#include "bpf_endian.h" #define _htonl __builtin_bswap32 #define ERROR(ret) do {\ @@ -38,6 +39,10 @@ struct vxlan_metadata { u32 gbp; }; +struct erspan_metadata { + __be32 index; +}; + SEC("gre_set_tunnel") int _gre_set_tunnel(struct __sk_buff *skb) { @@ -76,6 +81,63 @@ int _gre_get_tunnel(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("erspan_set_tunnel") +int _erspan_set_tunnel(struct __sk_buff *skb) +{ + struct bpf_tunnel_key key; + struct erspan_metadata md; + int ret; + + __builtin_memset(&key, 0x0, sizeof(key)); + key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ + key.tunnel_id = 2; + key.tunnel_tos = 0; + key.tunnel_ttl = 64; + + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + md.index = htonl(123); + ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +SEC("erspan_get_tunnel") +int _erspan_get_tunnel(struct __sk_buff *skb) +{ + char fmt[] = "key %d remote ip 0x%x erspan index 0x%x\n"; + struct bpf_tunnel_key key; + struct erspan_metadata md; + u32 index; + int ret; + + ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + index = bpf_ntohl(md.index); + bpf_trace_printk(fmt, sizeof(fmt), + key.tunnel_id, key.remote_ipv4, index); + + return TC_ACT_OK; +} + SEC("vxlan_set_tunnel") int _vxlan_set_tunnel(struct __sk_buff *skb) { @@ -378,5 +440,4 @@ int _ip6ip6_get_tunnel(struct __sk_buff *skb) return TC_ACT_OK; } - char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index c3cfb23e23b5..e79594dd629b 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -19,68 +19,271 @@ #include <errno.h> #include <fcntl.h> #include <net/if.h> +#include <inttypes.h> #include <linux/bpf.h> #include "libbpf.h" char bpf_log_buf[BPF_LOG_BUF_SIZE]; -static int prog_load(int idx) +static int prog_load(__u32 idx, __u32 mark, __u32 prio) { - struct bpf_insn prog[] = { + /* save pointer to context */ + struct bpf_insn prog_start[] = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + }; + struct bpf_insn prog_end[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + + /* set sk_bound_dev_if on socket */ + struct bpf_insn prog_dev[] = { BPF_MOV64_IMM(BPF_REG_3, idx), BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)), BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)), - BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */ - BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - return bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, + /* set mark on socket */ + struct bpf_insn prog_mark[] = { + /* get uid of process */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_current_uid_gid), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff), + + /* if uid is 0, use given mark, else use the uid as the mark */ + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, mark), + + /* set the mark on the new socket */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)), + }; + + /* set priority on socket */ + struct bpf_insn prog_prio[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_3, prio), + BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)), + }; + + struct bpf_insn *prog; + size_t insns_cnt; + void *p; + int ret; + + insns_cnt = sizeof(prog_start) + sizeof(prog_end); + if (idx) + insns_cnt += sizeof(prog_dev); + + if (mark) + insns_cnt += sizeof(prog_mark); + + if (prio) + insns_cnt += sizeof(prog_prio); + + p = prog = malloc(insns_cnt); + if (!prog) { + fprintf(stderr, "Failed to allocate memory for instructions\n"); + return EXIT_FAILURE; + } + + memcpy(p, prog_start, sizeof(prog_start)); + p += sizeof(prog_start); + + if (idx) { + memcpy(p, prog_dev, sizeof(prog_dev)); + p += sizeof(prog_dev); + } + + if (mark) { + memcpy(p, prog_mark, sizeof(prog_mark)); + p += sizeof(prog_mark); + } + + if (prio) { + memcpy(p, prog_prio, sizeof(prog_prio)); + p += sizeof(prog_prio); + } + + memcpy(p, prog_end, sizeof(prog_end)); + p += sizeof(prog_end); + + insns_cnt /= sizeof(struct bpf_insn); + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + + free(prog); + + return ret; +} + +static int get_bind_to_device(int sd, char *name, size_t len) +{ + socklen_t optlen = len; + int rc; + + name[0] = '\0'; + rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen); + if (rc < 0) + perror("setsockopt(SO_BINDTODEVICE)"); + + return rc; +} + +static unsigned int get_somark(int sd) +{ + unsigned int mark = 0; + socklen_t optlen = sizeof(mark); + int rc; + + rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen); + if (rc < 0) + perror("getsockopt(SO_MARK)"); + + return mark; +} + +static unsigned int get_priority(int sd) +{ + unsigned int prio = 0; + socklen_t optlen = sizeof(prio); + int rc; + + rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen); + if (rc < 0) + perror("getsockopt(SO_PRIORITY)"); + + return prio; +} + +static int show_sockopts(int family) +{ + unsigned int mark, prio; + char name[16]; + int sd; + + sd = socket(family, SOCK_DGRAM, 17); + if (sd < 0) { + perror("socket"); + return 1; + } + + if (get_bind_to_device(sd, name, sizeof(name)) < 0) + return 1; + + mark = get_somark(sd); + prio = get_priority(sd); + + close(sd); + + printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio); + + return 0; } static int usage(const char *argv0) { - printf("Usage: %s cg-path device-index\n", argv0); + printf("Usage:\n"); + printf(" Attach a program\n"); + printf(" %s -b bind-to-dev -m mark -p prio cg-path\n", argv0); + printf("\n"); + printf(" Detach a program\n"); + printf(" %s -d cg-path\n", argv0); + printf("\n"); + printf(" Show inherited socket settings (mark, priority, and device)\n"); + printf(" %s [-6]\n", argv0); return EXIT_FAILURE; } int main(int argc, char **argv) { + __u32 idx = 0, mark = 0, prio = 0; + const char *cgrp_path = NULL; int cg_fd, prog_fd, ret; - unsigned int idx; + int family = PF_INET; + int do_attach = 1; + int rc; + + while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) { + switch (rc) { + case 'd': + do_attach = 0; + break; + case 'b': + idx = if_nametoindex(optarg); + if (!idx) { + idx = strtoumax(optarg, NULL, 0); + if (!idx) { + printf("Invalid device name\n"); + return EXIT_FAILURE; + } + } + break; + case 'm': + mark = strtoumax(optarg, NULL, 0); + break; + case 'p': + prio = strtoumax(optarg, NULL, 0); + break; + case '6': + family = PF_INET6; + break; + default: + return usage(argv[0]); + } + } - if (argc < 2) - return usage(argv[0]); + if (optind == argc) + return show_sockopts(family); - idx = if_nametoindex(argv[2]); - if (!idx) { - printf("Invalid device name\n"); + cgrp_path = argv[optind]; + if (!cgrp_path) { + fprintf(stderr, "cgroup path not given\n"); return EXIT_FAILURE; } - cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); - if (cg_fd < 0) { - printf("Failed to open cgroup path: '%s'\n", strerror(errno)); + if (do_attach && !idx && !mark && !prio) { + fprintf(stderr, + "One of device, mark or priority must be given\n"); return EXIT_FAILURE; } - prog_fd = prog_load(idx); - printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); - - if (prog_fd < 0) { - printf("Failed to load prog: '%s'\n", strerror(errno)); + cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY); + if (cg_fd < 0) { + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); return EXIT_FAILURE; } - ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0); - if (ret < 0) { - printf("Failed to attach prog to cgroup: '%s'\n", - strerror(errno)); - return EXIT_FAILURE; + if (do_attach) { + prog_fd = prog_load(idx, mark, prio); + if (prog_fd < 0) { + printf("Failed to load prog: '%s'\n", strerror(errno)); + printf("Output from kernel verifier:\n%s\n-------\n", + bpf_log_buf); + return EXIT_FAILURE; + } + + ret = bpf_prog_attach(prog_fd, cg_fd, + BPF_CGROUP_INET_SOCK_CREATE, 0); + if (ret < 0) { + printf("Failed to attach prog to cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + } else { + ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE); + if (ret < 0) { + printf("Failed to detach prog from cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } } + close(cg_fd); return EXIT_SUCCESS; } diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh index 925fd467c7cc..a81f38eef417 100755 --- a/samples/bpf/test_cgrp2_sock.sh +++ b/samples/bpf/test_cgrp2_sock.sh @@ -1,47 +1,133 @@ -#!/bin/bash - -function config_device { - ip netns add at_ns0 - ip link add veth0 type veth peer name veth0b - ip link set veth0b up - ip link set veth0 netns at_ns0 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad - ip netns exec at_ns0 ip link set dev veth0 up - ip link add foo type vrf table 1234 - ip link set foo up - ip addr add 172.16.1.101/24 dev veth0b - ip addr add 2401:db00::2/64 dev veth0b nodad - ip link set veth0b master foo +#!/bin/sh + +# Test various socket options that can be set by attaching programs to cgroups. + +CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock" + +################################################################################ +# +print_result() +{ + local rc=$1 + local status=" OK " + + [ $rc -ne 0 ] && status="FAIL" + + printf "%-50s [%4s]\n" "$2" "$status" } -function attach_bpf { - rm -rf /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2 - mount -t cgroup2 none /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2/foo - test_cgrp2_sock /tmp/cgroupv2/foo foo - echo $$ >> /tmp/cgroupv2/foo/cgroup.procs +check_sock() +{ + out=$(test_cgrp2_sock) + echo $out | grep -q "$1" + if [ $? -ne 0 ]; then + print_result 1 "IPv4: $2" + echo " expected: $1" + echo " have: $out" + rc=1 + else + print_result 0 "IPv4: $2" + fi } -function cleanup { - set +ex - ip netns delete at_ns0 - ip link del veth0 - ip link del foo - umount /tmp/cgroupv2 - rm -rf /tmp/cgroupv2 - set -ex +check_sock6() +{ + out=$(test_cgrp2_sock -6) + echo $out | grep -q "$1" + if [ $? -ne 0 ]; then + print_result 1 "IPv6: $2" + echo " expected: $1" + echo " have: $out" + rc=1 + else + print_result 0 "IPv6: $2" + fi } -function do_test { - ping -c1 -w1 172.16.1.100 - ping6 -c1 -w1 2401:db00::1 +################################################################################ +# + +cleanup() +{ + echo $$ >> ${CGRP_MNT}/cgroup.procs + rmdir ${CGRP_MNT}/sockopts } +cleanup_and_exit() +{ + local rc=$1 + local msg="$2" + + [ -n "$msg" ] && echo "ERROR: $msg" + + ip li del cgrp2_sock + umount ${CGRP_MNT} + + exit $rc +} + + +################################################################################ +# main + +rc=0 + +ip li add cgrp2_sock type dummy 2>/dev/null + +set -e +mkdir -p ${CGRP_MNT} +mount -t cgroup2 none ${CGRP_MNT} +set +e + + +# make sure we have a known start point cleanup 2>/dev/null -config_device -attach_bpf -do_test -cleanup -echo "*** PASS ***" + +mkdir -p ${CGRP_MNT}/sockopts +[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy" + + +# set pid into cgroup +echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs + +# no bpf program attached, so socket should show no settings +check_sock "dev , mark 0, priority 0" "No programs attached" +check_sock6 "dev , mark 0, priority 0" "No programs attached" + +# verify device is set +# +test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set device" +fi +check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set" +check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set" + +# verify mark is set +# +test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set mark" +fi +check_sock "dev , mark 666, priority 0" "Mark set" +check_sock6 "dev , mark 666, priority 0" "Mark set" + +# verify priority is set +# +test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set priority" +fi +check_sock "dev , mark 0, priority 123" "Priority set" +check_sock6 "dev , mark 0, priority 123" "Priority set" + +# all 3 at once +# +test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set device, mark and priority" +fi +check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set" +check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set" + +cleanup_and_exit $rc diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index a70d2ea90313..410052d9fc37 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -32,6 +32,19 @@ function add_gre_tunnel { ip addr add dev $DEV 10.1.1.200/24 } +function add_erspan_tunnel { + # in namespace + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 local 172.16.1.100 remote 172.16.1.200 erspan 123 + ip netns exec at_ns0 ip link set dev $DEV_NS up + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + + # out of namespace + ip link add dev $DEV type $TYPE external + ip link set dev $DEV up + ip addr add dev $DEV 10.1.1.200/24 +} + function add_vxlan_tunnel { # Set static ARP entry here because iptables set-mark works # on L3 packet, as a result not applying to ARP packets, @@ -99,6 +112,18 @@ function test_gre { cleanup } +function test_erspan { + TYPE=erspan + DEV_NS=erspan00 + DEV=erspan11 + config_device + add_erspan_tunnel + attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel + ping -c 1 10.1.1.100 + ip netns exec at_ns0 ping -c 1 10.1.1.200 + cleanup +} + function test_vxlan { TYPE=vxlan DEV_NS=vxlan00 @@ -151,14 +176,18 @@ function cleanup { ip link del gretap11 ip link del vxlan11 ip link del geneve11 + ip link del erspan11 pkill tcpdump pkill cat set -ex } +trap cleanup 0 2 3 6 9 cleanup echo "Testing GRE tunnel..." test_gre +echo "Testing ERSPAN tunnel..." +test_erspan echo "Testing VXLAN tunnel..." test_vxlan echo "Testing GENEVE tunnel..." diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c new file mode 100644 index 000000000000..74f3fd8ed729 --- /dev/null +++ b/samples/bpf/xdp_monitor_kern.c @@ -0,0 +1,88 @@ +/* XDP monitor tool, based on tracepoints + * + * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") redirect_err_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = 2, + /* TODO: have entries for all possible errno's */ +}; + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_redirect_ctx { + unsigned short common_type; // offset:0; size:2; signed:0; + unsigned char common_flags; // offset:2; size:1; signed:0; + unsigned char common_preempt_count;// offset:3; size:1; signed:0; + int common_pid; // offset:4; size:4; signed:1; + + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline +int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) +{ + u32 key = XDP_REDIRECT_ERROR; + int err = ctx->err; + u64 *cnt; + + if (!err) + key = XDP_REDIRECT_SUCCESS; + + cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); + if (!cnt) + return 0; + *cnt += 1; + + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tracepoint/xdp/xdp_redirect_err") +int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + + +SEC("tracepoint/xdp/xdp_redirect_map_err") +int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Likely unloaded when prog starts */ +SEC("tracepoint/xdp/xdp_redirect") +int trace_xdp_redirect(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Likely unloaded when prog starts */ +SEC("tracepoint/xdp/xdp_redirect_map") +int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c new file mode 100644 index 000000000000..b51b4f5e3257 --- /dev/null +++ b/samples/bpf/xdp_monitor_user.c @@ -0,0 +1,295 @@ +/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +static const char *__doc__= + "XDP monitor tool, based on tracepoints\n" +; + +static const char *__doc_err_only__= + " NOTICE: Only tracking XDP redirect errors\n" + " Enable TX success stats via '--stats'\n" + " (which comes with a per packet processing overhead)\n" +; + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdint.h> +#include <string.h> +#include <ctype.h> +#include <unistd.h> +#include <locale.h> + +#include <getopt.h> +#include <net/if.h> +#include <time.h> + +#include "libbpf.h" +#include "bpf_load.h" +#include "bpf_util.h" + +static int verbose = 1; +static bool debug = false; + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"debug", no_argument, NULL, 'D' }, + {"stats", no_argument, NULL, 'S' }, + {"sec", required_argument, NULL, 's' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", + argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-15s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf("(internal short-option: -%c)", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +__u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAILURE); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +enum { + REDIR_SUCCESS = 0, + REDIR_ERROR = 1, +}; +#define REDIR_RES_MAX 2 +static const char *redir_names[REDIR_RES_MAX] = { + [REDIR_SUCCESS] = "Success", + [REDIR_ERROR] = "Error", +}; +static const char *err2str(int err) +{ + if (err < REDIR_RES_MAX) + return redir_names[err]; + return NULL; +} + +struct record { + __u64 counter; + __u64 timestamp; +}; + +struct stats_record { + struct record xdp_redir[REDIR_RES_MAX]; +}; + +static void stats_print_headers(bool err_only) +{ + if (err_only) + printf("\n%s\n", __doc_err_only__); + + printf("%-14s %-10s %-18s %-9s\n", + "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period"); +} + +static void stats_print(struct stats_record *rec, + struct stats_record *prev, + bool err_only) +{ + int i = 0; + + if (err_only) + i = REDIR_ERROR; + + for (; i < REDIR_RES_MAX; i++) { + struct record *r = &rec->xdp_redir[i]; + struct record *p = &prev->xdp_redir[i]; + __u64 period = 0; + __u64 packets = 0; + double pps = 0; + double period_ = 0; + + if (p->timestamp) { + packets = r->counter - p->counter; + period = r->timestamp - p->timestamp; + if (period > 0) { + period_ = ((double) period / NANOSEC_PER_SEC); + pps = packets / period_; + } + } + + printf("%-14s %-10.0f %'-18.0f %f\n", + err2str(i), pps, pps, period_); + } +} + +static __u64 get_key32_value64_percpu(int fd, __u32 key) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 values[nr_cpus]; + __u64 sum = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return 0; + } + + /* Sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + sum += values[i]; + } + return sum; +} + +static bool stats_collect(int fd, struct stats_record *rec) +{ + int i; + + /* TODO: Detect if someone unloaded the perf event_fd's, as + * this can happen by someone running perf-record -e + */ + + for (i = 0; i < REDIR_RES_MAX; i++) { + rec->xdp_redir[i].timestamp = gettime(); + rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); + } + return true; +} + +static void stats_poll(int interval, bool err_only) +{ + struct stats_record rec, prev; + int map_fd; + + memset(&rec, 0, sizeof(rec)); + + /* Trick to pretty printf with thousands separators use %' */ + setlocale(LC_NUMERIC, "en_US"); + + /* Header */ + if (verbose) + printf("\n%s", __doc__); + + /* TODO Need more advanced stats on error types */ + if (verbose) + printf(" - Stats map: %s\n", map_data[0].name); + map_fd = map_data[0].fd; + + stats_print_headers(err_only); + fflush(stdout); + + while (1) { + memcpy(&prev, &rec, sizeof(rec)); + stats_collect(map_fd, &rec); + stats_print(&rec, &prev, err_only); + fflush(stdout); + sleep(interval); + } +} + +void print_bpf_prog_info(void) +{ + int i; + + /* Prog info */ + printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt); + for (i = 0; i < prog_cnt; i++) { + printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]); + } + + /* Maps info */ + printf("Loaded BPF prog have %d map(s)\n", map_data_count); + for (i = 0; i < map_data_count; i++) { + char *name = map_data[i].name; + int fd = map_data[i].fd; + + printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); + } + + /* Event info */ + printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt); + for (i = 0; i < prog_cnt; i++) { + if (event_fd[i] != -1) + printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]); + } +} + +int main(int argc, char **argv) +{ + int longindex = 0, opt; + int ret = EXIT_SUCCESS; + char bpf_obj_file[256]; + + /* Default settings: */ + bool errors_only = true; + int interval = 2; + + snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]); + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "h", + long_options, &longindex)) != -1) { + switch (opt) { + case 'D': + debug = true; + break; + case 'S': + errors_only = false; + break; + case 's': + interval = atoi(optarg); + break; + case 'h': + default: + usage(argv); + return EXIT_FAILURE; + } + } + + if (load_bpf_file(bpf_obj_file)) { + printf("ERROR - bpf_log_buf: %s", bpf_log_buf); + return 1; + } + if (!prog_fd[0]) { + printf("ERROR - load_bpf_file: %s\n", strerror(errno)); + return 1; + } + + if (debug) { + print_bpf_prog_info(); + } + + /* Unload/stop tracepoint event by closing fd's */ + if (errors_only) { + /* The prog_fd[i] and event_fd[i] depend on the + * order the functions was defined in _kern.c + */ + close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */ + close(prog_fd[2]); /* func: trace_xdp_redirect */ + close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */ + close(prog_fd[3]); /* func: trace_xdp_redirect_map */ + } + + stats_poll(interval, errors_only); + + return ret; +} diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c new file mode 100644 index 000000000000..8abb151e385f --- /dev/null +++ b/samples/bpf/xdp_redirect_kern.c @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") tx_port = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct bpf_map_def SEC("maps") rxcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 1, +}; + +static void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +SEC("xdp_redirect") +int xdp_redirect_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + int *ifindex, port = 0; + long *value; + u32 key = 0; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + ifindex = bpf_map_lookup_elem(&tx_port, &port); + if (!ifindex) + return rc; + + value = bpf_map_lookup_elem(&rxcnt, &key); + if (value) + *value += 1; + + swap_src_dst_mac(data); + return bpf_redirect(*ifindex, 0); +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp_redirect_dummy") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c new file mode 100644 index 000000000000..740a529ba84f --- /dev/null +++ b/samples/bpf/xdp_redirect_map_kern.c @@ -0,0 +1,92 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") tx_port = { + .type = BPF_MAP_TYPE_DEVMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 100, +}; + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct bpf_map_def SEC("maps") rxcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 1, +}; + +static void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +SEC("xdp_redirect_map") +int xdp_redirect_map_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + int rc = XDP_DROP; + int vport, port = 0, m = 0; + long *value; + u32 key = 0; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + /* constant virtual port */ + vport = 0; + + /* count packet in global counter */ + value = bpf_map_lookup_elem(&rxcnt, &key); + if (value) + *value += 1; + + swap_src_dst_mac(data); + + /* send packet out physical port */ + return bpf_redirect_map(&tx_port, vport, 0); +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp_redirect_dummy") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c new file mode 100644 index 000000000000..d4d86a273fba --- /dev/null +++ b/samples/bpf/xdp_redirect_map_user.c @@ -0,0 +1,145 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <libgen.h> + +#include "bpf_load.h" +#include "bpf_util.h" +#include "libbpf.h" + +static int ifindex_in; +static int ifindex_out; +static bool ifindex_out_xdp_dummy_attached = true; + +static __u32 xdp_flags; + +static void int_exit(int sig) +{ + set_link_xdp_fd(ifindex_in, -1, xdp_flags); + if (ifindex_out_xdp_dummy_attached) + set_link_xdp_fd(ifindex_out, -1, xdp_flags); + exit(0); +} + +static void poll_stats(int interval, int ifindex) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 values[nr_cpus], prev[nr_cpus]; + + memset(prev, 0, sizeof(prev)); + + while (1) { + __u64 sum = 0; + __u32 key = 0; + int i; + + sleep(interval); + assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[i]); + if (sum) + printf("ifindex %i: %10llu pkt/s\n", + ifindex, sum / interval); + memcpy(prev, values, sizeof(values)); + } +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n" + "OPTS:\n" + " -S use skb-mode\n" + " -N enforce native mode\n", + prog); +} + +int main(int argc, char **argv) +{ + const char *optstr = "SN"; + char filename[256]; + int ret, opt, key = 0; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (optind == argc) { + printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]); + return 1; + } + + ifindex_in = strtoul(argv[optind], NULL, 0); + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + printf("input: %d output: %d\n", ifindex_in, ifindex_out); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + if (!prog_fd[0]) { + printf("load_bpf_file: %s\n", strerror(errno)); + return 1; + } + + if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { + printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); + return 1; + } + + /* Loading dummy XDP prog on out-device */ + if (set_link_xdp_fd(ifindex_out, prog_fd[1], + (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { + printf("WARN: link set xdp fd failed on %d\n", ifindex_out); + ifindex_out_xdp_dummy_attached = false; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + printf("map[0] (vports) = %i, map[1] (map) = %i, map[2] (count) = %i\n", + map_fd[0], map_fd[1], map_fd[2]); + + /* populate virtual to physical port map */ + ret = bpf_map_update_elem(map_fd[0], &key, &ifindex_out, 0); + if (ret) { + perror("bpf_update_elem"); + goto out; + } + + poll_stats(2, ifindex_out); + +out: + return 0; +} diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c new file mode 100644 index 000000000000..4475d837bf2c --- /dev/null +++ b/samples/bpf/xdp_redirect_user.c @@ -0,0 +1,143 @@ +/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <libgen.h> + +#include "bpf_load.h" +#include "bpf_util.h" +#include "libbpf.h" + +static int ifindex_in; +static int ifindex_out; +static bool ifindex_out_xdp_dummy_attached = true; + +static __u32 xdp_flags; + +static void int_exit(int sig) +{ + set_link_xdp_fd(ifindex_in, -1, xdp_flags); + if (ifindex_out_xdp_dummy_attached) + set_link_xdp_fd(ifindex_out, -1, xdp_flags); + exit(0); +} + +static void poll_stats(int interval, int ifindex) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 values[nr_cpus], prev[nr_cpus]; + + memset(prev, 0, sizeof(prev)); + + while (1) { + __u64 sum = 0; + __u32 key = 0; + int i; + + sleep(interval); + assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[i]); + if (sum) + printf("ifindex %i: %10llu pkt/s\n", + ifindex, sum / interval); + memcpy(prev, values, sizeof(values)); + } +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n" + "OPTS:\n" + " -S use skb-mode\n" + " -N enforce native mode\n", + prog); +} + + +int main(int argc, char **argv) +{ + const char *optstr = "SN"; + char filename[256]; + int ret, opt, key = 0; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (optind == argc) { + printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]); + return 1; + } + + ifindex_in = strtoul(argv[optind], NULL, 0); + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + printf("input: %d output: %d\n", ifindex_in, ifindex_out); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + if (!prog_fd[0]) { + printf("load_bpf_file: %s\n", strerror(errno)); + return 1; + } + + if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { + printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); + return 1; + } + + /* Loading dummy XDP prog on out-device */ + if (set_link_xdp_fd(ifindex_out, prog_fd[1], + (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { + printf("WARN: link set xdp fd failed on %d\n", ifindex_out); + ifindex_out_xdp_dummy_attached = false; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + /* bpf redirect port */ + ret = bpf_map_update_elem(map_fd[0], &key, &ifindex_out, 0); + if (ret) { + perror("bpf_update_elem"); + goto out; + } + + poll_stats(2, ifindex_out); + +out: + return 0; +} diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile new file mode 100644 index 000000000000..9291ab8e0f8c --- /dev/null +++ b/samples/sockmap/Makefile @@ -0,0 +1,78 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := sockmap + +# Libbpf dependencies +LIBBPF := ../../tools/lib/bpf/bpf.o + +HOSTCFLAGS += -I$(objtree)/usr/include +HOSTCFLAGS += -I$(srctree)/tools/lib/ +HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ +HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include +HOSTCFLAGS += -I$(srctree)/tools/perf + +sockmap-objs := ../bpf/bpf_load.o $(LIBBPF) sockmap_user.o + +# Tell kbuild to always build the programs +always := $(hostprogs-y) +always += sockmap_kern.o + +HOSTLOADLIBES_sockmap += -lelf -lpthread + +# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: +# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +LLC ?= llc +CLANG ?= clang + +# Trick to allow make to be run from this directory +all: + $(MAKE) -C ../../ $(CURDIR)/ + +clean: + $(MAKE) -C ../../ M=$(CURDIR) clean + @rm -f *~ + +$(obj)/syscall_nrs.s: $(src)/syscall_nrs.c + $(call if_changed_dep,cc_s_c) + +$(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE + $(call filechk,offsets,__SYSCALL_NRS_H__) + +clean-files += syscall_nrs.h + +FORCE: + + +# Verify LLVM compiler tools are available and bpf target is supported by llc +.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC) + +verify_cmds: $(CLANG) $(LLC) + @for TOOL in $^ ; do \ + if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \ + echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\ + exit 1; \ + else true; fi; \ + done + +verify_target_bpf: verify_cmds + @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \ + echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\ + echo " NOTICE: LLVM version >= 3.7.1 required" ;\ + exit 2; \ + else true; fi + +$(src)/*.c: verify_target_bpf + +# asm/sysreg.h - inline assembly used by it is incompatible with llvm. +# But, there is no easy way to fix it, so just exclude it since it is +# useless for BPF samples. +$(obj)/%.o: $(src)/%.c + $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ + -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ + -Wno-compare-distinct-pointer-types \ + -Wno-gnu-variable-sized-type-not-at-end \ + -Wno-address-of-packed-member -Wno-tautological-compare \ + -Wno-unknown-warning-option \ + -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c new file mode 100644 index 000000000000..f9b38ef82dc2 --- /dev/null +++ b/samples/sockmap/sockmap_kern.c @@ -0,0 +1,108 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include "../../tools/testing/selftests/bpf/bpf_helpers.h" +#include "../../tools/testing/selftests/bpf/bpf_endian.h" + +/* Sockmap sample program connects a client and a backend together + * using cgroups. + * + * client:X <---> frontend:80 client:X <---> backend:80 + * + * For simplicity we hard code values here and bind 1:1. The hard + * coded values are part of the setup in sockmap.sh script that + * is associated with this BPF program. + * + * The bpf_printk is verbose and prints information as connections + * are established and verdicts are decided. + */ + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +struct bpf_map_def SEC("maps") sock_map = { + .type = BPF_MAP_TYPE_SOCKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +SEC("sk_skb1") +int bpf_prog1(struct __sk_buff *skb) +{ + return skb->len; +} + +SEC("sk_skb2") +int bpf_prog2(struct __sk_buff *skb) +{ + __u32 lport = skb->local_port; + __u32 rport = skb->remote_port; + int ret = 0; + + if (lport == 10000) + ret = 10; + else + ret = 1; + + bpf_printk("sockmap: %d -> %d @ %d\n", lport, bpf_ntohl(rport), ret); + return bpf_sk_redirect_map(&sock_map, ret, 0); +} + +SEC("sockops") +int bpf_sockmap(struct bpf_sock_ops *skops) +{ + __u32 lport, rport; + int op, err = 0, index, key, ret; + + + op = (int) skops->op; + + switch (op) { + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (lport == 10000) { + ret = 1; + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST); + bpf_printk("passive(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (bpf_ntohl(rport) == 10001) { + ret = 10; + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST); + bpf_printk("active(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + default: + break; + } + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c new file mode 100644 index 000000000000..7cc9d228216f --- /dev/null +++ b/samples/sockmap/sockmap_user.c @@ -0,0 +1,294 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/select.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <stdbool.h> +#include <signal.h> +#include <fcntl.h> + +#include <sys/time.h> +#include <sys/types.h> + +#include <linux/netlink.h> +#include <linux/socket.h> +#include <linux/sock_diag.h> +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <libgen.h> + +#include "../bpf/bpf_load.h" +#include "../bpf/bpf_util.h" +#include "../bpf/libbpf.h" + +int running; +void running_handler(int a); + +/* randomly selected ports for testing on lo */ +#define S1_PORT 10000 +#define S2_PORT 10001 + +static int sockmap_test_sockets(int rate, int dot) +{ + int i, sc, err, max_fd, one = 1; + int s1, s2, c1, c2, p1, p2; + struct sockaddr_in addr; + struct timeval timeout; + char buf[1024] = {0}; + int *fds[4] = {&s1, &s2, &c1, &c2}; + fd_set w; + + s1 = s2 = p1 = p2 = c1 = c2 = 0; + + /* Init sockets */ + for (i = 0; i < 4; i++) { + *fds[i] = socket(AF_INET, SOCK_STREAM, 0); + if (*fds[i] < 0) { + perror("socket s1 failed()"); + err = *fds[i]; + goto out; + } + } + + /* Allow reuse */ + for (i = 0; i < 2; i++) { + err = setsockopt(*fds[i], SOL_SOCKET, SO_REUSEADDR, + (char *)&one, sizeof(one)); + if (err) { + perror("setsockopt failed()"); + goto out; + } + } + + /* Non-blocking sockets */ + for (i = 0; i < 4; i++) { + err = ioctl(*fds[i], FIONBIO, (char *)&one); + if (err < 0) { + perror("ioctl s1 failed()"); + goto out; + } + } + + /* Bind server sockets */ + memset(&addr, 0, sizeof(struct sockaddr_in)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + addr.sin_port = htons(S1_PORT); + err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0) { + perror("bind s1 failed()\n"); + goto out; + } + + addr.sin_port = htons(S2_PORT); + err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0) { + perror("bind s2 failed()\n"); + goto out; + } + + /* Listen server sockets */ + addr.sin_port = htons(S1_PORT); + err = listen(s1, 32); + if (err < 0) { + perror("listen s1 failed()\n"); + goto out; + } + + addr.sin_port = htons(S2_PORT); + err = listen(s2, 32); + if (err < 0) { + perror("listen s1 failed()\n"); + goto out; + } + + /* Initiate Connect */ + addr.sin_port = htons(S1_PORT); + err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0 && errno != EINPROGRESS) { + perror("connect c1 failed()\n"); + goto out; + } + + addr.sin_port = htons(S2_PORT); + err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); + if (err < 0 && errno != EINPROGRESS) { + perror("connect c2 failed()\n"); + goto out; + } + + /* Accept Connecrtions */ + p1 = accept(s1, NULL, NULL); + if (p1 < 0) { + perror("accept s1 failed()\n"); + goto out; + } + + p2 = accept(s2, NULL, NULL); + if (p2 < 0) { + perror("accept s1 failed()\n"); + goto out; + } + + max_fd = p2; + timeout.tv_sec = 10; + timeout.tv_usec = 0; + + printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); + printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", + c1, s1, c2, s2); + + /* Ping/Pong data from client to server */ + sc = send(c1, buf, sizeof(buf), 0); + if (sc < 0) { + perror("send failed()\n"); + goto out; + } + + do { + int s, rc, i; + + /* FD sets */ + FD_ZERO(&w); + FD_SET(c1, &w); + FD_SET(c2, &w); + FD_SET(p1, &w); + FD_SET(p2, &w); + + s = select(max_fd + 1, &w, NULL, NULL, &timeout); + if (s == -1) { + perror("select()"); + break; + } else if (!s) { + fprintf(stderr, "unexpected timeout\n"); + break; + } + + for (i = 0; i <= max_fd && s > 0; ++i) { + if (!FD_ISSET(i, &w)) + continue; + + s--; + + rc = recv(i, buf, sizeof(buf), 0); + if (rc < 0) { + if (errno != EWOULDBLOCK) { + perror("recv failed()\n"); + break; + } + } + + if (rc == 0) { + close(i); + break; + } + + sc = send(i, buf, rc, 0); + if (sc < 0) { + perror("send failed()\n"); + break; + } + } + sleep(rate); + if (dot) { + printf("."); + fflush(stdout); + + } + } while (running); + +out: + close(s1); + close(s2); + close(p1); + close(p2); + close(c1); + close(c2); + return err; +} + +int main(int argc, char **argv) +{ + int rate = 1, dot = 1; + char filename[256]; + int err, cg_fd; + char *cg_path; + + cg_path = argv[argc - 1]; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + running = 1; + + /* catch SIGINT */ + signal(SIGINT, running_handler); + + if (load_bpf_file(filename)) { + fprintf(stderr, "load_bpf_file: (%s) %s\n", + filename, strerror(errno)); + return 1; + } + + /* Cgroup configuration */ + cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); + if (cg_fd < 0) { + fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n", + cg_fd, cg_path); + return cg_fd; + } + + /* Attach programs to sockmap */ + err = bpf_prog_attach(prog_fd[0], map_fd[0], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + + err = bpf_prog_attach(prog_fd[1], map_fd[0], + BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + + /* Attach to cgroups */ + err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", + err, strerror(errno)); + return err; + } + + err = sockmap_test_sockets(rate, dot); + if (err) { + fprintf(stderr, "ERROR: test socket failed: %d\n", err); + return err; + } + return 0; +} + +void running_handler(int a) +{ + running = 0; +} |