diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 40 | ||||
-rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 2 | ||||
-rw-r--r-- | kernel/bpf/bpf_iter.c | 16 | ||||
-rw-r--r-- | kernel/bpf/bpf_local_storage.c | 39 | ||||
-rw-r--r-- | kernel/bpf/bpf_lsm.c | 4 | ||||
-rw-r--r-- | kernel/bpf/bpf_task_storage.c | 100 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 108 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 9 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 17 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 65 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 2 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 393 | ||||
-rw-r--r-- | kernel/fork.c | 5 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 6 |
15 files changed, 669 insertions, 140 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d1249340fd6b..7f33098ca63f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1f8453343bf2..463d25e1e67e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -625,6 +625,42 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; +static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + u32 i, key, num_elems = 0; + struct bpf_array *array; + bool is_percpu; + u64 ret = 0; + void *val; + + if (flags != 0) + return -EINVAL; + + is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + array = container_of(map, struct bpf_array, map); + if (is_percpu) + migrate_disable(); + for (i = 0; i < map->max_entries; i++) { + if (is_percpu) + val = this_cpu_ptr(array->pptrs[i]); + else + val = array->value + array->elem_size * i; + num_elems++; + key = i; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)&key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + break; + } + + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, @@ -643,6 +679,8 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &array_map_btf_id, .iter_seq_info = &iter_seq_info, @@ -660,6 +698,8 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &percpu_array_map_btf_id, .iter_seq_info = &iter_seq_info, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index b58b2efb9b43..2921ca39a93e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static int inode_storage_map_btf_id; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e7..b305270b7a4b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; + unsigned long flags; if (unlikely(!selem_linked_to_storage(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference(selem->local_storage); - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) kfree_rcu(local_storage, rcu); @@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; + unsigned long flags; if (unlikely(!selem_linked_to_map(selem))) /* selem has already be unlinked from smap */ @@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) smap = rcu_dereference(SDATA(selem)->smap); b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + unsigned long flags; - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem) @@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, sdata = SDATA(selem); if (cacheit_lockit) { + unsigned long flags; + /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], sdata); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); } return sdata; @@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; + unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } unlock: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return SDATA(selem); unlock_err: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return ERR_PTR(err); } @@ -468,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map_bucket *b; @@ -497,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + __this_cpu_inc(*busy_counter); + } bpf_selem_unlink(selem); + if (busy_counter) { + __this_cpu_dec(*busy_counter); + migrate_enable(); + } cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 1622a44d1617..9829f381b51c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e0da0258b732..fd3c74ef608e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -15,21 +15,41 @@ #include <linux/bpf_local_storage.h> #include <linux/filter.h> #include <uapi/linux/btf.h> -#include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> DEFINE_BPF_STORAGE_CACHE(task_cache); +DEFINE_PER_CPU(int, bpf_task_storage_busy); + +static void bpf_task_storage_lock(void) +{ + migrate_disable(); + __this_cpu_inc(bpf_task_storage_busy); +} + +static void bpf_task_storage_unlock(void) +{ + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); +} + +static bool bpf_task_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); + return false; + } + return true; +} + static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; - struct bpf_storage_blob *bsb; - bsb = bpf_task(task); - if (!bsb) - return NULL; - return &bsb->storage; + return &task->bpf_storage; } static struct bpf_local_storage_data * @@ -38,13 +58,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, { struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - struct bpf_storage_blob *bsb; - - bsb = bpf_task(task); - if (!bsb) - return NULL; - task_storage = rcu_dereference(bsb->storage); + task_storage = rcu_dereference(task->bpf_storage); if (!task_storage) return NULL; @@ -57,16 +72,12 @@ void bpf_task_storage_free(struct task_struct *task) struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct bpf_storage_blob *bsb; struct hlist_node *n; - - bsb = bpf_task(task); - if (!bsb) - return; + unsigned long flags; rcu_read_lock(); - local_storage = rcu_dereference(bsb->storage); + local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) { rcu_read_unlock(); return; @@ -81,7 +92,8 @@ void bpf_task_storage_free(struct task_struct *task) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_bh(&local_storage->lock); + bpf_task_storage_lock(); + raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -90,7 +102,8 @@ void bpf_task_storage_free(struct task_struct *task) free_task_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, false); } - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_task_storage_unlock(); rcu_read_unlock(); /* free_task_storage should always be true as long as @@ -123,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); + bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -150,13 +165,15 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task || !task_storage_ptr(task)) { + if (!task) { err = -ENOENT; goto out; } + bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags); + bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -199,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); err = task_storage_delete(task, map); + bpf_task_storage_unlock(); out: put_pid(pid); return err; @@ -213,44 +232,47 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; - /* explicitly check that the task_storage_ptr is not - * NULL as task_storage_lookup returns NULL in this case and - * bpf_local_storage_update expects the owner to have a - * valid storage pointer. - */ - if (!task || !task_storage_ptr(task)) + if (!task) + return (unsigned long)NULL; + + if (!bpf_task_storage_trylock()) return (unsigned long)NULL; sdata = task_storage_lookup(task, map, true); if (sdata) - return (unsigned long)sdata->data; + goto unlock; - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + /* only allocate new storage, when the task is refcounted */ + if (refcount_read(&task->usage) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); - return IS_ERR(sdata) ? (unsigned long)NULL : - (unsigned long)sdata->data; - } - return (unsigned long)NULL; +unlock: + bpf_task_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { + int ret; + if (!task) return -EINVAL; + if (!bpf_task_storage_trylock()) + return -EBUSY; + /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - return task_storage_delete(task, map); + ret = task_storage_delete(task, map); + bpf_task_storage_unlock(); + return ret; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -276,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } static int task_storage_map_btf_id; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b1a76fe046cb..369faeddf1df 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -173,7 +173,7 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x8f00ffff +#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -280,6 +280,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; static const char *btf_type_str(const struct btf_type *t) @@ -574,6 +575,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: return true; } @@ -1704,6 +1706,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: size = type->size; goto resolved; @@ -1849,7 +1852,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env, return -EINVAL; } -/* Used for ptr, array and struct/union type members. +/* Used for ptr, array struct/union and float type members. * int, enum and modifier types have their specific callback functions. */ static int btf_generic_check_kflag_member(struct btf_verifier_env *env, @@ -3675,6 +3678,81 @@ static const struct btf_kind_operations datasec_ops = { .show = btf_datasec_show, }; +static s32 btf_float_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 && + t->size != 16) { + btf_verifier_log_type(env, t, "Invalid type_size"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static int btf_float_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u64 start_offset_bytes; + u64 end_offset_bytes; + u64 misalign_bits; + u64 align_bytes; + u64 align_bits; + + /* Different architectures have different alignment requirements, so + * here we check only for the reasonable minimum. This way we ensure + * that types after CO-RE can pass the kernel BTF verifier. + */ + align_bytes = min_t(u64, sizeof(void *), member_type->size); + align_bits = align_bytes * BITS_PER_BYTE; + div64_u64_rem(member->offset, align_bits, &misalign_bits); + if (misalign_bits) { + btf_verifier_log_member(env, struct_type, member, + "Member is not properly aligned"); + return -EINVAL; + } + + start_offset_bytes = member->offset / BITS_PER_BYTE; + end_offset_bytes = start_offset_bytes + member_type->size; + if (end_offset_bytes > struct_type->size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static void btf_float_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u", t->size); +} + +static const struct btf_kind_operations float_ops = { + .check_meta = btf_float_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_float_check_member, + .check_kflag_member = btf_generic_check_kflag_member, + .log_details = btf_float_log, + .show = btf_df_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -3808,6 +3886,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = &func_proto_ops, [BTF_KIND_VAR] = &var_ops, [BTF_KIND_DATASEC] = &datasec_ops, + [BTF_KIND_FLOAT] = &float_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -4592,8 +4671,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ - nr_args = t ? btf_type_vlen(t) : 5; + /* if (t == NULL) Fall back to default BPF prog with + * MAX_BPF_FUNC_REG_ARGS u64 arguments. + */ + nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -4649,7 +4730,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } else { if (!t) - /* Default prog with 5 args */ + /* Default prog with MAX_BPF_FUNC_REG_ARGS args */ return true; t = btf_type_by_id(btf, args[arg].type); } @@ -5100,12 +5181,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (!func) { /* BTF function prototype doesn't match the verifier types. - * Fall back to 5 u64 args. + * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < 5; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) m->arg_size[i] = 8; m->ret_size = 8; - m->nr_args = 5; + m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; } args = (const struct btf_param *)(func + 1); @@ -5328,8 +5409,9 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Function %s has %d > %d args\n", tname, nargs, + MAX_BPF_FUNC_REG_ARGS); goto out; } @@ -5458,9 +5540,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", - tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } /* check that function returns int */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5d1469de6921..0cf2791d5099 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map) * complete. */ - bpf_clear_redirect_map(map); synchronize_rcu(); /* For cpu_map the remote CPUs can still be using the entries @@ -563,7 +562,7 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -600,6 +599,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); +} + static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -612,6 +616,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_cpu_map", .map_btf_id = &cpu_map_btf_id, + .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 85d9d1b72a33..7a5ad7331c3b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -258,7 +257,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); @@ -392,7 +391,7 @@ void __dev_flush(void) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; @@ -735,6 +734,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); +} + +static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); +} + static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -747,6 +756,7 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_btf_id, + .map_redirect = dev_map_redirect, }; static int dev_map_hash_map_btf_id; @@ -761,6 +771,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_hash_map_btf_id, + .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d63912e73ad9..330d721dd2af 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; +static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + u32 roundup_key_size; + int i, num_elems = 0; + void __percpu *pptr; + struct bucket *b; + void *key, *val; + bool is_percpu; + u64 ret = 0; + + if (flags != 0) + return -EINVAL; + + is_percpu = htab_is_percpu(htab); + + roundup_key_size = round_up(map->key_size, 8); + /* disable migration so percpu value prepared here will be the + * same as the one seen by the bpf program with bpf_map_lookup_elem(). + */ + if (is_percpu) + migrate_disable(); + for (i = 0; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + rcu_read_lock(); + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + key = elem->key; + if (is_percpu) { + /* current cpu value for percpu map */ + pptr = htab_elem_get_ptr(elem, map->key_size); + val = this_cpu_ptr(pptr); + } else { + val = elem->key + roundup_key_size; + } + num_elems++; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) { + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + } +out: + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), .map_btf_name = "bpf_htab", .map_btf_id = &htab_map_btf_id, @@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_map_btf_id, @@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_percpu_map_btf_id, @@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_percpu_map_btf_id, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..074800226327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44e4ec1640f1..999bf36ffeb1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +254,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -390,6 +397,24 @@ __printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, env->prev_linfo = linfo; } +static void verbose_invalid_scalar(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct tnum *range, const char *ctx, + const char *reg_name) +{ + char tn_buf[48]; + + verbose(env, "At %s the register %s ", ctx, reg_name); + if (!tnum_is_unknown(reg->var_off)) { + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "has value %s", tn_buf); + } else { + verbose(env, "has unknown scalar value"); + } + tnum_strn(tn_buf, sizeof(tn_buf), *range); + verbose(env, " should have been in %s\n", tn_buf); +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -409,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -451,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -541,6 +568,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -612,6 +641,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1519,7 +1549,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) } ret = find_subprog(env, off); if (ret >= 0) - return 0; + return ret; if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; @@ -1527,7 +1557,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) env->subprog_info[env->subprog_cnt++].start = off; sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); - return 0; + return env->subprog_cnt - 1; } static int check_subprogs(struct bpf_verifier_env *env) @@ -1544,6 +1574,19 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { + if (bpf_pseudo_func(insn + i)) { + if (!env->bpf_capable) { + verbose(env, + "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + return -EPERM; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + /* remember subprog */ + insn[i + 1].imm = ret; + continue; + } if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { @@ -2295,6 +2338,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2899,6 +2944,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3304,6 +3353,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3405,7 +3457,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3842,7 +3894,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4258,6 +4322,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4474,6 +4541,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4505,6 +4573,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4517,6 +4586,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4529,6 +4599,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4557,6 +4629,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4738,6 +4812,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -5258,13 +5334,19 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) +typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx); + +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) { struct bpf_verifier_state *state = env->cur_state; struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; - int i, err, subprog, target_insn; + int err; bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { @@ -5273,14 +5355,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -E2BIG; } - target_insn = *insn_idx + insn->imm; - subprog = find_subprog(env, target_insn + 1); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn + 1); - return -EFAULT; - } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", @@ -5335,11 +5409,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err) return err; - /* copy r1 - r5 args that callee can access. The copy includes parent - * pointers, which connects us up to the liveness chain - */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - callee->regs[i] = caller->regs[i]; + err = set_callee_state_cb(env, caller, callee, *insn_idx); + if (err) + return err; clear_caller_saved_regs(env, caller->regs); @@ -5347,7 +5419,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe++; /* and go analyze first insn of the callee */ - *insn_idx = target_insn; + *insn_idx = env->subprog_info[subprog].start - 1; if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); @@ -5358,6 +5430,92 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee) +{ + /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, + * void *callback_ctx, u64 flags); + * callback_fn(struct bpf_map *map, void *key, void *value, + * void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* pointer to stack or null */ + callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx) +{ + int i; + + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + int subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn); + return -EFAULT; + } + + return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); +} + +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5380,8 +5538,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5436,7 +5608,9 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem && + func_id != BPF_FUNC_redirect_map) return 0; if (map == NULL) { @@ -5517,15 +5691,18 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5571,7 +5748,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - for (i = 0; i < 5; i++) { + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { err = check_func_arg(env, i, &meta, fn); if (err) return err; @@ -5621,6 +5798,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5878,6 +6062,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, else *ptr_limit = -off - 1; return *ptr_limit >= max ? -ERANGE : 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; if (mask_to_left) { @@ -5909,7 +6106,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, aux->alu_limit != alu_limit)) return -EACCES; - /* Corresponding fixup done in fixup_bpf_calls(). */ + /* Corresponding fixup done in do_misc_fixups(). */ aux->alu_state = alu_state; aux->alu_limit = alu_limit; return 0; @@ -6084,6 +6281,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", @@ -8263,6 +8461,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8491,17 +8707,7 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { - char tn_buf[48]; - - verbose(env, "At program exit the register R0 "); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); - } - tnum_strn(tn_buf, sizeof(tn_buf), range); - verbose(env, " should have been in %s\n", tn_buf); + verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); return -EINVAL; } @@ -8628,6 +8834,27 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } +static int visit_func_call_insn(int t, int insn_cnt, + struct bpf_insn *insns, + struct bpf_verifier_env *env, + bool visit_callee) +{ + int ret; + + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + if (ret) + return ret; + + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + if (visit_callee) { + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); + } + return ret; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -8638,6 +8865,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -8648,18 +8878,8 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret) - return ret; - - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); - } - return ret; + return visit_func_call_insn(t, insn_cnt, insns, env, + insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: if (BPF_SRC(insns[t].code) != BPF_K) @@ -9272,6 +9492,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10118,10 +10339,9 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10550,6 +10770,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10682,9 +10908,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11323,6 +11553,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11452,6 +11688,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11497,6 +11739,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11561,6 +11808,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); @@ -11573,12 +11828,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } -/* fixup insn->imm field of bpf_call instructions - * and inline eligible helpers as explicit sequence of BPF instructions - * - * this function is called after eBPF program passed verification +/* Do various post-verification rewrites in a single program pass. + * These rewrites simplify JIT and interpreter implementations. */ -static int fixup_bpf_calls(struct bpf_verifier_env *env) +static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; bool expect_blinding = bpf_jit_blinding_enabled(prog); @@ -11593,6 +11846,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || insn->code == (BPF_ALU | BPF_MOD | BPF_X) || @@ -11633,6 +11887,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || BPF_MODE(insn->code) == BPF_IND)) { @@ -11652,6 +11907,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Rewrite pointer arithmetic to mitigate speculation attacks. */ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; @@ -11800,7 +12056,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_delete_elem || insn->imm == BPF_FUNC_map_push_elem || insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem)) { + insn->imm == BPF_FUNC_map_peek_elem || + insn->imm == BPF_FUNC_redirect_map)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -11842,6 +12099,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_redirect, + (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: @@ -11868,11 +12128,16 @@ patch_map_ops_generic: insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - __bpf_call_base; continue; + case BPF_FUNC_redirect_map: + insn->imm = BPF_CAST_CALL(ops->map_redirect) - + __bpf_call_base; + continue; } goto patch_call_imm; } + /* Implement bpf_jiffies64 inline. */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_jiffies64) { struct bpf_insn ld_jiffies_addr[2] = { @@ -12683,7 +12948,7 @@ skip_full_check: ret = convert_ctx_accesses(env); if (ret == 0) - ret = fixup_bpf_calls(env); + ret = do_misc_fixups(env); /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. diff --git a/kernel/fork.c b/kernel/fork.c index 54cc905e5fe0..50209691f21a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> +#include <linux/bpf.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -2072,6 +2074,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + RCU_INIT_POINTER(p->bpf_storage, NULL); +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..0d23755c2747 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1367,6 +1367,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: return NULL; } |