summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/cgroup.c19
-rw-r--r--kernel/bpf/helpers.c64
-rw-r--r--kernel/bpf/inode.c5
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/memalloc.c14
-rw-r--r--kernel/bpf/ringbuf.c2
-rw-r--r--kernel/bpf/syscall.c16
-rw-r--r--kernel/bpf/verifier.c98
-rw-r--r--kernel/cgroup/cgroup.c4
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/sched/ext.c29
-rw-r--r--kernel/time/posix-clock.c6
-rw-r--r--kernel/trace/bpf_trace.c6
-rw-r--r--kernel/trace/fgraph.c12
-rw-r--r--kernel/trace/trace_eprobe.c7
-rw-r--r--kernel/trace/trace_fprobe.c6
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_uprobe.c13
21 files changed, 201 insertions, 128 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e7113d700b87..025d7e2214ae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,23 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
- queue_work(system_wq, &cgrp->bpf.release_work);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1a43d06eab28..3d45ebe8afb4 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
@@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
.arg4_size = sizeof(s64),
};
@@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
.arg4_size = sizeof(u64),
};
@@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
@@ -2851,21 +2851,47 @@ struct bpf_iter_bits {
__u64 __opaque[2];
} __aligned(8);
+#define BITS_ITER_NR_WORDS_MAX 511
+
struct bpf_iter_bits_kern {
union {
- unsigned long *bits;
- unsigned long bits_copy;
+ __u64 *bits;
+ __u64 bits_copy;
};
- u32 nr_bits;
+ int nr_bits;
int bit;
} __aligned(8);
+/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
+ * a u64 pointer and an unsigned long pointer to find_next_bit() will
+ * return the same result, as both point to the same 8-byte area.
+ *
+ * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
+ * pointer also makes no difference. This is because the first iterated
+ * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
+ * long is composed of bits 32-63 of the u64.
+ *
+ * However, for 32-bit big-endian hosts, this is not the case. The first
+ * iterated unsigned long will be bits 32-63 of the u64, so swap these two
+ * ulong values within the u64.
+ */
+static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
+{
+#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
+ unsigned int i;
+
+ for (i = 0; i < nr; i++)
+ bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
+#endif
+}
+
/**
* bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
* @it: The new bpf_iter_bits to be created
* @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
* @nr_words: The size of the specified memory area, measured in 8-byte units.
- * Due to the limitation of memalloc, it can't be greater than 512.
+ * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
+ * further reduced by the BPF memory allocator implementation.
*
* This function initializes a new bpf_iter_bits structure for iterating over
* a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
@@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (!unsafe_ptr__ign || !nr_words)
return -EINVAL;
+ if (nr_words > BITS_ITER_NR_WORDS_MAX)
+ return -E2BIG;
/* Optimization for u64 mask */
if (nr_bits == 64) {
@@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (err)
return -EFAULT;
+ swap_ulong_in_u64(&kit->bits_copy, nr_words);
+
kit->nr_bits = nr_bits;
return 0;
}
+ if (bpf_mem_alloc_check_size(false, nr_bytes))
+ return -E2BIG;
+
/* Fallback to memalloc */
kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
if (!kit->bits)
@@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
return err;
}
+ swap_ulong_in_u64(kit->bits, nr_words);
+
kit->nr_bits = nr_bits;
return 0;
}
@@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
{
struct bpf_iter_bits_kern *kit = (void *)it;
- u32 nr_bits = kit->nr_bits;
- const unsigned long *bits;
- int bit;
+ int bit = kit->bit, nr_bits = kit->nr_bits;
+ const void *bits;
- if (nr_bits == 0)
+ if (!nr_bits || bit >= nr_bits)
return NULL;
bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
- bit = find_next_bit(bits, nr_bits, kit->bit + 1);
+ bit = find_next_bit(bits, nr_bits, bit + 1);
if (bit >= nr_bits) {
- kit->nr_bits = 0;
+ kit->bit = bit;
return NULL;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index d8fc5eba529d..9aaf5124648b 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
const struct btf_type *enum_t;
const char *enum_pfx;
u64 *delegate_msk, msk = 0;
- char *p;
+ char *p, *str;
int val;
/* ignore errors, fallback to hex */
@@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
- while ((p = strsep(&param->string, ":"))) {
+ str = param->string;
+ while ((p = strsep(&str, ":"))) {
if (strcmp(p, "any") == 0) {
msk |= ~0ULL;
} else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 0218a5132ab5..9b60eda0f727 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
- node_stack = kmalloc_array(trie->max_prefixlen,
+ node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index b3858a76e0b3..146f5b57cfb1 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,8 @@
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
@@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {
static int bpf_mem_cache_idx(size_t size)
{
- if (!size || size > 4096)
+ if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
return -1;
if (size <= 192)
@@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+ /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+ if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+ (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+ return -E2BIG;
+
+ return 0;
+}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index de3b681d1d13..e1cfe890e0be 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE,
};
BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8cfa7183d2ef..c5aa127ed4cc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3069,13 +3069,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
+ enum bpf_link_type type = link->type;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- seq_printf(m,
- "link_type:\t%s\n"
- "link_id:\t%u\n",
- bpf_link_type_strs[link->type],
- link->id);
+ if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
+ seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
+ } else {
+ WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
+ seq_printf(m, "link_type:\t<%u>\n", type);
+ }
+ seq_printf(m, "link_id:\t%u\n", link->id);
+
if (prog) {
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
@@ -5892,7 +5896,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
.arg4_size = sizeof(u64),
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 411ab1b57af4..bb99bada7e2e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6804,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
struct bpf_func_state *state,
enum bpf_access_type t)
{
- struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx];
- int min_valid_off, max_bpf_stack;
-
- /* If accessing instruction is a spill/fill from bpf_fastcall pattern,
- * add room for all caller saved registers below MAX_BPF_STACK.
- * In case if bpf_fastcall rewrite won't happen maximal stack depth
- * would be checked by check_max_stack_depth_subprog().
- */
- max_bpf_stack = MAX_BPF_STACK;
- if (aux->fastcall_pattern)
- max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE;
+ int min_valid_off;
if (t == BPF_WRITE || env->allow_uninit_stack)
- min_valid_off = -max_bpf_stack;
+ min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;
@@ -7438,7 +7428,8 @@ mark:
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
- int access_size, bool zero_size_allowed,
+ int access_size, enum bpf_access_type access_type,
+ bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
@@ -7450,7 +7441,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_KEY:
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7458,15 +7449,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_mem_region_access(env, regno, reg->off, access_size,
reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
- if (check_map_access_type(env, regno, reg->off, access_size,
- meta && meta->raw_mode ? BPF_WRITE :
- BPF_READ))
+ if (check_map_access_type(env, regno, reg->off, access_size, access_type))
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER);
case PTR_TO_MEM:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7477,7 +7466,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
zero_size_allowed);
case PTR_TO_BUF:
if (type_is_rdonly_mem(reg->type)) {
- if (meta && meta->raw_mode) {
+ if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type));
return -EACCES;
@@ -7505,7 +7494,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
* Dynamically check it now.
*/
if (!env->ops->convert_ctx_access) {
- enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
int offset = access_size - 1;
/* Allow zero-byte read from PTR_TO_CTX */
@@ -7513,7 +7501,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return zero_size_allowed ? 0 : -EACCES;
return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
- atype, -1, false, false);
+ access_type, -1, false, false);
}
fallthrough;
@@ -7538,6 +7526,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
*/
static int check_mem_size_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
+ enum bpf_access_type access_type,
bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
@@ -7553,15 +7542,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
*/
meta->msize_max_value = reg->umax_value;
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
+ /* The register is SCALAR_VALUE; the access check happens using
+ * its boundaries. For unprivileged variable accesses, disable
+ * raw mode so that the program is required to initialize all
+ * the memory that the helper could just partially fill up.
*/
if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
meta = NULL;
if (reg->smin_value < 0) {
@@ -7581,9 +7567,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
regno);
return -EACCES;
}
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
+ err = check_helper_mem_access(env, regno - 1, reg->umax_value,
+ access_type, zero_size_allowed, meta);
if (!err)
err = mark_chain_precision(env, regno);
return err;
@@ -7594,13 +7579,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
{
bool may_be_null = type_may_be_null(reg->type);
struct bpf_reg_state saved_reg;
- struct bpf_call_arg_meta meta;
int err;
if (register_is_null(reg))
return 0;
- memset(&meta, 0, sizeof(meta));
/* Assuming that the register contains a value check if the memory
* access is safe. Temporarily save and restore the register's state as
* the conversion shouldn't be visible to a caller.
@@ -7610,10 +7593,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
mark_ptr_not_null_reg(reg);
}
- err = check_helper_mem_access(env, regno, mem_size, true, &meta);
- /* Check access for BPF_WRITE */
- meta.raw_mode = true;
- err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+ err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
if (may_be_null)
*reg = saved_reg;
@@ -7639,13 +7620,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
mark_ptr_not_null_reg(mem_reg);
}
- err = check_mem_size_reg(env, reg, regno, true, &meta);
- /* Check access for BPF_WRITE */
- meta.raw_mode = true;
- err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
+ err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
+ err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);
if (may_be_null)
*mem_reg = saved_reg;
+
return err;
}
@@ -8948,9 +8928,8 @@ skip_type_check:
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
- NULL);
+ err = check_helper_mem_access(env, regno, meta->map_ptr->key_size,
+ BPF_READ, false, NULL);
break;
case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
@@ -8965,9 +8944,9 @@ skip_type_check:
return -EACCES;
}
meta->raw_mode = arg_type & MEM_UNINIT;
- err = check_helper_mem_access(env, regno,
- meta->map_ptr->value_size, false,
- meta);
+ err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
break;
case ARG_PTR_TO_PERCPU_BTF_ID:
if (!reg->btf_id) {
@@ -9009,7 +8988,9 @@ skip_type_check:
*/
meta->raw_mode = arg_type & MEM_UNINIT;
if (arg_type & MEM_FIXED_SIZE) {
- err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta);
+ err = check_helper_mem_access(env, regno, fn->arg_size[arg],
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
if (err)
return err;
if (arg_type & MEM_ALIGNED)
@@ -9017,10 +8998,16 @@ skip_type_check:
}
break;
case ARG_CONST_SIZE:
- err = check_mem_size_reg(env, reg, regno, false, meta);
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ false, meta);
break;
case ARG_CONST_SIZE_OR_ZERO:
- err = check_mem_size_reg(env, reg, regno, true, meta);
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ true, meta);
break;
case ARG_PTR_TO_DYNPTR:
err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
@@ -17889,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
- bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
- bool add_new_state = force_new_state;
- bool force_exact;
+ bool force_new_state, add_new_state, force_exact;
+
+ force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->jmp_history_cnt > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -17901,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
+ add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;
@@ -21213,7 +21203,7 @@ patch_map_ops_generic:
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
- continue;
+ goto next_insn;
}
/* Implement bpf_kptr_xchg inline */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5886b95c6eae..044c7ba1cc48 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5789,7 +5789,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
struct cgroup *cgroup;
int ret = false;
- int level = 1;
+ int level = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -5797,7 +5797,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
if (cgroup->nr_descendants >= cgroup->max_descendants)
goto fail;
- if (level > cgroup->max_depth)
+ if (level >= cgroup->max_depth)
goto fail;
level++;
diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..3bf38d260cb3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
- retval = ksm_fork(mm, oldmm);
- if (retval)
- goto out;
- khugepaged_fork(mm, oldmm);
-
/* Use __mt_dup() to efficiently build an identical maple tree. */
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
if (unlikely(retval))
@@ -760,6 +755,8 @@ loop_out:
vma_iter_free(&vmi);
if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
+ ksm_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
} else if (mpnt) {
/*
* The entire maple tree has already been duplicated. If the
@@ -775,7 +772,10 @@ out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
mmap_write_unlock(oldmm);
- dup_userfaultfd_complete(&uf);
+ if (!retval)
+ dup_userfaultfd_complete(&uf);
+ else
+ dup_userfaultfd_fail(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3a24d6b5f559..396a067a8a56 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -718,7 +718,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
if (ret < 0) {
if (ops->msi_free) {
- for (i--; i > 0; i--)
+ for (i--; i >= 0; i--)
ops->msi_free(domain, info, virq + i);
}
irq_domain_free_irqs_top(domain, virq, nr_irqs);
diff --git a/kernel/resource.c b/kernel/resource.c
index b730bd28b422..4101016e8b20 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
rams_size += 16;
}
- rams[i].start = res.start;
- rams[i++].end = res.end;
-
+ rams[i++] = res;
start = res.end + 1;
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5900b06fd036..8b98ab2dca5a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
-static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
+static int scx_ops_bypass_depth;
+static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
static bool scx_ops_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -4298,18 +4299,20 @@ bool task_should_scx(struct task_struct *p)
*/
static void scx_ops_bypass(bool bypass)
{
- int depth, cpu;
+ int cpu;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
if (bypass) {
- depth = atomic_inc_return(&scx_ops_bypass_depth);
- WARN_ON_ONCE(depth <= 0);
- if (depth != 1)
- return;
+ scx_ops_bypass_depth++;
+ WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
+ if (scx_ops_bypass_depth != 1)
+ goto unlock;
} else {
- depth = atomic_dec_return(&scx_ops_bypass_depth);
- WARN_ON_ONCE(depth < 0);
- if (depth != 0)
- return;
+ scx_ops_bypass_depth--;
+ WARN_ON_ONCE(scx_ops_bypass_depth < 0);
+ if (scx_ops_bypass_depth != 0)
+ goto unlock;
}
/*
@@ -4326,7 +4329,7 @@ static void scx_ops_bypass(bool bypass)
struct rq_flags rf;
struct task_struct *p, *n;
- rq_lock_irqsave(rq, &rf);
+ rq_lock(rq, &rf);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@@ -4362,11 +4365,13 @@ static void scx_ops_bypass(bool bypass)
sched_enq_and_set_task(&ctx);
}
- rq_unlock_irqrestore(rq, &rf);
+ rq_unlock(rq, &rf);
/* resched to restore ticks and idle state */
resched_cpu(cpu);
}
+unlock:
+ raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
}
static void free_exit_info(struct scx_exit_info *ei)
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 316a4e8c97d3..1af0bb2cc45c 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -309,6 +309,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
struct posix_clock_desc cd;
int err;
+ if (!timespec64_valid_strict(ts))
+ return -EINVAL;
+
err = get_clock_desc(id, &cd);
if (err)
return err;
@@ -318,9 +321,6 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
goto out;
}
- if (!timespec64_valid_strict(ts))
- return -EINVAL;
-
if (cd.clk->ops.clock_settime)
err = cd.clk->ops.clock_settime(cd.clk, ts);
else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3bd402fa62a4..630b763e5240 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1202,7 +1202,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
.arg3_size = sizeof(u64),
};
@@ -1219,7 +1219,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
.func = get_func_ret,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
.arg2_size = sizeof(u64),
};
@@ -2216,8 +2216,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
- if (ret == -ENOENT)
- goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 41e7a15dcb50..69e226a48daa 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1252,10 +1252,10 @@ int register_ftrace_graph(struct fgraph_ops *gops)
int ret = 0;
int i = -1;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (!fgraph_initialized) {
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init",
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online",
fgraph_cpu_init, NULL);
if (ret < 0) {
pr_warn("fgraph: Error to init cpu hotplug support\n");
@@ -1273,10 +1273,8 @@ int register_ftrace_graph(struct fgraph_ops *gops)
}
i = fgraph_lru_alloc_index();
- if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub)) {
- ret = -ENOSPC;
- goto out;
- }
+ if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub))
+ return -ENOSPC;
gops->idx = i;
ftrace_graph_active++;
@@ -1313,8 +1311,6 @@ error:
gops->saved_func = NULL;
fgraph_lru_release_index(i);
}
-out:
- mutex_unlock(&ftrace_lock);
return ret;
}
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index b0e0ec85912e..ebda68ee9abf 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -912,6 +912,11 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
}
+ if (argc - 2 > MAX_TRACE_ARGS) {
+ ret = -E2BIG;
+ goto error;
+ }
+
mutex_lock(&event_mutex);
event_call = find_and_get_event(sys_name, sys_event);
ep = alloc_event_probe(group, event, event_call, argc - 2);
@@ -937,7 +942,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
argc -= 2; argv += 2;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ret = trace_eprobe_tp_update_arg(ep, argv, i);
if (ret)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index a079abd8955b..c62d1629cffe 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -1187,6 +1187,10 @@ static int __trace_fprobe_create(int argc, const char *argv[])
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS) {
+ ret = -E2BIG;
+ goto out;
+ }
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
@@ -1203,7 +1207,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
}
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ctx.offset = 0;
ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 61a6da808203..263fac44d3ca 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1013,6 +1013,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS) {
+ ret = -E2BIG;
+ goto out;
+ }
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
@@ -1029,7 +1033,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ctx.offset = 0;
ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 39877c80d6cb..16a5e368e7b7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -276,7 +276,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
}
trace_probe_log_err(offset, NO_EVENT_NAME);
return -EINVAL;
- } else if (len > MAX_EVENT_NAME_LEN) {
+ } else if (len >= MAX_EVENT_NAME_LEN) {
trace_probe_log_err(offset, EVENT_TOO_LONG);
return -EINVAL;
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c40531d2cbad..b30fc8fcd095 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -565,6 +565,8 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (argc < 2)
return -ECANCELED;
+ if (argc - 2 > MAX_TRACE_ARGS)
+ return -E2BIG;
if (argv[0][1] == ':')
event = &argv[0][2];
@@ -690,7 +692,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
tu->filename = filename;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
struct traceprobe_parse_context ctx = {
.flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
};
@@ -875,6 +877,7 @@ struct uprobe_cpu_buffer {
};
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
static int uprobe_buffer_refcnt;
+#define MAX_UCB_BUFFER_SIZE PAGE_SIZE
static int uprobe_buffer_init(void)
{
@@ -979,6 +982,11 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
ucb = uprobe_buffer_get();
ucb->dsize = tu->tp.size + dsize;
+ if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) {
+ ucb->dsize = MAX_UCB_BUFFER_SIZE;
+ dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size;
+ }
+
store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
*ucbp = ucb;
@@ -998,9 +1006,6 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
WARN_ON(call != trace_file->event_call);
- if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE))
- return;
-
if (trace_trigger_soft_disabled(trace_file))
return;