summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-10-31 14:56:19 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2024-10-31 14:56:19 -1000
commit5635f189425e328097714c38341944fc40731f3d (patch)
tree5b5a78c5d6d976cdac85435ca751306c91ed1ee2 /kernel/bpf
parent90602c251cda8a1e526efb250f28c1ea3f87cd78 (diff)
parentc40dd8c4732551605712985bc5b7045094c6458d (diff)
Merge tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf
Pull bpf fixes from Daniel Borkmann: - Fix BPF verifier to force a checkpoint when the program's jump history becomes too long (Eduard Zingerman) - Add several fixes to the BPF bits iterator addressing issues like memory leaks and overflow problems (Hou Tao) - Fix an out-of-bounds write in trie_get_next_key (Byeonguk Jeong) - Fix BPF test infra's LIVE_FRAME frame update after a page has been recycled (Toke Høiland-Jørgensen) - Fix BPF verifier and undo the 40-bytes extra stack space for bpf_fastcall patterns due to various bugs (Eduard Zingerman) - Fix a BPF sockmap race condition which could trigger a NULL pointer dereference in sock_map_link_update_prog (Cong Wang) - Fix tcp_bpf_recvmsg_parser to retrieve seq_copied from tcp_sk under the socket lock (Jiayuan Chen) * tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf: bpf, test_run: Fix LIVE_FRAME frame update after a page has been recycled selftests/bpf: Add three test cases for bits_iter bpf: Use __u64 to save the bits in bits iterator bpf: Check the validity of nr_words in bpf_iter_bits_new() bpf: Add bpf_mem_alloc_check_size() helper bpf: Free dynamically allocated bits in bpf_iter_bits_destroy() bpf: disallow 40-bytes extra stack for bpf_fastcall patterns selftests/bpf: Add test for trie_get_next_key() bpf: Fix out-of-bounds write in trie_get_next_key() selftests/bpf: Test with a very short loop bpf: Force checkpoint when jmp history is too long bpf: fix filed access without lock sock_map: fix a NULL pointer dereference in sock_map_link_update_prog()
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/helpers.c54
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/memalloc.c14
-rw-r--r--kernel/bpf/verifier.c23
4 files changed, 66 insertions, 27 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ca3f0a2e5ed5..3d45ebe8afb4 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2851,21 +2851,47 @@ struct bpf_iter_bits {
__u64 __opaque[2];
} __aligned(8);
+#define BITS_ITER_NR_WORDS_MAX 511
+
struct bpf_iter_bits_kern {
union {
- unsigned long *bits;
- unsigned long bits_copy;
+ __u64 *bits;
+ __u64 bits_copy;
};
- u32 nr_bits;
+ int nr_bits;
int bit;
} __aligned(8);
+/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
+ * a u64 pointer and an unsigned long pointer to find_next_bit() will
+ * return the same result, as both point to the same 8-byte area.
+ *
+ * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
+ * pointer also makes no difference. This is because the first iterated
+ * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
+ * long is composed of bits 32-63 of the u64.
+ *
+ * However, for 32-bit big-endian hosts, this is not the case. The first
+ * iterated unsigned long will be bits 32-63 of the u64, so swap these two
+ * ulong values within the u64.
+ */
+static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
+{
+#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
+ unsigned int i;
+
+ for (i = 0; i < nr; i++)
+ bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
+#endif
+}
+
/**
* bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
* @it: The new bpf_iter_bits to be created
* @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
* @nr_words: The size of the specified memory area, measured in 8-byte units.
- * Due to the limitation of memalloc, it can't be greater than 512.
+ * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
+ * further reduced by the BPF memory allocator implementation.
*
* This function initializes a new bpf_iter_bits structure for iterating over
* a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
@@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (!unsafe_ptr__ign || !nr_words)
return -EINVAL;
+ if (nr_words > BITS_ITER_NR_WORDS_MAX)
+ return -E2BIG;
/* Optimization for u64 mask */
if (nr_bits == 64) {
@@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
if (err)
return -EFAULT;
+ swap_ulong_in_u64(&kit->bits_copy, nr_words);
+
kit->nr_bits = nr_bits;
return 0;
}
+ if (bpf_mem_alloc_check_size(false, nr_bytes))
+ return -E2BIG;
+
/* Fallback to memalloc */
kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
if (!kit->bits)
@@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
return err;
}
+ swap_ulong_in_u64(kit->bits, nr_words);
+
kit->nr_bits = nr_bits;
return 0;
}
@@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
{
struct bpf_iter_bits_kern *kit = (void *)it;
- u32 nr_bits = kit->nr_bits;
- const unsigned long *bits;
- int bit;
+ int bit = kit->bit, nr_bits = kit->nr_bits;
+ const void *bits;
- if (nr_bits == 0)
+ if (!nr_bits || bit >= nr_bits)
return NULL;
bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
- bit = find_next_bit(bits, nr_bits, kit->bit + 1);
+ bit = find_next_bit(bits, nr_bits, bit + 1);
if (bit >= nr_bits) {
- kit->nr_bits = 0;
+ kit->bit = bit;
return NULL;
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 0218a5132ab5..9b60eda0f727 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
- node_stack = kmalloc_array(trie->max_prefixlen,
+ node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index b3858a76e0b3..146f5b57cfb1 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,8 @@
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
@@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {
static int bpf_mem_cache_idx(size_t size)
{
- if (!size || size > 4096)
+ if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
return -1;
if (size <= 192)
@@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+ /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+ if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+ (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+ return -E2BIG;
+
+ return 0;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 587a6c76e564..bb99bada7e2e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6804,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
struct bpf_func_state *state,
enum bpf_access_type t)
{
- struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx];
- int min_valid_off, max_bpf_stack;
-
- /* If accessing instruction is a spill/fill from bpf_fastcall pattern,
- * add room for all caller saved registers below MAX_BPF_STACK.
- * In case if bpf_fastcall rewrite won't happen maximal stack depth
- * would be checked by check_max_stack_depth_subprog().
- */
- max_bpf_stack = MAX_BPF_STACK;
- if (aux->fastcall_pattern)
- max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE;
+ int min_valid_off;
if (t == BPF_WRITE || env->allow_uninit_stack)
- min_valid_off = -max_bpf_stack;
+ min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;
@@ -17886,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
- bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
- bool add_new_state = force_new_state;
- bool force_exact;
+ bool force_new_state, add_new_state, force_exact;
+
+ force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->jmp_history_cnt > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -17898,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
+ add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;