summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2017-08-11 12:12:57 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2017-08-11 12:12:57 +1000
commit2d81258a7bf556b9e93697f7538d8d2aeaf8d543 (patch)
tree8881d9abc6a2d2911fb299fd552749c39ae7d740 /kernel
parent9ad008b4d8649b08897b8a633ab6a89e2a5da3a1 (diff)
parent3b2b69efeca734b78bc85fd02253b0465bb2bec7 (diff)
Merge remote-tracking branch 'net-next/master'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/core.c60
-rw-r--r--kernel/bpf/devmap.c434
-rw-r--r--kernel/bpf/syscall.c78
-rw-r--r--kernel/bpf/tnum.c180
-rw-r--r--kernel/bpf/verifier.c2223
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/trace/trace_syscalls.c53
8 files changed, 2069 insertions, 974 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1e5e658f2db..2f0bcda40e90 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,10 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad5f55922a13..c69e7f5bfde7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JNE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
/* Accommodate for extra offset in case of a backjump. */
off = from->off;
@@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+ [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+ [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+ [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+ [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
@@ -1073,6 +1085,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLT_X:
+ if (DST < SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLT_K:
+ if (DST < IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
@@ -1085,6 +1109,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLE_X:
+ if (DST <= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLE_K:
+ if (DST <= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
@@ -1097,6 +1133,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLT_X:
+ if (((s64) DST) < ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLT_K:
+ if (((s64) DST) < ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
@@ -1109,6 +1157,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLE_X:
+ if (((s64) DST) <= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLE_K:
+ if (((s64) DST) <= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 000000000000..7192fb67d4de
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,434 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * until all bits are cleared indicating outstanding flush operations have
+ * completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * When removing the dev a cmpxchg() is used to ensure the correct dev is
+ * removed, in the case of a concurrent update or delete operation it is
+ * possible that the initially referenced dev is no longer in the map. As the
+ * notifier hook walks the map we know that new dev references can not be
+ * added by the user because core infrastructure ensures dev_get_by_index()
+ * calls will fail at this point.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/rculist_nulls.h>
+#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
+#include "map_in_map.h"
+
+struct bpf_dtab_netdev {
+ struct net_device *dev;
+ int key;
+ struct rcu_head rcu;
+ struct bpf_dtab *dtab;
+};
+
+struct bpf_dtab {
+ struct bpf_map map;
+ struct bpf_dtab_netdev **netdev_map;
+ unsigned long int __percpu *flush_needed;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_map_lock);
+static LIST_HEAD(dev_map_list);
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ u64 cost;
+ int err;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags)
+ return ERR_PTR(-EINVAL);
+
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ dtab->map.map_type = attr->map_type;
+ dtab->map.key_size = attr->key_size;
+ dtab->map.value_size = attr->value_size;
+ dtab->map.max_entries = attr->max_entries;
+ dtab->map.map_flags = attr->map_flags;
+
+ err = -ENOMEM;
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_dtab;
+
+ dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(dtab->map.pages);
+ if (err)
+ goto free_dtab;
+
+ err = -ENOMEM;
+ /* A per cpu bitfield with a bit per possible net device */
+ dtab->flush_needed = __alloc_percpu(
+ BITS_TO_LONGS(attr->max_entries) *
+ sizeof(unsigned long),
+ __alignof__(unsigned long));
+ if (!dtab->flush_needed)
+ goto free_dtab;
+
+ dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ sizeof(struct bpf_dtab_netdev *));
+ if (!dtab->netdev_map)
+ goto free_dtab;
+
+ spin_lock(&dev_map_lock);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
+ spin_unlock(&dev_map_lock);
+ return &dtab->map;
+
+free_dtab:
+ free_percpu(dtab->flush_needed);
+ kfree(dtab);
+ return ERR_PTR(err);
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int i, cpu;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further reads against netdev_map. It does __not__ ensure pending
+ * flush operations (if any) are complete.
+ */
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ cpu_relax();
+ }
+
+ /* Although we should no longer have datapath or bpf syscall operations
+ * at this point we we can still race with netdev notifier, hence the
+ * lock.
+ */
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev;
+
+ dev = dtab->netdev_map[i];
+ if (!dev)
+ continue;
+
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+
+ /* At this point bpf program is detached and all pending operations
+ * _must_ be complete
+ */
+ spin_lock(&dev_map_lock);
+ list_del_rcu(&dtab->list);
+ spin_unlock(&dev_map_lock);
+ free_percpu(dtab->flush_needed);
+ bpf_map_area_free(dtab->netdev_map);
+ kfree(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= dtab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == dtab->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+void __dev_map_insert_ctx(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+ __set_bit(key, bitmap);
+}
+
+struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ dev = READ_ONCE(dtab->netdev_map[key]);
+ return dev ? dev->dev : NULL;
+}
+
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct net_device *netdev;
+
+ /* This is possible if the dev entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!dev))
+ continue;
+
+ netdev = dev->dev;
+
+ __clear_bit(bit, bitmap);
+ if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush))
+ continue;
+
+ netdev->netdev_ops->ndo_xdp_flush(netdev);
+ }
+}
+
+/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
+ * update happens in parallel here a dev_put wont happen until after reading the
+ * ifindex.
+ */
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev;
+ u32 i = *(u32 *)key;
+
+ if (i >= map->max_entries)
+ return NULL;
+
+ dev = READ_ONCE(dtab->netdev_map[i]);
+ return dev ? &dev->dev->ifindex : NULL;
+}
+
+static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev)
+{
+ if (old_dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = old_dev->dev;
+ unsigned long *bitmap;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu);
+ __clear_bit(old_dev->key, bitmap);
+
+ fl->netdev_ops->ndo_xdp_flush(old_dev->dev);
+ }
+ }
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_dtab_netdev *old_dev;
+
+ old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(old_dev);
+ dev_put(old_dev->dev);
+ kfree(old_dev);
+}
+
+static int dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ /* Use synchronize_rcu() here to ensure any rcu critical sections
+ * have completed, but this does not guarantee a flush has happened
+ * yet. Because driver side rcu_read_lock/unlock only protects the
+ * running XDP program. However, for pending flush operations the
+ * dev and ctx are stored in another per cpu map. And additionally,
+ * the driver tear down ensures all soft irqs are complete before
+ * removing the net device in the case of dev_put equals zero.
+ */
+ old_dev = xchg(&dtab->netdev_map[k], NULL);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ return 0;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dtab_netdev *dev, *old_dev;
+ u32 i = *(u32 *)key;
+ u32 ifindex = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (unlikely(i >= dtab->map.max_entries))
+ return -E2BIG;
+
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ if (!ifindex) {
+ dev = NULL;
+ } else {
+ dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ kfree(dev);
+ return -EINVAL;
+ }
+
+ dev->key = i;
+ dev->dtab = dtab;
+ }
+
+ /* Use call_rcu() here to ensure rcu critical sections have completed
+ * Remembering the driver side flush operation will happen before the
+ * net device is removed.
+ */
+ old_dev = xchg(&dtab->netdev_map[i], dev);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+}
+
+const struct bpf_map_ops dev_map_ops = {
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_get_next_key,
+ .map_lookup_elem = dev_map_lookup_elem,
+ .map_update_elem = dev_map_update_elem,
+ .map_delete_elem = dev_map_delete_elem,
+};
+
+static int dev_map_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dtab *dtab;
+ int i;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* This rcu_read_lock/unlock pair is needed because
+ * dev_map_list is an RCU list AND to ensure a delete
+ * operation does not free a netdev_map entry while we
+ * are comparing it against the netdev being unregistered.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(dtab, &dev_map_list, list) {
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev, *odev;
+
+ dev = READ_ONCE(dtab->netdev_map[i]);
+ if (!dev ||
+ dev->dev->ifindex != netdev->ifindex)
+ continue;
+ odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ if (dev == odev)
+ call_rcu(&dev->rcu,
+ __dev_map_entry_free);
+ }
+ }
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+ .notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+ register_netdevice_notifier(&dev_map_notifier);
+ return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6c772adabad2..fbe09a0cccf4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -48,6 +48,47 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#undef BPF_MAP_TYPE
};
+/*
+ * If we're handed a bigger struct than we know of, ensure all the unknown bits
+ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
+ * we don't know about yet.
+ *
+ * There is a ToCToU between this function call and the following
+ * copy_from_user() call. However, this is not a concern since this function is
+ * meant to be a future-proofing of bits.
+ */
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
+ return -E2BIG;
+
+ if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
+ return -EFAULT;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
struct bpf_map *map;
@@ -1246,32 +1287,6 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-static int check_uarg_tail_zero(void __user *uaddr,
- size_t expected_size,
- size_t actual_size)
-{
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
- int err;
-
- if (actual_size <= expected_size)
- return 0;
-
- addr = uaddr + expected_size;
- end = uaddr + actual_size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
-
- return 0;
-}
-
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1393,17 +1408,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
- if (!access_ok(VERIFY_READ, uattr, 1))
- return -EFAULT;
-
- if (size > PAGE_SIZE) /* silly large */
- return -E2BIG;
-
- /* If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
err = check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
new file mode 100644
index 000000000000..1f4bf68c12db
--- /dev/null
+++ b/kernel/bpf/tnum.c
@@ -0,0 +1,180 @@
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value. Each bit can be either
+ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/kernel.h>
+#include <linux/tnum.h>
+
+#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
+/* A completely unknown value */
+const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
+
+struct tnum tnum_const(u64 value)
+{
+ return TNUM(value, 0);
+}
+
+struct tnum tnum_range(u64 min, u64 max)
+{
+ u64 chi = min ^ max, delta;
+ u8 bits = fls64(chi);
+
+ /* special case, needed because 1ULL << 64 is undefined */
+ if (bits > 63)
+ return tnum_unknown;
+ /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
+ * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
+ * constant min (since min == max).
+ */
+ delta = (1ULL << bits) - 1;
+ return TNUM(min & ~delta, delta);
+}
+
+struct tnum tnum_lshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value << shift, a.mask << shift);
+}
+
+struct tnum tnum_rshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value >> shift, a.mask >> shift);
+}
+
+struct tnum tnum_add(struct tnum a, struct tnum b)
+{
+ u64 sm, sv, sigma, chi, mu;
+
+ sm = a.mask + b.mask;
+ sv = a.value + b.value;
+ sigma = sm + sv;
+ chi = sigma ^ sv;
+ mu = chi | a.mask | b.mask;
+ return TNUM(sv & ~mu, mu);
+}
+
+struct tnum tnum_sub(struct tnum a, struct tnum b)
+{
+ u64 dv, alpha, beta, chi, mu;
+
+ dv = a.value - b.value;
+ alpha = dv + a.mask;
+ beta = dv - b.mask;
+ chi = alpha ^ beta;
+ mu = chi | a.mask | b.mask;
+ return TNUM(dv & ~mu, mu);
+}
+
+struct tnum tnum_and(struct tnum a, struct tnum b)
+{
+ u64 alpha, beta, v;
+
+ alpha = a.value | a.mask;
+ beta = b.value | b.mask;
+ v = a.value & b.value;
+ return TNUM(v, alpha & beta & ~v);
+}
+
+struct tnum tnum_or(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v, mu & ~v);
+}
+
+struct tnum tnum_xor(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value ^ b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+/* half-multiply add: acc += (unknown * mask * value).
+ * An intermediate step in the multiply algorithm.
+ */
+static struct tnum hma(struct tnum acc, u64 value, u64 mask)
+{
+ while (mask) {
+ if (mask & 1)
+ acc = tnum_add(acc, TNUM(0, value));
+ mask >>= 1;
+ value <<= 1;
+ }
+ return acc;
+}
+
+struct tnum tnum_mul(struct tnum a, struct tnum b)
+{
+ struct tnum acc;
+ u64 pi;
+
+ pi = a.value * b.value;
+ acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
+ return hma(acc, b.mask, a.value);
+}
+
+/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
+ * a 'known 0' - this will return a 'known 1' for that bit.
+ */
+struct tnum tnum_intersect(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask & b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+struct tnum tnum_cast(struct tnum a, u8 size)
+{
+ a.value &= (1ULL << (size * 8)) - 1;
+ a.mask &= (1ULL << (size * 8)) - 1;
+ return a;
+}
+
+bool tnum_is_aligned(struct tnum a, u64 size)
+{
+ if (!size)
+ return true;
+ return !((a.value | a.mask) & (size - 1));
+}
+
+bool tnum_in(struct tnum a, struct tnum b)
+{
+ if (b.mask & ~a.mask)
+ return false;
+ b.value &= ~a.mask;
+ return a.value == b.value;
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+int tnum_sbin(char *str, size_t size, struct tnum a)
+{
+ size_t n;
+
+ for (n = 64; n; n--) {
+ if (n < size) {
+ if (a.mask & 1)
+ str[n - 1] = 'x';
+ else if (a.value & 1)
+ str[n - 1] = '1';
+ else
+ str[n - 1] = '0';
+ }
+ a.mask >>= 1;
+ a.value >>= 1;
+ }
+ str[min(size - 1, (size_t)64)] = 0;
+ return 64;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 664d93972373..ecc590e01a1d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -61,12 +61,12 @@
* (and -20 constant is saved for further stack bounds checking).
* Meaning that this reg is a pointer to stack plus known immediate constant.
*
- * Most of the time the registers have UNKNOWN_VALUE type, which
+ * Most of the time the registers have SCALAR_VALUE type, which
* means the register has some value, but it's not a valid pointer.
- * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ * (like pointer plus pointer becomes SCALAR_VALUE type)
*
* When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
* types recognized by check_mem_access() function.
*
* PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
@@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 98304
+#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
@@ -180,15 +180,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
- [UNKNOWN_VALUE] = "inv",
+ [SCALAR_VALUE] = "inv",
[PTR_TO_CTX] = "ctx",
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
- [FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
- [CONST_IMM] = "imm",
[PTR_TO_PACKET] = "pkt",
[PTR_TO_PACKET_END] = "pkt_end",
};
@@ -221,32 +218,52 @@ static void print_verifier_state(struct bpf_verifier_state *state)
if (t == NOT_INIT)
continue;
verbose(" R%d=%s", i, reg_type_str[t]);
- if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%lld", reg->imm);
- else if (t == PTR_TO_PACKET)
- verbose("(id=%d,off=%d,r=%d)",
- reg->id, reg->off, reg->range);
- else if (t == UNKNOWN_VALUE && reg->imm)
- verbose("%lld", reg->imm);
- else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL ||
- t == PTR_TO_MAP_VALUE_ADJ)
- verbose("(ks=%d,vs=%d,id=%u)",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size,
- reg->id);
- if (reg->min_value != BPF_REGISTER_MIN_RANGE)
- verbose(",min_value=%lld",
- (long long)reg->min_value);
- if (reg->max_value != BPF_REGISTER_MAX_RANGE)
- verbose(",max_value=%llu",
- (unsigned long long)reg->max_value);
- if (reg->min_align)
- verbose(",min_align=%u", reg->min_align);
- if (reg->aux_off)
- verbose(",aux_off=%u", reg->aux_off);
- if (reg->aux_off_align)
- verbose(",aux_off_align=%u", reg->aux_off_align);
+ if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+ tnum_is_const(reg->var_off)) {
+ /* reg->off should be 0 for SCALAR_VALUE */
+ verbose("%lld", reg->var_off.value + reg->off);
+ } else {
+ verbose("(id=%d", reg->id);
+ if (t != SCALAR_VALUE)
+ verbose(",off=%d", reg->off);
+ if (t == PTR_TO_PACKET)
+ verbose(",r=%d", reg->range);
+ else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose(",ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ if (tnum_is_const(reg->var_off)) {
+ /* Typically an immediate SCALAR_VALUE, but
+ * could be a pointer whose offset is too big
+ * for reg->off
+ */
+ verbose(",imm=%llx", reg->var_off.value);
+ } else {
+ if (reg->smin_value != reg->umin_value &&
+ reg->smin_value != S64_MIN)
+ verbose(",smin_value=%lld",
+ (long long)reg->smin_value);
+ if (reg->smax_value != reg->umax_value &&
+ reg->smax_value != S64_MAX)
+ verbose(",smax_value=%lld",
+ (long long)reg->smax_value);
+ if (reg->umin_value != 0)
+ verbose(",umin_value=%llu",
+ (unsigned long long)reg->umin_value);
+ if (reg->umax_value != U64_MAX)
+ verbose(",umax_value=%llu",
+ (unsigned long long)reg->umax_value);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(",var_off=%s", tn_buf);
+ }
+ }
+ verbose(")");
+ }
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -295,11 +312,15 @@ static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
[BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
[BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
[BPF_JSET >> 4] = "&",
[BPF_JNE >> 4] = "!=",
[BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
[BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
[BPF_CALL >> 4] = "call",
[BPF_EXIT >> 4] = "exit",
};
@@ -463,56 +484,161 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg);
+
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- BUG_ON(regno >= MAX_BPF_REG);
+ reg->id = 0;
+ reg->var_off = tnum_const(imm);
+ reg->smin_value = (s64)imm;
+ reg->smax_value = (s64)imm;
+ reg->umin_value = imm;
+ reg->umax_value = imm;
+}
- memset(&regs[regno], 0, sizeof(regs[regno]));
- regs[regno].type = NOT_INIT;
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+/* Mark the 'variable offset' part of a register as zero. This should be
+ * used only on registers holding a pointer type.
+ */
+static void __mark_reg_known_zero(struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
{
- int i;
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_known_zero(regs + regno);
+}
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_reg_not_init(regs, i);
+/* Attempts to improve min/max values based on var_off information */
+static void __update_reg_bounds(struct bpf_reg_state *reg)
+{
+ /* min signed is max(sign bit) | min(other bits) */
+ reg->smin_value = max_t(s64, reg->smin_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MIN));
+ /* max signed is min(sign bit) | max(other bits) */
+ reg->smax_value = min_t(s64, reg->smax_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MAX));
+ reg->umin_value = max(reg->umin_value, reg->var_off.value);
+ reg->umax_value = min(reg->umax_value,
+ reg->var_off.value | reg->var_off.mask);
+}
- /* frame pointer */
- regs[BPF_REG_FP].type = FRAME_PTR;
+/* Uses signed min/max values to inform unsigned, and vice-versa */
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+{
+ /* Learn sign from signed bounds.
+ * If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if (reg->smin_value >= 0 || reg->smax_value < 0) {
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ return;
+ }
+ /* Learn sign from unsigned bounds. Signed bounds cross the sign
+ * boundary, so we must be careful.
+ */
+ if ((s64)reg->umax_value >= 0) {
+ /* Positive. We can't learn anything from the smin, but smax
+ * is positive, hence safe.
+ */
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ } else if ((s64)reg->umin_value < 0) {
+ /* Negative. We can't learn anything from the smax, but smin
+ * is negative, hence safe.
+ */
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value;
+ }
+}
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
+/* Attempts to improve var_off based on unsigned min/max information */
+static void __reg_bound_offset(struct bpf_reg_state *reg)
+{
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->umin_value,
+ reg->umax_value));
+}
+
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+{
+ reg->smin_value = S64_MIN;
+ reg->smax_value = S64_MAX;
+ reg->umin_value = 0;
+ reg->umax_value = U64_MAX;
}
-static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+/* Mark a register as having a completely unknown (scalar) value. */
+static void __mark_reg_unknown(struct bpf_reg_state *reg)
{
- regs[regno].type = UNKNOWN_VALUE;
- regs[regno].id = 0;
- regs[regno].imm = 0;
+ reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->off = 0;
+ reg->var_off = tnum_unknown;
+ __mark_reg_unbounded(reg);
}
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
{
- BUG_ON(regno >= MAX_BPF_REG);
- __mark_reg_unknown_value(regs, regno);
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_unknown(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_unknown(regs + regno);
}
-static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg)
{
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
- regs[regno].value_from_signed = false;
- regs[regno].min_align = 0;
+ __mark_reg_unknown(reg);
+ reg->type = NOT_INIT;
}
-static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
- u32 regno)
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
{
- mark_reg_unknown_value(regs, regno);
- reset_reg_range_values(regs, regno);
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_not_init(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_not_init(regs + regno);
+}
+
+static void init_reg_state(struct bpf_reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_reg_not_init(regs, i);
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = PTR_TO_STACK;
+ mark_reg_known_zero(regs, BPF_REG_FP);
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -542,7 +668,7 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
return -EACCES;
}
if (t == DST_OP)
- mark_reg_unknown_value(regs, regno);
+ mark_reg_unknown(regs, regno);
}
return 0;
}
@@ -552,12 +678,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
switch (type) {
case PTR_TO_MAP_VALUE:
case PTR_TO_MAP_VALUE_OR_NULL:
- case PTR_TO_MAP_VALUE_ADJ:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
case PTR_TO_PACKET_END:
- case FRAME_PTR:
case CONST_PTR_TO_MAP:
return true;
default:
@@ -637,14 +761,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -657,49 +780,55 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check read/write into an adjusted map element */
-static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
int off, int size)
{
struct bpf_verifier_state *state = &env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
- /* We adjusted the register to this map value, so we
- * need to change off and size to min_value and max_value
- * respectively to make sure our theoretical access will be
- * safe.
+ /* We may have adjusted the register to this map value, so we
+ * need to try adding each of min_value and max_value to off
+ * to make sure our theoretical access will be safe.
*/
if (log_level)
print_verifier_state(state);
- env->varlen_map_value_access = true;
+ /* If the offset is variable, we will need to be stricter in state
+ * pruning from now on.
+ */
+ if (!tnum_is_const(reg->var_off))
+ env->varlen_map_value_access = true;
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->min_value < 0) {
+ if (reg->smin_value < 0) {
verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = check_map_access(env, regno, reg->min_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size);
if (err) {
- verbose("R%d min value is outside of the array range\n",
- regno);
+ verbose("R%d min value is outside of the array range\n", regno);
return err;
}
- /* If we haven't set a max value then we need to bail
- * since we can't be sure we won't do bad things.
+ /* If we haven't set a max value then we need to bail since we can't be
+ * sure we won't do bad things.
+ * If reg->umax_value + off could overflow, treat that as unbounded too.
*/
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ if (reg->umax_value >= BPF_MAX_VAR_OFF) {
verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- return check_map_access(env, regno, reg->max_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size);
+ if (err)
+ verbose("R%d max value is outside of the array range\n", regno);
+ return err;
}
#define MAX_PACKET_OFF 0xffff
@@ -729,14 +858,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size)
{
struct bpf_reg_state *regs = env->cur_state.regs;
struct bpf_reg_state *reg = &regs[regno];
- off += reg->off;
- if (off < 0 || size <= 0 || off + size > reg->range) {
+ if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
@@ -744,7 +872,35 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check access to 'struct bpf_context' fields */
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *reg = &regs[regno];
+ int err;
+
+ /* We may have added a variable offset to the packet pointer; but any
+ * reg->range we have comes after that. We are only checking the fixed
+ * offset.
+ */
+
+ /* We don't allow negative numbers, because we aren't tracking enough
+ * detail to prove they're safe.
+ */
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = __check_packet_access(env, regno, off, size);
+ if (err) {
+ verbose("R%d offset is outside of the packet\n", regno);
+ return err;
+ }
+ return err;
+}
+
+/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
@@ -784,13 +940,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
if (allow_ptr_leaks)
return false;
- switch (reg->type) {
- case UNKNOWN_VALUE:
- case CONST_IMM:
- return false;
- default:
- return true;
- }
+ return reg->type != SCALAR_VALUE;
}
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
@@ -801,23 +951,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
+ struct tnum reg_off;
int ip_align;
- int reg_off;
/* Byte size accesses are always allowed. */
if (!strict || size == 1)
return 0;
- reg_off = reg->off;
- if (reg->id) {
- if (reg->aux_off_align % size) {
- verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
- reg->aux_off_align, size);
- return -EACCES;
- }
- reg_off += reg->aux_off;
- }
-
/* For platforms that do not have a Kconfig enabling
* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
* NET_IP_ALIGN is universally set to '2'. And on platforms
@@ -827,20 +967,37 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
* unconditional IP align value of '2'.
*/
ip_align = 2;
- if ((ip_align + reg_off + off) % size != 0) {
- verbose("misaligned packet access off %d+%d+%d size %d\n",
- ip_align, reg_off, off, size);
+
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
return 0;
}
-static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
- int size, bool strict)
+static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+ const char *pointer_desc,
+ int off, int size, bool strict)
{
- if (strict && size != 1) {
- verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+ struct tnum reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -852,21 +1009,25 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
int off, int size)
{
bool strict = env->strict_alignment;
+ const char *pointer_desc = "";
switch (reg->type) {
case PTR_TO_PACKET:
+ /* special case, because of NET_IP_ALIGN */
return check_pkt_ptr_alignment(reg, off, size, strict);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_val_ptr_alignment(reg, size, strict);
+ case PTR_TO_MAP_VALUE:
+ pointer_desc = "value ";
+ break;
+ case PTR_TO_CTX:
+ pointer_desc = "context ";
+ break;
+ case PTR_TO_STACK:
+ pointer_desc = "stack ";
+ break;
default:
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
- return -EACCES;
- }
-
- return 0;
+ break;
}
+ return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -883,52 +1044,79 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;
- if (reg->type == PTR_TO_STACK)
- off += reg->imm;
-
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
+ /* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size);
if (err)
return err;
- if (reg->type == PTR_TO_MAP_VALUE ||
- reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ /* for access checks, reg->off is just part of off */
+ off += reg->off;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- if (reg->type == PTR_TO_MAP_VALUE_ADJ)
- err = check_map_access_adj(env, regno, off, size);
- else
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+ enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
+ /* ctx accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable ctx access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
- /* note that reg.[id|off|range] == 0 */
+ /* ctx access returns either a scalar, or a
+ * PTR_TO_PACKET[_END]. In the latter case, we know
+ * the offset is zero.
+ */
+ if (reg_type == SCALAR_VALUE)
+ mark_reg_unknown(state->regs, value_regno);
+ else
+ mark_reg_known_zero(state->regs, value_regno);
+ state->regs[value_regno].id = 0;
+ state->regs[value_regno].off = 0;
+ state->regs[value_regno].range = 0;
state->regs[value_regno].type = reg_type;
- state->regs[value_regno].aux_off = 0;
- state->regs[value_regno].aux_off_align = 0;
}
- } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
+ } else if (reg->type == PTR_TO_STACK) {
+ /* stack accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ * See check_stack_read().
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable stack access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -948,7 +1136,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
err = check_stack_read(state, off, size, value_regno);
}
- } else if (state->regs[regno].type == PTR_TO_PACKET) {
+ } else if (reg->type == PTR_TO_PACKET) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
@@ -960,21 +1148,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[reg->type]);
return -EACCES;
}
- if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
- state->regs[value_regno].type == UNKNOWN_VALUE) {
- /* 1 or 2 byte load zero-extends, determine the number of
- * zero upper bits. Not doing it fo 4 byte load, since
- * such values cannot be added to ptr_to_packet anyway.
- */
- state->regs[value_regno].imm = 64 - size * 8;
+ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
+ state->regs[value_regno].type == SCALAR_VALUE) {
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ state->regs[value_regno].var_off = tnum_cast(
+ state->regs[value_regno].var_off, size);
+ __update_reg_bounds(&state->regs[value_regno]);
}
return err;
}
@@ -1016,9 +1202,17 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state reg)
+{
+ return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
+}
+
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized
+ * and all elements of stack are initialized.
+ * Unlike most pointer bounds-checking functions, this one doesn't take an
+ * 'off' argument, so it has to add in reg->off itself.
*/
static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
@@ -1029,9 +1223,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int off, i;
if (regs[regno].type != PTR_TO_STACK) {
+ /* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- regs[regno].type == CONST_IMM &&
- regs[regno].imm == 0)
+ register_is_null(regs[regno]))
return 0;
verbose("R%d type=%s expected=%s\n", regno,
@@ -1040,7 +1234,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
- off = regs[regno].imm;
+ /* Only allow fixed-offset stack reads */
+ if (!tnum_is_const(regs[regno].var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+ verbose("invalid variable stack read R%d var_off=%s\n",
+ regno, tn_buf);
+ }
+ off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size <= 0) {
verbose("invalid stack type R%d off=%d access_size=%d\n",
@@ -1071,16 +1273,14 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
- switch (regs[regno].type) {
+ switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, 0, access_size);
+ return check_packet_access(env, regno, reg->off, access_size);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, 0, access_size);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_map_access_adj(env, regno, 0, access_size);
- default: /* const_imm|ptr_to_stack or invalid ptr */
+ return check_map_access(env, regno, reg->off, access_size);
+ default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
}
@@ -1123,11 +1323,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
- expected_type = CONST_IMM;
- /* One exception. Allow UNKNOWN_VALUE registers when the
- * boundaries are known and don't cause unsafe memory accesses
- */
- if (type != UNKNOWN_VALUE && type != expected_type)
+ expected_type = SCALAR_VALUE;
+ if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
@@ -1141,13 +1338,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
- * passed in as argument, it's a CONST_IMM type. Final test
+ * passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (type == CONST_IMM && reg->imm == 0)
+ if (register_is_null(*reg))
/* final test in check_stack_boundary() */;
else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
- type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
+ type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
@@ -1173,7 +1370,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->key_size);
else
err = check_stack_boundary(env, regno,
@@ -1189,7 +1386,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->value_size);
else
err = check_stack_boundary(env, regno,
@@ -1209,10 +1406,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
- /* If the register is UNKNOWN_VALUE, the access check happens
- * using its boundaries. Otherwise, just use its imm
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
*/
- if (type == UNKNOWN_VALUE) {
+
+ if (!tnum_is_const(reg->var_off))
/* For unprivileged variable accesses, disable raw
* mode so that the program is required to
* initialize all the memory that the helper could
@@ -1220,35 +1418,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
meta = NULL;
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->min_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->max_value,
- zero_size_allowed, meta);
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
if (err)
return err;
- } else {
- /* register is CONST_IMM */
- err = check_helper_mem_access(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
}
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
}
return err;
@@ -1283,6 +1474,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ /* devmap returns a pointer to a live net_device ifindex that we cannot
+ * allow to be modified from bpf side. So do not allow lookup elements
+ * for now.
+ */
+ case BPF_MAP_TYPE_DEVMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
@@ -1311,6 +1510,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
break;
+ case BPF_FUNC_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ goto error;
+ break;
default:
break;
}
@@ -1340,6 +1543,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
+/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
+ * so turn them into unknown SCALAR_VALUE.
+ */
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *state = &env->cur_state;
@@ -1349,7 +1555,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET ||
regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown_value(regs, i);
+ mark_reg_unknown(regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1358,8 +1564,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
- __mark_reg_unknown_value(state->spilled_regs,
- i / BPF_REG_SIZE);
+ __mark_reg_unknown(reg);
}
}
@@ -1439,14 +1644,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
/* update return register */
if (fn->ret_type == RET_INTEGER) {
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ /* sets type to SCALAR_VALUE */
+ mark_reg_unknown(regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
struct bpf_insn_aux_data *insn_aux;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
- regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
+ /* There is no offset yet applied, variable or fixed */
+ mark_reg_known_zero(regs, BPF_REG_0);
+ regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
@@ -1477,494 +1685,551 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static int check_packet_ptr_add(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static void coerce_reg_to_32(struct bpf_reg_state *reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- struct bpf_reg_state tmp_reg;
- s32 imm;
-
- if (BPF_SRC(insn->code) == BPF_K) {
- /* pkt_ptr += imm */
- imm = insn->imm;
-
-add_imm:
- if (imm < 0) {
- verbose("addition of negative constant to packet pointer is not allowed\n");
- return -EACCES;
- }
- if (imm >= MAX_PACKET_OFF ||
- imm + dst_reg->off >= MAX_PACKET_OFF) {
- verbose("constant %d is too large to add to packet pointer\n",
- imm);
- return -EACCES;
- }
- /* a constant was added to pkt_ptr.
- * Remember it while keeping the same 'id'
- */
- dst_reg->off += imm;
- } else {
- bool had_id;
-
- if (src_reg->type == PTR_TO_PACKET) {
- /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
- tmp_reg = *dst_reg; /* save r7 state */
- *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
- src_reg = &tmp_reg; /* pretend it's src_reg state */
- /* if the checks below reject it, the copy won't matter,
- * since we're rejecting the whole program. If all ok,
- * then imm22 state will be added to r7
- * and r7 will be pkt(id=0,off=22,r=62) while
- * r6 will stay as pkt(id=0,off=0,r=62)
- */
- }
+ /* clear high 32 bits */
+ reg->var_off = tnum_cast(reg->var_off, 4);
+ /* Update bounds */
+ __update_reg_bounds(reg);
+}
- if (src_reg->type == CONST_IMM) {
- /* pkt_ptr += reg where reg is known constant */
- imm = src_reg->imm;
- goto add_imm;
- }
- /* disallow pkt_ptr += reg
- * if reg is not uknown_value with guaranteed zero upper bits
- * otherwise pkt_ptr may overflow and addition will become
- * subtraction which is not allowed
- */
- if (src_reg->type != UNKNOWN_VALUE) {
- verbose("cannot add '%s' to ptr_to_packet\n",
- reg_type_str[src_reg->type]);
- return -EACCES;
- }
- if (src_reg->imm < 48) {
- verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
- src_reg->imm);
- return -EACCES;
- }
+static bool signed_add_overflows(s64 a, s64 b)
+{
+ /* Do the add in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a + (u64)b);
- had_id = (dst_reg->id != 0);
+ if (b < 0)
+ return res > a;
+ return res < a;
+}
- /* dst_reg stays as pkt_ptr type and since some positive
- * integer value was added to the pointer, increment its 'id'
- */
- dst_reg->id = ++env->id_gen;
-
- /* something was added to pkt_ptr, set range to zero */
- dst_reg->aux_off += dst_reg->off;
- dst_reg->off = 0;
- dst_reg->range = 0;
- if (had_id)
- dst_reg->aux_off_align = min(dst_reg->aux_off_align,
- src_reg->min_align);
- else
- dst_reg->aux_off_align = src_reg->min_align;
- }
- return 0;
+static bool signed_sub_overflows(s64 a, s64 b)
+{
+ /* Do the sub in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a - (u64)b);
+
+ if (b < 0)
+ return res < a;
+ return res > a;
}
-static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
+/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+ * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
+ */
+static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ bool known = tnum_is_const(off_reg->var_off);
+ s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
+ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
u8 opcode = BPF_OP(insn->code);
- s64 imm_log2;
+ u32 dst = insn->dst_reg;
- /* for type == UNKNOWN_VALUE:
- * imm > 0 -> number of zero upper bits
- * imm == 0 -> don't track which is the same as all bits can be non-zero
- */
+ dst_reg = &regs[dst];
- if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(dst_reg->imm, src_reg->imm);
- dst_reg->imm--;
- return 0;
- }
- if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where dreg has zero upper bits and sreg is const.
- * Adding them can only result making one more bit
- * non-zero in the larger value.
- */
- imm_log2 = __ilog2_u64((long long)src_reg->imm);
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- return 0;
- }
- /* all other cases non supported yet, just mark dst_reg */
- dst_reg->imm = 0;
- return 0;
+ if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad sbounds\n");
+ return -EINVAL;
+ }
+ if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad ubounds\n");
+ return -EINVAL;
}
- /* sign extend 32-bit imm into 64-bit to make sure that
- * negative values occupy bit 63. Note ilog2() would have
- * been incorrect, since sizeof(insn->imm) == 4
- */
- imm_log2 = __ilog2_u64((long long)insn->imm);
-
- if (dst_reg->imm && opcode == BPF_LSH) {
- /* reg <<= imm
- * if reg was a result of 2 byte load, then its imm == 48
- * which means that upper 48 bits are zero and shifting this reg
- * left by 4 would mean that upper 44 bits are still zero
- */
- dst_reg->imm -= insn->imm;
- } else if (dst_reg->imm && opcode == BPF_MUL) {
- /* reg *= imm
- * if multiplying by 14 subtract 4
- * This is conservative calculation of upper zero bits.
- * It's not trying to special case insn->imm == 1 or 0 cases
- */
- dst_reg->imm -= imm_log2 + 1;
- } else if (opcode == BPF_AND) {
- /* reg &= imm */
- dst_reg->imm = 63 - imm_log2;
- } else if (dst_reg->imm && opcode == BPF_ADD) {
- /* reg += imm */
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- } else if (opcode == BPF_RSH) {
- /* reg >>= imm
- * which means that after right shift, upper bits will be zero
- * note that verifier already checked that
- * 0 <= imm < 64 for shift insn
- */
- dst_reg->imm += insn->imm;
- if (unlikely(dst_reg->imm > 64))
- /* some dumb code did:
- * r2 = *(u32 *)mem;
- * r2 >>= 32;
- * and all bits are zero now */
- dst_reg->imm = 64;
- } else {
- /* all other alu ops, means that we don't know what will
- * happen to the value, mark it with unknown number of zero bits
- */
- dst_reg->imm = 0;
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
+ return -EACCES;
}
- if (dst_reg->imm < 0) {
- /* all 64 bits of the register can contain non-zero bits
- * and such value cannot be added to ptr_to_packet, since it
- * may overflow, mark it as unknown to avoid further eval
- */
- dst_reg->imm = 0;
+ if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == CONST_PTR_TO_MAP) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == PTR_TO_PACKET_END) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
+ return -EACCES;
}
- return 0;
-}
-static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- u8 opcode = BPF_OP(insn->code);
- s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
-
- /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
- if (src_reg->imm > 0 && dst_reg->imm) {
- switch (opcode) {
- case BPF_ADD:
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- break;
- case BPF_AND:
- /* dreg &= sreg
- * AND can not extend zero bits only shrink
- * Ex. 0x00..00ffffff
- * & 0x0f..ffffffff
- * ----------------
- * 0x00..00ffffff
- */
- dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
+ * The id may be overwritten later if we create a new variable offset.
+ */
+ dst_reg->type = ptr_reg->type;
+ dst_reg->id = ptr_reg->id;
+
+ switch (opcode) {
+ case BPF_ADD:
+ /* We can take a fixed offset as long as it doesn't overflow
+ * the s32 'off' field
+ */
+ if (known && (ptr_reg->off + smin_val ==
+ (s64)(s32)(ptr_reg->off + smin_val))) {
+ /* pointer += K. Accumulate it into fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->off = ptr_reg->off + smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_OR:
- /* dreg |= sreg
- * OR can only extend zero bits
- * Ex. 0x00..00ffffff
- * | 0x0f..ffffffff
- * ----------------
- * 0x0f..00ffffff
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ }
+ /* A new variable offset is created. Note that off_reg->off
+ * == 0, since it's a scalar.
+ * dst_reg gets the pointer type and since some positive
+ * integer value was added to the pointer, give it a new 'id'
+ * if it's a PTR_TO_PACKET.
+ * this creates a new 'base' pointer, off_reg (variable) gets
+ * added into the variable offset, and we copy the fixed offset
+ * from ptr_reg.
+ */
+ if (signed_add_overflows(smin_ptr, smin_val) ||
+ signed_add_overflows(smax_ptr, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr + smin_val;
+ dst_reg->smax_value = smax_ptr + smax_val;
+ }
+ if (umin_ptr + umin_val < umin_ptr ||
+ umax_ptr + umax_val < umax_ptr) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value = umin_ptr + umin_val;
+ dst_reg->umax_value = umax_ptr + umax_val;
+ }
+ dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_SUB:
+ if (dst_reg == off_reg) {
+ /* scalar -= pointer. Creates an unknown scalar */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d tried to subtract pointer from scalar\n",
+ dst);
+ return -EACCES;
+ }
+ /* We don't allow subtraction from FP, because (according to
+ * test_verifier.c test "invalid fp arithmetic", JITs might not
+ * be able to deal with it.
+ */
+ if (ptr_reg->type == PTR_TO_STACK) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d subtraction from stack pointer prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (known && (ptr_reg->off - smin_val ==
+ (s64)(s32)(ptr_reg->off - smin_val))) {
+ /* pointer -= K. Subtract it from fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->id = ptr_reg->id;
+ dst_reg->off = ptr_reg->off - smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_SUB:
- case BPF_MUL:
- case BPF_RSH:
- case BPF_LSH:
- /* These may be flushed out later */
- default:
- mark_reg_unknown_value(regs, insn->dst_reg);
}
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
+ /* A new variable offset is created. If the subtrahend is known
+ * nonnegative, then any reg->range we had before is still good.
+ */
+ if (signed_sub_overflows(smin_ptr, smax_val) ||
+ signed_sub_overflows(smax_ptr, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr - smax_val;
+ dst_reg->smax_value = smax_ptr - smin_val;
+ }
+ if (umin_ptr < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value = umin_ptr - umax_val;
+ dst_reg->umax_value = umax_ptr - umin_val;
+ }
+ dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ if (smin_val < 0)
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* bitwise ops on pointers are troublesome, prohibit for now.
+ * (However, in principle we could allow some cases, e.g.
+ * ptr &= ~3 which would reduce min_value by 3.)
+ */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ default:
+ /* other operators (e.g. MUL,LSH) produce non-pointer results */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
}
- dst_reg->type = UNKNOWN_VALUE;
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
return 0;
}
-static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state src_reg)
{
struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
- u64 dst_imm = dst_reg->imm;
-
- if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
- return evaluate_reg_imm_alu_unknown(env, insn);
-
- /* dst_reg->type == CONST_IMM here. Simulate execution of insns
- * containing ALU ops. Don't care about overflow or negative
- * values, just add/sub/... them; registers are in u64.
- */
- if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
- dst_imm += insn->imm;
- } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm += src_reg->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
- dst_imm -= insn->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm -= src_reg->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
- dst_imm *= insn->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm *= src_reg->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
- dst_imm |= insn->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm |= src_reg->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
- dst_imm &= insn->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm &= src_reg->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm >>= insn->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm >>= src_reg->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm <<= insn->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm <<= src_reg->imm;
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
- goto out;
- }
-
- dst_reg->imm = dst_imm;
-out:
- return 0;
-}
-
-static void check_reg_overflow(struct bpf_reg_state *reg)
-{
- if (reg->max_value > BPF_REGISTER_MAX_RANGE)
- reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
- reg->min_value > BPF_REGISTER_MAX_RANGE)
- reg->min_value = BPF_REGISTER_MIN_RANGE;
-}
-
-static u32 calc_align(u32 imm)
-{
- if (!imm)
- return 1U << 31;
- return imm - ((imm - 1) & imm);
-}
-
-static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
- s64 min_val = BPF_REGISTER_MIN_RANGE;
- u64 max_val = BPF_REGISTER_MAX_RANGE;
- u8 opcode = BPF_OP(insn->code);
- u32 dst_align, src_align;
-
- dst_reg = &regs[insn->dst_reg];
- src_align = 0;
- if (BPF_SRC(insn->code) == BPF_X) {
- check_reg_overflow(&regs[insn->src_reg]);
- min_val = regs[insn->src_reg].min_value;
- max_val = regs[insn->src_reg].max_value;
-
- /* If the source register is a random pointer then the
- * min_value/max_value values represent the range of the known
- * accesses into that value, not the actual min/max value of the
- * register itself. In this case we have to reset the reg range
- * values so we know it is not safe to look at.
- */
- if (regs[insn->src_reg].type != CONST_IMM &&
- regs[insn->src_reg].type != UNKNOWN_VALUE) {
- min_val = BPF_REGISTER_MIN_RANGE;
- max_val = BPF_REGISTER_MAX_RANGE;
- src_align = 0;
- } else {
- src_align = regs[insn->src_reg].min_align;
- }
- } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
- (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
- min_val = max_val = insn->imm;
- src_align = calc_align(insn->imm);
- }
-
- dst_align = dst_reg->min_align;
-
- /* We don't know anything about what was done to this register, mark it
- * as unknown. Also, if both derived bounds came from signed/unsigned
- * mixed compares and one side is unbounded, we cannot really do anything
- * with them as boundaries cannot be trusted. Thus, arithmetic of two
- * regs of such kind will get invalidated bounds on the dst side.
- */
- if ((min_val == BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (BPF_SRC(insn->code) == BPF_X &&
- ((min_val != BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (min_val == BPF_REGISTER_MIN_RANGE &&
- max_val != BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
- regs[insn->dst_reg].value_from_signed !=
- regs[insn->src_reg].value_from_signed)) {
- reset_reg_range_values(regs, insn->dst_reg);
- return;
- }
-
- /* If one of our values was at the end of our ranges then we can't just
- * do our normal operations to the register, we need to set the values
- * to the min/max since they are undefined.
- */
- if (opcode != BPF_SUB) {
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ bool src_known, dst_known;
+ s64 smin_val, smax_val;
+ u64 umin_val, umax_val;
+
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->64 */
+ coerce_reg_to_32(dst_reg);
+ coerce_reg_to_32(&src_reg);
}
+ smin_val = src_reg.smin_value;
+ smax_val = src_reg.smax_value;
+ umin_val = src_reg.umin_value;
+ umax_val = src_reg.umax_value;
+ src_known = tnum_is_const(src_reg.var_off);
+ dst_known = tnum_is_const(dst_reg->var_off);
switch (opcode) {
case BPF_ADD:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value += min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value += max_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
+ signed_add_overflows(dst_reg->smax_value, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value += smin_val;
+ dst_reg->smax_value += smax_val;
+ }
+ if (dst_reg->umin_value + umin_val < umin_val ||
+ dst_reg->umax_value + umax_val < umax_val) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value += umin_val;
+ dst_reg->umax_value += umax_val;
+ }
+ dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- /* If one of our values was at the end of our ranges, then the
- * _opposite_ value in the dst_reg goes to the end of our range.
- */
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= max_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= min_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
+ signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value -= smax_val;
+ dst_reg->smax_value -= smin_val;
+ }
+ if (dst_reg->umin_value < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value -= umax_val;
+ dst_reg->umax_value -= umin_val;
+ }
+ dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
break;
case BPF_MUL:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value *= min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value *= max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+ if (smin_val < 0 || dst_reg->smin_value < 0) {
+ /* Ain't nobody got time to multiply that sign */
+ __mark_reg_unbounded(dst_reg);
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ /* Both values are positive, so we can work with unsigned and
+ * copy the result to signed (unless it exceeds S64_MAX).
+ */
+ if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
+ /* Potential overflow, we know nothing */
+ __mark_reg_unbounded(dst_reg);
+ /* (except what we can learn from the var_off) */
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ dst_reg->umin_value *= umin_val;
+ dst_reg->umax_value *= umax_val;
+ if (dst_reg->umax_value > S64_MAX) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
break;
case BPF_AND:
- /* Disallow AND'ing of negative numbers, ain't nobody got time
- * for that. Otherwise the minimum is 0 and the max is the max
- * value we could AND against.
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value &
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our minimum from the var_off, since that's inherently
+ * bitwise. Our maximum is the minimum of the operands' maxima.
*/
- if (min_val < 0)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
- dst_reg->min_value = 0;
- dst_reg->max_value = max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ANDing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ /* ANDing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_LSH:
- /* Gotta have special overflow logic here, if we're shifting
- * more than MAX_RANGE then just assume we have an invalid
- * range.
+ case BPF_OR:
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value |
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our maximum from the var_off, and our minimum is the
+ * maximum of the operands' minima
*/
- if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- dst_reg->min_align = 1;
+ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
+ dst_reg->umax_value = dst_reg->var_off.value |
+ dst_reg->var_off.mask;
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ORing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
} else {
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value <<= min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
- dst_reg->min_align <<= min_val;
- }
- if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value <<= max_val;
+ /* ORing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_RSH:
- /* RSH by a negative number is undefined, and the BPF_RSH is an
- * unsigned shift, so make the appropriate casts.
+ case BPF_LSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
+ }
+ /* We lose all sign bit information (except what we can pick
+ * up from var_off)
*/
- if (min_val < 0 || dst_reg->min_value < 0) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ /* If we might shift our top bit out, then we know nothing */
+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
} else {
- dst_reg->min_value =
- (u64)(dst_reg->min_value) >> min_val;
+ dst_reg->umin_value <<= umin_val;
+ dst_reg->umax_value <<= umax_val;
+ }
+ if (src_known)
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+ else
+ dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+ break;
+ case BPF_RSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
}
- if (min_val < 0) {
- dst_reg->min_align = 1;
+ /* BPF_RSH is an unsigned shift, so make the appropriate casts */
+ if (dst_reg->smin_value < 0) {
+ if (umin_val) {
+ /* Sign bit will be cleared */
+ dst_reg->smin_value = 0;
+ } else {
+ /* Lost sign bit information */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
} else {
- dst_reg->min_align >>= (u64) min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
+ dst_reg->smin_value =
+ (u64)(dst_reg->smin_value) >> umax_val;
}
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value >>= max_val;
+ if (src_known)
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off,
+ umin_val);
+ else
+ dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->umin_value >>= umax_val;
+ dst_reg->umax_value >>= umin_val;
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
default:
- reset_reg_range_values(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
break;
}
- check_reg_overflow(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
+ return 0;
+}
+
+/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
+ * and var_off.
+ */
+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ u8 opcode = BPF_OP(insn->code);
+ int rc;
+
+ dst_reg = &regs[insn->dst_reg];
+ src_reg = NULL;
+ if (dst_reg->type != SCALAR_VALUE)
+ ptr_reg = dst_reg;
+ if (BPF_SRC(insn->code) == BPF_X) {
+ src_reg = &regs[insn->src_reg];
+ if (src_reg->type != SCALAR_VALUE) {
+ if (dst_reg->type != SCALAR_VALUE) {
+ /* Combining two pointers by any ALU op yields
+ * an arbitrary scalar.
+ */
+ if (!env->allow_ptr_leaks) {
+ verbose("R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ }
+ mark_reg_unknown(regs, insn->dst_reg);
+ return 0;
+ } else {
+ /* scalar += pointer
+ * This is legal, but we have to reverse our
+ * src/dest handling in computing the range
+ */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* scalar += unknown scalar */
+ __mark_reg_unknown(&off_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn,
+ dst_reg, off_reg);
+ }
+ return rc;
+ }
+ } else if (ptr_reg) {
+ /* pointer += scalar */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += scalar */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, *src_reg);
+ }
+ return rc;
+ }
+ } else {
+ /* Pretend the src is a reg with a known value, since we only
+ * need to be able to read from this state.
+ */
+ off_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&off_reg, insn->imm);
+ src_reg = &off_reg;
+ if (ptr_reg) { /* pointer += K */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += K */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, off_reg);
+ }
+ return rc;
+ }
+ }
+
+ /* Got here implies adding two SCALAR_VALUEs */
+ if (WARN_ON_ONCE(ptr_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: unexpected ptr_reg\n");
+ return -EINVAL;
+ }
+ if (WARN_ON(!src_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: no src_reg\n");
+ return -EINVAL;
+ }
+ return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -2024,11 +2289,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
- /* we are setting our register to something new, we need to
- * reset its range values.
- */
- reset_reg_range_values(regs, insn->dst_reg);
-
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
@@ -2036,24 +2296,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
+ /* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
verbose("R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown_value(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
+ /* high 32 bits are known zero. */
+ regs[insn->dst_reg].var_off = tnum_cast(
+ regs[insn->dst_reg].var_off, 4);
+ __update_reg_bounds(&regs[insn->dst_reg]);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- regs[insn->dst_reg].id = 0;
- regs[insn->dst_reg].max_value = insn->imm;
- regs[insn->dst_reg].min_value = insn->imm;
- regs[insn->dst_reg].min_align = calc_align(insn->imm);
- regs[insn->dst_reg].value_from_signed = false;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(regs + insn->dst_reg, insn->imm);
}
} else if (opcode > BPF_END) {
@@ -2104,68 +2364,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
- dst_reg = &regs[insn->dst_reg];
-
- /* first we want to adjust our ranges. */
- adjust_reg_min_max_vals(env, insn);
-
- /* pattern match 'bpf_add Rx, imm' instruction */
- if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
- dst_reg->type = PTR_TO_STACK;
- dst_reg->imm = insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == PTR_TO_STACK &&
- ((BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == CONST_IMM) ||
- BPF_SRC(insn->code) == BPF_K)) {
- if (BPF_SRC(insn->code) == BPF_X)
- dst_reg->imm += regs[insn->src_reg].imm;
- else
- dst_reg->imm += insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- (dst_reg->type == PTR_TO_PACKET ||
- (BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == PTR_TO_PACKET))) {
- /* ptr_to_packet += K|X */
- return check_packet_ptr_add(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == UNKNOWN_VALUE &&
- env->allow_ptr_leaks) {
- /* unknown += K|X */
- return evaluate_reg_alu(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == CONST_IMM &&
- env->allow_ptr_leaks) {
- /* reg_imm += K|X */
- return evaluate_reg_imm_alu(env, insn);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->dst_reg);
- return -EACCES;
- } else if (BPF_SRC(insn->code) == BPF_X &&
- is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->src_reg);
- return -EACCES;
- }
-
- /* If we did pointer math on a map value then just set it to our
- * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
- * loads to this register appropriately, otherwise just mark the
- * register as unknown.
- */
- if (env->allow_ptr_leaks &&
- BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
- (dst_reg->type == PTR_TO_MAP_VALUE ||
- dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
- dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
- else
- mark_reg_unknown_value(regs, insn->dst_reg);
+ return adjust_reg_min_max_vals(env, insn);
}
return 0;
@@ -2177,27 +2376,48 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *regs = state->regs, *reg;
int i;
- /* LLVM can generate two kind of checks:
+ if (dst_reg->off < 0)
+ /* This doesn't give us any range */
+ return;
+
+ if (dst_reg->umax_value > MAX_PACKET_OFF ||
+ dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ /* Risk of overflow. For instance, ptr + (1<<63) may be less
+ * than pkt_end, but that's because it's also less than pkt.
+ */
+ return;
+
+ /* LLVM can generate four kind of checks:
*
- * Type 1:
+ * Type 1/2:
*
* r2 = r3;
* r2 += 8;
* if (r2 > pkt_end) goto <handle exception>
* <access okay>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 < pkt_end) goto <access okay>
+ * <handle exception>
+ *
* Where:
* r2 == dst_reg, pkt_end == src_reg
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
- * Type 2:
+ * Type 3/4:
*
* r2 = r3;
* r2 += 8;
* if (pkt_end >= r2) goto <access okay>
* <handle exception>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end <= r2) goto <handle exception>
+ * <access okay>
+ *
* Where:
* pkt_end == dst_reg, r2 == src_reg
* r2=pkt(id=n,off=8,r=0)
@@ -2207,193 +2427,247 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* so that range of bytes [r3, r3 + 8) is safe to access.
*/
+ /* If our ids match, then we must have the same max_value. And we
+ * don't care about the other reg's fixed offset, since if it's too big
+ * the range won't allow anything.
+ * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ */
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, dst_reg->off);
+ regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = max(reg->range, dst_reg->off);
+ reg->range = max_t(u16, reg->range, dst_reg->off);
}
}
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
+ * In JEQ/JNE cases we also adjust the var_off values.
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ /* If the dst_reg is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless src_reg were a pointer into
+ * the same object, but we don't bother with that.
+ * Since false_reg and true_reg have the same type by construction, we
+ * only need to check one of them for pointerness.
+ */
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum val is val,
- * otherwise we know the min val is val+1.
- */
- false_reg->max_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val + 1;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ break;
+ case BPF_JLT:
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLT:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum value is val - 1,
- * otherwise we know the mimimum value is val.
- */
- false_reg->max_value = val - 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ break;
+ case BPF_JLE:
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ break;
+ case BPF_JSLE:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
- }
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
}
-/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
- * is the variable reg.
+/* Same as above, but for the case that dst_reg holds a constant and src_reg is
+ * the variable reg.
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /*
- * If this is false, then the val is <= the register, if it is
- * true the register <= to the val.
- */
- false_reg->min_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val - 1;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ break;
+ case BPF_JLT:
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ break;
+ case BPF_JSLT:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /* If this is false then constant < register, if it is true then
- * the register < constant.
- */
- false_reg->min_value = val + 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ break;
+ case BPF_JLE:
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLE:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
+}
+
+/* Regs are known to be equal, so intersect their min/max/var_off */
+static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
+ struct bpf_reg_state *dst_reg)
+{
+ src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
+ dst_reg->umin_value);
+ src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
+ dst_reg->umax_value);
+ src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
+ dst_reg->smin_value);
+ src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
+ dst_reg->smax_value);
+ src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
+ dst_reg->var_off);
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(src_reg);
+ __reg_deduce_bounds(dst_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(src_reg);
+ __reg_bound_offset(dst_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+}
+
+static void reg_combine_min_max(struct bpf_reg_state *true_src,
+ struct bpf_reg_state *true_dst,
+ struct bpf_reg_state *false_src,
+ struct bpf_reg_state *false_dst,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ __reg_combine_min_max(true_src, true_dst);
+ break;
+ case BPF_JNE:
+ __reg_combine_min_max(false_src, false_dst);
+ break;
}
}
static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
- if (type == UNKNOWN_VALUE) {
- __mark_reg_unknown_value(regs, regno);
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
+ !tnum_equals_const(reg->var_off, 0) ||
+ reg->off)) {
+ __mark_reg_known_zero(reg);
+ reg->off = 0;
+ }
+ if (is_null) {
+ reg->type = SCALAR_VALUE;
} else if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
} else {
- reg->type = type;
+ reg->type = PTR_TO_MAP_VALUE;
}
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
@@ -2407,19 +2681,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
* be folded together at some point.
*/
static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *regs = state->regs;
u32 id = regs[regno].id;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- mark_map_reg(regs, i, id, type);
+ mark_map_reg(regs, i, id, is_null);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
}
}
@@ -2431,7 +2705,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
int err;
- if (opcode > BPF_EXIT) {
+ if (opcode > BPF_JSLE) {
verbose("invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
@@ -2469,7 +2743,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if R == 0 where R was initialized to zero earlier */
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
- dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
+ dst_reg->type == SCALAR_VALUE &&
+ tnum_equals_const(dst_reg->var_off, insn->imm)) {
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
@@ -2491,17 +2766,30 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
+ * this is only legit if both are scalars (or pointers to the same
+ * object, I suppose, but we don't support that right now), because
+ * otherwise the different base pointers mean the offsets aren't
+ * comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
- if (regs[insn->src_reg].type == CONST_IMM)
- reg_set_min_max(&other_branch->regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].imm,
- opcode);
- else if (dst_reg->type == CONST_IMM)
- reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
- &regs[insn->src_reg], dst_reg->imm,
- opcode);
- } else {
+ if (dst_reg->type == SCALAR_VALUE &&
+ regs[insn->src_reg].type == SCALAR_VALUE) {
+ if (tnum_is_const(regs[insn->src_reg].var_off))
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].var_off.value,
+ opcode);
+ else if (tnum_is_const(dst_reg->var_off))
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg],
+ dst_reg->var_off.value, opcode);
+ else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ /* Comparing for equality, we can combine knowledge */
+ reg_combine_min_max(&other_branch->regs[insn->src_reg],
+ &other_branch->regs[insn->dst_reg],
+ &regs[insn->src_reg],
+ &regs[insn->dst_reg], opcode);
+ }
+ } else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch->regs[insn->dst_reg],
dst_reg, insn->imm, opcode);
}
@@ -2513,18 +2801,24 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* Mark all identical map registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
- mark_map_regs(this_branch, insn->dst_reg,
- opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
- mark_map_regs(other_branch, insn->dst_reg,
- opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
+ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
+ mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(this_branch, dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+ dst_reg->type == PTR_TO_PACKET &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+ find_good_pkt_pointers(other_branch, dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+ dst_reg->type == PTR_TO_PACKET_END &&
+ regs[insn->src_reg].type == PTR_TO_PACKET) {
+ find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
@@ -2564,9 +2858,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = imm;
- regs[insn->dst_reg].id = 0;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
@@ -2647,7 +2940,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* mark destination R0 register as readable, since it contains
* the value fetched from the packet
*/
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ mark_reg_unknown(regs, BPF_REG_0);
return 0;
}
@@ -2850,57 +3143,149 @@ err_free:
return ret;
}
-/* the following conditions reduce the number of explored insns
- * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+/* check %cur's range satisfies %old's */
+static bool range_within(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value;
+}
+
+/* Maximum number of register states that can exist at once */
+#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+struct idpair {
+ u32 old;
+ u32 cur;
+};
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
- struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
{
- if (old->id != cur->id)
- return false;
+ unsigned int i;
+
+ for (i = 0; i < ID_MAP_SIZE; i++) {
+ if (!idmap[i].old) {
+ /* Reached an empty slot; haven't seen this id before */
+ idmap[i].old = old_id;
+ idmap[i].cur = cur_id;
+ return true;
+ }
+ if (idmap[i].old == old_id)
+ return idmap[i].cur == cur_id;
+ }
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
- /* old ptr_to_packet is more conservative, since it allows smaller
- * range. Ex:
- * old(off=0,r=10) is equal to cur(off=0,r=20), because
- * old(off=0,r=10) means that with range=10 the verifier proceeded
- * further and found no issues with the program. Now we're in the same
- * spot with cur(off=0,r=20), so we're safe too, since anything further
- * will only be looking at most 10 bytes after this pointer.
- */
- if (old->off == cur->off && old->range < cur->range)
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur,
+ bool varlen_map_access, struct idpair *idmap)
+{
+ if (memcmp(rold, rcur, sizeof(*rold)) == 0)
return true;
- /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
- * since both cannot be used for packet access and safe(old)
- * pointer has smaller off that could be used for further
- * 'if (ptr > data_end)' check
- * Ex:
- * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
- * that we cannot access the packet.
- * The safe range is:
- * [ptr, ptr + range - off)
- * so whenever off >=range, it means no safe bytes from this pointer.
- * When comparing old->off <= cur->off, it means that older code
- * went with smaller offset and that offset was later
- * used to figure out the safe range after 'if (ptr > data_end)' check
- * Say, 'old' state was explored like:
- * ... R3(off=0, r=0)
- * R4 = R3 + 20
- * ... now R4(off=20,r=0) <-- here
- * if (R4 > data_end)
- * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
- * ... the code further went all the way to bpf_exit.
- * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
- * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
- * goes further, such cur_R4 will give larger safe packet range after
- * 'if (R4 > data_end)' and all further insn were already good with r=20,
- * so they will be good with r=30 and we can prune the search.
- */
- if (!env->strict_alignment && old->off <= cur->off &&
- old->off >= old->range && cur->off >= cur->range)
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
return true;
+ if (rcur->type == NOT_INIT)
+ return false;
+ switch (rold->type) {
+ case SCALAR_VALUE:
+ if (rcur->type == SCALAR_VALUE) {
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ } else {
+ /* if we knew anything about the old value, we're not
+ * equal, because we can't know anything about the
+ * scalar value of the pointer in the new value.
+ */
+ return rold->umin_value == 0 &&
+ rold->umax_value == U64_MAX &&
+ rold->smin_value == S64_MIN &&
+ rold->smax_value == S64_MAX &&
+ tnum_is_unknown(rold->var_off);
+ }
+ case PTR_TO_MAP_VALUE:
+ if (varlen_map_access) {
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ * We don't care about the 'id' value, because nothing
+ * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ } else {
+ /* If the ranges/var_off were not the same, but
+ * everything else was and we didn't do a variable
+ * access into a map then we are a-ok.
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0;
+ }
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET:
+ if (rcur->type != PTR_TO_PACKET)
+ return false;
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range > rcur->range)
+ return false;
+ /* If the offsets don't match, we can't trust our alignment;
+ * nor can we be sure that we won't fall out of range.
+ */
+ if (rold->off != rcur->off)
+ return false;
+ /* id relations must be preserved */
+ if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_CTX:
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_END:
+ /* Only valid matches are exact, which memcmp() above
+ * would have accepted
+ */
+ default:
+ /* Don't know what's going on, just say it's not safe */
+ return false;
+ }
+ /* Shouldn't get here; if we do, say it's not safe */
+ WARN_ON_ONCE(1);
return false;
}
@@ -2935,43 +3320,19 @@ static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
bool varlen_map_access = env->varlen_map_value_access;
- struct bpf_reg_state *rold, *rcur;
+ struct idpair *idmap;
+ bool ret = false;
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- rold = &old->regs[i];
- rcur = &cur->regs[i];
-
- if (memcmp(rold, rcur, sizeof(*rold)) == 0)
- continue;
-
- /* If the ranges were not the same, but everything else was and
- * we didn't do a variable access into a map then we are a-ok.
- */
- if (!varlen_map_access &&
- memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
- continue;
-
- /* If we didn't map access then again we don't care about the
- * mismatched range values and it's ok if our old type was
- * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
- */
- if (rold->type == NOT_INIT ||
- (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
- rcur->type != NOT_INIT))
- continue;
-
- /* Don't care about the reg->id in this case. */
- if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rold->map_ptr == rcur->map_ptr)
- continue;
-
- if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
- compare_ptrs_to_packet(env, rold, rcur))
- continue;
-
+ idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
+ /* If we failed to allocate the idmap, just say it's not safe */
+ if (!idmap)
return false;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (!regsafe(&old->regs[i], &cur->regs[i], varlen_map_access,
+ idmap))
+ goto out_free;
}
for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -2983,29 +3344,32 @@ static bool states_equal(struct bpf_verifier_env *env,
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
if (i % BPF_REG_SIZE)
continue;
if (old->stack_slot_type[i] != STACK_SPILL)
continue;
- if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- sizeof(old->spilled_regs[0])))
- /* when explored and current stack slot types are
- * the same, check that stored pointers types
+ if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ varlen_map_access, idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
* are the same as well.
* Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
* but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
else
continue;
}
- return true;
+ ret = true;
+out_free:
+ kfree(idmap);
+ return ret;
}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
@@ -3319,7 +3683,6 @@ process_bpf_exit:
verbose("invalid BPF_LD mode\n");
return -EINVAL;
}
- reset_reg_range_values(regs, insn->dst_reg);
} else {
verbose("unknown insn class %d\n", class);
return -EINVAL;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..a7a6c1d19a49 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8050,7 +8050,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -8061,7 +8061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
@@ -8070,13 +8071,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395da88e..7a1a92036563 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
+static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long args[sys_data->nb_args];
+ } param;
+ int i;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ for (i = 0; i < sys_data->nb_args; i++)
+ param.args[i] = rec->args[i];
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->enter_event->prog);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
+
+ if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL);
@@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
+static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_trace_exit *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long ret;
+ } param;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ param.ret = rec->ret;
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->exit_event->prog);
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* We can probably do that at build time */
@@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
+
+ if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL);
}