mirror of
https://github.com/raspberrypi/linux.git
synced 2025-12-17 15:24:33 +00:00
This patch fixes an incorrect assumption made in the original
bpf_refcount series [0], specifically that the BPF program calling
bpf_refcount_acquire on some node can always guarantee that the node is
alive. In that series, the patch adding failure behavior to rbtree_add
and list_push_{front, back} breaks this assumption for non-owning
references.
Consider the following program:
n = bpf_kptr_xchg(&mapval, NULL);
/* skip error checking */
bpf_spin_lock(&l);
if(bpf_rbtree_add(&t, &n->rb, less)) {
bpf_refcount_acquire(n);
/* Failed to add, do something else with the node */
}
bpf_spin_unlock(&l);
It's incorrect to assume that bpf_refcount_acquire will always succeed in this
scenario. bpf_refcount_acquire is being called in a critical section
here, but the lock being held is associated with rbtree t, which isn't
necessarily the lock associated with the tree that the node is already
in. So after bpf_rbtree_add fails to add the node and calls bpf_obj_drop
in it, the program has no ownership of the node's lifetime. Therefore
the node's refcount can be decr'd to 0 at any time after the failing
rbtree_add. If this happens before the refcount_acquire above, the node
might be free'd, and regardless refcount_acquire will be incrementing a
0 refcount.
Later patches in the series exercise this scenario, resulting in the
expected complaint from the kernel (without this patch's changes):
refcount_t: addition on 0; use-after-free.
WARNING: CPU: 1 PID: 207 at lib/refcount.c:25 refcount_warn_saturate+0xbc/0x110
Modules linked in: bpf_testmod(O)
CPU: 1 PID: 207 Comm: test_progs Tainted: G O 6.3.0-rc7-02231-g723de1a718a2-dirty #371
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
RIP: 0010:refcount_warn_saturate+0xbc/0x110
Code: 6f 64 f6 02 01 e8 84 a3 5c ff 0f 0b eb 9d 80 3d 5e 64 f6 02 00 75 94 48 c7 c7 e0 13 d2 82 c6 05 4e 64 f6 02 01 e8 64 a3 5c ff <0f> 0b e9 7a ff ff ff 80 3d 38 64 f6 02 00 0f 85 6d ff ff ff 48 c7
RSP: 0018:ffff88810b9179b0 EFLAGS: 00010082
RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000
RDX: 0000000000000202 RSI: 0000000000000008 RDI: ffffffff857c3680
RBP: ffff88810027d3c0 R08: ffffffff8125f2a4 R09: ffff88810b9176e7
R10: ffffed1021722edc R11: 746e756f63666572 R12: ffff88810027d388
R13: ffff88810027d3c0 R14: ffffc900005fe030 R15: ffffc900005fe048
FS: 00007fee0584a700(0000) GS:ffff88811b280000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005634a96f6c58 CR3: 0000000108ce9002 CR4: 0000000000770ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
bpf_refcount_acquire_impl+0xb5/0xc0
(rest of output snipped)
The patch addresses this by changing bpf_refcount_acquire_impl to use
refcount_inc_not_zero instead of refcount_inc and marking
bpf_refcount_acquire KF_RET_NULL.
For owning references, though, we know the above scenario is not possible
and thus that bpf_refcount_acquire will always succeed. Some verifier
bookkeeping is added to track "is input owning ref?" for bpf_refcount_acquire
calls and return false from is_kfunc_ret_null for bpf_refcount_acquire on
owning refs despite it being marked KF_RET_NULL.
Existing selftests using bpf_refcount_acquire are modified where
necessary to NULL-check its return value.
[0]: https://lore.kernel.org/bpf/20230415201811.343116-1-davemarchevsky@fb.com/
Fixes: d2dcc67df9 ("bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail")
Reported-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230602022647.1571784-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
409 lines
9.4 KiB
C
409 lines
9.4 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
|
|
|
|
#include <vmlinux.h>
|
|
#include <bpf/bpf_tracing.h>
|
|
#include <bpf/bpf_helpers.h>
|
|
#include <bpf/bpf_core_read.h>
|
|
#include "bpf_misc.h"
|
|
#include "bpf_experimental.h"
|
|
|
|
struct node_data {
|
|
long key;
|
|
long list_data;
|
|
struct bpf_rb_node r;
|
|
struct bpf_list_node l;
|
|
struct bpf_refcount ref;
|
|
};
|
|
|
|
struct map_value {
|
|
struct node_data __kptr *node;
|
|
};
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_ARRAY);
|
|
__type(key, int);
|
|
__type(value, struct map_value);
|
|
__uint(max_entries, 1);
|
|
} stashed_nodes SEC(".maps");
|
|
|
|
struct node_acquire {
|
|
long key;
|
|
long data;
|
|
struct bpf_rb_node node;
|
|
struct bpf_refcount refcount;
|
|
};
|
|
|
|
#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8)))
|
|
private(A) struct bpf_spin_lock lock;
|
|
private(A) struct bpf_rb_root root __contains(node_data, r);
|
|
private(A) struct bpf_list_head head __contains(node_data, l);
|
|
|
|
private(B) struct bpf_spin_lock alock;
|
|
private(B) struct bpf_rb_root aroot __contains(node_acquire, node);
|
|
|
|
static bool less(struct bpf_rb_node *node_a, const struct bpf_rb_node *node_b)
|
|
{
|
|
struct node_data *a;
|
|
struct node_data *b;
|
|
|
|
a = container_of(node_a, struct node_data, r);
|
|
b = container_of(node_b, struct node_data, r);
|
|
|
|
return a->key < b->key;
|
|
}
|
|
|
|
static bool less_a(struct bpf_rb_node *a, const struct bpf_rb_node *b)
|
|
{
|
|
struct node_acquire *node_a;
|
|
struct node_acquire *node_b;
|
|
|
|
node_a = container_of(a, struct node_acquire, node);
|
|
node_b = container_of(b, struct node_acquire, node);
|
|
|
|
return node_a->key < node_b->key;
|
|
}
|
|
|
|
static long __insert_in_tree_and_list(struct bpf_list_head *head,
|
|
struct bpf_rb_root *root,
|
|
struct bpf_spin_lock *lock)
|
|
{
|
|
struct node_data *n, *m;
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
if (!n)
|
|
return -1;
|
|
|
|
m = bpf_refcount_acquire(n);
|
|
m->key = 123;
|
|
m->list_data = 456;
|
|
|
|
bpf_spin_lock(lock);
|
|
if (bpf_rbtree_add(root, &n->r, less)) {
|
|
/* Failure to insert - unexpected */
|
|
bpf_spin_unlock(lock);
|
|
bpf_obj_drop(m);
|
|
return -2;
|
|
}
|
|
bpf_spin_unlock(lock);
|
|
|
|
bpf_spin_lock(lock);
|
|
if (bpf_list_push_front(head, &m->l)) {
|
|
/* Failure to insert - unexpected */
|
|
bpf_spin_unlock(lock);
|
|
return -3;
|
|
}
|
|
bpf_spin_unlock(lock);
|
|
return 0;
|
|
}
|
|
|
|
static long __stash_map_insert_tree(int idx, int val, struct bpf_rb_root *root,
|
|
struct bpf_spin_lock *lock)
|
|
{
|
|
struct map_value *mapval;
|
|
struct node_data *n, *m;
|
|
|
|
mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
|
|
if (!mapval)
|
|
return -1;
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
if (!n)
|
|
return -2;
|
|
|
|
n->key = val;
|
|
m = bpf_refcount_acquire(n);
|
|
|
|
n = bpf_kptr_xchg(&mapval->node, n);
|
|
if (n) {
|
|
bpf_obj_drop(n);
|
|
bpf_obj_drop(m);
|
|
return -3;
|
|
}
|
|
|
|
bpf_spin_lock(lock);
|
|
if (bpf_rbtree_add(root, &m->r, less)) {
|
|
/* Failure to insert - unexpected */
|
|
bpf_spin_unlock(lock);
|
|
return -4;
|
|
}
|
|
bpf_spin_unlock(lock);
|
|
return 0;
|
|
}
|
|
|
|
static long __read_from_tree(struct bpf_rb_root *root,
|
|
struct bpf_spin_lock *lock,
|
|
bool remove_from_tree)
|
|
{
|
|
struct bpf_rb_node *rb;
|
|
struct node_data *n;
|
|
long res = -99;
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
rb = bpf_rbtree_first(root);
|
|
if (!rb) {
|
|
bpf_spin_unlock(lock);
|
|
return -1;
|
|
}
|
|
|
|
n = container_of(rb, struct node_data, r);
|
|
res = n->key;
|
|
|
|
if (!remove_from_tree) {
|
|
bpf_spin_unlock(lock);
|
|
return res;
|
|
}
|
|
|
|
rb = bpf_rbtree_remove(root, rb);
|
|
bpf_spin_unlock(lock);
|
|
if (!rb)
|
|
return -2;
|
|
n = container_of(rb, struct node_data, r);
|
|
bpf_obj_drop(n);
|
|
return res;
|
|
}
|
|
|
|
static long __read_from_list(struct bpf_list_head *head,
|
|
struct bpf_spin_lock *lock,
|
|
bool remove_from_list)
|
|
{
|
|
struct bpf_list_node *l;
|
|
struct node_data *n;
|
|
long res = -99;
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
l = bpf_list_pop_front(head);
|
|
if (!l) {
|
|
bpf_spin_unlock(lock);
|
|
return -1;
|
|
}
|
|
|
|
n = container_of(l, struct node_data, l);
|
|
res = n->list_data;
|
|
|
|
if (!remove_from_list) {
|
|
if (bpf_list_push_back(head, &n->l)) {
|
|
bpf_spin_unlock(lock);
|
|
return -2;
|
|
}
|
|
}
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
if (remove_from_list)
|
|
bpf_obj_drop(n);
|
|
return res;
|
|
}
|
|
|
|
static long __read_from_unstash(int idx)
|
|
{
|
|
struct node_data *n = NULL;
|
|
struct map_value *mapval;
|
|
long val = -99;
|
|
|
|
mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
|
|
if (!mapval)
|
|
return -1;
|
|
|
|
n = bpf_kptr_xchg(&mapval->node, n);
|
|
if (!n)
|
|
return -2;
|
|
|
|
val = n->key;
|
|
bpf_obj_drop(n);
|
|
return val;
|
|
}
|
|
|
|
#define INSERT_READ_BOTH(rem_tree, rem_list, desc) \
|
|
SEC("tc") \
|
|
__description(desc) \
|
|
__success __retval(579) \
|
|
long insert_and_remove_tree_##rem_tree##_list_##rem_list(void *ctx) \
|
|
{ \
|
|
long err, tree_data, list_data; \
|
|
\
|
|
err = __insert_in_tree_and_list(&head, &root, &lock); \
|
|
if (err) \
|
|
return err; \
|
|
\
|
|
err = __read_from_tree(&root, &lock, rem_tree); \
|
|
if (err < 0) \
|
|
return err; \
|
|
else \
|
|
tree_data = err; \
|
|
\
|
|
err = __read_from_list(&head, &lock, rem_list); \
|
|
if (err < 0) \
|
|
return err; \
|
|
else \
|
|
list_data = err; \
|
|
\
|
|
return tree_data + list_data; \
|
|
}
|
|
|
|
/* After successful insert of struct node_data into both collections:
|
|
* - it should have refcount = 2
|
|
* - removing / not removing the node_data from a collection after
|
|
* reading should have no effect on ability to read / remove from
|
|
* the other collection
|
|
*/
|
|
INSERT_READ_BOTH(true, true, "insert_read_both: remove from tree + list");
|
|
INSERT_READ_BOTH(false, false, "insert_read_both: remove from neither");
|
|
INSERT_READ_BOTH(true, false, "insert_read_both: remove from tree");
|
|
INSERT_READ_BOTH(false, true, "insert_read_both: remove from list");
|
|
|
|
#undef INSERT_READ_BOTH
|
|
#define INSERT_READ_BOTH(rem_tree, rem_list, desc) \
|
|
SEC("tc") \
|
|
__description(desc) \
|
|
__success __retval(579) \
|
|
long insert_and_remove_lf_tree_##rem_tree##_list_##rem_list(void *ctx) \
|
|
{ \
|
|
long err, tree_data, list_data; \
|
|
\
|
|
err = __insert_in_tree_and_list(&head, &root, &lock); \
|
|
if (err) \
|
|
return err; \
|
|
\
|
|
err = __read_from_list(&head, &lock, rem_list); \
|
|
if (err < 0) \
|
|
return err; \
|
|
else \
|
|
list_data = err; \
|
|
\
|
|
err = __read_from_tree(&root, &lock, rem_tree); \
|
|
if (err < 0) \
|
|
return err; \
|
|
else \
|
|
tree_data = err; \
|
|
\
|
|
return tree_data + list_data; \
|
|
}
|
|
|
|
/* Similar to insert_read_both, but list data is read and possibly removed
|
|
* first
|
|
*
|
|
* Results should be no different than reading and possibly removing rbtree
|
|
* node first
|
|
*/
|
|
INSERT_READ_BOTH(true, true, "insert_read_both_list_first: remove from tree + list");
|
|
INSERT_READ_BOTH(false, false, "insert_read_both_list_first: remove from neither");
|
|
INSERT_READ_BOTH(true, false, "insert_read_both_list_first: remove from tree");
|
|
INSERT_READ_BOTH(false, true, "insert_read_both_list_first: remove from list");
|
|
|
|
#define INSERT_DOUBLE_READ_AND_DEL(read_fn, read_root, desc) \
|
|
SEC("tc") \
|
|
__description(desc) \
|
|
__success __retval(-1) \
|
|
long insert_double_##read_fn##_and_del_##read_root(void *ctx) \
|
|
{ \
|
|
long err, list_data; \
|
|
\
|
|
err = __insert_in_tree_and_list(&head, &root, &lock); \
|
|
if (err) \
|
|
return err; \
|
|
\
|
|
err = read_fn(&read_root, &lock, true); \
|
|
if (err < 0) \
|
|
return err; \
|
|
else \
|
|
list_data = err; \
|
|
\
|
|
err = read_fn(&read_root, &lock, true); \
|
|
if (err < 0) \
|
|
return err; \
|
|
\
|
|
return err + list_data; \
|
|
}
|
|
|
|
/* Insert into both tree and list, then try reading-and-removing from either twice
|
|
*
|
|
* The second read-and-remove should fail on read step since the node has
|
|
* already been removed
|
|
*/
|
|
INSERT_DOUBLE_READ_AND_DEL(__read_from_tree, root, "insert_double_del: 2x read-and-del from tree");
|
|
INSERT_DOUBLE_READ_AND_DEL(__read_from_list, head, "insert_double_del: 2x read-and-del from list");
|
|
|
|
#define INSERT_STASH_READ(rem_tree, desc) \
|
|
SEC("tc") \
|
|
__description(desc) \
|
|
__success __retval(84) \
|
|
long insert_rbtree_and_stash__del_tree_##rem_tree(void *ctx) \
|
|
{ \
|
|
long err, tree_data, map_data; \
|
|
\
|
|
err = __stash_map_insert_tree(0, 42, &root, &lock); \
|
|
if (err) \
|
|
return err; \
|
|
\
|
|
err = __read_from_tree(&root, &lock, rem_tree); \
|
|
if (err < 0) \
|
|
return err; \
|
|
else \
|
|
tree_data = err; \
|
|
\
|
|
err = __read_from_unstash(0); \
|
|
if (err < 0) \
|
|
return err; \
|
|
else \
|
|
map_data = err; \
|
|
\
|
|
return tree_data + map_data; \
|
|
}
|
|
|
|
/* Stash a refcounted node in map_val, insert same node into tree, then try
|
|
* reading data from tree then unstashed map_val, possibly removing from tree
|
|
*
|
|
* Removing from tree should have no effect on map_val kptr validity
|
|
*/
|
|
INSERT_STASH_READ(true, "insert_stash_read: remove from tree");
|
|
INSERT_STASH_READ(false, "insert_stash_read: don't remove from tree");
|
|
|
|
SEC("tc")
|
|
__success
|
|
long rbtree_refcounted_node_ref_escapes(void *ctx)
|
|
{
|
|
struct node_acquire *n, *m;
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
if (!n)
|
|
return 1;
|
|
|
|
bpf_spin_lock(&alock);
|
|
bpf_rbtree_add(&aroot, &n->node, less_a);
|
|
m = bpf_refcount_acquire(n);
|
|
bpf_spin_unlock(&alock);
|
|
if (!m)
|
|
return 2;
|
|
|
|
m->key = 2;
|
|
bpf_obj_drop(m);
|
|
return 0;
|
|
}
|
|
|
|
SEC("tc")
|
|
__success
|
|
long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx)
|
|
{
|
|
struct node_acquire *n, *m;
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
if (!n)
|
|
return 1;
|
|
|
|
m = bpf_refcount_acquire(n);
|
|
m->key = 2;
|
|
|
|
bpf_spin_lock(&alock);
|
|
bpf_rbtree_add(&aroot, &n->node, less_a);
|
|
bpf_spin_unlock(&alock);
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
return 0;
|
|
}
|
|
|
|
char _license[] SEC("license") = "GPL";
|