linux/kernel/bpf/bpf_iter.c
Andrii Nakryiko 66c8473135 bpf: move sleepable flag from bpf_prog_aux to bpf_prog
prog->aux->sleepable is checked very frequently as part of (some) BPF
program run hot paths. So this extra aux indirection seems wasteful and
on busy systems might cause unnecessary memory cache misses.

Let's move sleepable flag into prog itself to eliminate unnecessary
pointer dereference.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Message-ID: <20240309004739.2961431-1-andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-03-11 16:41:25 -07:00

845 lines
19 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/fs.h>
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info {
struct list_head list;
const struct bpf_iter_reg *reg_info;
u32 btf_id; /* cached value */
};
struct bpf_iter_link {
struct bpf_link link;
struct bpf_iter_aux_info aux;
struct bpf_iter_target_info *tinfo;
};
struct bpf_iter_priv_data {
struct bpf_iter_target_info *tinfo;
const struct bpf_iter_seq_info *seq_info;
struct bpf_prog *prog;
u64 session_id;
u64 seq_num;
bool done_stop;
u8 target_private[] __aligned(8);
};
static struct list_head targets = LIST_HEAD_INIT(targets);
static DEFINE_MUTEX(targets_mutex);
/* protect bpf_iter_link changes */
static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
const struct bpf_iter_seq_info *seq_info);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num++;
}
static void bpf_iter_dec_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num--;
}
static void bpf_iter_done_stop(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->done_stop = true;
}
static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo)
{
return tinfo->reg_info->feature & BPF_ITER_RESCHED;
}
static bool bpf_iter_support_resched(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
return bpf_iter_target_support_resched(iter_priv->tinfo);
}
/* maximum visited objects before bailing out */
#define MAX_ITER_OBJECTS 1000000
/* bpf_seq_read, a customized and simpler version for bpf iterator.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
* . assuming NULL ->llseek()
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
loff_t *ppos)
{
struct seq_file *seq = file->private_data;
size_t n, offs, copied = 0;
int err = 0, num_objs = 0;
bool can_resched;
void *p;
mutex_lock(&seq->lock);
if (!seq->buf) {
seq->size = PAGE_SIZE << 3;
seq->buf = kvmalloc(seq->size, GFP_KERNEL);
if (!seq->buf) {
err = -ENOMEM;
goto done;
}
}
if (seq->count) {
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf + seq->from, n);
if (err) {
err = -EFAULT;
goto done;
}
seq->count -= n;
seq->from += n;
copied = n;
goto done;
}
seq->from = 0;
p = seq->op->start(seq, &seq->index);
if (!p)
goto stop;
if (IS_ERR(p)) {
err = PTR_ERR(p);
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
err = seq->op->show(seq, p);
if (err > 0) {
/* object is skipped, decrease seq_num, so next
* valid object can reuse the same seq_num.
*/
bpf_iter_dec_seq_num(seq);
seq->count = 0;
} else if (err < 0 || seq_has_overflowed(seq)) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
can_resched = bpf_iter_support_resched(seq);
while (1) {
loff_t pos = seq->index;
num_objs++;
offs = seq->count;
p = seq->op->next(seq, p, &seq->index);
if (pos == seq->index) {
pr_info_ratelimited("buggy seq_file .next function %ps "
"did not updated position index\n",
seq->op->next);
seq->index++;
}
if (IS_ERR_OR_NULL(p))
break;
/* got a valid next object, increase seq_num */
bpf_iter_inc_seq_num(seq);
if (seq->count >= size)
break;
if (num_objs >= MAX_ITER_OBJECTS) {
if (offs == 0) {
err = -EAGAIN;
seq->op->stop(seq, p);
goto done;
}
break;
}
err = seq->op->show(seq, p);
if (err > 0) {
bpf_iter_dec_seq_num(seq);
seq->count = offs;
} else if (err < 0 || seq_has_overflowed(seq)) {
seq->count = offs;
if (offs == 0) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
goto done;
}
break;
}
if (can_resched)
cond_resched();
}
stop:
offs = seq->count;
if (IS_ERR(p)) {
seq->op->stop(seq, NULL);
err = PTR_ERR(p);
goto done;
}
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
if (!seq_has_overflowed(seq)) {
bpf_iter_done_stop(seq);
} else {
seq->count = offs;
if (offs == 0) {
err = -E2BIG;
goto done;
}
}
}
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf, n);
if (err) {
err = -EFAULT;
goto done;
}
copied = n;
seq->count -= n;
seq->from = n;
done:
if (!copied)
copied = err;
else
*ppos += copied;
mutex_unlock(&seq->lock);
return copied;
}
static const struct bpf_iter_seq_info *
__get_seq_info(struct bpf_iter_link *link)
{
const struct bpf_iter_seq_info *seq_info;
if (link->aux.map) {
seq_info = link->aux.map->ops->iter_seq_info;
if (seq_info)
return seq_info;
}
return link->tinfo->reg_info->seq_info;
}
static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
return prepare_seq_file(file, link, __get_seq_info(link));
}
static int iter_release(struct inode *inode, struct file *file)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
seq = file->private_data;
if (!seq)
return 0;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
if (iter_priv->seq_info->fini_seq_private)
iter_priv->seq_info->fini_seq_private(seq->private);
bpf_prog_put(iter_priv->prog);
seq->private = iter_priv;
return seq_release_private(inode, file);
}
const struct file_operations bpf_iter_fops = {
.open = iter_open,
.llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
/* The argument reg_info will be cached in bpf_iter_target_info.
* The common practice is to declare target reg_info as
* a const static variable and passed as an argument to
* bpf_iter_reg_target().
*/
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
return -ENOMEM;
tinfo->reg_info = reg_info;
INIT_LIST_HEAD(&tinfo->list);
mutex_lock(&targets_mutex);
list_add(&tinfo->list, &targets);
mutex_unlock(&targets_mutex);
return 0;
}
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
bool found = false;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (reg_info == tinfo->reg_info) {
list_del(&tinfo->list);
kfree(tinfo);
found = true;
break;
}
}
mutex_unlock(&targets_mutex);
WARN_ON(found == false);
}
static void cache_btf_id(struct bpf_iter_target_info *tinfo,
struct bpf_prog *prog)
{
tinfo->btf_id = prog->aux->attach_btf_id;
}
bool bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
struct bpf_iter_target_info *tinfo = NULL, *iter;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
int prefix_len = strlen(prefix);
if (strncmp(attach_fname, prefix, prefix_len))
return false;
mutex_lock(&targets_mutex);
list_for_each_entry(iter, &targets, list) {
if (iter->btf_id && iter->btf_id == prog_btf_id) {
tinfo = iter;
break;
}
if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
cache_btf_id(iter, prog);
tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
if (tinfo) {
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
}
return tinfo != NULL;
}
const struct bpf_func_proto *
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
const struct bpf_iter_target_info *tinfo;
const struct bpf_func_proto *fn = NULL;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id == prog->aux->attach_btf_id) {
const struct bpf_iter_reg *reg_info;
reg_info = tinfo->reg_info;
if (reg_info->get_func_proto)
fn = reg_info->get_func_proto(func_id, prog);
break;
}
}
mutex_unlock(&targets_mutex);
return fn;
}
static void bpf_iter_link_release(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
if (iter_link->tinfo->reg_info->detach_target)
iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
kfree(iter_link);
}
static int bpf_iter_link_replace(struct bpf_link *link,
struct bpf_prog *new_prog,
struct bpf_prog *old_prog)
{
int ret = 0;
mutex_lock(&link_mutex);
if (old_prog && link->prog != old_prog) {
ret = -EPERM;
goto out_unlock;
}
if (link->prog->type != new_prog->type ||
link->prog->expected_attach_type != new_prog->expected_attach_type ||
link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
ret = -EINVAL;
goto out_unlock;
}
old_prog = xchg(&link->prog, new_prog);
bpf_prog_put(old_prog);
out_unlock:
mutex_unlock(&link_mutex);
return ret;
}
static void bpf_iter_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
bpf_iter_show_fdinfo_t show_fdinfo;
seq_printf(seq,
"target_name:\t%s\n",
iter_link->tinfo->reg_info->target);
show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo;
if (show_fdinfo)
show_fdinfo(&iter_link->aux, seq);
}
static int bpf_iter_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
char __user *ubuf = u64_to_user_ptr(info->iter.target_name);
bpf_iter_fill_link_info_t fill_link_info;
u32 ulen = info->iter.target_name_len;
const char *target_name;
u32 target_len;
if (!ulen ^ !ubuf)
return -EINVAL;
target_name = iter_link->tinfo->reg_info->target;
target_len = strlen(target_name);
info->iter.target_name_len = target_len + 1;
if (ubuf) {
if (ulen >= target_len + 1) {
if (copy_to_user(ubuf, target_name, target_len + 1))
return -EFAULT;
} else {
char zero = '\0';
if (copy_to_user(ubuf, target_name, ulen - 1))
return -EFAULT;
if (put_user(zero, ubuf + ulen - 1))
return -EFAULT;
return -ENOSPC;
}
}
fill_link_info = iter_link->tinfo->reg_info->fill_link_info;
if (fill_link_info)
return fill_link_info(&iter_link->aux, info);
return 0;
}
static const struct bpf_link_ops bpf_iter_link_lops = {
.release = bpf_iter_link_release,
.dealloc = bpf_iter_link_dealloc,
.update_prog = bpf_iter_link_replace,
.show_fdinfo = bpf_iter_link_show_fdinfo,
.fill_link_info = bpf_iter_link_fill_link_info,
};
bool bpf_link_is_iter(struct bpf_link *link)
{
return link->ops == &bpf_iter_link_lops;
}
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
struct bpf_prog *prog)
{
struct bpf_iter_target_info *tinfo = NULL, *iter;
struct bpf_link_primer link_primer;
union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
u32 prog_btf_id, linfo_len;
bpfptr_t ulinfo;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
return -EINVAL;
memset(&linfo, 0, sizeof(union bpf_iter_link_info));
ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
linfo_len = attr->link_create.iter_info_len;
if (bpfptr_is_null(ulinfo) ^ !linfo_len)
return -EINVAL;
if (!bpfptr_is_null(ulinfo)) {
err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
linfo_len);
if (err)
return err;
linfo_len = min_t(u32, linfo_len, sizeof(linfo));
if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
return -EFAULT;
}
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
list_for_each_entry(iter, &targets, list) {
if (iter->btf_id == prog_btf_id) {
tinfo = iter;
break;
}
}
mutex_unlock(&targets_mutex);
if (!tinfo)
return -ENOENT;
/* Only allow sleepable program for resched-able iterator */
if (prog->sleepable && !bpf_iter_target_support_resched(tinfo))
return -EINVAL;
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
link->tinfo = tinfo;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
}
if (tinfo->reg_info->attach_target) {
err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
if (err) {
bpf_link_cleanup(&link_primer);
return err;
}
}
return bpf_link_settle(&link_primer);
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
struct bpf_iter_target_info *tinfo,
const struct bpf_iter_seq_info *seq_info,
struct bpf_prog *prog)
{
priv_data->tinfo = tinfo;
priv_data->seq_info = seq_info;
priv_data->prog = prog;
priv_data->session_id = atomic64_inc_return(&session_id);
priv_data->seq_num = 0;
priv_data->done_stop = false;
}
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
const struct bpf_iter_seq_info *seq_info)
{
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
u32 total_priv_dsize;
struct seq_file *seq;
int err = 0;
mutex_lock(&link_mutex);
prog = link->link.prog;
bpf_prog_inc(prog);
mutex_unlock(&link_mutex);
tinfo = link->tinfo;
total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
seq_info->seq_priv_size;
priv_data = __seq_open_private(file, seq_info->seq_ops,
total_priv_dsize);
if (!priv_data) {
err = -ENOMEM;
goto release_prog;
}
if (seq_info->init_seq_private) {
err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
if (err)
goto release_seq_file;
}
init_seq_meta(priv_data, tinfo, seq_info, prog);
seq = file->private_data;
seq->private = priv_data->target_private;
return 0;
release_seq_file:
seq_release_private(file->f_inode, file);
file->private_data = NULL;
release_prog:
bpf_prog_put(prog);
return err;
}
int bpf_iter_new_fd(struct bpf_link *link)
{
struct bpf_iter_link *iter_link;
struct file *file;
unsigned int flags;
int err, fd;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
fd = get_unused_fd_flags(flags);
if (fd < 0)
return fd;
file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto free_fd;
}
iter_link = container_of(link, struct bpf_iter_link, link);
err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
if (err)
goto free_file;
fd_install(fd, file);
return fd;
free_file:
fput(file);
free_fd:
put_unused_fd(fd);
return err;
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
void *seq_priv;
seq = meta->seq;
if (seq->file->f_op != &bpf_iter_fops)
return NULL;
seq_priv = seq->private;
iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
target_private);
if (in_stop && iter_priv->done_stop)
return NULL;
meta->session_id = iter_priv->session_id;
meta->seq_num = iter_priv->seq_num;
return iter_priv->prog;
}
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
if (prog->sleepable) {
rcu_read_lock_trace();
migrate_disable();
might_fault();
old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock_trace();
} else {
rcu_read_lock();
migrate_disable();
old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock();
}
/* bpf program can only return 0 or 1:
* 0 : okay
* 1 : retry the same object
* The bpf_iter_run_prog() return value
* will be seq_ops->show() return value.
*/
return ret == 0 ? 0 : -EAGAIN;
}
BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
void *, callback_ctx, u64, flags)
{
return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
}
const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.func = bpf_for_each_map_elem,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_FUNC,
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64, flags)
{
bpf_callback_t callback = (bpf_callback_t)callback_fn;
u64 ret;
u32 i;
/* Note: these safety checks are also verified when bpf_loop
* is inlined, be careful to modify this code in sync. See
* function verifier.c:inline_bpf_loop.
*/
if (flags)
return -EINVAL;
if (nr_loops > BPF_MAX_LOOPS)
return -E2BIG;
for (i = 0; i < nr_loops; i++) {
ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret)
return i + 1;
}
return i;
}
const struct bpf_func_proto bpf_loop_proto = {
.func = bpf_loop,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_PTR_TO_FUNC,
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
struct bpf_iter_num_kern {
int cur; /* current value, inclusive */
int end; /* final value, exclusive */
} __aligned(8);
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
{
struct bpf_iter_num_kern *s = (void *)it;
BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
/* start == end is legit, it's an empty range and we'll just get NULL
* on first (and any subsequent) bpf_iter_num_next() call
*/
if (start > end) {
s->cur = s->end = 0;
return -EINVAL;
}
/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
s->cur = s->end = 0;
return -E2BIG;
}
/* user will call bpf_iter_num_next() first,
* which will set s->cur to exactly start value;
* underflow shouldn't matter
*/
s->cur = start - 1;
s->end = end;
return 0;
}
__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
{
struct bpf_iter_num_kern *s = (void *)it;
/* check failed initialization or if we are done (same behavior);
* need to be careful about overflow, so convert to s64 for checks,
* e.g., if s->cur == s->end == INT_MAX, we can't just do
* s->cur + 1 >= s->end
*/
if ((s64)(s->cur + 1) >= s->end) {
s->cur = s->end = 0;
return NULL;
}
s->cur++;
return &s->cur;
}
__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
{
struct bpf_iter_num_kern *s = (void *)it;
s->cur = s->end = 0;
}
__bpf_kfunc_end_defs();