bpf-for-netdev

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQTFp0I1jqZrAX+hPRXbK58LschIgwUCZalBVQAKCRDbK58LschI
 gyfQAP4+KhkJiJiOXsECo0f3JcuzDgCqEMnylNx0Wujzgs2s9wD+LEjYr8zztqUd
 E9rkjGKUoSYYfarEJ0KKfy6Lv61BlgY=
 =xI6t
 -----END PGP SIGNATURE-----

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2024-01-18

We've added 10 non-merge commits during the last 5 day(s) which contain
a total of 12 files changed, 806 insertions(+), 51 deletions(-).

The main changes are:

1) Fix an issue in bpf_iter_udp under backward progress which prevents
   user space process from finishing iteration, from Martin KaFai Lau.

2) Fix BPF verifier to reject variable offset alu on registers with a type
   of PTR_TO_FLOW_KEYS to prevent oob access, from Hao Sun.

3) Follow up fixes for kernel- and libbpf-side logic around handling
   arg:ctx tagged arguments of BPF global subprogs, from Andrii Nakryiko.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  libbpf: warn on unexpected __arg_ctx type when rewriting BTF
  selftests/bpf: add tests confirming type logic in kernel for __arg_ctx
  bpf: enforce types for __arg_ctx-tagged arguments in global subprogs
  bpf: extract bpf_ctx_convert_map logic and make it more reusable
  libbpf: feature-detect arg:ctx tag support in kernel
  selftests/bpf: Add test for alu on PTR_TO_FLOW_KEYS
  bpf: Reject variable offset alu on PTR_TO_FLOW_KEYS
  selftests/bpf: Test udp and tcp iter batching
  bpf: Avoid iter->offset making backward progress in bpf_iter_udp
  bpf: iter_udp: Retry with a larger batch size without going back to the previous bucket
====================

Link: https://lore.kernel.org/r/20240118153936.11769-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2024-01-18 09:54:24 -08:00
commit 4349efc52b
12 changed files with 806 additions and 51 deletions

View file

@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
struct module *owner);
struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
const struct btf_member *
const struct btf_type *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg);

View file

@ -5615,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
const struct btf_member *
static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
{
const struct btf_type *conv_struct;
const struct btf_member *ctx_type;
conv_struct = bpf_ctx_convert.t;
if (!conv_struct)
return NULL;
/* prog_type is valid bpf program type. No need for bounds check. */
ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
/* ctx_type is a pointer to prog_ctx_type in vmlinux.
* Like 'struct __sk_buff'
*/
return btf_type_by_id(btf_vmlinux, ctx_type->type);
}
static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
{
const struct btf_type *conv_struct;
const struct btf_member *ctx_type;
conv_struct = bpf_ctx_convert.t;
if (!conv_struct)
return -EFAULT;
/* prog_type is valid bpf program type. No need for bounds check. */
ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
/* ctx_type is a pointer to prog_ctx_type in vmlinux.
* Like 'struct sk_buff'
*/
return ctx_type->type;
}
const struct btf_type *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
{
const struct btf_type *conv_struct;
const struct btf_type *ctx_struct;
const struct btf_member *ctx_type;
const struct btf_type *ctx_type;
const char *tname, *ctx_tname;
conv_struct = bpf_ctx_convert.t;
if (!conv_struct) {
bpf_log(log, "btf_vmlinux is malformed\n");
return NULL;
}
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
@ -5646,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
/* prog_type is valid bpf program type. No need for bounds check. */
ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
/* ctx_struct is a pointer to prog_ctx_type in vmlinux.
* Like 'struct __sk_buff'
*/
ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
if (!ctx_struct)
ctx_type = find_canonical_prog_ctx_type(prog_type);
if (!ctx_type) {
bpf_log(log, "btf_vmlinux is malformed\n");
/* should not happen */
return NULL;
}
again:
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
@ -5677,28 +5700,167 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
*/
if (!btf_type_is_modifier(ctx_struct))
if (!btf_type_is_modifier(ctx_type))
return NULL;
while (btf_type_is_modifier(ctx_struct))
ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
while (btf_type_is_modifier(ctx_type))
ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
goto again;
}
return ctx_type;
}
/* forward declarations for arch-specific underlying types of
* bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
* compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
* works correctly with __builtin_types_compatible_p() on respective
* architectures
*/
struct user_regs_struct;
struct user_pt_regs;
static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int arg,
enum bpf_prog_type prog_type,
enum bpf_attach_type attach_type)
{
const struct btf_type *ctx_type;
const char *tname, *ctx_tname;
if (!btf_is_ptr(t)) {
bpf_log(log, "arg#%d type isn't a pointer\n", arg);
return -EINVAL;
}
t = btf_type_by_id(btf, t->type);
/* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_typedef(t)) {
tname = btf_name_by_offset(btf, t->name_off);
if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
return 0;
}
}
/* all other program types don't use typedefs for context type */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
/* `void *ctx __arg_ctx` is always valid */
if (btf_type_is_void(t))
return 0;
tname = btf_name_by_offset(btf, t->name_off);
if (str_is_empty(tname)) {
bpf_log(log, "arg#%d type doesn't have a name\n", arg);
return -EINVAL;
}
/* special cases */
switch (prog_type) {
case BPF_PROG_TYPE_KPROBE:
if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
return 0;
break;
case BPF_PROG_TYPE_PERF_EVENT:
if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
return 0;
if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
__btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
return 0;
if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
__btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
return 0;
break;
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
/* allow u64* as ctx */
if (btf_is_int(t) && t->size == 8)
return 0;
break;
case BPF_PROG_TYPE_TRACING:
switch (attach_type) {
case BPF_TRACE_RAW_TP:
/* tp_btf program is TRACING, so need special case here */
if (__btf_type_is_struct(t) &&
strcmp(tname, "bpf_raw_tracepoint_args") == 0)
return 0;
/* allow u64* as ctx */
if (btf_is_int(t) && t->size == 8)
return 0;
break;
case BPF_TRACE_ITER:
/* allow struct bpf_iter__xxx types only */
if (__btf_type_is_struct(t) &&
strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
return 0;
break;
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_MODIFY_RETURN:
/* allow u64* as ctx */
if (btf_is_int(t) && t->size == 8)
return 0;
break;
default:
break;
}
break;
case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_STRUCT_OPS:
/* allow u64* as ctx */
if (btf_is_int(t) && t->size == 8)
return 0;
break;
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
return 0; /* anything goes */
default:
break;
}
ctx_type = find_canonical_prog_ctx_type(prog_type);
if (!ctx_type) {
/* should not happen */
bpf_log(log, "btf_vmlinux is malformed\n");
return -EINVAL;
}
/* resolve typedefs and check that underlying structs are matching as well */
while (btf_type_is_modifier(ctx_type))
ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
/* if program type doesn't have distinctly named struct type for
* context, then __arg_ctx argument can only be `void *`, which we
* already checked above
*/
if (!__btf_type_is_struct(ctx_type)) {
bpf_log(log, "arg#%d should be void pointer\n", arg);
return -EINVAL;
}
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
return -EINVAL;
}
return 0;
}
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
enum bpf_prog_type prog_type,
int arg)
{
const struct btf_member *prog_ctx_type, *kern_ctx_type;
prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
if (!prog_ctx_type)
if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
return -ENOENT;
kern_ctx_type = prog_ctx_type + 1;
return kern_ctx_type->type;
return find_kern_ctx_type_id(prog_type);
}
int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
@ -6934,6 +7096,23 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
return -EINVAL;
}
for (i = 0; i < nargs; i++) {
const char *tag;
if (sub->args[i].arg_type != ARG_PTR_TO_CTX)
continue;
/* check if arg has "arg:ctx" tag */
t = btf_type_by_id(btf, args[i].type);
tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0)
continue;
if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
prog->expected_attach_type))
return -EINVAL;
}
sub->arg_cnt = nargs;
sub->args_cached = true;

View file

@ -12826,6 +12826,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
switch (base_type(ptr_reg->type)) {
case PTR_TO_FLOW_KEYS:
if (known)
break;
fallthrough;
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)

View file

@ -3137,16 +3137,18 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
struct bpf_udp_iter_state *iter = seq->private;
struct udp_iter_state *state = &iter->state;
struct net *net = seq_file_net(seq);
int resume_bucket, resume_offset;
struct udp_table *udptable;
unsigned int batch_sks = 0;
bool resized = false;
struct sock *sk;
resume_bucket = state->bucket;
resume_offset = iter->offset;
/* The current batch is done, so advance the bucket. */
if (iter->st_bucket_done) {
if (iter->st_bucket_done)
state->bucket++;
iter->offset = 0;
}
udptable = udp_get_table_seq(seq, net);
@ -3166,19 +3168,19 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
if (hlist_empty(&hslot2->head)) {
iter->offset = 0;
if (hlist_empty(&hslot2->head))
continue;
}
iter->offset = 0;
spin_lock_bh(&hslot2->lock);
udp_portaddr_for_each_entry(sk, &hslot2->head) {
if (seq_sk_match(seq, sk)) {
/* Resume from the last iterated socket at the
* offset in the bucket before iterator was stopped.
*/
if (iter->offset) {
--iter->offset;
if (state->bucket == resume_bucket &&
iter->offset < resume_offset) {
++iter->offset;
continue;
}
if (iter->end_sk < iter->max_sk) {
@ -3192,9 +3194,6 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
if (iter->end_sk)
break;
/* Reset the current bucket's offset before moving to the next bucket. */
iter->offset = 0;
}
/* All done: no batch made. */
@ -3213,7 +3212,6 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
/* After allocating a larger batch, retry one more time to grab
* the whole bucket.
*/
state->bucket--;
goto again;
}
done:

View file

@ -6695,6 +6695,67 @@ static struct {
/* all other program types don't have "named" context structs */
};
static bool need_func_arg_type_fixup(const struct btf *btf, const struct bpf_program *prog,
const char *subprog_name, int arg_idx,
int arg_type_id, const char *ctx_name)
{
const struct btf_type *t;
const char *tname;
/* check if existing parameter already matches verifier expectations */
t = skip_mods_and_typedefs(btf, arg_type_id, NULL);
if (!btf_is_ptr(t))
goto out_warn;
/* typedef bpf_user_pt_regs_t is a special PITA case, valid for kprobe
* and perf_event programs, so check this case early on and forget
* about it for subsequent checks
*/
while (btf_is_mod(t))
t = btf__type_by_id(btf, t->type);
if (btf_is_typedef(t) &&
(prog->type == BPF_PROG_TYPE_KPROBE || prog->type == BPF_PROG_TYPE_PERF_EVENT)) {
tname = btf__str_by_offset(btf, t->name_off) ?: "<anon>";
if (strcmp(tname, "bpf_user_pt_regs_t") == 0)
return false; /* canonical type for kprobe/perf_event */
}
/* now we can ignore typedefs moving forward */
t = skip_mods_and_typedefs(btf, t->type, NULL);
/* if it's `void *`, definitely fix up BTF info */
if (btf_is_void(t))
return true;
/* if it's already proper canonical type, no need to fix up */
tname = btf__str_by_offset(btf, t->name_off) ?: "<anon>";
if (btf_is_struct(t) && strcmp(tname, ctx_name) == 0)
return false;
/* special cases */
switch (prog->type) {
case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_PERF_EVENT:
/* `struct pt_regs *` is expected, but we need to fix up */
if (btf_is_struct(t) && strcmp(tname, "pt_regs") == 0)
return true;
break;
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
/* allow u64* as ctx */
if (btf_is_int(t) && t->size == 8)
return true;
break;
default:
break;
}
out_warn:
pr_warn("prog '%s': subprog '%s' arg#%d is expected to be of `struct %s *` type\n",
prog->name, subprog_name, arg_idx, ctx_name);
return false;
}
static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_program *prog)
{
int fn_id, fn_proto_id, ret_type_id, orig_proto_id;
@ -6757,6 +6818,69 @@ static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_progr
return fn_id;
}
static int probe_kern_arg_ctx_tag(void)
{
/* To minimize merge conflicts with BPF token series that refactors
* feature detection code a lot, we don't integrate
* probe_kern_arg_ctx_tag() into kernel_supports() feature-detection
* framework yet, doing our own caching internally.
* This will be cleaned up a bit later when bpf/bpf-next trees settle.
*/
static int cached_result = -1;
static const char strs[] = "\0a\0b\0arg:ctx\0";
const __u32 types[] = {
/* [1] INT */
BTF_TYPE_INT_ENC(1 /* "a" */, BTF_INT_SIGNED, 0, 32, 4),
/* [2] PTR -> VOID */
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
/* [3] FUNC_PROTO `int(void *a)` */
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1),
BTF_PARAM_ENC(1 /* "a" */, 2),
/* [4] FUNC 'a' -> FUNC_PROTO (main prog) */
BTF_TYPE_ENC(1 /* "a" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 3),
/* [5] FUNC_PROTO `int(void *b __arg_ctx)` */
BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1),
BTF_PARAM_ENC(3 /* "b" */, 2),
/* [6] FUNC 'b' -> FUNC_PROTO (subprog) */
BTF_TYPE_ENC(3 /* "b" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 5),
/* [7] DECL_TAG 'arg:ctx' -> func 'b' arg 'b' */
BTF_TYPE_DECL_TAG_ENC(5 /* "arg:ctx" */, 6, 0),
};
const struct bpf_insn insns[] = {
/* main prog */
BPF_CALL_REL(+1),
BPF_EXIT_INSN(),
/* global subprog */
BPF_EMIT_CALL(BPF_FUNC_get_func_ip), /* needs PTR_TO_CTX */
BPF_EXIT_INSN(),
};
const struct bpf_func_info_min func_infos[] = {
{ 0, 4 }, /* main prog -> FUNC 'a' */
{ 2, 6 }, /* subprog -> FUNC 'b' */
};
LIBBPF_OPTS(bpf_prog_load_opts, opts);
int prog_fd, btf_fd, insn_cnt = ARRAY_SIZE(insns);
if (cached_result >= 0)
return cached_result;
btf_fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs));
if (btf_fd < 0)
return 0;
opts.prog_btf_fd = btf_fd;
opts.func_info = &func_infos;
opts.func_info_cnt = ARRAY_SIZE(func_infos);
opts.func_info_rec_size = sizeof(func_infos[0]);
prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, "det_arg_ctx",
"GPL", insns, insn_cnt, &opts);
close(btf_fd);
cached_result = probe_fd(prog_fd);
return cached_result;
}
/* Check if main program or global subprog's function prototype has `arg:ctx`
* argument tags, and, if necessary, substitute correct type to match what BPF
* verifier would expect, taking into account specific program type. This
@ -6766,7 +6890,7 @@ static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_progr
*/
static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_program *prog)
{
const char *ctx_name = NULL, *ctx_tag = "arg:ctx";
const char *ctx_name = NULL, *ctx_tag = "arg:ctx", *fn_name;
struct bpf_func_info_min *func_rec;
struct btf_type *fn_t, *fn_proto_t;
struct btf *btf = obj->btf;
@ -6780,6 +6904,10 @@ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_progra
if (!obj->btf_ext || !prog->func_info)
return 0;
/* don't do any fix ups if kernel natively supports __arg_ctx */
if (probe_kern_arg_ctx_tag() > 0)
return 0;
/* some BPF program types just don't have named context structs, so
* this fallback mechanism doesn't work for them
*/
@ -6842,15 +6970,11 @@ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_progra
if (arg_idx < 0 || arg_idx >= arg_cnt)
continue;
/* check if existing parameter already matches verifier expectations */
/* check if we should fix up argument type */
p = &btf_params(fn_proto_t)[arg_idx];
t = skip_mods_and_typedefs(btf, p->type, NULL);
if (btf_is_ptr(t) &&
(t = skip_mods_and_typedefs(btf, t->type, NULL)) &&
btf_is_struct(t) &&
strcmp(btf__str_by_offset(btf, t->name_off), ctx_name) == 0) {
continue; /* no need for fix up */
}
fn_name = btf__str_by_offset(btf, fn_t->name_off) ?: "<anon>";
if (!need_func_arg_type_fixup(btf, prog, fn_name, arg_idx, p->type, ctx_name))
continue;
/* clone fn/fn_proto, unless we already did it for another arg */
if (func_rec->type_id == orig_fn_id) {

View file

@ -0,0 +1,135 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2024 Meta
#include <test_progs.h>
#include "network_helpers.h"
#include "sock_iter_batch.skel.h"
#define TEST_NS "sock_iter_batch_netns"
static const int nr_soreuse = 4;
static void do_test(int sock_type, bool onebyone)
{
int err, i, nread, to_read, total_read, iter_fd = -1;
int first_idx, second_idx, indices[nr_soreuse];
struct bpf_link *link = NULL;
struct sock_iter_batch *skel;
int *fds[2] = {};
skel = sock_iter_batch__open();
if (!ASSERT_OK_PTR(skel, "sock_iter_batch__open"))
return;
/* Prepare 2 buckets of sockets in the kernel hashtable */
for (i = 0; i < ARRAY_SIZE(fds); i++) {
int local_port;
fds[i] = start_reuseport_server(AF_INET6, sock_type, "::1", 0, 0,
nr_soreuse);
if (!ASSERT_OK_PTR(fds[i], "start_reuseport_server"))
goto done;
local_port = get_socket_local_port(*fds[i]);
if (!ASSERT_GE(local_port, 0, "get_socket_local_port"))
goto done;
skel->rodata->ports[i] = ntohs(local_port);
}
err = sock_iter_batch__load(skel);
if (!ASSERT_OK(err, "sock_iter_batch__load"))
goto done;
link = bpf_program__attach_iter(sock_type == SOCK_STREAM ?
skel->progs.iter_tcp_soreuse :
skel->progs.iter_udp_soreuse,
NULL);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter"))
goto done;
iter_fd = bpf_iter_create(bpf_link__fd(link));
if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create"))
goto done;
/* Test reading a bucket (either from fds[0] or fds[1]).
* Only read "nr_soreuse - 1" number of sockets
* from a bucket and leave one socket out from
* that bucket on purpose.
*/
to_read = (nr_soreuse - 1) * sizeof(*indices);
total_read = 0;
first_idx = -1;
do {
nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read);
if (nread <= 0 || nread % sizeof(*indices))
break;
total_read += nread;
if (first_idx == -1)
first_idx = indices[0];
for (i = 0; i < nread / sizeof(*indices); i++)
ASSERT_EQ(indices[i], first_idx, "first_idx");
} while (total_read < to_read);
ASSERT_EQ(nread, onebyone ? sizeof(*indices) : to_read, "nread");
ASSERT_EQ(total_read, to_read, "total_read");
free_fds(fds[first_idx], nr_soreuse);
fds[first_idx] = NULL;
/* Read the "whole" second bucket */
to_read = nr_soreuse * sizeof(*indices);
total_read = 0;
second_idx = !first_idx;
do {
nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read);
if (nread <= 0 || nread % sizeof(*indices))
break;
total_read += nread;
for (i = 0; i < nread / sizeof(*indices); i++)
ASSERT_EQ(indices[i], second_idx, "second_idx");
} while (total_read <= to_read);
ASSERT_EQ(nread, 0, "nread");
/* Both so_reuseport ports should be in different buckets, so
* total_read must equal to the expected to_read.
*
* For a very unlikely case, both ports collide at the same bucket,
* the bucket offset (i.e. 3) will be skipped and it cannot
* expect the to_read number of bytes.
*/
if (skel->bss->bucket[0] != skel->bss->bucket[1])
ASSERT_EQ(total_read, to_read, "total_read");
done:
for (i = 0; i < ARRAY_SIZE(fds); i++)
free_fds(fds[i], nr_soreuse);
if (iter_fd < 0)
close(iter_fd);
bpf_link__destroy(link);
sock_iter_batch__destroy(skel);
}
void test_sock_iter_batch(void)
{
struct nstoken *nstoken = NULL;
SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null");
SYS(done, "ip netns add %s", TEST_NS);
SYS(done, "ip -net %s link set dev lo up", TEST_NS);
nstoken = open_netns(TEST_NS);
if (!ASSERT_OK_PTR(nstoken, "open_netns"))
goto done;
if (test__start_subtest("tcp")) {
do_test(SOCK_STREAM, true);
do_test(SOCK_STREAM, false);
}
if (test__start_subtest("udp")) {
do_test(SOCK_DGRAM, true);
do_test(SOCK_DGRAM, false);
}
close_netns(nstoken);
done:
SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null");
}

View file

@ -47,6 +47,19 @@ static void subtest_ctx_arg_rewrite(void)
struct btf *btf = NULL;
__u32 info_len = sizeof(info);
int err, fd, i;
struct btf *kern_btf = NULL;
kern_btf = btf__load_vmlinux_btf();
if (!ASSERT_OK_PTR(kern_btf, "kern_btf_load"))
return;
/* simple detection of kernel native arg:ctx tag support */
if (btf__find_by_name_kind(kern_btf, "bpf_subprog_arg_info", BTF_KIND_STRUCT) > 0) {
test__skip();
btf__free(kern_btf);
return;
}
btf__free(kern_btf);
skel = test_global_func_ctx_args__open();
if (!ASSERT_OK_PTR(skel, "skel_open"))

View file

@ -72,6 +72,8 @@
#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
#define inet_dport sk.__sk_common.skc_dport
#define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1]
#define ir_loc_addr req.__req_common.skc_rcv_saddr
#define ir_num req.__req_common.skc_num
#define ir_rmt_addr req.__req_common.skc_daddr
@ -85,6 +87,7 @@
#define sk_rmem_alloc sk_backlog.rmem_alloc
#define sk_refcnt __sk_common.skc_refcnt
#define sk_state __sk_common.skc_state
#define sk_net __sk_common.skc_net
#define sk_v6_daddr __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
#define sk_flags __sk_common.skc_flags

View file

@ -0,0 +1,91 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2024 Meta
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_endian.h>
#include "bpf_tracing_net.h"
#include "bpf_kfuncs.h"
#define ATTR __always_inline
#include "test_jhash.h"
static bool ipv6_addr_loopback(const struct in6_addr *a)
{
return (a->s6_addr32[0] | a->s6_addr32[1] |
a->s6_addr32[2] | (a->s6_addr32[3] ^ bpf_htonl(1))) == 0;
}
volatile const __u16 ports[2];
unsigned int bucket[2];
SEC("iter/tcp")
int iter_tcp_soreuse(struct bpf_iter__tcp *ctx)
{
struct sock *sk = (struct sock *)ctx->sk_common;
struct inet_hashinfo *hinfo;
unsigned int hash;
struct net *net;
int idx;
if (!sk)
return 0;
sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock));
if (sk->sk_family != AF_INET6 ||
sk->sk_state != TCP_LISTEN ||
!ipv6_addr_loopback(&sk->sk_v6_rcv_saddr))
return 0;
if (sk->sk_num == ports[0])
idx = 0;
else if (sk->sk_num == ports[1])
idx = 1;
else
return 0;
/* bucket selection as in inet_lhash2_bucket_sk() */
net = sk->sk_net.net;
hash = jhash2(sk->sk_v6_rcv_saddr.s6_addr32, 4, net->hash_mix);
hash ^= sk->sk_num;
hinfo = net->ipv4.tcp_death_row.hashinfo;
bucket[idx] = hash & hinfo->lhash2_mask;
bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx));
return 0;
}
#define udp_sk(ptr) container_of(ptr, struct udp_sock, inet.sk)
SEC("iter/udp")
int iter_udp_soreuse(struct bpf_iter__udp *ctx)
{
struct sock *sk = (struct sock *)ctx->udp_sk;
struct udp_table *udptable;
int idx;
if (!sk)
return 0;
sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock));
if (sk->sk_family != AF_INET6 ||
!ipv6_addr_loopback(&sk->sk_v6_rcv_saddr))
return 0;
if (sk->sk_num == ports[0])
idx = 0;
else if (sk->sk_num == ports[1])
idx = 1;
else
return 0;
/* bucket selection as in udp_hashslot2() */
udptable = sk->sk_net.net->ipv4.udp_table;
bucket[idx] = udp_sk(sk)->udp_portaddr_hash & udptable->mask;
bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx));
return 0;
}
char _license[] SEC("license") = "GPL";

View file

@ -69,3 +69,34 @@ u32 jhash(const void *key, u32 length, u32 initval)
return c;
}
static __always_inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
u32 a, b, c;
/* Set up the internal state */
a = b = c = JHASH_INITVAL + (length<<2) + initval;
/* Handle most of the key */
while (length > 3) {
a += k[0];
b += k[1];
c += k[2];
__jhash_mix(a, b, c);
length -= 3;
k += 3;
}
/* Handle the last 3 u32's */
switch (length) {
case 3: c += k[2];
case 2: b += k[1];
case 1: a += k[0];
__jhash_final(a, b, c);
break;
case 0: /* Nothing left to add */
break;
}
return c;
}

View file

@ -3,6 +3,7 @@
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_misc.h"
#include "xdp_metadata.h"
#include "bpf_kfuncs.h"
@ -138,25 +139,182 @@ __weak int subprog_ctx_tag(void *ctx __arg_ctx)
return bpf_get_stack(ctx, stack, sizeof(stack), 0);
}
__weak int raw_tp_canonical(struct bpf_raw_tracepoint_args *ctx __arg_ctx)
{
return 0;
}
__weak int raw_tp_u64_array(u64 *ctx __arg_ctx)
{
return 0;
}
SEC("?raw_tp")
__success __log_level(2)
int arg_tag_ctx_raw_tp(void *ctx)
{
return subprog_ctx_tag(ctx);
return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx);
}
SEC("?raw_tp.w")
__success __log_level(2)
int arg_tag_ctx_raw_tp_writable(void *ctx)
{
return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx);
}
SEC("?tp_btf/sys_enter")
__success __log_level(2)
int arg_tag_ctx_raw_tp_btf(void *ctx)
{
return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx);
}
struct whatever { };
__weak int tp_whatever(struct whatever *ctx __arg_ctx)
{
return 0;
}
SEC("?tp")
__success __log_level(2)
int arg_tag_ctx_tp(void *ctx)
{
return subprog_ctx_tag(ctx);
return subprog_ctx_tag(ctx) + tp_whatever(ctx);
}
__weak int kprobe_subprog_pt_regs(struct pt_regs *ctx __arg_ctx)
{
return 0;
}
__weak int kprobe_subprog_typedef(bpf_user_pt_regs_t *ctx __arg_ctx)
{
return 0;
}
SEC("?kprobe")
__success __log_level(2)
int arg_tag_ctx_kprobe(void *ctx)
{
return subprog_ctx_tag(ctx);
return subprog_ctx_tag(ctx) +
kprobe_subprog_pt_regs(ctx) +
kprobe_subprog_typedef(ctx);
}
__weak int perf_subprog_regs(
#if defined(bpf_target_riscv)
struct user_regs_struct *ctx __arg_ctx
#elif defined(bpf_target_s390)
/* user_pt_regs typedef is anonymous struct, so only `void *` works */
void *ctx __arg_ctx
#elif defined(bpf_target_loongarch) || defined(bpf_target_arm64) || defined(bpf_target_powerpc)
struct user_pt_regs *ctx __arg_ctx
#else
struct pt_regs *ctx __arg_ctx
#endif
)
{
return 0;
}
__weak int perf_subprog_typedef(bpf_user_pt_regs_t *ctx __arg_ctx)
{
return 0;
}
__weak int perf_subprog_canonical(struct bpf_perf_event_data *ctx __arg_ctx)
{
return 0;
}
SEC("?perf_event")
__success __log_level(2)
int arg_tag_ctx_perf(void *ctx)
{
return subprog_ctx_tag(ctx) +
perf_subprog_regs(ctx) +
perf_subprog_typedef(ctx) +
perf_subprog_canonical(ctx);
}
__weak int iter_subprog_void(void *ctx __arg_ctx)
{
return 0;
}
__weak int iter_subprog_typed(struct bpf_iter__task *ctx __arg_ctx)
{
return 0;
}
SEC("?iter/task")
__success __log_level(2)
int arg_tag_ctx_iter_task(struct bpf_iter__task *ctx)
{
return (iter_subprog_void(ctx) + iter_subprog_typed(ctx)) & 1;
}
__weak int tracing_subprog_void(void *ctx __arg_ctx)
{
return 0;
}
__weak int tracing_subprog_u64(u64 *ctx __arg_ctx)
{
return 0;
}
int acc;
SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
__success __log_level(2)
int BPF_PROG(arg_tag_ctx_fentry)
{
acc += tracing_subprog_void(ctx) + tracing_subprog_u64(ctx);
return 0;
}
SEC("?fexit/" SYS_PREFIX "sys_nanosleep")
__success __log_level(2)
int BPF_PROG(arg_tag_ctx_fexit)
{
acc += tracing_subprog_void(ctx) + tracing_subprog_u64(ctx);
return 0;
}
SEC("?fmod_ret/" SYS_PREFIX "sys_nanosleep")
__success __log_level(2)
int BPF_PROG(arg_tag_ctx_fmod_ret)
{
return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx);
}
SEC("?lsm/bpf")
__success __log_level(2)
int BPF_PROG(arg_tag_ctx_lsm)
{
return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx);
}
SEC("?struct_ops/test_1")
__success __log_level(2)
int BPF_PROG(arg_tag_ctx_struct_ops)
{
return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx);
}
SEC(".struct_ops")
struct bpf_dummy_ops dummy_1 = {
.test_1 = (void *)arg_tag_ctx_struct_ops,
};
SEC("?syscall")
__success __log_level(2)
int arg_tag_ctx_syscall(void *ctx)
{
return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx) + tp_whatever(ctx);
}
__weak int subprog_dynptr(struct bpf_dynptr *dptr)

View file

@ -146,4 +146,23 @@ l0_%=: exit; \
: __clobber_all);
}
SEC("flow_dissector")
__description("flow_keys illegal alu op with variable offset")
__failure __msg("R7 pointer arithmetic on flow_keys prohibited")
__naked void flow_keys_illegal_variable_offset_alu(void)
{
asm volatile(" \
r6 = r1; \
r7 = *(u64*)(r6 + %[flow_keys_off]); \
r8 = 8; \
r8 /= 1; \
r8 &= 8; \
r7 += r8; \
r0 = *(u64*)(r7 + 0); \
exit; \
" :
: __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys))
: __clobber_all);
}
char _license[] SEC("license") = "GPL";