linux/net/core/sock_reuseport.c
Jakub Sitnicki 035ff358f2 net: Generate reuseport group ID on group creation
Commit 736b46027e ("net: Add ID (if needed) to sock_reuseport and expose
reuseport_lock") has introduced lazy generation of reuseport group IDs that
survive group resize.

By comparing the identifier we check if BPF reuseport program is not trying
to select a socket from a BPF map that belongs to a different reuseport
group than the one the packet is for.

Because SOCKARRAY used to be the only BPF map type that can be used with
reuseport BPF, it was possible to delay the generation of reuseport group
ID until a socket from the group was inserted into BPF map for the first
time.

Now that SOCK{MAP,HASH} can be used with reuseport BPF we have two options,
either generate the reuseport ID on map update, like SOCKARRAY does, or
allocate an ID from the start when reuseport group gets created.

This patch takes the latter approach to keep sockmap free of calls into
reuseport code. This streamlines the reuseport_id access as its lifetime
now matches the longevity of reuseport object.

The cost of this simplification, however, is that we allocate reuseport IDs
for all SO_REUSEPORT users. Even those that don't use SOCKARRAY in their
setups. With the way identifiers are currently generated, we can have at
most S32_MAX reuseport groups, which hopefully is sufficient. If we ever
get close to the limit, we can switch an u64 counter like sk_cookie.

Another change is that we now always call into SOCKARRAY logic to unlink
the socket from the map when unhashing or closing the socket. Previously we
did it only when at least one socket from the group was in a BPF map.

It is worth noting that this doesn't conflict with sockmap tear-down in
case a socket is in a SOCK{MAP,HASH} and belongs to a reuseport
group. sockmap tear-down happens first:

  prot->unhash
  `- tcp_bpf_unhash
     |- tcp_bpf_remove
     |  `- while (sk_psock_link_pop(psock))
     |     `- sk_psock_unlink
     |        `- sock_map_delete_from_link
     |           `- __sock_map_delete
     |              `- sock_map_unref
     |                 `- sk_psock_put
     |                    `- sk_psock_drop
     |                       `- rcu_assign_sk_user_data(sk, NULL)
     `- inet_unhash
        `- reuseport_detach_sock
           `- bpf_sk_reuseport_detach
              `- WRITE_ONCE(sk->sk_user_data, NULL)

Suggested-by: Martin Lau <kafai@fb.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200218171023.844439-10-jakub@cloudflare.com
2020-02-21 22:29:45 +01:00

361 lines
9.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* To speed up listener socket lookup, create an array to store all sockets
* listening on the same port. This allows a decision to be made after finding
* the first socket. An optional BPF program can also be configured for
* selecting the socket index from the array of available sockets.
*/
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/idr.h>
#include <linux/filter.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
DEFINE_SPINLOCK(reuseport_lock);
static DEFINE_IDA(reuseport_ida);
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
unsigned int size = sizeof(struct sock_reuseport) +
sizeof(struct sock *) * max_socks;
struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
if (!reuse)
return NULL;
reuse->max_socks = max_socks;
RCU_INIT_POINTER(reuse->prog, NULL);
return reuse;
}
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
int id, ret = 0;
/* bh lock used since this function call may precede hlist lock in
* soft irq of receive path or setsockopt from process context
*/
spin_lock_bh(&reuseport_lock);
/* Allocation attempts can occur concurrently via the setsockopt path
* and the bind/hash path. Nothing to do when we lose the race.
*/
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (reuse) {
/* Only set reuse->bind_inany if the bind_inany is true.
* Otherwise, it will overwrite the reuse->bind_inany
* which was set by the bind/hash path.
*/
if (bind_inany)
reuse->bind_inany = bind_inany;
goto out;
}
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
ret = -ENOMEM;
goto out;
}
id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
if (id < 0) {
kfree(reuse);
ret = id;
goto out;
}
reuse->reuseport_id = id;
reuse->socks[0] = sk;
reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
spin_unlock_bh(&reuseport_lock);
return ret;
}
EXPORT_SYMBOL(reuseport_alloc);
static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
{
struct sock_reuseport *more_reuse;
u32 more_socks_size, i;
more_socks_size = reuse->max_socks * 2U;
if (more_socks_size > U16_MAX)
return NULL;
more_reuse = __reuseport_alloc(more_socks_size);
if (!more_reuse)
return NULL;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
for (i = 0; i < reuse->num_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
* that reuse and more_reuse can temporarily share a reference
* to prog.
*/
kfree_rcu(reuse, rcu);
return more_reuse;
}
static void reuseport_free_rcu(struct rcu_head *head)
{
struct sock_reuseport *reuse;
reuse = container_of(head, struct sock_reuseport, rcu);
sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
ida_free(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
/**
* reuseport_add_sock - Add a socket to the reuseport group of another.
* @sk: New socket to add to the group.
* @sk2: Socket belonging to the existing reuseport group.
* @bind_inany: Whether or not the group is bound to a local INANY address.
*
* May return ENOMEM and not add socket to group under memory pressure.
*/
int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
{
struct sock_reuseport *old_reuse, *reuse;
if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
int err = reuseport_alloc(sk2, bind_inany);
if (err)
return err;
}
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (old_reuse && old_reuse->num_socks != 1) {
spin_unlock_bh(&reuseport_lock);
return -EBUSY;
}
if (reuse->num_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
return -ENOMEM;
}
}
reuse->socks[reuse->num_socks] = sk;
/* paired with smp_rmb() in reuseport_select_sock() */
smp_wmb();
reuse->num_socks++;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
spin_unlock_bh(&reuseport_lock);
if (old_reuse)
call_rcu(&old_reuse->rcu, reuseport_free_rcu);
return 0;
}
EXPORT_SYMBOL(reuseport_add_sock);
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
int i;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
/* Notify the bpf side. The sk may be added to a sockarray
* map. If so, sockarray logic will remove it from the map.
*
* Other bpf map types that work with reuseport, like sockmap,
* don't need an explicit callback from here. They override sk
* unhash/close ops to remove the sk from the map before we
* get to this point.
*/
bpf_sk_reuseport_detach(sk);
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
for (i = 0; i < reuse->num_socks; i++) {
if (reuse->socks[i] == sk) {
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
if (reuse->num_socks == 0)
call_rcu(&reuse->rcu, reuseport_free_rcu);
break;
}
}
spin_unlock_bh(&reuseport_lock);
}
EXPORT_SYMBOL(reuseport_detach_sock);
static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
{
struct sk_buff *nskb = NULL;
u32 index;
if (skb_shared(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return NULL;
skb = nskb;
}
/* temporarily advance data past protocol header */
if (!pskb_pull(skb, hdr_len)) {
kfree_skb(nskb);
return NULL;
}
index = bpf_prog_run_save_cb(prog, skb);
__skb_push(skb, hdr_len);
consume_skb(nskb);
if (index >= socks)
return NULL;
return reuse->socks[index];
}
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
* @hash: When no BPF filter is available, use this hash to select.
* @skb: skb to run through BPF filter.
* @hdr_len: BPF filter expects skb data pointer at payload data. If
* the skb does not yet point at the payload, this parameter represents
* how far the pointer needs to advance to reach the payload.
* Returns a socket that should receive the packet (or NULL on error).
*/
struct sock *reuseport_select_sock(struct sock *sk,
u32 hash,
struct sk_buff *skb,
int hdr_len)
{
struct sock_reuseport *reuse;
struct bpf_prog *prog;
struct sock *sk2 = NULL;
u16 socks;
rcu_read_lock();
reuse = rcu_dereference(sk->sk_reuseport_cb);
/* if memory allocation failed or add call is not yet complete */
if (!reuse)
goto out;
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
if (!prog || !skb)
goto select_by_hash;
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
else
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2) {
int i, j;
i = j = reciprocal_scale(hash, socks);
while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
i++;
if (i >= reuse->num_socks)
i = 0;
if (i == j)
goto out;
}
sk2 = reuse->socks[i];
}
}
out:
rcu_read_unlock();
return sk2;
}
EXPORT_SYMBOL(reuseport_select_sock);
int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
if (sk_unhashed(sk) && sk->sk_reuseport) {
int err = reuseport_alloc(sk, false);
if (err)
return err;
} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
/* The socket wasn't bound with SO_REUSEPORT */
return -EINVAL;
}
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_prog = rcu_dereference_protected(reuse->prog,
lockdep_is_held(&reuseport_lock));
rcu_assign_pointer(reuse->prog, prog);
spin_unlock_bh(&reuseport_lock);
sk_reuseport_prog_free(old_prog);
return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
int reuseport_detach_prog(struct sock *sk)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
if (!rcu_access_pointer(sk->sk_reuseport_cb))
return sk->sk_reuseport ? -ENOENT : -EINVAL;
old_prog = NULL;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_prog = rcu_replace_pointer(reuse->prog, old_prog,
lockdep_is_held(&reuseport_lock));
spin_unlock_bh(&reuseport_lock);
if (!old_prog)
return -ENOENT;
sk_reuseport_prog_free(old_prog);
return 0;
}
EXPORT_SYMBOL(reuseport_detach_prog);