Merge branch 'Add FOU support for externally controlled ipip devices'

Christian Ehrig says:

====================

This patch set adds support for using FOU or GUE encapsulation with
an ipip device operating in collect-metadata mode and a set of kfuncs
for controlling encap parameters exposed to a BPF tc-hook.

BPF tc-hooks allow us to read tunnel metadata (like remote IP addresses)
in the ingress path of an externally controlled tunnel interface via
the bpf_skb_get_tunnel_{key,opt} bpf-helpers. Packets can then be
redirected to the same or a different externally controlled tunnel
interface by overwriting metadata via the bpf_skb_set_tunnel_{key,opt}
helpers and a call to bpf_redirect. This enables us to redirect packets
between tunnel interfaces - and potentially change the encapsulation
type - using only a single BPF program.

Today this approach works fine for a couple of tunnel combinations.
For example: redirecting packets between Geneve and GRE interfaces or
GRE and plain ipip interfaces. However, redirecting using FOU or GUE is
not supported today. The ip_tunnel module does not allow us to egress
packets using additional UDP encapsulation from an ipip device in
collect-metadata mode.

Patch 1 lifts this restriction by adding a struct ip_tunnel_encap to
the tunnel metadata. It can be filled by a new BPF kfunc introduced
in Patch 2 and evaluated by the ip_tunnel egress path. This will allow
us to use FOU and GUE encap with externally controlled ipip devices.

Patch 2 introduces two new BPF kfuncs: bpf_skb_{set,get}_fou_encap.
These helpers can be used to set and get UDP encap parameters from the
BPF tc-hook doing the packet redirect.

Patch 3 adds BPF tunnel selftests using the two kfuncs.
---
v3:
 - Integrate selftest into test_progs (Alexei)
v2:
 - Fixes for checkpatch.pl
 - Fixes for kernel test robot
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2023-04-12 16:40:39 -07:00
commit bbc73e6855
10 changed files with 432 additions and 19 deletions

View file

@ -17,4 +17,6 @@ int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, __be16 *sport, int type);
int register_fou_bpf(void);
#endif

View file

@ -57,6 +57,13 @@ struct ip_tunnel_key {
__u8 flow_flags;
};
struct ip_tunnel_encap {
u16 type;
u16 flags;
__be16 sport;
__be16 dport;
};
/* Flags for ip_tunnel_info mode. */
#define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */
#define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */
@ -66,9 +73,9 @@ struct ip_tunnel_key {
#define IP_TUNNEL_OPTS_MAX \
GENMASK((sizeof_field(struct ip_tunnel_info, \
options_len) * BITS_PER_BYTE) - 1, 0)
struct ip_tunnel_info {
struct ip_tunnel_key key;
struct ip_tunnel_encap encap;
#ifdef CONFIG_DST_CACHE
struct dst_cache dst_cache;
#endif
@ -86,13 +93,6 @@ struct ip_tunnel_6rd_parm {
};
#endif
struct ip_tunnel_encap {
u16 type;
u16 flags;
__be16 sport;
__be16 dport;
};
struct ip_tunnel_prl_entry {
struct ip_tunnel_prl_entry __rcu *next;
__be32 addr;
@ -293,6 +293,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
__be32 remote, __be32 local,
__be32 key);
void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info);
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error);
@ -371,22 +372,23 @@ static inline int ip_encap_hlen(struct ip_tunnel_encap *e)
return hlen;
}
static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
static inline int ip_tunnel_encap(struct sk_buff *skb,
struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
const struct ip_tunnel_encap_ops *ops;
int ret = -EINVAL;
if (t->encap.type == TUNNEL_ENCAP_NONE)
if (e->type == TUNNEL_ENCAP_NONE)
return 0;
if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
if (e->type >= MAX_IPTUN_ENCAP_OPS)
return -EINVAL;
rcu_read_lock();
ops = rcu_dereference(iptun_encaps[t->encap.type]);
ops = rcu_dereference(iptun_encaps[e->type]);
if (likely(ops && ops->build_header))
ret = ops->build_header(skb, &t->encap, protocol, fl4);
ret = ops->build_header(skb, e, protocol, fl4);
rcu_read_unlock();
return ret;

View file

@ -26,7 +26,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
obj-$(CONFIG_NET_IPIP) += ipip.o
gre-y := gre_demux.o
fou-y := fou_core.o fou_nl.o
fou-y := fou_core.o fou_nl.o fou_bpf.o
obj-$(CONFIG_NET_FOU) += fou.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o

119
net/ipv4/fou_bpf.c Normal file
View file

@ -0,0 +1,119 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Unstable Fou Helpers for TC-BPF hook
*
* These are called from SCHED_CLS BPF programs. Note that it is
* allowed to break compatibility for these functions since the interface they
* are exposed through to BPF programs is explicitly unstable.
*/
#include <linux/bpf.h>
#include <linux/btf_ids.h>
#include <net/dst_metadata.h>
#include <net/fou.h>
struct bpf_fou_encap {
__be16 sport;
__be16 dport;
};
enum bpf_fou_encap_type {
FOU_BPF_ENCAP_FOU,
FOU_BPF_ENCAP_GUE,
};
__diag_push();
__diag_ignore_all("-Wmissing-prototypes",
"Global functions as their definitions will be in BTF");
/* bpf_skb_set_fou_encap - Set FOU encap parameters
*
* This function allows for using GUE or FOU encapsulation together with an
* ipip device in collect-metadata mode.
*
* It is meant to be used in BPF tc-hooks and after a call to the
* bpf_skb_set_tunnel_key helper, responsible for setting IP addresses.
*
* Parameters:
* @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
* @encap Pointer to a `struct bpf_fou_encap` storing UDP src and
* dst ports. If sport is set to 0 the kernel will auto-assign a
* port. This is similar to using `encap-sport auto`.
* Cannot be NULL
* @type Encapsulation type for the packet. Their definitions are
* specified in `enum bpf_fou_encap_type`
*/
__bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type)
{
struct sk_buff *skb = (struct sk_buff *)skb_ctx;
struct ip_tunnel_info *info = skb_tunnel_info(skb);
if (unlikely(!encap))
return -EINVAL;
if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX)))
return -EINVAL;
switch (type) {
case FOU_BPF_ENCAP_FOU:
info->encap.type = TUNNEL_ENCAP_FOU;
break;
case FOU_BPF_ENCAP_GUE:
info->encap.type = TUNNEL_ENCAP_GUE;
break;
default:
info->encap.type = TUNNEL_ENCAP_NONE;
}
if (info->key.tun_flags & TUNNEL_CSUM)
info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
info->encap.sport = encap->sport;
info->encap.dport = encap->dport;
return 0;
}
/* bpf_skb_get_fou_encap - Get FOU encap parameters
*
* This function allows for reading encap metadata from a packet received
* on an ipip device in collect-metadata mode.
*
* Parameters:
* @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
* @encap Pointer to a struct bpf_fou_encap storing UDP source and
* destination port. Cannot be NULL
*/
__bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap)
{
struct sk_buff *skb = (struct sk_buff *)skb_ctx;
struct ip_tunnel_info *info = skb_tunnel_info(skb);
if (unlikely(!info))
return -EINVAL;
encap->sport = info->encap.sport;
encap->dport = info->encap.dport;
return 0;
}
__diag_pop()
BTF_SET8_START(fou_kfunc_set)
BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
BTF_SET8_END(fou_kfunc_set)
static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
.owner = THIS_MODULE,
.set = &fou_kfunc_set,
};
int register_fou_bpf(void)
{
return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
&fou_bpf_kfunc_set);
}

View file

@ -1236,10 +1236,15 @@ static int __init fou_init(void)
if (ret < 0)
goto unregister;
ret = register_fou_bpf();
if (ret < 0)
goto kfunc_failed;
ret = ip_tunnel_encap_add_fou_ops();
if (ret == 0)
return 0;
kfunc_failed:
genl_unregister_family(&fou_nl_family);
unregister:
unregister_pernet_device(&fou_net_ops);

View file

@ -359,6 +359,20 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
return ERR_PTR(err);
}
void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
{
const struct iphdr *iph = ip_hdr(skb);
const struct udphdr *udph;
if (iph->protocol != IPPROTO_UDP)
return;
udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
info->encap.sport = udph->source;
info->encap.dport = udph->dest;
}
EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error)
@ -572,7 +586,11 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
dev_net(dev), 0, skb->mark, skb_get_hash(skb),
key->flow_flags);
if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
if (!tunnel_hlen)
tunnel_hlen = ip_encap_hlen(&tun_info->encap);
if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
goto tx_error;
use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
@ -732,7 +750,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
dev_net(dev), tunnel->parms.link,
tunnel->fwmark, skb_get_hash(skb), 0);
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
goto tx_error;
if (connected && md) {

View file

@ -241,6 +241,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
if (!tun_dst)
return 0;
ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
}
skb_reset_mac_header(skb);

View file

@ -1024,7 +1024,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
ttl = iph6->hop_limit;
tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) {
if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) {
ip_rt_put(rt);
goto tx_error;
}

View file

@ -89,6 +89,9 @@
#define IP6VXLAN_TUNL_DEV0 "ip6vxlan00"
#define IP6VXLAN_TUNL_DEV1 "ip6vxlan11"
#define IPIP_TUNL_DEV0 "ipip00"
#define IPIP_TUNL_DEV1 "ipip11"
#define PING_ARGS "-i 0.01 -c 3 -w 10 -q"
static int config_device(void)
@ -188,6 +191,79 @@ static void delete_ip6vxlan_tunnel(void)
SYS_NOFAIL("ip link delete dev %s", IP6VXLAN_TUNL_DEV1);
}
enum ipip_encap {
NONE = 0,
FOU = 1,
GUE = 2,
};
static int set_ipip_encap(const char *ipproto, const char *type)
{
SYS(fail, "ip -n at_ns0 fou add port 5555 %s", ipproto);
SYS(fail, "ip -n at_ns0 link set dev %s type ipip encap %s",
IPIP_TUNL_DEV0, type);
SYS(fail, "ip -n at_ns0 link set dev %s type ipip encap-dport 5555",
IPIP_TUNL_DEV0);
return 0;
fail:
return -1;
}
static int add_ipip_tunnel(enum ipip_encap encap)
{
int err;
const char *ipproto, *type;
switch (encap) {
case FOU:
ipproto = "ipproto 4";
type = "fou";
break;
case GUE:
ipproto = "gue";
type = ipproto;
break;
default:
ipproto = NULL;
type = ipproto;
}
/* at_ns0 namespace */
SYS(fail, "ip -n at_ns0 link add dev %s type ipip local %s remote %s",
IPIP_TUNL_DEV0, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
if (type && ipproto) {
err = set_ipip_encap(ipproto, type);
if (!ASSERT_OK(err, "set_ipip_encap"))
goto fail;
}
SYS(fail, "ip -n at_ns0 link set dev %s up", IPIP_TUNL_DEV0);
SYS(fail, "ip -n at_ns0 addr add dev %s %s/24",
IPIP_TUNL_DEV0, IP4_ADDR_TUNL_DEV0);
/* root namespace */
if (type && ipproto)
SYS(fail, "ip fou add port 5555 %s", ipproto);
SYS(fail, "ip link add dev %s type ipip external", IPIP_TUNL_DEV1);
SYS(fail, "ip link set dev %s up", IPIP_TUNL_DEV1);
SYS(fail, "ip addr add dev %s %s/24", IPIP_TUNL_DEV1,
IP4_ADDR_TUNL_DEV1);
return 0;
fail:
return -1;
}
static void delete_ipip_tunnel(void)
{
SYS_NOFAIL("ip -n at_ns0 link delete dev %s", IPIP_TUNL_DEV0);
SYS_NOFAIL("ip -n at_ns0 fou del port 5555 2> /dev/null");
SYS_NOFAIL("ip link delete dev %s", IPIP_TUNL_DEV1);
SYS_NOFAIL("ip fou del port 5555 2> /dev/null");
}
static int test_ping(int family, const char *addr)
{
SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr);
@ -386,10 +462,80 @@ static void test_ip6vxlan_tunnel(void)
test_tunnel_kern__destroy(skel);
}
#define RUN_TEST(name) \
static void test_ipip_tunnel(enum ipip_encap encap)
{
struct test_tunnel_kern *skel = NULL;
struct nstoken *nstoken;
int set_src_prog_fd, get_src_prog_fd;
int ifindex = -1;
int err;
DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
.attach_point = BPF_TC_INGRESS);
/* add ipip tunnel */
err = add_ipip_tunnel(encap);
if (!ASSERT_OK(err, "add_ipip_tunnel"))
goto done;
/* load and attach bpf prog to tunnel dev tc hook point */
skel = test_tunnel_kern__open_and_load();
if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load"))
goto done;
ifindex = if_nametoindex(IPIP_TUNL_DEV1);
if (!ASSERT_NEQ(ifindex, 0, "ipip11 ifindex"))
goto done;
tc_hook.ifindex = ifindex;
switch (encap) {
case FOU:
get_src_prog_fd = bpf_program__fd(
skel->progs.ipip_encap_get_tunnel);
set_src_prog_fd = bpf_program__fd(
skel->progs.ipip_fou_set_tunnel);
break;
case GUE:
get_src_prog_fd = bpf_program__fd(
skel->progs.ipip_encap_get_tunnel);
set_src_prog_fd = bpf_program__fd(
skel->progs.ipip_gue_set_tunnel);
break;
default:
get_src_prog_fd = bpf_program__fd(
skel->progs.ipip_get_tunnel);
set_src_prog_fd = bpf_program__fd(
skel->progs.ipip_set_tunnel);
}
if (!ASSERT_GE(set_src_prog_fd, 0, "bpf_program__fd"))
goto done;
if (!ASSERT_GE(get_src_prog_fd, 0, "bpf_program__fd"))
goto done;
if (attach_tc_prog(&tc_hook, get_src_prog_fd, set_src_prog_fd))
goto done;
/* ping from root namespace test */
err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV0);
if (!ASSERT_OK(err, "test_ping"))
goto done;
/* ping from at_ns0 namespace test */
nstoken = open_netns("at_ns0");
err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
if (!ASSERT_OK(err, "test_ping"))
goto done;
close_netns(nstoken);
done:
/* delete ipip tunnel */
delete_ipip_tunnel();
if (skel)
test_tunnel_kern__destroy(skel);
}
#define RUN_TEST(name, ...) \
({ \
if (test__start_subtest(#name)) { \
test_ ## name(); \
test_ ## name(__VA_ARGS__); \
} \
})
@ -400,6 +546,9 @@ static void *test_tunnel_run_tests(void *arg)
RUN_TEST(vxlan_tunnel);
RUN_TEST(ip6vxlan_tunnel);
RUN_TEST(ipip_tunnel, NONE);
RUN_TEST(ipip_tunnel, FOU);
RUN_TEST(ipip_tunnel, GUE);
cleanup();

View file

@ -52,6 +52,21 @@ struct vxlan_metadata {
__u32 gbp;
};
struct bpf_fou_encap {
__be16 sport;
__be16 dport;
};
enum bpf_fou_encap_type {
FOU_BPF_ENCAP_FOU,
FOU_BPF_ENCAP_GUE,
};
int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type) __ksym;
int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap) __ksym;
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
@ -749,6 +764,108 @@ int ipip_get_tunnel(struct __sk_buff *skb)
return TC_ACT_OK;
}
SEC("tc")
int ipip_gue_set_tunnel(struct __sk_buff *skb)
{
struct bpf_tunnel_key key = {};
struct bpf_fou_encap encap = {};
void *data = (void *)(long)skb->data;
struct iphdr *iph = data;
void *data_end = (void *)(long)skb->data_end;
int ret;
if (data + sizeof(*iph) > data_end) {
log_err(1);
return TC_ACT_SHOT;
}
key.tunnel_ttl = 64;
if (iph->protocol == IPPROTO_ICMP)
key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
}
encap.sport = 0;
encap.dport = bpf_htons(5555);
ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_GUE);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
}
return TC_ACT_OK;
}
SEC("tc")
int ipip_fou_set_tunnel(struct __sk_buff *skb)
{
struct bpf_tunnel_key key = {};
struct bpf_fou_encap encap = {};
void *data = (void *)(long)skb->data;
struct iphdr *iph = data;
void *data_end = (void *)(long)skb->data_end;
int ret;
if (data + sizeof(*iph) > data_end) {
log_err(1);
return TC_ACT_SHOT;
}
key.tunnel_ttl = 64;
if (iph->protocol == IPPROTO_ICMP)
key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
}
encap.sport = 0;
encap.dport = bpf_htons(5555);
ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_FOU);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
}
return TC_ACT_OK;
}
SEC("tc")
int ipip_encap_get_tunnel(struct __sk_buff *skb)
{
int ret;
struct bpf_tunnel_key key = {};
struct bpf_fou_encap encap = {};
ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
}
ret = bpf_skb_get_fou_encap(skb, &encap);
if (ret < 0) {
log_err(ret);
return TC_ACT_SHOT;
}
if (bpf_ntohs(encap.dport) != 5555)
return TC_ACT_SHOT;
bpf_printk("%d remote ip 0x%x, sport %d, dport %d\n", ret,
key.remote_ipv4, bpf_ntohs(encap.sport),
bpf_ntohs(encap.dport));
return TC_ACT_OK;
}
SEC("tc")
int ipip6_set_tunnel(struct __sk_buff *skb)
{