linux/net/8021q/vlan_core.c
Richard Gobert 5ef31ea5d0 net: gro: fix udp bad offset in socket lookup by adding {inner_}network_offset to napi_gro_cb
Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp:
additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the
complete phase of gro. The functions always return skb->network_header,
which in the case of encapsulated packets at the gro complete phase, is
always set to the innermost L3 of the packet. That means that calling
{ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in
gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_
L3/L4 may return an unexpected value.

This incorrect usage leads to a bug in GRO's UDP socket lookup.
udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These
*_hdr functions return network_header which will point to the innermost L3,
resulting in the wrong offset being used in __udp{4,6}_lib_lookup with
encapsulated packets.

This patch adds network_offset and inner_network_offset to napi_gro_cb, and
makes sure both are set correctly.

To fix the issue, network_offsets union is used inside napi_gro_cb, in
which both the outer and the inner network offsets are saved.

Reproduction example:

Endpoint configuration example (fou + local address bind)

    # ip fou add port 6666 ipproto 4
    # ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip
    # ip link set tun1 up
    # ip a add 1.1.1.2/24 dev tun1

Netperf TCP_STREAM result on net-next before patch is applied:

net-next main, GRO enabled:
    $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
    Recv   Send    Send
    Socket Socket  Message  Elapsed
    Size   Size    Size     Time     Throughput
    bytes  bytes   bytes    secs.    10^6bits/sec

    131072  16384  16384    5.28        2.37

net-next main, GRO disabled:
    $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
    Recv   Send    Send
    Socket Socket  Message  Elapsed
    Size   Size    Size     Time     Throughput
    bytes  bytes   bytes    secs.    10^6bits/sec

    131072  16384  16384    5.01     2745.06

patch applied, GRO enabled:
    $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
    Recv   Send    Send
    Socket Socket  Message  Elapsed
    Size   Size    Size     Time     Throughput
    bytes  bytes   bytes    secs.    10^6bits/sec

    131072  16384  16384    5.01     2877.38

Fixes: a6024562ff ("udp: Add GRO functions to UDP socket")
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-05-02 11:02:48 +02:00

561 lines
13 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/netpoll.h>
#include <linux/export.h>
#include <net/gro.h>
#include "vlan.h"
bool vlan_do_receive(struct sk_buff **skbp)
{
struct sk_buff *skb = *skbp;
__be16 vlan_proto = skb->vlan_proto;
u16 vlan_id = skb_vlan_tag_get_id(skb);
struct net_device *vlan_dev;
struct vlan_pcpu_stats *rx_stats;
vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id);
if (!vlan_dev)
return false;
skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
return false;
if (unlikely(!(vlan_dev->flags & IFF_UP))) {
kfree_skb(skb);
*skbp = NULL;
return false;
}
skb->dev = vlan_dev;
if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
/* Our lower layer thinks this is not local, let's make sure.
* This allows the VLAN to have a different MAC than the
* underlying device, and still route correctly. */
if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr))
skb->pkt_type = PACKET_HOST;
}
if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) &&
!netif_is_macvlan_port(vlan_dev) &&
!netif_is_bridge_port(vlan_dev)) {
unsigned int offset = skb->data - skb_mac_header(skb);
/*
* vlan_insert_tag expect skb->data pointing to mac header.
* So change skb->data before calling it and change back to
* original position later
*/
skb_push(skb, offset);
skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto,
skb->vlan_tci, skb->mac_len);
if (!skb)
return false;
skb_pull(skb, offset + VLAN_HLEN);
skb_reset_mac_len(skb);
}
skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
__vlan_hwaccel_clear_tag(skb);
rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
u64_stats_update_begin(&rx_stats->syncp);
u64_stats_inc(&rx_stats->rx_packets);
u64_stats_add(&rx_stats->rx_bytes, skb->len);
if (skb->pkt_type == PACKET_MULTICAST)
u64_stats_inc(&rx_stats->rx_multicast);
u64_stats_update_end(&rx_stats->syncp);
return true;
}
/* Must be invoked with rcu_read_lock. */
struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
__be16 vlan_proto, u16 vlan_id)
{
struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);
if (vlan_info) {
return vlan_group_get_device(&vlan_info->grp,
vlan_proto, vlan_id);
} else {
/*
* Lower devices of master uppers (bonding, team) do not have
* grp assigned to themselves. Grp is assigned to upper device
* instead.
*/
struct net_device *upper_dev;
upper_dev = netdev_master_upper_dev_get_rcu(dev);
if (upper_dev)
return __vlan_find_dev_deep_rcu(upper_dev,
vlan_proto, vlan_id);
}
return NULL;
}
EXPORT_SYMBOL(__vlan_find_dev_deep_rcu);
struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
struct net_device *ret = vlan_dev_priv(dev)->real_dev;
while (is_vlan_dev(ret))
ret = vlan_dev_priv(ret)->real_dev;
return ret;
}
EXPORT_SYMBOL(vlan_dev_real_dev);
u16 vlan_dev_vlan_id(const struct net_device *dev)
{
return vlan_dev_priv(dev)->vlan_id;
}
EXPORT_SYMBOL(vlan_dev_vlan_id);
__be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
return vlan_dev_priv(dev)->vlan_proto;
}
EXPORT_SYMBOL(vlan_dev_vlan_proto);
/*
* vlan info and vid list
*/
static void vlan_group_free(struct vlan_group *grp)
{
int i, j;
for (i = 0; i < VLAN_PROTO_NUM; i++)
for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++)
kfree(grp->vlan_devices_arrays[i][j]);
}
static void vlan_info_free(struct vlan_info *vlan_info)
{
vlan_group_free(&vlan_info->grp);
kfree(vlan_info);
}
static void vlan_info_rcu_free(struct rcu_head *rcu)
{
vlan_info_free(container_of(rcu, struct vlan_info, rcu));
}
static struct vlan_info *vlan_info_alloc(struct net_device *dev)
{
struct vlan_info *vlan_info;
vlan_info = kzalloc(sizeof(struct vlan_info), GFP_KERNEL);
if (!vlan_info)
return NULL;
vlan_info->real_dev = dev;
INIT_LIST_HEAD(&vlan_info->vid_list);
return vlan_info;
}
struct vlan_vid_info {
struct list_head list;
__be16 proto;
u16 vid;
int refcount;
};
static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
{
if (proto == htons(ETH_P_8021Q) &&
dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
return true;
if (proto == htons(ETH_P_8021AD) &&
dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
return true;
return false;
}
static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
__be16 proto, u16 vid)
{
struct vlan_vid_info *vid_info;
list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
if (vid_info->proto == proto && vid_info->vid == vid)
return vid_info;
}
return NULL;
}
static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
{
struct vlan_vid_info *vid_info;
vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL);
if (!vid_info)
return NULL;
vid_info->proto = proto;
vid_info->vid = vid;
return vid_info;
}
static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
if (!vlan_hw_filter_capable(dev, proto))
return 0;
if (netif_device_present(dev))
return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid);
else
return -ENODEV;
}
static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
if (!vlan_hw_filter_capable(dev, proto))
return 0;
if (netif_device_present(dev))
return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
else
return -ENODEV;
}
int vlan_for_each(struct net_device *dev,
int (*action)(struct net_device *dev, int vid, void *arg),
void *arg)
{
struct vlan_vid_info *vid_info;
struct vlan_info *vlan_info;
struct net_device *vdev;
int ret;
ASSERT_RTNL();
vlan_info = rtnl_dereference(dev->vlan_info);
if (!vlan_info)
return 0;
list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
vid_info->vid);
ret = action(vdev, vid_info->vid, arg);
if (ret)
return ret;
}
return 0;
}
EXPORT_SYMBOL(vlan_for_each);
int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
{
struct net_device *real_dev = vlan_info->real_dev;
struct vlan_vid_info *vlan_vid_info;
int err;
list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) {
if (vlan_vid_info->proto == proto) {
err = vlan_add_rx_filter_info(real_dev, proto,
vlan_vid_info->vid);
if (err)
goto unwind;
}
}
return 0;
unwind:
list_for_each_entry_continue_reverse(vlan_vid_info,
&vlan_info->vid_list, list) {
if (vlan_vid_info->proto == proto)
vlan_kill_rx_filter_info(real_dev, proto,
vlan_vid_info->vid);
}
return err;
}
EXPORT_SYMBOL(vlan_filter_push_vids);
void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto)
{
struct vlan_vid_info *vlan_vid_info;
list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list)
if (vlan_vid_info->proto == proto)
vlan_kill_rx_filter_info(vlan_info->real_dev,
vlan_vid_info->proto,
vlan_vid_info->vid);
}
EXPORT_SYMBOL(vlan_filter_drop_vids);
static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
struct vlan_vid_info **pvid_info)
{
struct net_device *dev = vlan_info->real_dev;
struct vlan_vid_info *vid_info;
int err;
vid_info = vlan_vid_info_alloc(proto, vid);
if (!vid_info)
return -ENOMEM;
err = vlan_add_rx_filter_info(dev, proto, vid);
if (err) {
kfree(vid_info);
return err;
}
list_add(&vid_info->list, &vlan_info->vid_list);
vlan_info->nr_vids++;
*pvid_info = vid_info;
return 0;
}
int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
struct vlan_info *vlan_info;
struct vlan_vid_info *vid_info;
bool vlan_info_created = false;
int err;
ASSERT_RTNL();
vlan_info = rtnl_dereference(dev->vlan_info);
if (!vlan_info) {
vlan_info = vlan_info_alloc(dev);
if (!vlan_info)
return -ENOMEM;
vlan_info_created = true;
}
vid_info = vlan_vid_info_get(vlan_info, proto, vid);
if (!vid_info) {
err = __vlan_vid_add(vlan_info, proto, vid, &vid_info);
if (err)
goto out_free_vlan_info;
}
vid_info->refcount++;
if (vlan_info_created)
rcu_assign_pointer(dev->vlan_info, vlan_info);
return 0;
out_free_vlan_info:
if (vlan_info_created)
kfree(vlan_info);
return err;
}
EXPORT_SYMBOL(vlan_vid_add);
static void __vlan_vid_del(struct vlan_info *vlan_info,
struct vlan_vid_info *vid_info)
{
struct net_device *dev = vlan_info->real_dev;
__be16 proto = vid_info->proto;
u16 vid = vid_info->vid;
int err;
err = vlan_kill_rx_filter_info(dev, proto, vid);
if (err && dev->reg_state != NETREG_UNREGISTERING)
netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid);
list_del(&vid_info->list);
kfree(vid_info);
vlan_info->nr_vids--;
}
void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
struct vlan_info *vlan_info;
struct vlan_vid_info *vid_info;
ASSERT_RTNL();
vlan_info = rtnl_dereference(dev->vlan_info);
if (!vlan_info)
return;
vid_info = vlan_vid_info_get(vlan_info, proto, vid);
if (!vid_info)
return;
vid_info->refcount--;
if (vid_info->refcount == 0) {
__vlan_vid_del(vlan_info, vid_info);
if (vlan_info->nr_vids == 0) {
RCU_INIT_POINTER(dev->vlan_info, NULL);
call_rcu(&vlan_info->rcu, vlan_info_rcu_free);
}
}
}
EXPORT_SYMBOL(vlan_vid_del);
int vlan_vids_add_by_dev(struct net_device *dev,
const struct net_device *by_dev)
{
struct vlan_vid_info *vid_info;
struct vlan_info *vlan_info;
int err;
ASSERT_RTNL();
vlan_info = rtnl_dereference(by_dev->vlan_info);
if (!vlan_info)
return 0;
list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
continue;
err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
if (err)
goto unwind;
}
return 0;
unwind:
list_for_each_entry_continue_reverse(vid_info,
&vlan_info->vid_list,
list) {
if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
continue;
vlan_vid_del(dev, vid_info->proto, vid_info->vid);
}
return err;
}
EXPORT_SYMBOL(vlan_vids_add_by_dev);
void vlan_vids_del_by_dev(struct net_device *dev,
const struct net_device *by_dev)
{
struct vlan_vid_info *vid_info;
struct vlan_info *vlan_info;
ASSERT_RTNL();
vlan_info = rtnl_dereference(by_dev->vlan_info);
if (!vlan_info)
return;
list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
continue;
vlan_vid_del(dev, vid_info->proto, vid_info->vid);
}
}
EXPORT_SYMBOL(vlan_vids_del_by_dev);
bool vlan_uses_dev(const struct net_device *dev)
{
struct vlan_info *vlan_info;
ASSERT_RTNL();
vlan_info = rtnl_dereference(dev->vlan_info);
if (!vlan_info)
return false;
return vlan_info->grp.nr_vlan_devs ? true : false;
}
EXPORT_SYMBOL(vlan_uses_dev);
static struct sk_buff *vlan_gro_receive(struct list_head *head,
struct sk_buff *skb)
{
const struct packet_offload *ptype;
unsigned int hlen, off_vlan;
struct sk_buff *pp = NULL;
struct vlan_hdr *vhdr;
struct sk_buff *p;
__be16 type;
int flush = 1;
off_vlan = skb_gro_offset(skb);
hlen = off_vlan + sizeof(*vhdr);
vhdr = skb_gro_header(skb, hlen, off_vlan);
if (unlikely(!vhdr))
goto out;
NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;
type = vhdr->h_vlan_encapsulated_proto;
ptype = gro_find_receive_by_type(type);
if (!ptype)
goto out;
flush = 0;
list_for_each_entry(p, head, list) {
struct vlan_hdr *vhdr2;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
if (compare_vlan_header(vhdr, vhdr2))
NAPI_GRO_CB(p)->same_flow = 0;
}
skb_gro_pull(skb, sizeof(*vhdr));
skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
ipv6_gro_receive, inet_gro_receive,
head, skb);
out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
{
struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
__be16 type = vhdr->h_vlan_encapsulated_proto;
struct packet_offload *ptype;
int err = -ENOENT;
ptype = gro_find_complete_by_type(type);
if (ptype)
err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
ipv6_gro_complete, inet_gro_complete,
skb, nhoff + sizeof(*vhdr));
return err;
}
static struct packet_offload vlan_packet_offloads[] __read_mostly = {
{
.type = cpu_to_be16(ETH_P_8021Q),
.priority = 10,
.callbacks = {
.gro_receive = vlan_gro_receive,
.gro_complete = vlan_gro_complete,
},
},
{
.type = cpu_to_be16(ETH_P_8021AD),
.priority = 10,
.callbacks = {
.gro_receive = vlan_gro_receive,
.gro_complete = vlan_gro_complete,
},
},
};
static int __init vlan_offload_init(void)
{
unsigned int i;
for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
dev_add_offload(&vlan_packet_offloads[i]);
return 0;
}
fs_initcall(vlan_offload_init);