linux/drivers/net/veth.c
Vijay Pandurangan ce8c839b74 veth: don’t modify ip_summed; doing so treats packets with bad checksums as good.
Packets that arrive from real hardware devices have ip_summed ==
CHECKSUM_UNNECESSARY if the hardware verified the checksums, or
CHECKSUM_NONE if the packet is bad or it was unable to verify it. The
current version of veth will replace CHECKSUM_NONE with
CHECKSUM_UNNECESSARY, which causes corrupt packets routed from hardware to
a veth device to be delivered to the application. This caused applications
at Twitter to receive corrupt data when network hardware was corrupting
packets.

We believe this was added as an optimization to skip computing and
verifying checksums for communication between containers. However, locally
generated packets have ip_summed == CHECKSUM_PARTIAL, so the code as
written does nothing for them. As far as we can tell, after removing this
code, these packets are transmitted from one stack to another unmodified
(tcpdump shows invalid checksums on both sides, as expected), and they are
delivered correctly to applications. We didn’t test every possible network
configuration, but we tried a few common ones such as bridging containers,
using NAT between the host and a container, and routing from hardware
devices to containers. We have effectively deployed this in production at
Twitter (by disabling RX checksum offloading on veth devices).

This code dates back to the first version of the driver, commit
<e314dbdc1c0dc6a548ecf> ("[NET]: Virtual ethernet device driver"), so I
suspect this bug occurred mostly because the driver API has evolved
significantly since then. Commit <0b7967503dc97864f283a> ("net/veth: Fix
packet checksumming") (in December 2010) fixed this for packets that get
created locally and sent to hardware devices, by not changing
CHECKSUM_PARTIAL. However, the same issue still occurs for packets coming
in from hardware devices.

Co-authored-by: Evan Jones <ej@evanjones.ca>
Signed-off-by: Evan Jones <ej@evanjones.ca>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Cc: Phil Sutter <phil@nwl.cc>
Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Vijay Pandurangan <vijayp@vijayp.ca>
Acked-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-22 15:15:34 -05:00

523 lines
12 KiB
C

/*
* drivers/net/veth.c
*
* Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
*
* Author: Pavel Emelianov <xemul@openvz.org>
* Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
*
*/
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/ethtool.h>
#include <linux/etherdevice.h>
#include <linux/u64_stats_sync.h>
#include <net/rtnetlink.h>
#include <net/dst.h>
#include <net/xfrm.h>
#include <linux/veth.h>
#include <linux/module.h>
#define DRV_NAME "veth"
#define DRV_VERSION "1.0"
#define MIN_MTU 68 /* Min L3 MTU */
#define MAX_MTU 65535 /* Max L3 MTU (arbitrary) */
struct pcpu_vstats {
u64 packets;
u64 bytes;
struct u64_stats_sync syncp;
};
struct veth_priv {
struct net_device __rcu *peer;
atomic64_t dropped;
};
/*
* ethtool interface
*/
static struct {
const char string[ETH_GSTRING_LEN];
} ethtool_stats_keys[] = {
{ "peer_ifindex" },
};
static int veth_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
cmd->supported = 0;
cmd->advertising = 0;
ethtool_cmd_speed_set(cmd, SPEED_10000);
cmd->duplex = DUPLEX_FULL;
cmd->port = PORT_TP;
cmd->phy_address = 0;
cmd->transceiver = XCVR_INTERNAL;
cmd->autoneg = AUTONEG_DISABLE;
cmd->maxtxpkt = 0;
cmd->maxrxpkt = 0;
return 0;
}
static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
strlcpy(info->version, DRV_VERSION, sizeof(info->version));
}
static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
{
switch(stringset) {
case ETH_SS_STATS:
memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
break;
}
}
static int veth_get_sset_count(struct net_device *dev, int sset)
{
switch (sset) {
case ETH_SS_STATS:
return ARRAY_SIZE(ethtool_stats_keys);
default:
return -EOPNOTSUPP;
}
}
static void veth_get_ethtool_stats(struct net_device *dev,
struct ethtool_stats *stats, u64 *data)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer = rtnl_dereference(priv->peer);
data[0] = peer ? peer->ifindex : 0;
}
static const struct ethtool_ops veth_ethtool_ops = {
.get_settings = veth_get_settings,
.get_drvinfo = veth_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_strings = veth_get_strings,
.get_sset_count = veth_get_sset_count,
.get_ethtool_stats = veth_get_ethtool_stats,
};
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *rcv;
int length = skb->len;
rcu_read_lock();
rcv = rcu_dereference(priv->peer);
if (unlikely(!rcv)) {
kfree_skb(skb);
goto drop;
}
if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
u64_stats_update_begin(&stats->syncp);
stats->bytes += length;
stats->packets++;
u64_stats_update_end(&stats->syncp);
} else {
drop:
atomic64_inc(&priv->dropped);
}
rcu_read_unlock();
return NETDEV_TX_OK;
}
/*
* general routines
*/
static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
int cpu;
result->packets = 0;
result->bytes = 0;
for_each_possible_cpu(cpu) {
struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu);
u64 packets, bytes;
unsigned int start;
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
packets = stats->packets;
bytes = stats->bytes;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
result->packets += packets;
result->bytes += bytes;
}
return atomic64_read(&priv->dropped);
}
static struct rtnl_link_stats64 *veth_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *tot)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
struct pcpu_vstats one;
tot->tx_dropped = veth_stats_one(&one, dev);
tot->tx_bytes = one.bytes;
tot->tx_packets = one.packets;
rcu_read_lock();
peer = rcu_dereference(priv->peer);
if (peer) {
tot->rx_dropped = veth_stats_one(&one, peer);
tot->rx_bytes = one.bytes;
tot->rx_packets = one.packets;
}
rcu_read_unlock();
return tot;
}
/* fake multicast ability */
static void veth_set_multicast_list(struct net_device *dev)
{
}
static int veth_open(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer = rtnl_dereference(priv->peer);
if (!peer)
return -ENOTCONN;
if (peer->flags & IFF_UP) {
netif_carrier_on(dev);
netif_carrier_on(peer);
}
return 0;
}
static int veth_close(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer = rtnl_dereference(priv->peer);
netif_carrier_off(dev);
if (peer)
netif_carrier_off(peer);
return 0;
}
static int is_valid_veth_mtu(int new_mtu)
{
return new_mtu >= MIN_MTU && new_mtu <= MAX_MTU;
}
static int veth_change_mtu(struct net_device *dev, int new_mtu)
{
if (!is_valid_veth_mtu(new_mtu))
return -EINVAL;
dev->mtu = new_mtu;
return 0;
}
static int veth_dev_init(struct net_device *dev)
{
dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats);
if (!dev->vstats)
return -ENOMEM;
return 0;
}
static void veth_dev_free(struct net_device *dev)
{
free_percpu(dev->vstats);
free_netdev(dev);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
static void veth_poll_controller(struct net_device *dev)
{
/* veth only receives frames when its peer sends one
* Since it's a synchronous operation, we are guaranteed
* never to have pending data when we poll for it so
* there is nothing to do here.
*
* We need this though so netpoll recognizes us as an interface that
* supports polling, which enables bridge devices in virt setups to
* still use netconsole
*/
}
#endif /* CONFIG_NET_POLL_CONTROLLER */
static int veth_get_iflink(const struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
int iflink;
rcu_read_lock();
peer = rcu_dereference(priv->peer);
iflink = peer ? peer->ifindex : 0;
rcu_read_unlock();
return iflink;
}
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
.ndo_stop = veth_close,
.ndo_start_xmit = veth_xmit,
.ndo_change_mtu = veth_change_mtu,
.ndo_get_stats64 = veth_get_stats64,
.ndo_set_rx_mode = veth_set_multicast_list,
.ndo_set_mac_address = eth_mac_addr,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = veth_poll_controller,
#endif
.ndo_get_iflink = veth_get_iflink,
.ndo_features_check = passthru_features_check,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_HIGHDMA | \
NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL | \
NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT | NETIF_F_UFO | \
NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
static void veth_setup(struct net_device *dev)
{
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->priv_flags |= IFF_NO_QUEUE;
dev->netdev_ops = &veth_netdev_ops;
dev->ethtool_ops = &veth_ethtool_ops;
dev->features |= NETIF_F_LLTX;
dev->features |= VETH_FEATURES;
dev->vlan_features = dev->features &
~(NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX |
NETIF_F_HW_VLAN_CTAG_RX |
NETIF_F_HW_VLAN_STAG_RX);
dev->destructor = veth_dev_free;
dev->hw_features = VETH_FEATURES;
dev->hw_enc_features = VETH_FEATURES;
}
/*
* netlink interface
*/
static int veth_validate(struct nlattr *tb[], struct nlattr *data[])
{
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
return -EINVAL;
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
return -EADDRNOTAVAIL;
}
if (tb[IFLA_MTU]) {
if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
return -EINVAL;
}
return 0;
}
static struct rtnl_link_ops veth_link_ops;
static int veth_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
int err;
struct net_device *peer;
struct veth_priv *priv;
char ifname[IFNAMSIZ];
struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
unsigned char name_assign_type;
struct ifinfomsg *ifmp;
struct net *net;
/*
* create and register peer first
*/
if (data != NULL && data[VETH_INFO_PEER] != NULL) {
struct nlattr *nla_peer;
nla_peer = data[VETH_INFO_PEER];
ifmp = nla_data(nla_peer);
err = rtnl_nla_parse_ifla(peer_tb,
nla_data(nla_peer) + sizeof(struct ifinfomsg),
nla_len(nla_peer) - sizeof(struct ifinfomsg));
if (err < 0)
return err;
err = veth_validate(peer_tb, NULL);
if (err < 0)
return err;
tbp = peer_tb;
} else {
ifmp = NULL;
tbp = tb;
}
if (tbp[IFLA_IFNAME]) {
nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
name_assign_type = NET_NAME_USER;
} else {
snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
name_assign_type = NET_NAME_ENUM;
}
net = rtnl_link_get_net(src_net, tbp);
if (IS_ERR(net))
return PTR_ERR(net);
peer = rtnl_create_link(net, ifname, name_assign_type,
&veth_link_ops, tbp);
if (IS_ERR(peer)) {
put_net(net);
return PTR_ERR(peer);
}
if (tbp[IFLA_ADDRESS] == NULL)
eth_hw_addr_random(peer);
if (ifmp && (dev->ifindex != 0))
peer->ifindex = ifmp->ifi_index;
err = register_netdevice(peer);
put_net(net);
net = NULL;
if (err < 0)
goto err_register_peer;
netif_carrier_off(peer);
err = rtnl_configure_link(peer, ifmp);
if (err < 0)
goto err_configure_peer;
/*
* register dev last
*
* note, that since we've registered new device the dev's name
* should be re-allocated
*/
if (tb[IFLA_ADDRESS] == NULL)
eth_hw_addr_random(dev);
if (tb[IFLA_IFNAME])
nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
else
snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
err = register_netdevice(dev);
if (err < 0)
goto err_register_dev;
netif_carrier_off(dev);
/*
* tie the deviced together
*/
priv = netdev_priv(dev);
rcu_assign_pointer(priv->peer, peer);
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
return 0;
err_register_dev:
/* nothing to do */
err_configure_peer:
unregister_netdevice(peer);
return err;
err_register_peer:
free_netdev(peer);
return err;
}
static void veth_dellink(struct net_device *dev, struct list_head *head)
{
struct veth_priv *priv;
struct net_device *peer;
priv = netdev_priv(dev);
peer = rtnl_dereference(priv->peer);
/* Note : dellink() is called from default_device_exit_batch(),
* before a rcu_synchronize() point. The devices are guaranteed
* not being freed before one RCU grace period.
*/
RCU_INIT_POINTER(priv->peer, NULL);
unregister_netdevice_queue(dev, head);
if (peer) {
priv = netdev_priv(peer);
RCU_INIT_POINTER(priv->peer, NULL);
unregister_netdevice_queue(peer, head);
}
}
static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
[VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
};
static struct net *veth_get_link_net(const struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer = rtnl_dereference(priv->peer);
return peer ? dev_net(peer) : dev_net(dev);
}
static struct rtnl_link_ops veth_link_ops = {
.kind = DRV_NAME,
.priv_size = sizeof(struct veth_priv),
.setup = veth_setup,
.validate = veth_validate,
.newlink = veth_newlink,
.dellink = veth_dellink,
.policy = veth_policy,
.maxtype = VETH_INFO_MAX,
.get_link_net = veth_get_link_net,
};
/*
* init/fini
*/
static __init int veth_init(void)
{
return rtnl_link_register(&veth_link_ops);
}
static __exit void veth_exit(void)
{
rtnl_link_unregister(&veth_link_ops);
}
module_init(veth_init);
module_exit(veth_exit);
MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_RTNL_LINK(DRV_NAME);