net: do not create fallback tunnels for non-default namespaces

fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.

These tunnels are also automatically created when a new network
namespace is created, at a great cost.

In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)

Add a new sysctl so that we can opt-out from this automatic creation.

Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.

Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
 (for j in `seq 1 100` ; do  unshare -n /bin/true >/dev/null ; done) &
done
wait

lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real	0m37.521s
user	0m0.886s
sys	7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real	0m4.761s
user	0m0.851s
sys	1m8.343s
lpk43:~#

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2018-03-08 12:51:41 -08:00 committed by David S. Miller
parent 2b3905de8b
commit 79134e6ce2
8 changed files with 54 additions and 10 deletions

View file

@ -270,6 +270,18 @@ optmem_max
Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
of struct cmsghdr structures with appended data.
fb_tunnels_only_for_init_net
----------------------------
Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0,
sit0, ip6tnl0, ip6gre0) are automatically created when a new
network namespace is created, if corresponding tunnel is present
in initial network namespace.
If set to 1, these devices are not automatically created, and
user space is responsible for creating them if needed.
Default : 0 (for compatibility reasons)
2. /proc/sys/net/unix - Parameters for Unix domain sockets
-------------------------------------------------------

View file

@ -585,6 +585,13 @@ struct netdev_queue {
#endif
} ____cacheline_aligned_in_smp;
extern int sysctl_fb_tunnels_only_for_init_net;
static inline bool net_has_fallback_tunnels(const struct net *net)
{
return net == &init_net || !sysctl_fb_tunnels_only_for_init_net;
}
static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)

View file

@ -180,8 +180,10 @@ struct tnl_ptk_info {
struct ip_tunnel_net {
struct net_device *fb_tunnel_dev;
struct rtnl_link_ops *rtnl_link_ops;
struct hlist_head tunnels[IP_TNL_HASH_SIZE];
struct ip_tunnel __rcu *collect_md_tun;
int type;
};
static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,

View file

@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS;
static int net_msg_warn; /* Unused, but still a sysctl */
int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
{
.procname = "fb_tunnels_only_for_init_net",
.data = &sysctl_fb_tunnels_only_for_init_net,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
{ }
};

View file

@ -347,8 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
struct net_device *dev;
int t_hlen;
BUG_ON(!itn->fb_tunnel_dev);
dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
if (IS_ERR(dev))
return ERR_CAST(dev);
@ -822,7 +821,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
struct net *net = t->net;
struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
BUG_ON(!itn->fb_tunnel_dev);
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == itn->fb_tunnel_dev) {
@ -847,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
p->o_key = 0;
}
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
t = ip_tunnel_find(itn, p, itn->type);
if (cmd == SIOCADDTUNNEL) {
if (!t) {
@ -991,10 +989,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct ip_tunnel_parm parms;
unsigned int i;
itn->rtnl_link_ops = ops;
for (i = 0; i < IP_TNL_HASH_SIZE; i++)
INIT_HLIST_HEAD(&itn->tunnels[i]);
if (!ops) {
if (!ops || !net_has_fallback_tunnels(net)) {
struct ip_tunnel_net *it_init_net;
it_init_net = net_generic(&init_net, ip_tnl_net_id);
itn->type = it_init_net->type;
itn->fb_tunnel_dev = NULL;
return 0;
}
@ -1012,6 +1015,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
itn->type = itn->fb_tunnel_dev->type;
}
rtnl_unlock();
@ -1019,10 +1023,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
struct list_head *head,
struct rtnl_link_ops *ops)
{
struct net *net = dev_net(itn->fb_tunnel_dev);
struct net_device *dev, *aux;
int h;
@ -1054,7 +1058,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
itn = net_generic(net, id);
ip_tunnel_destroy(itn, &list, ops);
ip_tunnel_destroy(net, itn, &list, ops);
}
unregister_netdevice_many(&list);
rtnl_unlock();

View file

@ -236,7 +236,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
return t;
dev = ign->fb_tunnel_dev;
if (dev->flags & IFF_UP)
if (dev && dev->flags & IFF_UP)
return netdev_priv(dev);
return NULL;
@ -1472,6 +1472,8 @@ static int __net_init ip6gre_init_net(struct net *net)
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
int err;
if (!net_has_fallback_tunnels(net))
return 0;
ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
NET_NAME_UNKNOWN,
ip6gre_tunnel_setup);

View file

@ -2205,6 +2205,8 @@ static int __net_init ip6_tnl_init_net(struct net *net)
ip6n->tnls[0] = ip6n->tnls_wc;
ip6n->tnls[1] = ip6n->tnls_r_l;
if (!net_has_fallback_tunnels(net))
return 0;
err = -ENOMEM;
ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
NET_NAME_UNKNOWN, ip6_tnl_dev_setup);

View file

@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
#ifdef CONFIG_IPV6_SIT_6RD
struct ip_tunnel *t = netdev_priv(dev);
if (dev == sitn->fb_tunnel_dev) {
if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) {
ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
t->ip6rd.relay_prefix = 0;
t->ip6rd.prefixlen = 16;
@ -1835,6 +1835,9 @@ static int __net_init sit_init_net(struct net *net)
sitn->tunnels[2] = sitn->tunnels_r;
sitn->tunnels[3] = sitn->tunnels_r_l;
if (!net_has_fallback_tunnels(net))
return 0;
sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
NET_NAME_UNKNOWN,
ipip6_tunnel_setup);