network/netdev: add support to create IPoIB subinterface

This commit is contained in:
Yu Watanabe 2021-11-08 06:34:43 +09:00
parent a2bf1a61bc
commit b90d0f83b2
13 changed files with 248 additions and 14 deletions

View file

@ -189,6 +189,9 @@
<row><entry><varname>batadv</varname></entry>
<entry><ulink url="https://www.open-mesh.org/projects/open-mesh/wiki">B.A.T.M.A.N. Advanced</ulink> is a routing protocol for multi-hop mobile ad-hoc networks which operates on layer 2.</entry></row>
<row><entry><varname>ipoib</varname></entry>
<entry>An IP over Infiniband subinterface.</entry></row>
</tbody>
</tgroup>
</table>
@ -2125,6 +2128,49 @@
</variablelist>
</refsect1>
<refsect1>
<title>[IPoIB] Section Options</title>
<para>The [IPoIB] section only applies for netdevs of kind <literal>ipoib</literal> and accepts the
following keys:</para>
<variablelist class='network-directives'>
<varlistentry>
<term><varname>PartitionKey=</varname></term>
<listitem>
<para>Takes an integer in the range 1…0xffff, except for 0x8000. Defaults to unset, and the
kernel's default is used.</para>
</listitem>
</varlistentry>
<varlistentry id='ipoib_mode'>
<term><varname>Mode=</varname></term>
<listitem>
<para>Takes one of the special values <literal>datagram</literal> or
<literal>connected</literal>. Defaults to unset, and the kernel's default is used.</para>
<para>When <literal>datagram</literal>, the Infiniband unreliable datagram (UD) transport is
used, and so the interface MTU is equal to the IB L2 MTU minus the IPoIB encapsulation
header (4 bytes). For example, in a typical IB fabric with a 2K MTU, the IPoIB MTU will be
2048 - 4 = 2044 bytes.</para>
<para>When <literal>connected</literal>, the Infiniband reliable connected (RC) transport is
used. Connected mode takes advantage of the connected nature of the IB transport and allows
an MTU up to the maximal IP packet size of 64K, which reduces the number of IP packets needed
for handling large UDP datagrams, TCP segments, etc and increases the performance for large
messages.</para>
</listitem>
</varlistentry>
<varlistentry id='ipoib_umcast'>
<term><varname>IgnoreUserspaceMulticastGroup=</varname></term>
<listitem>
<para>Takes an boolean value. When true, the kernel ignores multicast groups handled by
userspace. Defaults to unset, and the kernel's default is used.</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>Examples</title>
<example>

View file

@ -902,6 +902,7 @@ Table=1234</programlisting></para>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>IPoIB=</varname></term>
<term><varname>IPVLAN=</varname></term>
<term><varname>IPVTAP=</varname></term>
<term><varname>L2TP=</varname></term>
@ -913,8 +914,8 @@ Table=1234</programlisting></para>
<term><varname>VXLAN=</varname></term>
<term><varname>Xfrm=</varname></term>
<listitem>
<para>The name of an IPVLAN, IPVTAP, L2TP, MACsec, MACVLAN, MACVTAP, tunnel, VLAN, VXLAN, or
Xfrm to be created on the link. See
<para>The name of an IPoIB, IPVLAN, IPVTAP, L2TP, MACsec, MACVLAN, MACVTAP, tunnel, VLAN,
VXLAN, or Xfrm to be created on the link. See
<citerefentry><refentrytitle>systemd.netdev</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
This option may be specified more than once.</para>
</listitem>

View file

@ -13,6 +13,8 @@ sources = files('''
netdev/dummy.h
netdev/ifb.c
netdev/ifb.h
netdev/ipoib.c
netdev/ipoib.h
netdev/ipvlan.c
netdev/ipvlan.h
netdev/macvlan.c

119
src/network/netdev/ipoib.c Normal file
View file

@ -0,0 +1,119 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <linux/if_arp.h>
#include <linux/if_link.h>
#include "ipoib.h"
#include "parse-util.h"
#include "string-table.h"
assert_cc((int) IP_OVER_INFINIBAND_MODE_DATAGRAM == (int) IPOIB_MODE_DATAGRAM);
assert_cc((int) IP_OVER_INFINIBAND_MODE_CONNECTED == (int) IPOIB_MODE_CONNECTED);
static void netdev_ipoib_init(NetDev *netdev) {
IPoIB *ipoib;
assert(netdev);
ipoib = IPOIB(netdev);
assert(ipoib);
ipoib->mode = _IP_OVER_INFINIBAND_MODE_INVALID;
ipoib->umcast = -1;
}
static int netdev_ipoib_fill_message_create(NetDev *netdev, Link *link, sd_netlink_message *m) {
IPoIB *ipoib;
int r;
assert(netdev);
assert(link);
assert(m);
ipoib = IPOIB(netdev);
assert(ipoib);
if (ipoib->pkey > 0) {
r = sd_netlink_message_append_u16(m, IFLA_IPOIB_PKEY, ipoib->pkey);
if (r < 0)
return log_netdev_error_errno(netdev, r, "Could not append IFLA_IPOIB_PKEY attribute: %m");
}
if (ipoib->mode >= 0) {
r = sd_netlink_message_append_u16(m, IFLA_IPOIB_MODE, ipoib->mode);
if (r < 0)
return log_netdev_error_errno(netdev, r, "Could not append IFLA_IPOIB_MODE attribute: %m");
}
if (ipoib->umcast >= 0) {
r = sd_netlink_message_append_u16(m, IFLA_IPOIB_UMCAST, ipoib->umcast);
if (r < 0)
return log_netdev_error_errno(netdev, r, "Could not append IFLA_IPOIB_UMCAST attribute: %m");
}
return 0;
}
static const char * const ipoib_mode_table[_IP_OVER_INFINIBAND_MODE_MAX] = {
[IP_OVER_INFINIBAND_MODE_DATAGRAM] = "datagram",
[IP_OVER_INFINIBAND_MODE_CONNECTED] = "connected",
};
DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(ipoib_mode, IPoIBMode);
DEFINE_CONFIG_PARSE_ENUM(config_parse_ipoib_mode, ipoib_mode, IPoIBMode, "Failed to parse IPoIB mode");
int config_parse_ipoib_pkey(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
uint16_t u, *pkey = data;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
assert(data);
if (isempty(rvalue)) {
*pkey = 0; /* 0 means unset. */
return 0;
}
r = safe_atou16(rvalue, &u);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r,
"Failed to parse IPoIB pkey '%s', ignoring assignment: %m",
rvalue);
return 0;
}
if (u == 0 || u == 0x8000) {
log_syntax(unit, LOG_WARNING, filename, line, 0,
"IPoIB pkey cannot be 0 nor 0x8000, ignoring assignment: %s",
rvalue);
return 0;
}
*pkey = u;
return 0;
}
const NetDevVTable ipoib_vtable = {
.object_size = sizeof(IPoIB),
.sections = NETDEV_COMMON_SECTIONS "IPoIB\0",
.init = netdev_ipoib_init,
.fill_message_create = netdev_ipoib_fill_message_create,
.create_type = NETDEV_CREATE_STACKED,
.iftype = ARPHRD_INFINIBAND,
.generate_mac = true,
};

View file

@ -0,0 +1,28 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include <errno.h>
#include "conf-parser.h"
#include "netdev.h"
typedef enum IPoIBMode {
IP_OVER_INFINIBAND_MODE_DATAGRAM,
IP_OVER_INFINIBAND_MODE_CONNECTED,
_IP_OVER_INFINIBAND_MODE_MAX,
_IP_OVER_INFINIBAND_MODE_INVALID = -EINVAL,
} IPoIBMode;
typedef struct IPoIB {
NetDev meta;
uint16_t pkey;
IPoIBMode mode;
int umcast;
} IPoIB;
DEFINE_NETDEV_CAST(IPOIB, IPoIB);
extern const NetDevVTable ipoib_vtable;
CONFIG_PARSER_PROTOTYPE(config_parse_ipoib_pkey);
CONFIG_PARSER_PROTOTYPE(config_parse_ipoib_mode);

View file

@ -11,6 +11,7 @@ _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
#include "conf-parser.h"
#include "fou-tunnel.h"
#include "geneve.h"
#include "ipoib.h"
#include "ipvlan.h"
#include "l2tp-tunnel.h"
#include "macsec.h"
@ -253,3 +254,6 @@ BatmanAdvanced.GatewayBandwidthUp, config_parse_badadv_bandwidth,
BatmanAdvanced.HopPenalty, config_parse_uint8, 0, offsetof(BatmanAdvanced, hop_penalty)
BatmanAdvanced.OriginatorIntervalSec, config_parse_sec, 0, offsetof(BatmanAdvanced, originator_interval)
BatmanAdvanced.RoutingAlgorithm, config_parse_batadv_routing_algorithm, 0, offsetof(BatmanAdvanced, routing_algorithm)
IPoIB.PartitionKey, config_parse_ipoib_pkey, 0, offsetof(IPoIB, pkey)
IPoIB.Mode, config_parse_ipoib_mode, 0, offsetof(IPoIB, mode)
IPoIB.IgnoreUserspaceMulticastGroups, config_parse_tristate, 0, offsetof(IPoIB, umcast)

View file

@ -18,6 +18,7 @@
#include "fou-tunnel.h"
#include "geneve.h"
#include "ifb.h"
#include "ipoib.h"
#include "ipvlan.h"
#include "l2tp-tunnel.h"
#include "list.h"
@ -64,6 +65,7 @@ const NetDevVTable * const netdev_vtable[_NETDEV_KIND_MAX] = {
[NETDEV_KIND_IP6GRETAP] = &ip6gretap_vtable,
[NETDEV_KIND_IP6TNL] = &ip6tnl_vtable,
[NETDEV_KIND_IPIP] = &ipip_vtable,
[NETDEV_KIND_IPOIB] = &ipoib_vtable,
[NETDEV_KIND_IPVLAN] = &ipvlan_vtable,
[NETDEV_KIND_IPVTAP] = &ipvtap_vtable,
[NETDEV_KIND_L2TP] = &l2tptnl_vtable,
@ -103,6 +105,7 @@ static const char* const netdev_kind_table[_NETDEV_KIND_MAX] = {
[NETDEV_KIND_IP6GRETAP] = "ip6gretap",
[NETDEV_KIND_IP6TNL] = "ip6tnl",
[NETDEV_KIND_IPIP] = "ipip",
[NETDEV_KIND_IPOIB] = "ipoib",
[NETDEV_KIND_IPVLAN] = "ipvlan",
[NETDEV_KIND_IPVTAP] = "ipvtap",
[NETDEV_KIND_L2TP] = "l2tp",
@ -393,6 +396,7 @@ int netdev_set_ifindex(NetDev *netdev, sd_netlink_message *message) {
int netdev_generate_hw_addr(
NetDev *netdev,
Link *parent,
const char *name,
const struct hw_addr_data *hw_addr,
struct hw_addr_data *ret) {
@ -419,7 +423,7 @@ int netdev_generate_hw_addr(
if (!NETDEV_VTABLE(netdev)->generate_mac)
goto finalize;
if (NETDEV_VTABLE(netdev)->iftype != ARPHRD_ETHER)
if (!IN_SET(NETDEV_VTABLE(netdev)->iftype, ARPHRD_ETHER, ARPHRD_INFINIBAND))
goto finalize;
r = net_get_unique_predictable_data_from_name(name, &HASH_KEY, &result);
@ -430,21 +434,42 @@ int netdev_generate_hw_addr(
}
a.length = arphrd_to_hw_addr_len(NETDEV_VTABLE(netdev)->iftype);
assert(a.length <= sizeof(result));
memcpy(a.bytes, &result, a.length);
if (ether_addr_is_null(&a.ether) || ether_addr_is_broadcast(&a.ether)) {
log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL),
"Failed to generate persistent MAC address, ignoring: %m");
a = HW_ADDR_NULL;
goto finalize;
switch (NETDEV_VTABLE(netdev)->iftype) {
case ARPHRD_ETHER:
assert(a.length <= sizeof(result));
memcpy(a.bytes, &result, a.length);
if (ether_addr_is_null(&a.ether) || ether_addr_is_broadcast(&a.ether)) {
log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL),
"Failed to generate persistent MAC address, ignoring: %m");
a = HW_ADDR_NULL;
goto finalize;
}
break;
case ARPHRD_INFINIBAND:
if (result == 0) {
log_netdev_warning_errno(netdev, SYNTHETIC_ERRNO(EINVAL),
"Failed to generate persistent MAC address: %m");
goto finalize;
}
assert(a.length >= sizeof(result));
memzero(a.bytes, a.length - sizeof(result));
memcpy(a.bytes + a.length - sizeof(result), &result, sizeof(result));
break;
default:
assert_not_reached();
}
} else {
a = *hw_addr;
warn_invalid = true;
}
r = net_verify_hardware_address(name, warn_invalid, NETDEV_VTABLE(netdev)->iftype, NULL, &a);
r = net_verify_hardware_address(name, warn_invalid, NETDEV_VTABLE(netdev)->iftype,
parent ? &parent->hw_addr : NULL, &a);
if (r < 0)
return r;
@ -481,7 +506,7 @@ static int netdev_create(NetDev *netdev, Link *link, link_netlink_message_handle
if (r < 0)
return log_netdev_error_errno(netdev, r, "Could not append IFLA_IFNAME, attribute: %m");
r = netdev_generate_hw_addr(netdev, netdev->ifname, &netdev->hw_addr, &hw_addr);
r = netdev_generate_hw_addr(netdev, link, netdev->ifname, &netdev->hw_addr, &hw_addr);
if (r < 0)
return r;

View file

@ -22,6 +22,7 @@
"-Bridge\0" \
"-FooOverUDP\0" \
"-GENEVE\0" \
"-IPoIB\0" \
"-IPVLAN\0" \
"-IPVTAP\0" \
"-L2TP\0" \
@ -60,6 +61,7 @@ typedef enum NetDevKind {
NETDEV_KIND_IP6GRETAP,
NETDEV_KIND_IP6TNL,
NETDEV_KIND_IPIP,
NETDEV_KIND_IPOIB,
NETDEV_KIND_IPVLAN,
NETDEV_KIND_IPVTAP,
NETDEV_KIND_L2TP,
@ -201,7 +203,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(NetDev*, netdev_unref);
bool netdev_is_managed(NetDev *netdev);
int netdev_get(Manager *manager, const char *name, NetDev **ret);
int netdev_set_ifindex(NetDev *netdev, sd_netlink_message *newlink);
int netdev_generate_hw_addr(NetDev *netdev, const char *name,
int netdev_generate_hw_addr(NetDev *netdev, Link *link, const char *name,
const struct hw_addr_data *hw_addr, struct hw_addr_data *ret);
int netdev_join(NetDev *netdev, Link *link, link_netlink_message_handler_t cb);

View file

@ -32,7 +32,7 @@ static int netdev_veth_fill_message_create(NetDev *netdev, Link *link, sd_netlin
return log_netdev_error_errno(netdev, r, "Failed to add netlink interface name: %m");
}
r = netdev_generate_hw_addr(netdev, v->ifname_peer, &v->hw_addr_peer, &hw_addr);
r = netdev_generate_hw_addr(netdev, NULL, v->ifname_peer, &v->hw_addr_peer, &hw_addr);
if (r < 0)
return r;

View file

@ -87,6 +87,7 @@ Network.BatmanAdvanced, config_parse_ifname,
Network.Bond, config_parse_ifname, 0, offsetof(Network, bond_name)
Network.Bridge, config_parse_ifname, 0, offsetof(Network, bridge_name)
Network.VRF, config_parse_ifname, 0, offsetof(Network, vrf_name)
Network.IPoIB, config_parse_stacked_netdev, NETDEV_KIND_IPOIB, offsetof(Network, stacked_netdev_names)
Network.IPVLAN, config_parse_stacked_netdev, NETDEV_KIND_IPVLAN, offsetof(Network, stacked_netdev_names)
Network.IPVTAP, config_parse_stacked_netdev, NETDEV_KIND_IPVTAP, offsetof(Network, stacked_netdev_names)
Network.L2TP, config_parse_stacked_netdev, NETDEV_KIND_L2TP, offsetof(Network, stacked_netdev_names)

View file

@ -852,6 +852,7 @@ int config_parse_stacked_netdev(
assert(rvalue);
assert(data);
assert(IN_SET(kind,
NETDEV_KIND_IPOIB,
NETDEV_KIND_IPVLAN,
NETDEV_KIND_IPVTAP,
NETDEV_KIND_L2TP,

View file

@ -241,3 +241,7 @@ GatewayBandwithUp=
GatewayBandwidthDown=
GatewayBandwidthUp=
RoutingAlgorithm=
[IPoIB]
PartitionKey=
Mode=
IgnoreUserspaceMulticastGroups=

View file

@ -242,6 +242,7 @@ IgnoreCarrierLoss=
KeepConfiguration=
DHCPv6PrefixDelegation=
BatmanAdvanced=
IPoIB=
[IPv6Prefix]
Prefix=
OnLink=