diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 8c94b07e654a..8e30685affeb 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -47,7 +47,6 @@ enum { NETIF_F_GSO_SIT_BIT, /* ... SIT tunnel with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */ - NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, @@ -119,7 +118,6 @@ enum { #define NETIF_F_GSO_SIT __NETIF_F(GSO_SIT) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM) -#define NETIF_F_GSO_MPLS __NETIF_F(GSO_MPLS) #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) @@ -183,7 +181,6 @@ enum { NETIF_F_GSO_IPIP | \ NETIF_F_GSO_SIT | \ NETIF_F_GSO_UDP_TUNNEL | \ - NETIF_F_GSO_UDP_TUNNEL_CSUM | \ - NETIF_F_GSO_MPLS) + NETIF_F_GSO_UDP_TUNNEL_CSUM) #endif /* _LINUX_NETDEV_FEATURES_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4767f546d7c0..90ac95900a11 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3583,7 +3583,6 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); - BUILD_BUG_ON(SKB_GSO_MPLS != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39ec7530ae27..53f4f6c93356 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -372,9 +372,7 @@ enum { SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, - SKB_GSO_MPLS = 1 << 12, - - SKB_GSO_TUNNEL_REMCSUM = 1 << 13, + SKB_GSO_TUNNEL_REMCSUM = 1 << 12, }; #if BITS_PER_LONG > 32 diff --git a/include/net/mpls.h b/include/net/mpls.h new file mode 100644 index 000000000000..5b3b5addfb08 --- /dev/null +++ b/include/net/mpls.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _NET_MPLS_H +#define _NET_MPLS_H 1 + +#include +#include + +#define MPLS_HLEN 4 + +static inline bool eth_p_mpls(__be16 eth_type) +{ + return eth_type == htons(ETH_P_MPLS_UC) || + eth_type == htons(ETH_P_MPLS_MC); +} + +/* + * For non-MPLS skbs this will correspond to the network header. + * For MPLS skbs it will be before the network_header as the MPLS + * label stack lies between the end of the mac header and the network + * header. That is, for MPLS skbs the end of the mac header + * is the top of the MPLS label stack. + */ +static inline unsigned char *skb_mpls_header(struct sk_buff *skb) +{ + return skb_mac_header(skb) + skb->mac_len; +} +#endif diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 435eabc5ffaa..26c36c4cf7e2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -293,6 +293,9 @@ enum ovs_key_attr { OVS_KEY_ATTR_DP_HASH, /* u32 hash value. Value 0 indicates the hash is not computed by the datapath. */ OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ + OVS_KEY_ATTR_MPLS, /* array of struct ovs_key_mpls. + * The implementation may restrict + * the accepted length of the array. */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ @@ -340,6 +343,10 @@ struct ovs_key_ethernet { __u8 eth_dst[ETH_ALEN]; }; +struct ovs_key_mpls { + __be32 mpls_lse; +}; + struct ovs_key_ipv4 { __be32 ipv4_src; __be32 ipv4_dst; @@ -393,9 +400,9 @@ struct ovs_key_arp { }; struct ovs_key_nd { - __u32 nd_target[4]; - __u8 nd_sll[ETH_ALEN]; - __u8 nd_tll[ETH_ALEN]; + __be32 nd_target[4]; + __u8 nd_sll[ETH_ALEN]; + __u8 nd_tll[ETH_ALEN]; }; /** @@ -483,6 +490,19 @@ enum ovs_userspace_attr { #define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1) +/** + * struct ovs_action_push_mpls - %OVS_ACTION_ATTR_PUSH_MPLS action argument. + * @mpls_lse: MPLS label stack entry to push. + * @mpls_ethertype: Ethertype to set in the encapsulating ethernet frame. + * + * The only values @mpls_ethertype should ever be given are %ETH_P_MPLS_UC and + * %ETH_P_MPLS_MC, indicating MPLS unicast or multicast. Other are rejected. + */ +struct ovs_action_push_mpls { + __be32 mpls_lse; + __be16 mpls_ethertype; /* Either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC */ +}; + /** * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument. * @vlan_tpid: Tag protocol identifier (TPID) to push. @@ -534,6 +554,15 @@ struct ovs_action_hash { * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet. * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in * the nested %OVS_SAMPLE_ATTR_* attributes. + * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the + * top of the packets MPLS label stack. Set the ethertype of the + * encapsulating frame to either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC to + * indicate the new packet contents. + * @OVS_ACTION_ATTR_POP_MPLS: Pop an MPLS label stack entry off of the + * packet's MPLS label stack. Set the encapsulating frame's ethertype to + * indicate the new packet contents. This could potentially still be + * %ETH_P_MPLS if the resulting MPLS label stack is not empty. If there + * is no MPLS label stack, as determined by ethertype, no action is taken. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -550,6 +579,9 @@ enum ovs_action_attr { OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ OVS_ACTION_ATTR_RECIRC, /* u32 recirc_id. */ OVS_ACTION_ATTR_HASH, /* struct ovs_action_hash. */ + OVS_ACTION_ATTR_PUSH_MPLS, /* struct ovs_action_push_mpls. */ + OVS_ACTION_ATTR_POP_MPLS, /* __be16 ethertype. */ + __OVS_ACTION_ATTR_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 40be481268de..70bb609c283d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -118,6 +118,7 @@ #include #include #include +#include #include #include #include @@ -2530,7 +2531,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { - if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 06dfb293e5aa..b0f84f5ddda8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -84,7 +84,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ed2c672c5b01..3a096bb2d596 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1223,7 +1223,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_MPLS | 0))) goto out; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index a1b2a5624f91..9d7930ba8e0f 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -94,7 +94,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | - SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0a5a70d0e84c..d3e537ef6b7f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -207,8 +207,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index e9767079a360..fd76ce938c32 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -79,7 +79,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_MPLS | SKB_GSO_TCPV6 | 0))) goto out; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 637ba2e438b7..b6aa8ed18257 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -46,8 +46,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, SKB_GSO_GRE | SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_MPLS) || + SKB_GSO_SIT) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index e3545f21a099..ca27837974fe 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -34,8 +34,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_MPLS))) + SKB_GSO_IPIP))) goto out; /* Setup inner SKB. */ diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 2a9673e39ca1..454ce12efbbf 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -30,6 +30,7 @@ config OPENVSWITCH config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" + select NET_MPLS_GSO depends on INET depends on OPENVSWITCH depends on NET_IPGRE_DEMUX diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 922c133b1933..f7e589159e4a 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -28,10 +28,12 @@ #include #include #include + #include #include #include #include +#include #include #include "datapath.h" @@ -118,6 +120,92 @@ static int make_writable(struct sk_buff *skb, int write_len) return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } +static int push_mpls(struct sk_buff *skb, + const struct ovs_action_push_mpls *mpls) +{ + __be32 *new_mpls_lse; + struct ethhdr *hdr; + + /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -ENOTSUPP; + + if (skb_cow_head(skb, MPLS_HLEN) < 0) + return -ENOMEM; + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + + new_mpls_lse = (__be32 *)skb_mpls_header(skb); + *new_mpls_lse = mpls->mpls_lse; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, + MPLS_HLEN, 0)); + + hdr = eth_hdr(skb); + hdr->h_proto = mpls->mpls_ethertype; + + skb_set_inner_protocol(skb, skb->protocol); + skb->protocol = mpls->mpls_ethertype; + + return 0; +} + +static int pop_mpls(struct sk_buff *skb, const __be16 ethertype) +{ + struct ethhdr *hdr; + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_sub(skb->csum, + csum_partial(skb_mpls_header(skb), + MPLS_HLEN, 0)); + + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + + /* skb_mpls_header() is used to locate the ethertype + * field correctly in the presence of VLAN tags. + */ + hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr->h_proto = ethertype; + if (eth_p_mpls(skb->protocol)) + skb->protocol = ethertype; + return 0; +} + +static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse) +{ + __be32 *stack; + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + stack = (__be32 *)skb_mpls_header(skb); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~(*stack), *mpls_lse }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + *stack = *mpls_lse; + + return 0; +} + /* remove VLAN header from packet and update csum accordingly. */ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) { @@ -140,10 +228,12 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; + if (skb_network_offset(skb) < ETH_HLEN) skb_set_network_header(skb, ETH_HLEN); - skb_reset_mac_len(skb); + /* Update mac_len for subsequent MPLS actions */ + skb_reset_mac_len(skb); return 0; } @@ -186,6 +276,8 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) return -ENOMEM; + /* Update mac_len for subsequent MPLS actions */ + skb->mac_len += VLAN_HLEN; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(skb->data @@ -459,21 +551,14 @@ static int set_sctp(struct sk_buff *skb, return 0; } -static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) { - struct vport *vport; + struct vport *vport = ovs_vport_rcu(dp, out_port); - if (unlikely(!skb)) - return -ENOMEM; - - vport = ovs_vport_rcu(dp, out_port); - if (unlikely(!vport)) { + if (likely(vport)) + ovs_vport_send(vport, skb); + else kfree_skb(skb); - return -ENODEV; - } - - ovs_vport_send(vport, skb); - return 0; } static int output_userspace(struct datapath *dp, struct sk_buff *skb, @@ -612,6 +697,10 @@ static int execute_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_SCTP: err = set_sctp(skb, nla_data(nested_attr)); break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, nla_data(nested_attr)); + break; } return err; @@ -672,8 +761,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; - if (prev_port != -1) { - do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); + if (unlikely(prev_port != -1)) { + struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); + + if (out_skb) + do_output(dp, out_skb, prev_port); + prev_port = -1; } @@ -690,6 +783,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, execute_hash(skb, key, a); break; + case OVS_ACTION_ATTR_PUSH_MPLS: + err = push_mpls(skb, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, nla_get_be16(a)); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ @@ -764,14 +865,11 @@ static void process_deferred_actions(struct datapath *dp) /* Execute a list of actions against 'skb'. */ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key) + struct sw_flow_actions *acts, struct sw_flow_key *key) { int level = this_cpu_read(exec_actions_level); - struct sw_flow_actions *acts; int err; - acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); - this_cpu_inc(exec_actions_level); OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f18302f32049..014485ec4b0d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -140,19 +140,30 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *, static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *); -/* Must be called with rcu_read_lock or ovs_mutex. */ -static struct datapath *get_dp(struct net *net, int dp_ifindex) +/* Must be called with rcu_read_lock. */ +static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) { - struct datapath *dp = NULL; - struct net_device *dev; + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, dp_ifindex); if (dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); if (vport) - dp = vport->dp; + return vport->dp; } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); rcu_read_unlock(); return dp; @@ -187,6 +198,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); + ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -245,6 +257,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; @@ -270,10 +283,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) goto out; } - OVS_CB(skb)->flow = flow; + ovs_flow_stats_update(flow, key->tp.flags, skb); + sf_acts = rcu_dereference(flow->sf_acts); + ovs_execute_actions(dp, skb, sf_acts, key); - ovs_flow_stats_update(OVS_CB(skb)->flow, key->tp.flags, skb); - ovs_execute_actions(dp, skb, key); stats_counter = &stats->n_hit; out: @@ -362,37 +375,12 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, return err; } -static size_t key_attr_size(void) -{ - return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ - + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ - + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ - + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ - + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ - + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ - + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(4) /* OVS_KEY_ATTR_8021Q */ - + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ - + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ - + nla_total_size(28); /* OVS_KEY_ATTR_ND */ -} - static size_t upcall_msg_size(const struct nlattr *userdata, unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ /* OVS_PACKET_ATTR_USERDATA */ if (userdata) @@ -512,6 +500,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; @@ -554,25 +543,18 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (err) goto err_flow_free; - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); - err = PTR_ERR(acts); - if (IS_ERR(acts)) - goto err_flow_free; - err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], - &flow->key, 0, &acts); + &flow->key, &acts); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; - OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; @@ -585,9 +567,10 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_unlock; OVS_CB(packet)->input_vport = input_vport; + sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); - err = ovs_execute_actions(dp, packet, &flow->key); + err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); local_bh_enable(); rcu_read_unlock(); @@ -664,8 +647,8 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ - + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ @@ -673,58 +656,67 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) } /* Called with ovs_mutex or RCU read lock. */ -static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, - struct sk_buff *skb, u32 portid, - u32 seq, u32 flags, u8 cmd) +static int ovs_flow_cmd_fill_match(const struct sw_flow *flow, + struct sk_buff *skb) { - const int skb_orig_len = skb->len; - struct nlattr *start; - struct ovs_flow_stats stats; - __be16 tcp_flags; - unsigned long used; - struct ovs_header *ovs_header; struct nlattr *nla; int err; - ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); - if (!ovs_header) - return -EMSGSIZE; - - ovs_header->dp_ifindex = dp_ifindex; - /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); if (!nla) - goto nla_put_failure; + return -EMSGSIZE; err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb); if (err) - goto error; + return err; + nla_nest_end(skb, nla); + /* Fill flow mask. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); if (!nla) - goto nla_put_failure; + return -EMSGSIZE; err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb); if (err) - goto error; + return err; nla_nest_end(skb, nla); + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, + struct sk_buff *skb) +{ + struct ovs_flow_stats stats; + __be16 tcp_flags; + unsigned long used; ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) - goto nla_put_failure; + return -EMSGSIZE; if (stats.n_packets && nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) - goto nla_put_failure; + return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) - goto nla_put_failure; + return -EMSGSIZE; + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, + struct sk_buff *skb, int skb_orig_len) +{ + struct nlattr *start; + int err; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if * this is the first flow to be dumped into 'skb'. This is unusual for @@ -748,17 +740,47 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, nla_nest_end(skb, start); else { if (skb_orig_len) - goto error; + return err; nla_nest_cancel(skb, start); } - } else if (skb_orig_len) - goto nla_put_failure; + } else if (skb_orig_len) { + return -EMSGSIZE; + } + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, + struct sk_buff *skb, u32 portid, + u32 seq, u32 flags, u8 cmd) +{ + const int skb_orig_len = skb->len; + struct ovs_header *ovs_header; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = dp_ifindex; + + err = ovs_flow_cmd_fill_match(flow, skb); + if (err) + goto error; + + err = ovs_flow_cmd_fill_stats(flow, skb); + if (err) + goto error; + + err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); + if (err) + goto error; return genlmsg_end(skb, ovs_header); -nla_put_failure: - err = -EMSGSIZE; error: genlmsg_cancel(skb, ovs_header); return err; @@ -816,10 +838,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Must have key and actions. */ error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR("Flow key attribute not present in new flow.\n"); goto error; - if (!a[OVS_FLOW_ATTR_ACTIONS]) + } + if (!a[OVS_FLOW_ATTR_ACTIONS]) { + OVS_NLERR("Flow actions attribute not present in new flow.\n"); goto error; + } /* Most of the time we need to allocate a new flow, do it before * locking. @@ -840,16 +866,11 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask); /* Validate actions. */ - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); - error = PTR_ERR(acts); - if (IS_ERR(acts)) - goto err_kfree_flow; - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - 0, &acts); + &acts); if (error) { OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - goto err_kfree_acts; + goto err_kfree_flow; } reply = ovs_flow_cmd_alloc_info(acts, info, false); @@ -940,6 +961,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) return error; } +/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask) @@ -948,15 +970,10 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, struct sw_flow_key masked_key; int error; - acts = ovs_nla_alloc_flow_actions(nla_len(a)); - if (IS_ERR(acts)) - return acts; - ovs_flow_mask_key(&masked_key, key, mask); - error = ovs_nla_copy_actions(a, &masked_key, 0, &acts); + error = ovs_nla_copy_actions(a, &masked_key, &acts); if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - kfree(acts); + OVS_NLERR("Actions may not be safe on all matching packets.\n"); return ERR_PTR(error); } @@ -978,8 +995,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR("Flow key attribute not present in set flow.\n"); goto error; + } ovs_match_init(&match, &key, &mask); error = ovs_nla_get_match(&match, @@ -994,10 +1013,8 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) error = PTR_ERR(acts); goto error; } - } - /* Can allocate before locking if have acts. */ - if (acts) { + /* Can allocate before locking if have acts. */ reply = ovs_flow_cmd_alloc_info(acts, info, false); if (IS_ERR(reply)) { error = PTR_ERR(reply); @@ -1181,7 +1198,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) struct datapath *dp; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; @@ -1444,7 +1461,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(&dp->table, false); + ovs_flow_tbl_destroy(&dp->table); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); @@ -1476,8 +1493,6 @@ static void __dp_destroy(struct datapath *dp) ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); /* RCU destroy the flow table */ - ovs_flow_tbl_destroy(&dp->table, true); - call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1945,7 +1960,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int i, j = 0; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 974135439c5c..1c56a80d6677 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -94,14 +94,12 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @flow: The flow associated with this packet. May be %NULL if no flow. * @egress_tun_key: Tunnel information about this packet on egress path. * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. */ struct ovs_skb_cb { - struct sw_flow *flow; struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; }; @@ -194,7 +192,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *); + struct sw_flow_actions *acts, struct sw_flow_key *); void ovs_dp_notify_wq(struct work_struct *work); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2b78789ea7c5..90a21010fc8f 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include "datapath.h" @@ -480,6 +482,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -ENOMEM; skb_reset_network_header(skb); + skb_reset_mac_len(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); /* Network layer. */ @@ -584,6 +587,33 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ip, 0, sizeof(key->ip)); memset(&key->ipv4, 0, sizeof(key->ipv4)); } + } else if (eth_p_mpls(key->eth.type)) { + size_t stack_len = MPLS_HLEN; + + /* In the presence of an MPLS label stack the end of the L2 + * header and the beginning of the L3 header differ. + * + * Advance network_header to the beginning of the L3 + * header. mac_len corresponds to the end of the L2 header. + */ + while (1) { + __be32 lse; + + error = check_header(skb, skb->mac_len + stack_len); + if (unlikely(error)) + return 0; + + memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + + if (stack_len == MPLS_HLEN) + memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + + skb_set_network_header(skb, skb->mac_len + stack_len); + if (lse & htonl(MPLS_LS_S_MASK)) + break; + + stack_len += MPLS_HLEN; + } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 71813318c8c7..4962bee81a11 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -102,12 +102,17 @@ struct sw_flow_key { __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ __be16 type; /* Ethernet frame type. */ } eth; - struct { - u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ - u8 tos; /* IP ToS. */ - u8 ttl; /* IP TTL/hop limit. */ - u8 frag; /* One of OVS_FRAG_TYPE_*. */ - } ip; + union { + struct { + __be32 top_lse; /* top label stack entry */ + } mpls; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 939bcb32100f..ed3109761827 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -46,24 +46,22 @@ #include #include #include +#include #include "flow_netlink.h" -static void update_range__(struct sw_flow_match *match, - size_t offset, size_t size, bool is_mask) +static void update_range(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) { - struct sw_flow_key_range *range = NULL; + struct sw_flow_key_range *range; size_t start = rounddown(offset, sizeof(long)); size_t end = roundup(offset + size, sizeof(long)); if (!is_mask) range = &match->range; - else if (match->mask) + else range = &match->mask->range; - if (!range) - return; - if (range->start == range->end) { range->start = start; range->end = end; @@ -79,22 +77,20 @@ static void update_range__(struct sw_flow_match *match, #define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - (match)->mask->key.field = value; \ - } else { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + (match)->mask->key.field = value; \ + else \ (match)->key->field = value; \ - } \ } while (0) #define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ do { \ - update_range__(match, offset, len, is_mask); \ + update_range(match, offset, len, is_mask); \ if (is_mask) \ memcpy((u8 *)&(match)->mask->key + offset, value_p, \ - len); \ + len); \ else \ memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) @@ -103,18 +99,16 @@ static void update_range__(struct sw_flow_match *match, SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ value_p, len, is_mask) -#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memset((u8 *)&(match)->mask->key.field, value,\ - sizeof((match)->mask->key.field)); \ - } else { \ +#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + memset((u8 *)&(match)->mask->key.field, value, \ + sizeof((match)->mask->key.field)); \ + else \ memset((u8 *)&(match)->key->field, value, \ sizeof((match)->key->field)); \ - } \ } while (0) static bool match_validate(const struct sw_flow_match *match, @@ -134,7 +128,8 @@ static bool match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_ICMP) | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) - | (1 << OVS_KEY_ATTR_ND)); + | (1 << OVS_KEY_ATTR_ND) + | (1 << OVS_KEY_ATTR_MPLS)); /* Always allowed mask fields. */ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) @@ -149,6 +144,12 @@ static bool match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_ARP; } + if (eth_p_mpls(match->key->eth.type)) { + key_expected |= 1 << OVS_KEY_ATTR_MPLS; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_MPLS; + } + if (match->key->eth.type == htons(ETH_P_IP)) { key_expected |= 1 << OVS_KEY_ATTR_IPV4; if (match->mask && (match->mask->key.eth.type == htons(0xffff))) @@ -244,6 +245,38 @@ static bool match_validate(const struct sw_flow_match *match, return true; } +size_t ovs_key_attr_size(void) +{ + /* Whenever adding new OVS_KEY_ FIELDS, we should consider + * updating this function. + */ + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ENCAP] = -1, @@ -266,6 +299,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, + [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls), }; static bool is_all_zero(const u8 *fp, size_t size) @@ -572,10 +606,13 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - if (is_mask) + if (is_mask) { in_port = 0xffffffff; /* Always exact match in_port. */ - else if (in_port >= DP_MAX_PORTS) + } else if (in_port >= DP_MAX_PORTS) { + OVS_NLERR("Port (%d) exceeds maximum allowable (%d).\n", + in_port, DP_MAX_PORTS); return -EINVAL; + } SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); @@ -602,7 +639,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, const struct nlattr **a, bool is_mask) { int err; - u64 orig_attrs = attrs; err = metadata_from_nlattrs(match, &attrs, a, is_mask); if (err) @@ -634,8 +670,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_VLAN); - } else if (!is_mask) - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + } if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { __be16 eth_type; @@ -735,6 +770,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_ARP); } + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { + const struct ovs_key_mpls *mpls_key; + + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + SW_FLOW_KEY_PUT(match, mpls.top_lse, + mpls_key->mpls_lse, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); + } + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { const struct ovs_key_tcp *tcp_key; @@ -745,15 +790,9 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, } if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, tp.flags, - nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), - is_mask); - } else { - SW_FLOW_KEY_PUT(match, tp.flags, - nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), - is_mask); - } + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); } @@ -812,8 +851,11 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_ND); } - if (attrs != 0) + if (attrs != 0) { + OVS_NLERR("Unknown key attributes (%llx).\n", + (unsigned long long)attrs); return -EINVAL; + } return 0; } @@ -853,8 +895,8 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * attribute specifies the mask field of the wildcarded flow. */ int ovs_nla_get_match(struct sw_flow_match *match, - const struct nlattr *key, - const struct nlattr *mask) + const struct nlattr *nla_key, + const struct nlattr *nla_mask) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; const struct nlattr *encap; @@ -864,7 +906,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, bool encap_valid = false; int err; - err = parse_flow_nlattrs(key, a, &key_attrs); + err = parse_flow_nlattrs(nla_key, a, &key_attrs); if (err) return err; @@ -905,36 +947,43 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (err) return err; - if (match->mask && !mask) { - /* Create an exact match mask. We need to set to 0xff all the - * 'match->mask' fields that have been touched in 'match->key'. - * We cannot simply memset 'match->mask', because padding bytes - * and fields not specified in 'match->key' should be left to 0. - * Instead, we use a stream of netlink attributes, copied from - * 'key' and set to 0xff: ovs_key_from_nlattrs() will take care - * of filling 'match->mask' appropriately. - */ - newmask = kmemdup(key, nla_total_size(nla_len(key)), - GFP_KERNEL); - if (!newmask) - return -ENOMEM; + if (match->mask) { + if (!nla_mask) { + /* Create an exact match mask. We need to set to 0xff + * all the 'match->mask' fields that have been touched + * in 'match->key'. We cannot simply memset + * 'match->mask', because padding bytes and fields not + * specified in 'match->key' should be left to 0. + * Instead, we use a stream of netlink attributes, + * copied from 'key' and set to 0xff. + * ovs_key_from_nlattrs() will take care of filling + * 'match->mask' appropriately. + */ + newmask = kmemdup(nla_key, + nla_total_size(nla_len(nla_key)), + GFP_KERNEL); + if (!newmask) + return -ENOMEM; - mask_set_nlattr(newmask, 0xff); + mask_set_nlattr(newmask, 0xff); - /* The userspace does not send tunnel attributes that are 0, - * but we should not wildcard them nonetheless. - */ - if (match->key->tun_key.ipv4_dst) - SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); + /* The userspace does not send tunnel attributes that + * are 0, but we should not wildcard them nonetheless. + */ + if (match->key->tun_key.ipv4_dst) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, + 0xff, true); - mask = newmask; - } + nla_mask = newmask; + } - if (mask) { - err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs); if (err) goto free_newmask; + /* Always match on tci. */ + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { __be16 eth_type = 0; __be16 tci = 0; @@ -1140,6 +1189,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, arp_key->arp_op = htons(output->ip.proto); ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } else if (eth_p_mpls(swkey->eth.type)) { + struct ovs_key_mpls *mpls_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + mpls_key = nla_data(nla); + mpls_key->mpls_lse = output->mpls.top_lse; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1226,12 +1283,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, #define MAX_ACTIONS_BUFSIZE (32 * 1024) -struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size) +static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa; - if (size > MAX_ACTIONS_BUFSIZE) + if (size > MAX_ACTIONS_BUFSIZE) { + OVS_NLERR("Flow action size (%u bytes) exceeds maximum", size); return ERR_PTR(-EINVAL); + } sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) @@ -1269,7 +1328,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = ovs_nla_alloc_flow_actions(new_acts_size); + acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return (void *)acts; @@ -1336,9 +1395,15 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci); + static int validate_and_copy_sample(const struct nlattr *attr, const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -1375,7 +1440,8 @@ static int validate_and_copy_sample(const struct nlattr *attr, if (st_acts < 0) return st_acts; - err = ovs_nla_copy_actions(actions, key, depth + 1, sfa); + err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + eth_type, vlan_tci); if (err) return err; @@ -1385,10 +1451,10 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } -static int validate_tp_port(const struct sw_flow_key *flow_key) +static int validate_tp_port(const struct sw_flow_key *flow_key, + __be16 eth_type) { - if ((flow_key->eth.type == htons(ETH_P_IP) || - flow_key->eth.type == htons(ETH_P_IPV6)) && + if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) && (flow_key->tp.src || flow_key->tp.dst)) return 0; @@ -1483,7 +1549,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, struct sw_flow_actions **sfa, - bool *set_tun) + bool *set_tun, __be16 eth_type) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -1508,6 +1574,9 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_TUNNEL: + if (eth_p_mpls(eth_type)) + return -EINVAL; + *set_tun = true; err = validate_and_copy_set_tun(a, sfa); if (err) @@ -1515,7 +1584,7 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) + if (eth_type != htons(ETH_P_IP)) return -EINVAL; if (!flow_key->ip.proto) @@ -1531,7 +1600,7 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) + if (eth_type != htons(ETH_P_IPV6)) return -EINVAL; if (!flow_key->ip.proto) @@ -1553,19 +1622,24 @@ static int validate_set(const struct nlattr *a, if (flow_key->ip.proto != IPPROTO_TCP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); case OVS_KEY_ATTR_UDP: if (flow_key->ip.proto != IPPROTO_UDP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); + + case OVS_KEY_ATTR_MPLS: + if (!eth_p_mpls(eth_type)) + return -EINVAL; + break; case OVS_KEY_ATTR_SCTP: if (flow_key->ip.proto != IPPROTO_SCTP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); default: return -EINVAL; @@ -1609,12 +1683,13 @@ static int copy_action(const struct nlattr *from, return 0; } -int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) { const struct nlattr *a; + bool out_tnl_port = false; int rem, err; if (depth >= SAMPLE_ACTION_DEPTH) @@ -1626,6 +1701,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, @@ -1655,6 +1732,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_OUTPUT: if (nla_get_u32(a) >= DP_MAX_PORTS) return -EINVAL; + out_tnl_port = false; + break; case OVS_ACTION_ATTR_HASH: { @@ -1671,6 +1750,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, } case OVS_ACTION_ATTR_POP_VLAN: + vlan_tci = htons(0); break; case OVS_ACTION_ATTR_PUSH_VLAN: @@ -1679,25 +1759,73 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) return -EINVAL; + vlan_tci = vlan->vlan_tci; break; case OVS_ACTION_ATTR_RECIRC: break; + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + + /* Networking stack do not allow simultaneous Tunnel + * and MPLS GSO. + */ + if (out_tnl_port) + return -EINVAL; + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + /* Prohibit push MPLS other than to a white list + * for packets that have a known tag order. + */ + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + eth_type = mpls->mpls_ethertype; + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + !eth_p_mpls(eth_type)) + return -EINVAL; + + /* Disallow subsequent L2.5+ set and mpls_pop actions + * as there is no check here to ensure that the new + * eth_type is valid and thus set actions could + * write off the end of the packet or otherwise + * corrupt it. + * + * Support for these actions is planned using packet + * recirculation. + */ + eth_type = htons(0); + break; + case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); + err = validate_set(a, key, sfa, + &out_tnl_port, eth_type); if (err) return err; + + skip_copy = out_tnl_port; break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); + err = validate_and_copy_sample(a, key, depth, sfa, + eth_type, vlan_tci); if (err) return err; skip_copy = true; break; default: + OVS_NLERR("Unknown tunnel attribute (%d).\n", type); return -EINVAL; } if (!skip_copy) { @@ -1713,6 +1841,24 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return 0; } +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa) +{ + int err; + + *sfa = nla_alloc_flow_actions(nla_len(attr)); + if (IS_ERR(*sfa)) + return PTR_ERR(*sfa); + + err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + key->eth.tci); + if (err) + kfree(*sfa); + + return err; +} + static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) { const struct nlattr *a; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 206e45add888..eb0b177300ad 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -37,6 +37,8 @@ #include "flow.h" +size_t ovs_key_attr_size(void); + void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, struct sw_flow_mask *mask); @@ -49,12 +51,11 @@ int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *); int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, + const struct sw_flow_key *key, struct sw_flow_actions **sfa); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); -struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len); void ovs_nla_free_flow_actions(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index cf2d853646f0..90f8b40a350b 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -250,11 +250,14 @@ static void table_instance_destroy(struct table_instance *ti, bool deferred) __table_instance_destroy(ti); } -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +/* No need for locking this function is called from RCU callback or + * error path. + */ +void ovs_flow_tbl_destroy(struct flow_table *table) { - struct table_instance *ti = ovsl_dereference(table->ti); + struct table_instance *ti = rcu_dereference_raw(table->ti); - table_instance_destroy(ti, deferred); + table_instance_destroy(ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 5918bff7f3f6..f682c8c07f44 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -62,7 +62,7 @@ void ovs_flow_free(struct sw_flow *, bool deferred); int ovs_flow_tbl_init(struct flow_table *); int ovs_flow_tbl_count(struct flow_table *table); -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); +void ovs_flow_tbl_destroy(struct flow_table *table); int ovs_flow_tbl_flush(struct flow_table *flow_table); int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 10dc07e1678b..6a55f7105505 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -224,6 +224,11 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) struct net_device *netdev = netdev_vport_priv(vport)->dev; int len; + if (unlikely(!(netdev->flags & IFF_UP))) { + kfree_skb(skb); + return 0; + } + len = skb->len; skb_dst_drop(skb);