From 8bd67ebb50c0145fd2ca8681ab65eb7e8cde1afc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 13 May 2024 13:34:19 +0300 Subject: [PATCH 01/30] net: bridge: xmit: make sure we have at least eth header len bytes syzbot triggered an uninit value[1] error in bridge device's xmit path by sending a short (less than ETH_HLEN bytes) skb. To fix it check if we can actually pull that amount instead of assuming. Tested with dropwatch: drop at: br_dev_xmit+0xb93/0x12d0 [bridge] (0xffffffffc06739b3) origin: software timestamp: Mon May 13 11:31:53 2024 778214037 nsec protocol: 0x88a8 length: 2 original length: 2 drop reason: PKT_TOO_SMALL [1] BUG: KMSAN: uninit-value in br_dev_xmit+0x61d/0x1cb0 net/bridge/br_device.c:65 br_dev_xmit+0x61d/0x1cb0 net/bridge/br_device.c:65 __netdev_start_xmit include/linux/netdevice.h:4903 [inline] netdev_start_xmit include/linux/netdevice.h:4917 [inline] xmit_one net/core/dev.c:3531 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3547 __dev_queue_xmit+0x34db/0x5350 net/core/dev.c:4341 dev_queue_xmit include/linux/netdevice.h:3091 [inline] __bpf_tx_skb net/core/filter.c:2136 [inline] __bpf_redirect_common net/core/filter.c:2180 [inline] __bpf_redirect+0x14a6/0x1620 net/core/filter.c:2187 ____bpf_clone_redirect net/core/filter.c:2460 [inline] bpf_clone_redirect+0x328/0x470 net/core/filter.c:2432 ___bpf_prog_run+0x13fe/0xe0f0 kernel/bpf/core.c:1997 __bpf_prog_run512+0xb5/0xe0 kernel/bpf/core.c:2238 bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline] __bpf_prog_run include/linux/filter.h:657 [inline] bpf_prog_run include/linux/filter.h:664 [inline] bpf_test_run+0x499/0xc30 net/bpf/test_run.c:425 bpf_prog_test_run_skb+0x14ea/0x1f20 net/bpf/test_run.c:1058 bpf_prog_test_run+0x6b7/0xad0 kernel/bpf/syscall.c:4269 __sys_bpf+0x6aa/0xd90 kernel/bpf/syscall.c:5678 __do_sys_bpf kernel/bpf/syscall.c:5767 [inline] __se_sys_bpf kernel/bpf/syscall.c:5765 [inline] __x64_sys_bpf+0xa0/0xe0 kernel/bpf/syscall.c:5765 x64_sys_call+0x96b/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:322 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+a63a1f6a062033cf0f40@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a63a1f6a062033cf0f40 Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c366ccc8b3db..ecac7886988b 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -27,6 +27,7 @@ EXPORT_SYMBOL_GPL(nf_br_ops); /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { + enum skb_drop_reason reason = pskb_may_pull_reason(skb, ETH_HLEN); struct net_bridge_mcast_port *pmctx_null = NULL; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mcast *brmctx = &br->multicast_ctx; @@ -38,6 +39,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const unsigned char *dest; u16 vid = 0; + if (unlikely(reason != SKB_NOT_DROPPED_YET)) { + kfree_skb_reason(skb, reason); + return NETDEV_TX_OK; + } + memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); br_tc_skb_miss_set(skb, false); From 06080ea23095afe04a2cb7a8d05fab4311782623 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 13 May 2024 13:52:57 +0300 Subject: [PATCH 02/30] selftests: net: bridge: increase IGMP/MLD exclude timeout membership interval When running the bridge IGMP/MLD selftests on debug kernels we can get spurious errors when setting up the IGMP/MLD exclude timeout tests because the membership interval is just 3 seconds and the setup has 2 seconds of sleep plus various validations, the one second that is left is not enough. Increase the membership interval from 3 to 5 seconds to make room for the setup validation and 2 seconds of sleep. Fixes: 34d7ecb3d4f7 ("selftests: net: bridge: update IGMP/MLD membership interval value") Reported-by: Jakub Kicinski Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/bridge_igmp.sh | 6 +++--- tools/testing/selftests/net/forwarding/bridge_mld.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh index 2aa66d2a1702..e6a3e04fd83f 100755 --- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh +++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh @@ -478,10 +478,10 @@ v3exc_timeout_test() RET=0 local X=("192.0.2.20" "192.0.2.30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ v3exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW2" -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh index e2b9ff773c6b..f84ab2e65754 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mld.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh @@ -478,10 +478,10 @@ mldv2exc_timeout_test() RET=0 local X=("2001:db8:1::20" "2001:db8:1::30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 mldv2exclude_prepare $h1 ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ mldv2exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 $MZPKT_ALLOW2 -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ From 3a7c1661ae1383364cd6092d851f5e5da64d476b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 13 May 2024 14:06:27 +0300 Subject: [PATCH 03/30] net: bridge: mst: fix vlan use-after-free syzbot reported a suspicious rcu usage[1] in bridge's mst code. While fixing it I noticed that nothing prevents a vlan to be freed while walking the list from the same path (br forward delay timer). Fix the rcu usage and also make sure we are not accessing freed memory by making br_mst_vlan_set_state use rcu read lock. [1] WARNING: suspicious RCU usage 6.9.0-rc6-syzkaller #0 Not tainted ----------------------------- net/bridge/br_private.h:1599 suspicious rcu_dereference_protected() usage! ... stack backtrace: CPU: 1 PID: 8017 Comm: syz-executor.1 Not tainted 6.9.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 lockdep_rcu_suspicious+0x221/0x340 kernel/locking/lockdep.c:6712 nbp_vlan_group net/bridge/br_private.h:1599 [inline] br_mst_set_state+0x1ea/0x650 net/bridge/br_mst.c:105 br_set_state+0x28a/0x7b0 net/bridge/br_stp.c:47 br_forward_delay_timer_expired+0x176/0x440 net/bridge/br_stp_timer.c:88 call_timer_fn+0x18e/0x650 kernel/time/timer.c:1793 expire_timers kernel/time/timer.c:1844 [inline] __run_timers kernel/time/timer.c:2418 [inline] __run_timer_base+0x66a/0x8e0 kernel/time/timer.c:2429 run_timer_base kernel/time/timer.c:2438 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2448 __do_softirq+0x2c6/0x980 kernel/softirq.c:554 invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0xf2/0x1c0 kernel/softirq.c:633 irq_exit_rcu+0x9/0x30 kernel/softirq.c:645 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1043 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 RIP: 0010:lock_acquire+0x264/0x550 kernel/locking/lockdep.c:5758 Code: 2b 00 74 08 4c 89 f7 e8 ba d1 84 00 f6 44 24 61 02 0f 85 85 01 00 00 41 f7 c7 00 02 00 00 74 01 fb 48 c7 44 24 40 0e 36 e0 45 <4b> c7 44 25 00 00 00 00 00 43 c7 44 25 09 00 00 00 00 43 c7 44 25 RSP: 0018:ffffc90013657100 EFLAGS: 00000206 RAX: 0000000000000001 RBX: 1ffff920026cae2c RCX: 0000000000000001 RDX: dffffc0000000000 RSI: ffffffff8bcaca00 RDI: ffffffff8c1eaa60 RBP: ffffc90013657260 R08: ffffffff92efe507 R09: 1ffffffff25dfca0 R10: dffffc0000000000 R11: fffffbfff25dfca1 R12: 1ffff920026cae28 R13: dffffc0000000000 R14: ffffc90013657160 R15: 0000000000000246 Fixes: ec7328b59176 ("net: bridge: mst: Multiple Spanning Tree (MST) mode") Reported-by: syzbot+fa04eb8a56fd923fc5d8@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=fa04eb8a56fd923fc5d8 Signed-off-by: Nikolay Aleksandrov Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/bridge/br_mst.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index ee680adcee17..3c66141d34d6 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -78,7 +78,7 @@ static void br_mst_vlan_set_state(struct net_bridge_port *p, struct net_bridge_v { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); - if (v->state == state) + if (br_vlan_get_state(v) == state) return; br_vlan_set_state(v, state); @@ -100,11 +100,12 @@ int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, }; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; - int err; + int err = 0; + rcu_read_lock(); vg = nbp_vlan_group(p); if (!vg) - return 0; + goto out; /* MSTI 0 (CST) state changes are notified via the regular * SWITCHDEV_ATTR_ID_PORT_STP_STATE. @@ -112,17 +113,20 @@ int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, if (msti) { err = switchdev_port_attr_set(p->dev, &attr, extack); if (err && err != -EOPNOTSUPP) - return err; + goto out; } - list_for_each_entry(v, &vg->vlan_list, vlist) { + err = 0; + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (v->brvlan->msti != msti) continue; br_mst_vlan_set_state(p, v, state); } - return 0; +out: + rcu_read_unlock(); + return err; } static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti) From 317a215d493230da361028ea8a4675de334bfa1a Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Mon, 13 May 2024 16:39:22 +0200 Subject: [PATCH 04/30] net: ks8851: Fix another TX stall caused by wrong ISR flag handling Under some circumstances it may happen that the ks8851 Ethernet driver stops sending data. Currently the interrupt handler resets the interrupt status flags in the hardware after handling TX. With this approach we may lose interrupts in the time window between handling the TX interrupt and resetting the TX interrupt status bit. When all of the three following conditions are true then transmitting data stops: - TX queue is stopped to wait for room in the hardware TX buffer - no queued SKBs in the driver (txq) that wait for being written to hw - hardware TX buffer is empty and the last TX interrupt was lost This is because reenabling the TX queue happens when handling the TX interrupt status but if the TX status bit has already been cleared then this interrupt will never come. With this commit the interrupt status flags will be cleared before they are handled. That way we stop losing interrupts. The wrong handling of the ISR flags was there from the beginning but with commit 3dc5d4454545 ("net: ks8851: Fix TX stall caused by TX buffer overrun") the issue becomes apparent. Fixes: 3dc5d4454545 ("net: ks8851: Fix TX stall caused by TX buffer overrun") Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: netdev@vger.kernel.org Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: Ronald Wahl Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851_common.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 502518cdb461..6453c92f0fa7 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -328,7 +328,6 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) { struct ks8851_net *ks = _ks; struct sk_buff_head rxq; - unsigned handled = 0; unsigned long flags; unsigned int status; struct sk_buff *skb; @@ -336,24 +335,17 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) ks8851_lock(ks, &flags); status = ks8851_rdreg16(ks, KS_ISR); + ks8851_wrreg16(ks, KS_ISR, status); netif_dbg(ks, intr, ks->netdev, "%s: status 0x%04x\n", __func__, status); - if (status & IRQ_LCI) - handled |= IRQ_LCI; - if (status & IRQ_LDI) { u16 pmecr = ks8851_rdreg16(ks, KS_PMECR); pmecr &= ~PMECR_WKEVT_MASK; ks8851_wrreg16(ks, KS_PMECR, pmecr | PMECR_WKEVT_LINK); - - handled |= IRQ_LDI; } - if (status & IRQ_RXPSI) - handled |= IRQ_RXPSI; - if (status & IRQ_TXI) { unsigned short tx_space = ks8851_rdreg16(ks, KS_TXMIR); @@ -365,20 +357,12 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) if (netif_queue_stopped(ks->netdev)) netif_wake_queue(ks->netdev); spin_unlock(&ks->statelock); - - handled |= IRQ_TXI; } - if (status & IRQ_RXI) - handled |= IRQ_RXI; - if (status & IRQ_SPIBEI) { netdev_err(ks->netdev, "%s: spi bus error\n", __func__); - handled |= IRQ_SPIBEI; } - ks8851_wrreg16(ks, KS_ISR, handled); - if (status & IRQ_RXI) { /* the datasheet says to disable the rx interrupt during * packet read-out, however we're masking the interrupt From 1de27bba6d50a909647f304eadc0f7c59a842a50 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 13 May 2024 11:08:03 -0700 Subject: [PATCH 05/30] libbpf: fix feature detectors when using token_fd Adjust `union bpf_attr` size passed to kernel in two feature-detecting functions to take into account prog_token_fd field. Libbpf is avoiding memset()'ing entire `union bpf_attr` by only using minimal set of bpf_attr's fields. Two places have been missed when wiring BPF token support in libbpf's feature detection logic. Fix them trivially. Fixes: f3dcee938f48 ("libbpf: Wire up token_fd into feature probing logic") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240513180804.403775-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 2 +- tools/lib/bpf/features.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 466a29d80124..2a4c71501a17 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -105,7 +105,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) */ int probe_memcg_account(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), BPF_EXIT_INSN(), diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index 4e783cc7fc4b..a336786a22a3 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -22,7 +22,7 @@ int probe_fd(int fd) static int probe_kern_prog_name(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), From 7a8030057f6791d35dd20987f9ff15855c01c1bb Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 14 May 2024 13:03:03 +0000 Subject: [PATCH 06/30] bpf, docs: Fix the description of 'src' in ALU instructions An ALU instruction's source operand can be the value in the source register or the 32-bit immediate value encoded in the instruction. This is controlled by the 's' bit of the 'opcode'. The current description explicitly uses the phrase 'value of the source register' when defining the meaning of 'src'. Change the description to use 'source operand' in place of 'value of the source register'. Signed-off-by: Puranjay Mohan Acked-by: Dave Thaler Link: https://lore.kernel.org/r/20240514130303.113607-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/standardization/instruction-set.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index 997560abadab..00c93eb42613 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -301,8 +301,9 @@ Arithmetic instructions ``ALU`` uses 32-bit wide operands while ``ALU64`` uses 64-bit wide operands for otherwise identical operations. ``ALU64`` instructions belong to the base64 conformance group unless noted otherwise. -The 'code' field encodes the operation as below, where 'src' and 'dst' refer -to the values of the source and destination registers, respectively. +The 'code' field encodes the operation as below, where 'src' refers to the +the source operand and 'dst' refers to the value of the destination +register. ===== ===== ======= ========================================================== name code offset description From 325423cafc12031a69692363ddcabc63113bb3d6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 14 May 2024 18:39:14 +0000 Subject: [PATCH 07/30] MAINTAINERS: Update ARM64 BPF JIT maintainer Zi Shen Lim is not actively doing kernel development and has decided to tranfer the responsibility of maintaining the JIT to me. Add myself as the maintainer for BPF JIT for ARM64 and remove Zi Shen Lim. Signed-off-by: Puranjay Mohan Acked-by: Zi Shen Lim Link: https://lore.kernel.org/r/20240514183914.27737-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 94fddfcec2fb..be7f72766dc6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3777,7 +3777,7 @@ F: arch/arm/net/ BPF JIT for ARM64 M: Daniel Borkmann M: Alexei Starovoitov -M: Zi Shen Lim +M: Puranjay Mohan L: bpf@vger.kernel.org S: Supported F: arch/arm64/net/ From 9ee98229083186837199912a7debb666146b8c17 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 14 May 2024 23:24:39 -0700 Subject: [PATCH 08/30] bpf: save extended inner map info for percpu array maps as well ARRAY_OF_MAPS and HASH_OF_MAPS map types have special logic to save a few extra fields required for correct operations of ARRAY maps, when they are used as inner maps. PERCPU_ARRAY maps have similar requirements as they now support generating inline element lookup logic. So make sure that both classes of maps are handled correctly. Reported-by: Jakub Kicinski Fixes: db69718b8efa ("bpf: inline bpf_map_lookup_elem() for PERCPU_ARRAY maps") Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240515062440.846086-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 8ef269e66ba5..b4f18c85d7bc 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -32,7 +32,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ - if (inner_map->ops == &array_map_ops) + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); @@ -68,7 +68,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; - if (inner_map->ops == &array_map_ops) { + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) { struct bpf_array *inner_array_meta = container_of(inner_map_meta, struct bpf_array, map); struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); From 2322113ac9d0c5653017adbab504fb307b0e92e2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 14 May 2024 23:24:40 -0700 Subject: [PATCH 09/30] selftests/bpf: add more variations of map-in-map situations Add test cases validating usage of PERCPU_ARRAY and PERCPU_HASH maps as inner maps. Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240515062440.846086-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/map_kptr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index da30f0d59364..ab0ce1d01a4a 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -110,10 +110,14 @@ DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_map, array_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_map, array_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_malloc_map, array_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, lru_hash_map, array_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_array_map, array_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_hash_map, array_of_pcpu_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, array_map, hash_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_map, hash_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_malloc_map, hash_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, lru_hash_map, hash_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_array_map, hash_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_hash_map, hash_of_pcpu_hash_maps); #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) @@ -204,6 +208,8 @@ int test_map_kptr(struct __sk_buff *ctx) TEST(hash_map); TEST(hash_malloc_map); TEST(lru_hash_map); + TEST(pcpu_array_map); + TEST(pcpu_hash_map); #undef TEST return 0; @@ -281,10 +287,14 @@ int test_map_in_map_kptr(struct __sk_buff *ctx) TEST(array_of_hash_maps); TEST(array_of_hash_malloc_maps); TEST(array_of_lru_hash_maps); + TEST(array_of_pcpu_array_maps); + TEST(array_of_pcpu_hash_maps); TEST(hash_of_array_maps); TEST(hash_of_hash_maps); TEST(hash_of_hash_malloc_maps); TEST(hash_of_lru_hash_maps); + TEST(hash_of_pcpu_array_maps); + TEST(hash_of_pcpu_hash_maps); #undef TEST return 0; From 99975ad644c7836414183fa7be4f883a4fb2bf64 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 13 May 2024 13:18:53 +0200 Subject: [PATCH 10/30] net: lan966x: remove debugfs directory in probe() error path A debugfs directory entry is create early during probe(). This entry is not removed on error path leading to some "already present" issues in case of EPROBE_DEFER. Create this entry later in the probe() code to avoid the need to change many 'return' in 'goto' and add the removal in the already present error path. Fixes: 942814840127 ("net: lan966x: Add VCAP debugFS support") Cc: Signed-off-by: Herve Codina Reviewed-by: Andrew Lunn Reviewed-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index b12d3b8a64fd..a3b919a3ce07 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -1087,8 +1087,6 @@ static int lan966x_probe(struct platform_device *pdev) platform_set_drvdata(pdev, lan966x); lan966x->dev = &pdev->dev; - lan966x->debugfs_root = debugfs_create_dir("lan966x", NULL); - if (!device_get_mac_address(&pdev->dev, mac_addr)) { ether_addr_copy(lan966x->base_mac, mac_addr); } else { @@ -1179,6 +1177,8 @@ static int lan966x_probe(struct platform_device *pdev) return dev_err_probe(&pdev->dev, -ENODEV, "no ethernet-ports child found\n"); + lan966x->debugfs_root = debugfs_create_dir("lan966x", NULL); + /* init switch */ lan966x_init(lan966x); lan966x_stats_init(lan966x); @@ -1257,6 +1257,8 @@ static int lan966x_probe(struct platform_device *pdev) destroy_workqueue(lan966x->stats_queue); mutex_destroy(&lan966x->stats_lock); + debugfs_remove_recursive(lan966x->debugfs_root); + return err; } From fd76e5ccc48f9f54eb44909dd7c0b924005f1582 Mon Sep 17 00:00:00 2001 From: Chris Lew Date: Mon, 13 May 2024 10:31:46 -0700 Subject: [PATCH 11/30] net: qrtr: ns: Fix module refcnt The qrtr protocol core logic and the qrtr nameservice are combined into a single module. Neither the core logic or nameservice provide much functionality by themselves; combining the two into a single module also prevents any possible issues that may stem from client modules loading inbetween qrtr and the ns. Creating a socket takes two references to the module that owns the socket protocol. Since the ns needs to create the control socket, this creates a scenario where there are always two references to the qrtr module. This prevents the execution of 'rmmod' for qrtr. To resolve this, forcefully put the module refcount for the socket opened by the nameservice. Fixes: a365023a76f2 ("net: qrtr: combine nameservice into main module") Reported-by: Jeffrey Hugo Tested-by: Jeffrey Hugo Signed-off-by: Chris Lew Reviewed-by: Manivannan Sadhasivam Reviewed-by: Jeffrey Hugo Signed-off-by: David S. Miller --- net/qrtr/ns.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index abb0c70ffc8b..654a3cc0d347 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -725,6 +725,24 @@ int qrtr_ns_init(void) if (ret < 0) goto err_wq; + /* As the qrtr ns socket owner and creator is the same module, we have + * to decrease the qrtr module reference count to guarantee that it + * remains zero after the ns socket is created, otherwise, executing + * "rmmod" command is unable to make the qrtr module deleted after the + * qrtr module is inserted successfully. + * + * However, the reference count is increased twice in + * sock_create_kern(): one is to increase the reference count of owner + * of qrtr socket's proto_ops struct; another is to increment the + * reference count of owner of qrtr proto struct. Therefore, we must + * decrement the module reference count twice to ensure that it keeps + * zero after server's listening socket is created. Of course, we + * must bump the module reference count twice as well before the socket + * is closed. + */ + module_put(qrtr_ns.sock->ops->owner); + module_put(qrtr_ns.sock->sk->sk_prot_creator->owner); + return 0; err_wq: @@ -739,6 +757,15 @@ void qrtr_ns_remove(void) { cancel_work_sync(&qrtr_ns.work); destroy_workqueue(qrtr_ns.workqueue); + + /* sock_release() expects the two references that were put during + * qrtr_ns_init(). This function is only called during module remove, + * so try_stop_module() has already set the refcnt to 0. Use + * __module_get() instead of try_module_get() to successfully take two + * references. + */ + __module_get(qrtr_ns.sock->ops->owner); + __module_get(qrtr_ns.sock->sk->sk_prot_creator->owner); sock_release(qrtr_ns.sock); } EXPORT_SYMBOL_GPL(qrtr_ns_remove); From 83e93942796db58652288f0391ac00072401816f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 14 May 2024 10:33:59 +0800 Subject: [PATCH 12/30] selftests/net/lib: no need to record ns name if it already exist There is no need to add the name to ns_list again if the netns already recoreded. Fixes: 25ae948b4478 ("selftests/net: add lib.sh") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/lib.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index c868c0aec121..72b191e4e064 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -127,15 +127,17 @@ setup_ns() local ns="" local ns_name="" local ns_list="" + local ns_exist= for ns_name in "$@"; do # Some test may setup/remove same netns multi times if unset ${ns_name} 2> /dev/null; then ns="${ns_name,,}-$(mktemp -u XXXXXX)" eval readonly ${ns_name}="$ns" + ns_exist=false else eval ns='$'${ns_name} cleanup_ns "$ns" - + ns_exist=true fi if ! ip netns add "$ns"; then @@ -144,7 +146,7 @@ setup_ns() return $ksft_skip fi ip -n "$ns" link set lo up - ns_list="$ns_list $ns" + ! $ns_exist && ns_list="$ns_list $ns" done NS_LIST="$NS_LIST $ns_list" } From 5405807edd4168c2dc2f307f3c6b70e9579bf7be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 16 May 2024 10:01:40 -0700 Subject: [PATCH 13/30] selftests/bpf: Adjust test_access_variable_array after a kernel function name change After commit 4c3e509ea9f2 ("sched/balancing: Rename load_balance() => sched_balance_rq()"), the load_balance kernel function is renamed to sched_balance_rq. This patch adjusts the fentry program in test_access_variable_array.c to reflect this kernel function name change. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240516170140.2689430-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/progs/test_access_variable_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_access_variable_array.c b/tools/testing/selftests/bpf/progs/test_access_variable_array.c index 808c49b79889..326b7d1f496a 100644 --- a/tools/testing/selftests/bpf/progs/test_access_variable_array.c +++ b/tools/testing/selftests/bpf/progs/test_access_variable_array.c @@ -7,7 +7,7 @@ unsigned long span = 0; -SEC("fentry/load_balance") +SEC("fentry/sched_balance_rq") int BPF_PROG(fentry_fentry, int this_cpu, struct rq *this_rq, struct sched_domain *sd) { From 51e2b8d33199df9675d2a36ec6aad0c27e91c6fe Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 16 May 2024 09:43:10 -0700 Subject: [PATCH 14/30] selftests/bpf: Adjust btf_dump test to reflect recent change in file_operations The btf_dump test fails: test_btf_dump_struct_data:FAIL:file_operations unexpected file_operations: actual '(struct file_operations){ .owner = (struct module *)0xffffffffffffffff, .fop_flags = (fop_flags_t)4294967295, .llseek = (loff_t (*)(struct f' != expected '(struct file_operations){ .owner = (struct module *)0xffffffffffffffff, .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,' The "fop_flags" is a recent addition to the struct file_operations in commit 210a03c9d51a ("fs: claw back a few FMODE_* bits") This patch changes the test_btf_dump_struct_data() to reflect this change. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/20240516164310.2481460-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index e9ea38aa8248..09a8e6f9b379 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -653,7 +653,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, cmpstr = "(struct file_operations){\n" " .owner = (struct module *)0xffffffffffffffff,\n" -" .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,"; +" .fop_flags = (fop_flags_t)4294967295,"; ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations"); } From 988af276360bc555c7685d5a607d4da0588616d9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 14 May 2024 17:52:27 +0800 Subject: [PATCH 15/30] selftests/net: reduce xfrm_policy test time The check_random_order test add/get plenty of xfrm rules, which consume a lot time on debug kernel and always TIMEOUT. Let's reduce the test loop and see if it works. Signed-off-by: Hangbin Liu Link: https://lore.kernel.org/r/20240514095227.2597730-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/xfrm_policy.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh index 457789530645..3eeeeffb4005 100755 --- a/tools/testing/selftests/net/xfrm_policy.sh +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -293,7 +293,7 @@ check_random_order() local ns=$1 local log=$2 - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do ip -net $ns xfrm policy add dst $j.0.0.0/24 dir out priority 10 action allow @@ -306,7 +306,7 @@ check_random_order() done done - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do local addr=$(printf "e000:0000:%02x00::/56" $j) From bb487272380d120295e955ad8acfcbb281b57642 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 14 May 2024 20:11:02 +0800 Subject: [PATCH 16/30] net/ipv6: Fix route deleting failure when metric equals 0 Problem ========= After commit 67f695134703 ("ipv6: Move setting default metric for routes"), we noticed that the logic of assigning the default value of fc_metirc changed in the ioctl process. That is, when users use ioctl(fd, SIOCADDRT, rt) with a non-zero metric to add a route, then they may fail to delete a route with passing in a metric value of 0 to the kernel by ioctl(fd, SIOCDELRT, rt). But iproute can succeed in deleting it. As a reference, when using iproute tools by netlink to delete routes with a metric parameter equals 0, like the command as follows: ip -6 route del fe80::/64 via fe81::5054:ff:fe11:3451 dev eth0 metric 0 the user can still succeed in deleting the route entry with the smallest metric. Root Reason =========== After commit 67f695134703 ("ipv6: Move setting default metric for routes"), When ioctl() pass in SIOCDELRT with a zero metric, rtmsg_to_fib6_config() will set a defalut value (1024) to cfg->fc_metric in kernel, and in ip6_route_del() and the line 4074 at net/ipv3/route.c, it will check by if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; and the condition is true and skip the later procedure (deleting route) because cfg->fc_metric != rt->fib6_metric. But before that commit, cfg->fc_metric is still zero there, so the condition is false and it will do the following procedure (deleting). Solution ======== In order to keep a consistent behaviour across netlink() and ioctl(), we should allow to delete a route with a metric value of 0. So we only do the default setting of fc_metric in route adding. CC: stable@vger.kernel.org # 5.4+ Fixes: 67f695134703 ("ipv6: Move setting default metric for routes") Co-developed-by: Fan Yu Signed-off-by: Fan Yu Signed-off-by: xu xin Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240514201102055dD2Ba45qKbLlUMxu_DTHP@zte.com.cn Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c43b0616742e..bbc2a0dd9314 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4445,7 +4445,7 @@ static void rtmsg_to_fib6_config(struct net *net, .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, - .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, + .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, @@ -4475,6 +4475,9 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) rtnl_lock(); switch (cmd) { case SIOCADDRT: + /* Only do the default setting of fc_metric in route adding */ + if (cfg.fc_metric == 0) + cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: From a45835a0bb6ef7d5ddbc0714dd760de979cb6ece Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Tue, 14 May 2024 15:57:29 -0400 Subject: [PATCH 17/30] bonding: fix oops during rmmod "rmmod bonding" causes an oops ever since commit cc317ea3d927 ("bonding: remove redundant NULL check in debugfs function"). Here are the relevant functions being called: bonding_exit() bond_destroy_debugfs() debugfs_remove_recursive(bonding_debug_root); bonding_debug_root = NULL; <--------- SET TO NULL HERE bond_netlink_fini() rtnl_link_unregister() __rtnl_link_unregister() unregister_netdevice_many_notify() bond_uninit() bond_debug_unregister() (commit removed check for bonding_debug_root == NULL) debugfs_remove() simple_recursive_removal() down_write() -> OOPS However, reverting the bad commit does not solve the problem completely because the original code contains a race that could cause the same oops, although it was much less likely to be triggered unintentionally: CPU1 rmmod bonding bonding_exit() bond_destroy_debugfs() debugfs_remove_recursive(bonding_debug_root); CPU2 echo -bond0 > /sys/class/net/bonding_masters bond_uninit() bond_debug_unregister() if (!bonding_debug_root) CPU1 bonding_debug_root = NULL; So do NOT revert the bad commit (since the removed checks were racy anyway), and instead change the order of actions taken during module removal. The same oops can also happen if there is an error during module init, so apply the same fix there. Fixes: cc317ea3d927 ("bonding: remove redundant NULL check in debugfs function") Cc: stable@vger.kernel.org Signed-off-by: Tony Battersby Reviewed-by: Simon Horman Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/641f914f-3216-4eeb-87dd-91b78aa97773@cybernetics.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3b0c7758ea4f..3c3fcce4acd4 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -6477,16 +6477,16 @@ static int __init bonding_init(void) if (res) goto out; + bond_create_debugfs(); + res = register_pernet_subsys(&bond_net_ops); if (res) - goto out; + goto err_net_ops; res = bond_netlink_init(); if (res) goto err_link; - bond_create_debugfs(); - for (i = 0; i < max_bonds; i++) { res = bond_create(&init_net, NULL); if (res) @@ -6501,10 +6501,11 @@ static int __init bonding_init(void) out: return res; err: - bond_destroy_debugfs(); bond_netlink_fini(); err_link: unregister_pernet_subsys(&bond_net_ops); +err_net_ops: + bond_destroy_debugfs(); goto out; } @@ -6513,11 +6514,11 @@ static void __exit bonding_exit(void) { unregister_netdevice_notifier(&bond_netdev_notifier); - bond_destroy_debugfs(); - bond_netlink_fini(); unregister_pernet_subsys(&bond_net_ops); + bond_destroy_debugfs(); + #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ WARN_ON(atomic_read(&netpoll_block_tx)); From fe32622763d8bc864231381f34f7521f8694748b Mon Sep 17 00:00:00 2001 From: Sagar Cheluvegowda Date: Tue, 14 May 2024 17:06:52 -0700 Subject: [PATCH 18/30] dt-bindings: net: qcom: ethernet: Allow dma-coherent On SA8775P, Ethernet DMA controller is coherent with the CPU. allow specifying that. Signed-off-by: Sagar Cheluvegowda Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240514-mark_ethernet_devices_dma_coherent-v4-2-04e1198858c5@quicinc.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/qcom,ethqos.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/net/qcom,ethqos.yaml b/Documentation/devicetree/bindings/net/qcom,ethqos.yaml index 69a337c7e345..6672327358bc 100644 --- a/Documentation/devicetree/bindings/net/qcom,ethqos.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ethqos.yaml @@ -61,6 +61,8 @@ properties: iommus: maxItems: 1 + dma-coherent: true + phys: true phy-names: From 67708158e732bf03d076fba1e3d4453fbf8292a2 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Wed, 15 May 2024 11:24:14 +0200 Subject: [PATCH 19/30] idpf: don't skip over ethtool tcp-data-split setting Disabling tcp-data-split on idpf silently fails: # ethtool -G $NETDEV tcp-data-split off # ethtool -g $NETDEV | grep 'TCP data split' TCP data split: on But it works if you also change 'tx' or 'rx': # ethtool -G $NETDEV tcp-data-split off tx 256 # ethtool -g $NETDEV | grep 'TCP data split' TCP data split: off The bug is in idpf_set_ringparam, where it takes a shortcut out if the TX and RX sizes are not changing. Fix it by checking also if the tcp-data-split setting remains unchanged. Only then can the soft reset be skipped. Fixes: 9b1aa3ef2328 ("idpf: add get/set for Ethtool's header split ringparam") Reported-by: Xu Du Closes: https://issues.redhat.com/browse/RHEL-36182 Signed-off-by: Michal Schmidt Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/r/20240515092414.158079-1-mschmidt@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_ethtool.c b/drivers/net/ethernet/intel/idpf/idpf_ethtool.c index 986d429d1175..6972d728431c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_ethtool.c +++ b/drivers/net/ethernet/intel/idpf/idpf_ethtool.c @@ -376,7 +376,8 @@ static int idpf_set_ringparam(struct net_device *netdev, new_tx_count); if (new_tx_count == vport->txq_desc_count && - new_rx_count == vport->rxq_desc_count) + new_rx_count == vport->rxq_desc_count && + kring->tcp_data_split == idpf_vport_get_hsplit(vport)) goto unlock_mutex; if (!idpf_vport_set_hsplit(vport, kring->tcp_data_split)) { From e03e7f20ebf7e1611d40d1fdc1bde900fd3335f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2024 14:29:34 +0000 Subject: [PATCH 20/30] netrom: fix possible dead-lock in nr_rt_ioctl() syzbot loves netrom, and found a possible deadlock in nr_rt_ioctl [1] Make sure we always acquire nr_node_list_lock before nr_node_lock(nr_node) [1] WARNING: possible circular locking dependency detected 6.9.0-rc7-syzkaller-02147-g654de42f3fc6 #0 Not tainted ------------------------------------------------------ syz-executor350/5129 is trying to acquire lock: ffff8880186e2070 (&nr_node->node_lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline] ffff8880186e2070 (&nr_node->node_lock){+...}-{2:2}, at: nr_node_lock include/net/netrom.h:152 [inline] ffff8880186e2070 (&nr_node->node_lock){+...}-{2:2}, at: nr_dec_obs net/netrom/nr_route.c:464 [inline] ffff8880186e2070 (&nr_node->node_lock){+...}-{2:2}, at: nr_rt_ioctl+0x1bb/0x1090 net/netrom/nr_route.c:697 but task is already holding lock: ffffffff8f7053b8 (nr_node_list_lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline] ffffffff8f7053b8 (nr_node_list_lock){+...}-{2:2}, at: nr_dec_obs net/netrom/nr_route.c:462 [inline] ffffffff8f7053b8 (nr_node_list_lock){+...}-{2:2}, at: nr_rt_ioctl+0x10a/0x1090 net/netrom/nr_route.c:697 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (nr_node_list_lock){+...}-{2:2}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:126 [inline] _raw_spin_lock_bh+0x35/0x50 kernel/locking/spinlock.c:178 spin_lock_bh include/linux/spinlock.h:356 [inline] nr_remove_node net/netrom/nr_route.c:299 [inline] nr_del_node+0x4b4/0x820 net/netrom/nr_route.c:355 nr_rt_ioctl+0xa95/0x1090 net/netrom/nr_route.c:683 sock_do_ioctl+0x158/0x460 net/socket.c:1222 sock_ioctl+0x629/0x8e0 net/socket.c:1341 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:904 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:890 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&nr_node->node_lock){+...}-{2:2}: check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:126 [inline] _raw_spin_lock_bh+0x35/0x50 kernel/locking/spinlock.c:178 spin_lock_bh include/linux/spinlock.h:356 [inline] nr_node_lock include/net/netrom.h:152 [inline] nr_dec_obs net/netrom/nr_route.c:464 [inline] nr_rt_ioctl+0x1bb/0x1090 net/netrom/nr_route.c:697 sock_do_ioctl+0x158/0x460 net/socket.c:1222 sock_ioctl+0x629/0x8e0 net/socket.c:1341 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:904 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:890 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(nr_node_list_lock); lock(&nr_node->node_lock); lock(nr_node_list_lock); lock(&nr_node->node_lock); *** DEADLOCK *** 1 lock held by syz-executor350/5129: #0: ffffffff8f7053b8 (nr_node_list_lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline] #0: ffffffff8f7053b8 (nr_node_list_lock){+...}-{2:2}, at: nr_dec_obs net/netrom/nr_route.c:462 [inline] #0: ffffffff8f7053b8 (nr_node_list_lock){+...}-{2:2}, at: nr_rt_ioctl+0x10a/0x1090 net/netrom/nr_route.c:697 stack backtrace: CPU: 0 PID: 5129 Comm: syz-executor350 Not tainted 6.9.0-rc7-syzkaller-02147-g654de42f3fc6 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187 check_prev_add kernel/locking/lockdep.c:3134 [inline] check_prevs_add kernel/locking/lockdep.c:3253 [inline] validate_chain+0x18cb/0x58e0 kernel/locking/lockdep.c:3869 __lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5754 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:126 [inline] _raw_spin_lock_bh+0x35/0x50 kernel/locking/spinlock.c:178 spin_lock_bh include/linux/spinlock.h:356 [inline] nr_node_lock include/net/netrom.h:152 [inline] nr_dec_obs net/netrom/nr_route.c:464 [inline] nr_rt_ioctl+0x1bb/0x1090 net/netrom/nr_route.c:697 sock_do_ioctl+0x158/0x460 net/socket.c:1222 sock_ioctl+0x629/0x8e0 net/socket.c:1341 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:904 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:890 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240515142934.3708038-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/netrom/nr_route.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 70480869ad1c..bd2b17b219ae 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -285,22 +285,14 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, return 0; } -static inline void __nr_remove_node(struct nr_node *nr_node) +static void nr_remove_node_locked(struct nr_node *nr_node) { + lockdep_assert_held(&nr_node_list_lock); + hlist_del_init(&nr_node->node_node); nr_node_put(nr_node); } -#define nr_remove_node_locked(__node) \ - __nr_remove_node(__node) - -static void nr_remove_node(struct nr_node *nr_node) -{ - spin_lock_bh(&nr_node_list_lock); - __nr_remove_node(nr_node); - spin_unlock_bh(&nr_node_list_lock); -} - static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh) { hlist_del_init(&nr_neigh->neigh_node); @@ -339,6 +331,7 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n return -EINVAL; } + spin_lock_bh(&nr_node_list_lock); nr_node_lock(nr_node); for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { @@ -352,7 +345,7 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n nr_node->count--; if (nr_node->count == 0) { - nr_remove_node(nr_node); + nr_remove_node_locked(nr_node); } else { switch (i) { case 0: @@ -367,12 +360,14 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n nr_node_put(nr_node); } nr_node_unlock(nr_node); + spin_unlock_bh(&nr_node_list_lock); return 0; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); + spin_unlock_bh(&nr_node_list_lock); nr_node_put(nr_node); return -EINVAL; From fa033def4171d2e4e29d5e3714fb2a5b1fc077e8 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Wed, 15 May 2024 11:31:25 -0500 Subject: [PATCH 21/30] virtio_net: Fix missed rtnl_unlock The rtnl_lock would stay locked if allocating promisc_allmulti failed. Also changed the allocation to GFP_KERNEL. Fixes: ff7c7d9f5261 ("virtio_net: Remove command data from control_buf") Reported-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89iLazVaUCvhPm6RPJJ0owra_oFnx7Fhc8d60gV-65ad3WQ@mail.gmail.com/ Signed-off-by: Daniel Jurgens Reviewed-by: Brett Creeley Reviewed-by: Eric Dumazet Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://lore.kernel.org/r/20240515163125.569743-1-danielj@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 19a9b50646c7..4e1a0fc0d555 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2902,14 +2902,14 @@ static void virtnet_rx_mode_work(struct work_struct *work) if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) return; - rtnl_lock(); - - promisc_allmulti = kzalloc(sizeof(*promisc_allmulti), GFP_ATOMIC); + promisc_allmulti = kzalloc(sizeof(*promisc_allmulti), GFP_KERNEL); if (!promisc_allmulti) { dev_warn(&dev->dev, "Failed to set RX mode, no memory.\n"); return; } + rtnl_lock(); + *promisc_allmulti = !!(dev->flags & IFF_PROMISC); sg_init_one(sg, promisc_allmulti, sizeof(*promisc_allmulti)); From 581073f626e387d3e7eed55c48c8495584ead7ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2024 16:33:58 +0000 Subject: [PATCH 22/30] af_packet: do not call packet_read_pending() from tpacket_destruct_skb() trafgen performance considerably sank on hosts with many cores after the blamed commit. packet_read_pending() is very expensive, and calling it in af_packet fast path defeats Daniel intent in commit b013840810c2 ("packet: use percpu mmap tx frame pending refcount") tpacket_destruct_skb() makes room for one packet, we can immediately wakeup a producer, no need to completely drain the tx ring. Fixes: 89ed5b519004 ("af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET") Signed-off-by: Eric Dumazet Cc: Neil Horman Cc: Daniel Borkmann Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240515163358.4105915-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8c6d3fbb4ed8..ea3ebc160e25 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2522,8 +2522,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); - if (!packet_read_pending(&po->tx_ring)) - complete(&po->skb_completion); + complete(&po->skb_completion); } sock_wfree(skb); From 68067f065ee730c7c67b361c3c81808d25d5a90b Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 17 May 2024 14:51:38 +0800 Subject: [PATCH 23/30] net: wangxun: fix to change Rx features Fix the issue where some Rx features cannot be changed. When using ethtool -K to turn off rx offload, it returns error and displays "Could not change any device features". And netdev->features is not assigned a new value to actually configure the hardware. Fixes: 6dbedcffcf54 ("net: libwx: Implement xx_set_features ops") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 6fae161cbcb8..667a5675998c 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -2690,12 +2690,14 @@ int wx_set_features(struct net_device *netdev, netdev_features_t features) wx->rss_enabled = false; } + netdev->features = features; + if (changed & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX)) wx_set_rx_mode(netdev); - return 1; + return 0; } EXPORT_SYMBOL(wx_set_features); From ac71ab7816b675f1c9614015bd87bfccb456c394 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 17 May 2024 14:51:39 +0800 Subject: [PATCH 24/30] net: wangxun: match VLAN CTAG and STAG features Hardware requires VLAN CTAG and STAG configuration always matches. And whether VLAN CTAG or STAG changes, the configuration needs to be changed as well. Fixes: 6670f1ece2c8 ("net: txgbe: Add netdev features support") Signed-off-by: Jiawen Wu Reviewed-by: Sai Krishna Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 46 +++++++++++++++++++ drivers/net/ethernet/wangxun/libwx/wx_lib.h | 2 + drivers/net/ethernet/wangxun/ngbe/ngbe_main.c | 1 + .../net/ethernet/wangxun/txgbe/txgbe_main.c | 1 + 4 files changed, 50 insertions(+) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 667a5675998c..d0cb09a4bd3d 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -2701,6 +2701,52 @@ int wx_set_features(struct net_device *netdev, netdev_features_t features) } EXPORT_SYMBOL(wx_set_features); +#define NETIF_VLAN_STRIPPING_FEATURES (NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_STAG_RX) + +#define NETIF_VLAN_INSERTION_FEATURES (NETIF_F_HW_VLAN_CTAG_TX | \ + NETIF_F_HW_VLAN_STAG_TX) + +#define NETIF_VLAN_FILTERING_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \ + NETIF_F_HW_VLAN_STAG_FILTER) + +netdev_features_t wx_fix_features(struct net_device *netdev, + netdev_features_t features) +{ + netdev_features_t changed = netdev->features ^ features; + struct wx *wx = netdev_priv(netdev); + + if (changed & NETIF_VLAN_STRIPPING_FEATURES) { + if ((features & NETIF_VLAN_STRIPPING_FEATURES) != NETIF_VLAN_STRIPPING_FEATURES && + (features & NETIF_VLAN_STRIPPING_FEATURES) != 0) { + features &= ~NETIF_VLAN_STRIPPING_FEATURES; + features |= netdev->features & NETIF_VLAN_STRIPPING_FEATURES; + wx_err(wx, "802.1Q and 802.1ad VLAN stripping must be either both on or both off."); + } + } + + if (changed & NETIF_VLAN_INSERTION_FEATURES) { + if ((features & NETIF_VLAN_INSERTION_FEATURES) != NETIF_VLAN_INSERTION_FEATURES && + (features & NETIF_VLAN_INSERTION_FEATURES) != 0) { + features &= ~NETIF_VLAN_INSERTION_FEATURES; + features |= netdev->features & NETIF_VLAN_INSERTION_FEATURES; + wx_err(wx, "802.1Q and 802.1ad VLAN insertion must be either both on or both off."); + } + } + + if (changed & NETIF_VLAN_FILTERING_FEATURES) { + if ((features & NETIF_VLAN_FILTERING_FEATURES) != NETIF_VLAN_FILTERING_FEATURES && + (features & NETIF_VLAN_FILTERING_FEATURES) != 0) { + features &= ~NETIF_VLAN_FILTERING_FEATURES; + features |= netdev->features & NETIF_VLAN_FILTERING_FEATURES; + wx_err(wx, "802.1Q and 802.1ad VLAN filtering must be either both on or both off."); + } + } + + return features; +} +EXPORT_SYMBOL(wx_fix_features); + void wx_set_ring(struct wx *wx, u32 new_tx_count, u32 new_rx_count, struct wx_ring *temp_ring) { diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.h b/drivers/net/ethernet/wangxun/libwx/wx_lib.h index ec909e876720..c41b29ea812f 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.h @@ -30,6 +30,8 @@ int wx_setup_resources(struct wx *wx); void wx_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats); int wx_set_features(struct net_device *netdev, netdev_features_t features); +netdev_features_t wx_fix_features(struct net_device *netdev, + netdev_features_t features); void wx_set_ring(struct wx *wx, u32 new_tx_count, u32 new_rx_count, struct wx_ring *temp_ring); diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c index fdd6b4f70b7a..e894e01d030d 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c @@ -499,6 +499,7 @@ static const struct net_device_ops ngbe_netdev_ops = { .ndo_start_xmit = wx_xmit_frame, .ndo_set_rx_mode = wx_set_rx_mode, .ndo_set_features = wx_set_features, + .ndo_fix_features = wx_fix_features, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = wx_set_mac, .ndo_get_stats64 = wx_get_stats64, diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index bd4624d14ca0..b3c0058b045d 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -428,6 +428,7 @@ static const struct net_device_ops txgbe_netdev_ops = { .ndo_start_xmit = wx_xmit_frame, .ndo_set_rx_mode = wx_set_rx_mode, .ndo_set_features = wx_set_features, + .ndo_fix_features = wx_fix_features, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = wx_set_mac, .ndo_get_stats64 = wx_get_stats64, From 1d3c6414950badaa38002af3b5857e01a21f01e9 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 17 May 2024 14:51:40 +0800 Subject: [PATCH 25/30] net: txgbe: fix to control VLAN strip When VLAN tag strip is changed to enable or disable, the hardware requires the Rx ring to be in a disabled state, otherwise the feature cannot be changed. Fixes: f3b03c655f67 ("net: wangxun: Implement vlan add and kill functions") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 2 ++ drivers/net/ethernet/wangxun/libwx/wx_lib.c | 6 ++-- drivers/net/ethernet/wangxun/libwx/wx_type.h | 22 ++++++++++++++ .../net/ethernet/wangxun/ngbe/ngbe_ethtool.c | 18 +++++++---- .../ethernet/wangxun/txgbe/txgbe_ethtool.c | 18 +++++++---- .../net/ethernet/wangxun/txgbe/txgbe_main.c | 30 +++++++++++++++++++ .../net/ethernet/wangxun/txgbe/txgbe_type.h | 1 + 7 files changed, 84 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 3662483bfe2e..7c4b6881a93f 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -1958,6 +1958,8 @@ int wx_sw_init(struct wx *wx) return -ENOMEM; } + bitmap_zero(wx->state, WX_STATE_NBITS); + return 0; } EXPORT_SYMBOL(wx_sw_init); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index d0cb09a4bd3d..07ba3a270a14 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -2692,9 +2692,9 @@ int wx_set_features(struct net_device *netdev, netdev_features_t features) netdev->features = features; - if (changed & - (NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_HW_VLAN_STAG_RX)) + if (wx->mac.type == wx_mac_sp && changed & NETIF_F_HW_VLAN_CTAG_RX) + wx->do_reset(netdev); + else if (changed & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER)) wx_set_rx_mode(netdev); return 0; diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 1fdeb464d5f4..5aaf7b1fa2db 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -982,8 +982,13 @@ struct wx_hw_stats { u64 qmprc; }; +enum wx_state { + WX_STATE_RESETTING, + WX_STATE_NBITS, /* must be last */ +}; struct wx { unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; + DECLARE_BITMAP(state, WX_STATE_NBITS); void *priv; u8 __iomem *hw_addr; @@ -1071,6 +1076,8 @@ struct wx { u64 hw_csum_rx_good; u64 hw_csum_rx_error; u64 alloc_rx_buff_failed; + + void (*do_reset)(struct net_device *netdev); }; #define WX_INTR_ALL (~0ULL) @@ -1131,4 +1138,19 @@ static inline struct wx *phylink_to_wx(struct phylink_config *config) return container_of(config, struct wx, phylink_config); } +static inline int wx_set_state_reset(struct wx *wx) +{ + u8 timeout = 50; + + while (test_and_set_bit(WX_STATE_RESETTING, wx->state)) { + timeout--; + if (!timeout) + return -EBUSY; + + usleep_range(1000, 2000); + } + + return 0; +} + #endif /* _WX_TYPE_H_ */ diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_ethtool.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_ethtool.c index 786a652ae64f..46a5a3e95202 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_ethtool.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_ethtool.c @@ -52,7 +52,7 @@ static int ngbe_set_ringparam(struct net_device *netdev, struct wx *wx = netdev_priv(netdev); u32 new_rx_count, new_tx_count; struct wx_ring *temp_ring; - int i; + int i, err = 0; new_tx_count = clamp_t(u32, ring->tx_pending, WX_MIN_TXD, WX_MAX_TXD); new_tx_count = ALIGN(new_tx_count, WX_REQ_TX_DESCRIPTOR_MULTIPLE); @@ -64,6 +64,10 @@ static int ngbe_set_ringparam(struct net_device *netdev, new_rx_count == wx->rx_ring_count) return 0; + err = wx_set_state_reset(wx); + if (err) + return err; + if (!netif_running(wx->netdev)) { for (i = 0; i < wx->num_tx_queues; i++) wx->tx_ring[i]->count = new_tx_count; @@ -72,14 +76,16 @@ static int ngbe_set_ringparam(struct net_device *netdev, wx->tx_ring_count = new_tx_count; wx->rx_ring_count = new_rx_count; - return 0; + goto clear_reset; } /* allocate temporary buffer to store rings in */ i = max_t(int, wx->num_tx_queues, wx->num_rx_queues); temp_ring = kvmalloc_array(i, sizeof(struct wx_ring), GFP_KERNEL); - if (!temp_ring) - return -ENOMEM; + if (!temp_ring) { + err = -ENOMEM; + goto clear_reset; + } ngbe_down(wx); @@ -89,7 +95,9 @@ static int ngbe_set_ringparam(struct net_device *netdev, wx_configure(wx); ngbe_up(wx); - return 0; +clear_reset: + clear_bit(WX_STATE_RESETTING, wx->state); + return err; } static int ngbe_set_channels(struct net_device *dev, diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_ethtool.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_ethtool.c index db675512ce4d..31fde3fa7c6b 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_ethtool.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_ethtool.c @@ -19,7 +19,7 @@ static int txgbe_set_ringparam(struct net_device *netdev, struct wx *wx = netdev_priv(netdev); u32 new_rx_count, new_tx_count; struct wx_ring *temp_ring; - int i; + int i, err = 0; new_tx_count = clamp_t(u32, ring->tx_pending, WX_MIN_TXD, WX_MAX_TXD); new_tx_count = ALIGN(new_tx_count, WX_REQ_TX_DESCRIPTOR_MULTIPLE); @@ -31,6 +31,10 @@ static int txgbe_set_ringparam(struct net_device *netdev, new_rx_count == wx->rx_ring_count) return 0; + err = wx_set_state_reset(wx); + if (err) + return err; + if (!netif_running(wx->netdev)) { for (i = 0; i < wx->num_tx_queues; i++) wx->tx_ring[i]->count = new_tx_count; @@ -39,14 +43,16 @@ static int txgbe_set_ringparam(struct net_device *netdev, wx->tx_ring_count = new_tx_count; wx->rx_ring_count = new_rx_count; - return 0; + goto clear_reset; } /* allocate temporary buffer to store rings in */ i = max_t(int, wx->num_tx_queues, wx->num_rx_queues); temp_ring = kvmalloc_array(i, sizeof(struct wx_ring), GFP_KERNEL); - if (!temp_ring) - return -ENOMEM; + if (!temp_ring) { + err = -ENOMEM; + goto clear_reset; + } txgbe_down(wx); @@ -55,7 +61,9 @@ static int txgbe_set_ringparam(struct net_device *netdev, txgbe_up(wx); - return 0; +clear_reset: + clear_bit(WX_STATE_RESETTING, wx->state); + return err; } static int txgbe_set_channels(struct net_device *dev, diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index b3c0058b045d..8c7a74981b90 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -269,6 +269,8 @@ static int txgbe_sw_init(struct wx *wx) wx->tx_work_limit = TXGBE_DEFAULT_TX_WORK; wx->rx_work_limit = TXGBE_DEFAULT_RX_WORK; + wx->do_reset = txgbe_do_reset; + return 0; } @@ -421,6 +423,34 @@ int txgbe_setup_tc(struct net_device *dev, u8 tc) return 0; } +static void txgbe_reinit_locked(struct wx *wx) +{ + int err = 0; + + netif_trans_update(wx->netdev); + + err = wx_set_state_reset(wx); + if (err) { + wx_err(wx, "wait device reset timeout\n"); + return; + } + + txgbe_down(wx); + txgbe_up(wx); + + clear_bit(WX_STATE_RESETTING, wx->state); +} + +void txgbe_do_reset(struct net_device *netdev) +{ + struct wx *wx = netdev_priv(netdev); + + if (netif_running(netdev)) + txgbe_reinit_locked(wx); + else + txgbe_reset(wx); +} + static const struct net_device_ops txgbe_netdev_ops = { .ndo_open = txgbe_open, .ndo_stop = txgbe_close, diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h index 1b4ff50d5857..f434a7865cb7 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h @@ -134,6 +134,7 @@ extern char txgbe_driver_name[]; void txgbe_down(struct wx *wx); void txgbe_up(struct wx *wx); int txgbe_setup_tc(struct net_device *dev, u8 tc); +void txgbe_do_reset(struct net_device *netdev); #define NODE_PROP(_NAME, _PROP) \ (const struct software_node) { \ From 6e828dc60e509b79ef09882264952f341cb58425 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Mon, 13 May 2024 18:22:47 +0100 Subject: [PATCH 26/30] l2tp: fix ICMP error handling for UDP-encap sockets Since commit a36e185e8c85 ("udp: Handle ICMP errors for tunnels with same destination port on both endpoints") UDP's handling of ICMP errors has allowed for UDP-encap tunnels to determine socket associations in scenarios where the UDP hash lookup could not. Subsequently, commit d26796ae58940 ("udp: check udp sock encap_type in __udp_lib_err") subtly tweaked the approach such that UDP ICMP error handling would be skipped for any UDP socket which has encapsulation enabled. In the case of L2TP tunnel sockets using UDP-encap, this latter modification effectively broke ICMP error reporting for the L2TP control plane. To a degree this isn't catastrophic inasmuch as the L2TP control protocol defines a reliable transport on top of the underlying packet switching network which will eventually detect errors and time out. However, paying attention to the ICMP error reporting allows for more timely detection of errors in L2TP userspace, and aids in debugging connectivity issues. Reinstate ICMP error handling for UDP encap L2TP tunnels: * implement struct udp_tunnel_sock_cfg .encap_err_rcv in order to allow the L2TP code to handle ICMP errors; * only implement error-handling for tunnels which have a managed socket: unmanaged tunnels using a kernel socket have no userspace to report errors back to; * flag the error on the socket, which allows for userspace to get an error such as -ECONNREFUSED back from sendmsg/recvmsg; * pass the error into ip[v6]_icmp_error() which allows for userspace to get extended error information via. MSG_ERRQUEUE. Fixes: d26796ae5894 ("udp: check udp sock encap_type in __udp_lib_err") Signed-off-by: Tom Parkin Link: https://lore.kernel.org/r/20240513172248.623261-1-tparkin@katalix.com Signed-off-by: Jakub Kicinski --- net/l2tp/l2tp_core.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 7d519a46a844..88a34db265d8 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -910,22 +910,20 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) return 1; } -/* UDP encapsulation receive handler. See net/ipv4/udp.c. - * Return codes: - * 0 : success. - * <0: error - * >0: skb should be passed up to userspace as UDP. +/* UDP encapsulation receive and error receive handlers. + * See net/ipv4/udp.c for details. + * + * Note that these functions are called from inside an + * RCU-protected region, but without the socket being locked. + * + * Hence we use rcu_dereference_sk_user_data to access the + * tunnel data structure rather the usual l2tp_sk_to_tunnel + * accessor function. */ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_tunnel *tunnel; - /* Note that this is called from the encap_rcv hook inside an - * RCU-protected region, but without the socket being locked. - * Hence we use rcu_dereference_sk_user_data to access the - * tunnel data structure rather the usual l2tp_sk_to_tunnel - * accessor function. - */ tunnel = rcu_dereference_sk_user_data(sk); if (!tunnel) goto pass_up; @@ -942,6 +940,29 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv); +static void l2tp_udp_encap_err_recv(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) +{ + struct l2tp_tunnel *tunnel; + + tunnel = rcu_dereference_sk_user_data(sk); + if (!tunnel || tunnel->fd < 0) + return; + + sk->sk_err = err; + sk_error_report(sk); + + if (ip_hdr(skb)->version == IPVERSION) { + if (inet_test_bit(RECVERR, sk)) + return ip_icmp_error(sk, skb, err, port, info, payload); +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (inet6_test_bit(RECVERR6, sk)) + return ipv6_icmp_error(sk, skb, err, port, info, payload); +#endif + } +} + /************************************************************************ * Transmit handling ***********************************************************************/ @@ -1516,6 +1537,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, .sk_user_data = tunnel, .encap_type = UDP_ENCAP_L2TPINUDP, .encap_rcv = l2tp_udp_encap_recv, + .encap_err_rcv = l2tp_udp_encap_err_recv, .encap_destroy = l2tp_udp_encap_destroy, }; From ce08eeb59df090ca74ab6c035d8636ca75680cb4 Mon Sep 17 00:00:00 2001 From: Ravi Gunasekaran Date: Thu, 16 May 2024 11:19:32 +0530 Subject: [PATCH 27/30] dt-bindings: net: ti: Update maintainers list Update the list with the current maintainers of TI's CPSW ethernet peripheral. Signed-off-by: Ravi Gunasekaran Acked-by: Conor Dooley Acked-by: Roger Quadros Link: https://lore.kernel.org/r/20240516054932.27597-1-r-gunasekaran@ti.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml | 1 - Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml | 1 - Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml | 1 - 3 files changed, 3 deletions(-) diff --git a/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml b/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml index d5bd93ee4dbb..d14ca81f70e0 100644 --- a/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml +++ b/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml @@ -8,7 +8,6 @@ title: TI SoC Ethernet Switch Controller (CPSW) maintainers: - Siddharth Vadapalli - - Ravi Gunasekaran - Roger Quadros description: diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml index 73ed5951d296..02b6d32003cc 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml @@ -8,7 +8,6 @@ title: The TI AM654x/J721E/AM642x SoC Gigabit Ethernet MAC (Media Access Control maintainers: - Siddharth Vadapalli - - Ravi Gunasekaran - Roger Quadros description: diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml index b1c875325776..3888692275ad 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml @@ -8,7 +8,6 @@ title: The TI AM654x/J721E Common Platform Time Sync (CPTS) module maintainers: - Siddharth Vadapalli - - Ravi Gunasekaran - Roger Quadros description: |+ From 31279b0cb45f2f6be1f0e7324aeec26578a6e65e Mon Sep 17 00:00:00 2001 From: Ravi Gunasekaran Date: Thu, 16 May 2024 13:55:45 +0530 Subject: [PATCH 28/30] MAINTAINERS: net: Update reviewers for TI's Ethernet drivers Remove myself as reviewer for TI's ethernet drivers Signed-off-by: Ravi Gunasekaran Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20240516082545.6412-1-r-gunasekaran@ti.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index be7f72766dc6..b76072a35a29 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22275,7 +22275,6 @@ F: drivers/counter/ti-eqep.c TI ETHERNET SWITCH DRIVER (CPSW) R: Siddharth Vadapalli -R: Ravi Gunasekaran R: Roger Quadros L: linux-omap@vger.kernel.org L: netdev@vger.kernel.org From f0fa84116434b50a8d249d0da8852f410a21ba98 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 17 May 2024 07:01:21 +0200 Subject: [PATCH 29/30] net: dsa: microchip: Correct initialization order for KSZ88x3 ports Adjust the initialization sequence of KSZ88x3 switches to enable 802.1p priority control on Port 2 before configuring Port 1. This change ensures the apptrust functionality on Port 1 operates correctly, as it depends on the priority settings of Port 2. The prior initialization sequence incorrectly configured Port 1 first, which could lead to functional discrepancies. Fixes: a1ea57710c9d ("net: dsa: microchip: dcb: add special handling for KSZ88X3 family") Signed-off-by: Oleksij Rempel Reviewed-by: Hariprasad Kelam Acked-by: Arun Ramadoss Link: https://lore.kernel.org/r/20240517050121.2174412-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_dcb.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz_dcb.c b/drivers/net/dsa/microchip/ksz_dcb.c index a97106327562..086bc9b3cf53 100644 --- a/drivers/net/dsa/microchip/ksz_dcb.c +++ b/drivers/net/dsa/microchip/ksz_dcb.c @@ -805,5 +805,15 @@ int ksz_dcb_init(struct ksz_device *dev) if (ret) return ret; + /* Enable 802.1p priority control on Port 2 during switch initialization. + * This setup is critical for the apptrust functionality on Port 1, which + * relies on the priority settings of Port 2. Note: Port 1 is naturally + * configured before Port 2, necessitating this configuration order. + */ + if (ksz_is_ksz88x3(dev)) + return ksz_prmw8(dev, KSZ_PORT_2, KSZ8_REG_PORT_1_CTRL_0, + KSZ8_PORT_802_1P_ENABLE, + KSZ8_PORT_802_1P_ENABLE); + return 0; } From fe56d6e4a99a40f50e64d5a8043f1fa838b1f7a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 16 May 2024 08:25:13 -0700 Subject: [PATCH 30/30] selftests: net: local_termination: annotate the expected failures Vladimir said when adding this test: The bridge driver fares particularly badly [...] mainly because it does not implement IFF_UNICAST_FLT. See commit 90b9566aa5cd ("selftests: forwarding: add a test for local_termination.sh"). We don't want to hide the known gaps, but having a test which always fails prevents us from catching regressions. Report the cases we know may fail as XFAIL. Reviewed-by: Simon Horman Reviewed-by: Hangbin Liu Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20240516152513.1115270-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/forwarding/local_termination.sh | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c5b0cbc85b3e..4b364cdf3ef0 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -155,25 +155,30 @@ run_test() "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Multicast IPv4 to unknown group" \ - "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Multicast IPv4 to unknown group" \ + "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ @@ -187,9 +192,10 @@ run_test() "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ true - check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ - "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ + "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ + false check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \