From 91a46c6d1b4fcbfa4773df9421b8ad3e58088101 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 4 Sep 2020 08:49:55 +0200 Subject: xfrm: clone XFRMA_REPLAY_ESN_VAL in xfrm_do_migrate XFRMA_REPLAY_ESN_VAL was not cloned completely from the old to the new. Migrate this attribute during XFRMA_MSG_MIGRATE v1->v2: - move curleft cloning to a separate patch Fixes: af2f464e326e ("xfrm: Assign esn pointers when cloning a state") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2737d24ec244..9e806c781025 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1773,21 +1773,17 @@ static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_es static inline int xfrm_replay_clone(struct xfrm_state *x, struct xfrm_state *orig) { - x->replay_esn = kzalloc(xfrm_replay_state_esn_len(orig->replay_esn), + + x->replay_esn = kmemdup(orig->replay_esn, + xfrm_replay_state_esn_len(orig->replay_esn), GFP_KERNEL); if (!x->replay_esn) return -ENOMEM; - - x->replay_esn->bmp_len = orig->replay_esn->bmp_len; - x->replay_esn->replay_window = orig->replay_esn->replay_window; - - x->preplay_esn = kmemdup(x->replay_esn, - xfrm_replay_state_esn_len(x->replay_esn), + x->preplay_esn = kmemdup(orig->preplay_esn, + xfrm_replay_state_esn_len(orig->preplay_esn), GFP_KERNEL); - if (!x->preplay_esn) { - kfree(x->replay_esn); + if (!x->preplay_esn) return -ENOMEM; - } return 0; } -- cgit v1.2.3 From 65c204398928f9c79f1a29912b410439f7052635 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Sat, 19 Sep 2020 22:01:34 -0700 Subject: bpf: Prevent .BTF section elimination Systems with memory or disk constraints often reduce the kernel footprint by configuring LD_DEAD_CODE_DATA_ELIMINATION. However, this can result in removal of any BTF information. Use the KEEP() macro to preserve the BTF data as done with other important sections, while still allowing for smaller kernels. Fixes: 90ceddcb4950 ("bpf: Support llvm-objcopy for vmlinux BTF") Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/a635b5d3e2da044e7b51ec1315e8910fbce0083f.1600417359.git.Tony.Ambardar@gmail.com --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5430febd34be..7636bc71c71f 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -661,7 +661,7 @@ #define BTF \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ __start_BTF = .; \ - *(.BTF) \ + KEEP(*(.BTF)) \ __stop_BTF = .; \ } \ . = ALIGN(4); \ -- cgit v1.2.3 From e49d8c22f1261c43a986a7fdbf677ac309682a07 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 22 Sep 2020 20:56:23 -0700 Subject: net_sched: defer tcf_idr_insert() in tcf_action_init_1() All TC actions call tcf_idr_insert() for new action at the end of their ->init(), so we can actually move it to a central place in tcf_action_init_1(). And once the action is inserted into the global IDR, other parallel process could free it immediately as its refcnt is still 1, so we can not fail after this, we need to move it after the goto action validation to avoid handling the failure case after insertion. This is found during code review, is not directly triggered by syzbot. And this prepares for the next patch. Cc: Vlad Buslov Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 2 -- net/sched/act_api.c | 38 ++++++++++++++++++++------------------ net/sched/act_bpf.c | 4 +--- net/sched/act_connmark.c | 1 - net/sched/act_csum.c | 3 --- net/sched/act_ct.c | 2 -- net/sched/act_ctinfo.c | 3 --- net/sched/act_gact.c | 2 -- net/sched/act_gate.c | 3 --- net/sched/act_ife.c | 3 --- net/sched/act_ipt.c | 2 -- net/sched/act_mirred.c | 2 -- net/sched/act_mpls.c | 2 -- net/sched/act_nat.c | 3 --- net/sched/act_pedit.c | 2 -- net/sched/act_police.c | 2 -- net/sched/act_sample.c | 2 -- net/sched/act_simple.c | 2 -- net/sched/act_skbedit.c | 2 -- net/sched/act_skbmod.c | 2 -- net/sched/act_tunnel_key.c | 3 --- net/sched/act_vlan.c | 2 -- 22 files changed, 21 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index cb382a89ea58..87214927314a 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -166,8 +166,6 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); -void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); - void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 063d8aaf2900..0030f00234ee 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -467,17 +467,6 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, } EXPORT_SYMBOL(tcf_idr_create_from_flags); -void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) -{ - struct tcf_idrinfo *idrinfo = tn->idrinfo; - - mutex_lock(&idrinfo->lock); - /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ - WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); - mutex_unlock(&idrinfo->lock); -} -EXPORT_SYMBOL(tcf_idr_insert); - /* Cleanup idr index that was allocated but not initialized. */ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) @@ -902,6 +891,16 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; +static void tcf_idr_insert(struct tc_action *a) +{ + struct tcf_idrinfo *idrinfo = a->idrinfo; + + mutex_lock(&idrinfo->lock); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); + mutex_unlock(&idrinfo->lock); +} + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -989,6 +988,16 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err < 0) goto err_mod; + if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) && + !rcu_access_pointer(a->goto_chain)) { + tcf_action_destroy_1(a, bind); + NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain"); + return ERR_PTR(-EINVAL); + } + + if (err == ACT_P_CREATED) + tcf_idr_insert(a); + if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); @@ -1002,13 +1011,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err != ACT_P_CREATED) module_put(a_o->owner); - if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) && - !rcu_access_pointer(a->goto_chain)) { - tcf_action_destroy_1(a, bind); - NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain"); - return ERR_PTR(-EINVAL); - } - return a; err_mod: diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 54d5652cfe6c..a4c7ba35a343 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -365,9 +365,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (res == ACT_P_CREATED) { - tcf_idr_insert(tn, *act); - } else { + if (res != ACT_P_CREATED) { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index f901421b0634..e19885d7fe2c 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -139,7 +139,6 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci->net = net; ci->zone = parm->zone; - tcf_idr_insert(tn, *a); ret = ACT_P_CREATED; } else if (ret > 0) { ci = to_connmark(*a); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index f5826e457679..4fa4fcb842ba 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -110,9 +110,6 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (params_new) kfree_rcu(params_new, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2c3619165680..a780afdf570d 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1297,8 +1297,6 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, tcf_chain_put_by_act(goto_ch); if (params) call_rcu(¶ms->rcu, tcf_ct_params_free); - if (res == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return res; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index b5042f3ea079..6084300e51ad 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -269,9 +269,6 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, if (cp_new) kfree_rcu(cp_new, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; put_chain: diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 410e3bbfb9ca..73c3926358a0 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -140,8 +140,6 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; release_idr: tcf_idr_release(*a, bind); diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 1fb8d428d2c1..7c0771dd77a3 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -437,9 +437,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; chain_put: diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5c568757643b..a2ddea04183a 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -627,9 +627,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (p) kfree_rcu(p, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; metadata_parse_err: if (goto_ch) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 400a2cfe8452..8dc3bec0d325 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -189,8 +189,6 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, ipt->tcfi_t = t; ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; err3: diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b2705318993b..e24b7e2331cd 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -194,8 +194,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock(&mirred_list_lock); - - tcf_idr_insert(tn, *a); } return ret; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 8118e2640979..e298ec3b3c9e 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -273,8 +273,6 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, if (p) kfree_rcu(p, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 855a6fa16a62..1ebd2a86d980 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -93,9 +93,6 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; release_idr: tcf_idr_release(*a, bind); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index c158bfed86d5..b45304446e13 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -238,8 +238,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0b431d493768..8d8452b1cdd4 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -201,8 +201,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (new) kfree_rcu(new, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; failure: diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 5e2df590bb58..3ebf9ede3cf1 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -116,8 +116,6 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 9813ca4006dd..a4f3d0f0daa9 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -157,8 +157,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, goto release_idr; } - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index d0652386c6e2..e5f3fb8b00e3 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -225,8 +225,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 39e6d94cfafb..81a1c67335be 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -190,8 +190,6 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 37f1e10f35e0..a229751ee8c4 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -537,9 +537,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); - return ret; put_chain: diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index a5ff9f68ab02..163b0385fd4c 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -229,8 +229,6 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (p) kfree_rcu(p, rcu); - if (ret == ACT_P_CREATED) - tcf_idr_insert(tn, *a); return ret; put_chain: if (goto_ch) -- cgit v1.2.3 From 02a1b175b0e92d9e0fa5df3957ade8d733ceb6a0 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Wed, 23 Sep 2020 13:18:15 -0700 Subject: net/ipv4: always honour route mtu during forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documentation/networking/ip-sysctl.txt:46 says: ip_forward_use_pmtu - BOOLEAN By default we don't trust protocol path MTUs while forwarding because they could be easily forged and can lead to unwanted fragmentation by the router. You only need to enable this if you have user-space software which tries to discover path mtus by itself and depends on the kernel honoring this information. This is normally not the case. Default: 0 (disabled) Possible values: 0 - disabled 1 - enabled Which makes it pretty clear that setting it to 1 is a potential security/safety/DoS issue, and yet it is entirely reasonable to want forwarded traffic to honour explicitly administrator configured route mtus (instead of defaulting to device mtu). Indeed, I can't think of a single reason why you wouldn't want to. Since you configured a route mtu you probably know better... It is pretty common to have a higher device mtu to allow receiving large (jumbo) frames, while having some routes via that interface (potentially including the default route to the internet) specify a lower mtu. Note that ipv6 forwarding uses device mtu unless the route is locked (in which case it will use the route mtu). This approach is not usable for IPv4 where an 'mtu lock' on a route also has the side effect of disabling TCP path mtu discovery via disabling the IPv4 DF (don't frag) bit on all outgoing frames. I'm not aware of a way to lock a route from an IPv6 RA, so that also potentially seems wrong. Signed-off-by: Maciej Żenczykowski Cc: Eric Dumazet Cc: Willem de Bruijn Cc: Lorenzo Colitti Cc: Sunmeet Gill (Sunny) Cc: Vinay Paradkar Cc: Tyler Wear Cc: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index b09c48d862cc..2a52787db64a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -436,12 +436,18 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { struct net *net = dev_net(dst->dev); + unsigned int mtu; if (net->ipv4.sysctl_ip_fwd_use_pmtu || ip_mtu_locked(dst) || !forwarding) return dst_mtu(dst); + /* 'forwarding = true' case should always honour route mtu */ + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); } -- cgit v1.2.3 From 4ab810a4e04ab6c851007033d39c13e6d3f55110 Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Thu, 24 Sep 2020 10:11:13 +0800 Subject: net: mscc: ocelot: fix fields offset in SG_CONFIG_REG_3 INIT_IPS and GATE_ENABLE fields have a wrong offset in SG_CONFIG_REG_3. This register is used by stream gate control of PSFP, and it has not been used before, because PSFP is not implemented in ocelot driver. Signed-off-by: Xiaoliang Yang Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_ana.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot_ana.h b/include/soc/mscc/ocelot_ana.h index 841c6ec22b64..1669481d9779 100644 --- a/include/soc/mscc/ocelot_ana.h +++ b/include/soc/mscc/ocelot_ana.h @@ -252,10 +252,10 @@ #define ANA_SG_CONFIG_REG_3_LIST_LENGTH_M GENMASK(18, 16) #define ANA_SG_CONFIG_REG_3_LIST_LENGTH_X(x) (((x) & GENMASK(18, 16)) >> 16) #define ANA_SG_CONFIG_REG_3_GATE_ENABLE BIT(20) -#define ANA_SG_CONFIG_REG_3_INIT_IPS(x) (((x) << 24) & GENMASK(27, 24)) -#define ANA_SG_CONFIG_REG_3_INIT_IPS_M GENMASK(27, 24) -#define ANA_SG_CONFIG_REG_3_INIT_IPS_X(x) (((x) & GENMASK(27, 24)) >> 24) -#define ANA_SG_CONFIG_REG_3_INIT_GATE_STATE BIT(28) +#define ANA_SG_CONFIG_REG_3_INIT_IPS(x) (((x) << 21) & GENMASK(24, 21)) +#define ANA_SG_CONFIG_REG_3_INIT_IPS_M GENMASK(24, 21) +#define ANA_SG_CONFIG_REG_3_INIT_IPS_X(x) (((x) & GENMASK(24, 21)) >> 21) +#define ANA_SG_CONFIG_REG_3_INIT_GATE_STATE BIT(25) #define ANA_SG_GCL_GS_CONFIG_RSZ 0x4 -- cgit v1.2.3 From ad2b9b0f8d0107602bdc1f3b1ab90719842ace11 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Thu, 24 Sep 2020 15:23:14 -0700 Subject: tcp: skip DSACKs with dubious sequence ranges Currently, we use length of DSACKed range to compute number of delivered packets. And if sequence range in DSACK is corrupted, we can get bogus dsacked/acked count, and bogus cwnd. This patch put bounds on DSACKed range to skip update of data delivery and spurious retransmission information, if the DSACK is unlikely caused by sender's action: - DSACKed range shouldn't be greater than maximum advertised rwnd. - Total no. of DSACKed segments shouldn't be greater than total no. of retransmitted segs. Unlike spurious retransmits, network duplicates or corrupted DSACKs shouldn't be counted as delivery. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 32 +++++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index cee9f8e6fce3..f84e7bcad6de 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -288,6 +288,7 @@ enum LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ + LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 1074df726ec0..8d5e1695b9aa 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -293,6 +293,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), + SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 184ea556f50e..b1ce2054291d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -885,21 +885,34 @@ struct tcp_sacktag_state { struct rate_sample *rate; }; -/* Take a notice that peer is sending D-SACKs */ +/* Take a notice that peer is sending D-SACKs. Skip update of data delivery + * and spurious retransmission information if this DSACK is unlikely caused by + * sender's action: + * - DSACKed sequence range is larger than maximum receiver's window. + * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. + */ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, u32 end_seq, struct tcp_sacktag_state *state) { u32 seq_len, dup_segs = 1; - if (before(start_seq, end_seq)) { - seq_len = end_seq - start_seq; - if (seq_len > tp->mss_cache) - dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); - } + if (!before(start_seq, end_seq)) + return 0; + + seq_len = end_seq - start_seq; + /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ + if (seq_len > tp->max_window) + return 0; + if (seq_len > tp->mss_cache) + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + + tp->dsack_dups += dup_segs; + /* Skip the DSACK if dup segs weren't retransmitted by sender */ + if (tp->dsack_dups > tp->total_retrans) + return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; - tp->dsack_dups += dup_segs; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ @@ -1153,6 +1166,11 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + if (!dup_segs) { /* Skip dubious DSACK */ + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); + return false; + } + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ -- cgit v1.2.3 From eff7423365a6938d2d34dbce989febed2ae1f957 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 25 Sep 2020 18:13:12 +0000 Subject: net: core: introduce struct netdev_nested_priv for nested interface infrastructure Functions related to nested interface infrastructure such as netdev_walk_all_{ upper | lower }_dev() pass both private functions and "data" pointer to handle their own things. At this point, the data pointer type is void *. In order to make it easier to expand common variables and functions, this new netdev_nested_priv structure is added. In the following patch, a new member variable will be added into this struct to fix the lockdep issue. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/infiniband/core/cache.c | 10 ++-- drivers/infiniband/core/cma.c | 9 ++-- drivers/infiniband/core/roce_gid_mgmt.c | 9 ++-- drivers/infiniband/ulp/ipoib/ipoib_main.c | 9 ++-- drivers/net/bonding/bond_alb.c | 9 ++-- drivers/net/bonding/bond_main.c | 10 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 37 ++++++++++---- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 24 +++++---- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 11 ++-- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 10 ++-- drivers/net/ethernet/rocker/rocker_main.c | 9 ++-- drivers/net/wireless/quantenna/qtnfmac/core.c | 10 ++-- include/linux/netdevice.h | 16 +++--- net/bridge/br_arp_nd_proxy.c | 26 +++++++--- net/bridge/br_vlan.c | 20 +++++--- net/core/dev.c | 59 ++++++++++++++-------- 16 files changed, 183 insertions(+), 95 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index ffad73bb40ff..5a76611e684a 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1320,9 +1320,10 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) } EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); -static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) +static int get_lower_dev_vlan(struct net_device *lower_dev, + struct netdev_nested_priv *priv) { - u16 *vlan_id = data; + u16 *vlan_id = (u16 *)priv->data; if (is_vlan_dev(lower_dev)) *vlan_id = vlan_dev_vlan_id(lower_dev); @@ -1348,6 +1349,9 @@ static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, u16 *vlan_id, u8 *smac) { + struct netdev_nested_priv priv = { + .data = (void *)vlan_id, + }; struct net_device *ndev; rcu_read_lock(); @@ -1368,7 +1372,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, - get_lower_dev_vlan, vlan_id); + get_lower_dev_vlan, &priv); } } rcu_read_unlock(); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 7f0e91e92968..5888311b2119 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2865,9 +2865,10 @@ struct iboe_prio_tc_map { bool found; }; -static int get_lower_vlan_dev_tc(struct net_device *dev, void *data) +static int get_lower_vlan_dev_tc(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct iboe_prio_tc_map *map = data; + struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); @@ -2886,16 +2887,18 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); + struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; + priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, - &prio_tc_map); + &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 2860def84f4d..6b8364bb032d 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -531,10 +531,11 @@ struct upper_list { struct net_device *upper; }; -static int netdev_upper_walk(struct net_device *upper, void *data) +static int netdev_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - struct list_head *upper_list = data; + struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; @@ -553,12 +554,14 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, struct net_device *ndev)) { struct net_device *ndev = cookie; + struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); + priv.data = &upper_list; rcu_read_lock(); - netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list); + netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ab75b7f745d4..f772fe8c5b66 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -342,9 +342,10 @@ struct ipoib_walk_data { struct net_device *result; }; -static int ipoib_upper_walk(struct net_device *upper, void *_data) +static int ipoib_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { - struct ipoib_walk_data *data = _data; + struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; int ret = 0; if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { @@ -368,10 +369,12 @@ static int ipoib_upper_walk(struct net_device *upper, void *_data) static struct net_device *ipoib_get_net_dev_match_addr( const struct sockaddr *addr, struct net_device *dev) { + struct netdev_nested_priv priv; struct ipoib_walk_data data = { .addr = addr, }; + priv.data = (void *)&data; rcu_read_lock(); if (ipoib_is_dev_match_addr_rcu(addr, dev)) { dev_hold(dev); @@ -379,7 +382,7 @@ static struct net_device *ipoib_get_net_dev_match_addr( goto out; } - netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data); + netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); out: rcu_read_unlock(); return data.result; diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 4e1b7deb724b..c3091e00dd5f 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -942,9 +942,10 @@ struct alb_walk_data { bool strict_match; }; -static int alb_upper_dev_walk(struct net_device *upper, void *_data) +static int alb_upper_dev_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { - struct alb_walk_data *data = _data; + struct alb_walk_data *data = (struct alb_walk_data *)priv->data; bool strict_match = data->strict_match; struct bonding *bond = data->bond; struct slave *slave = data->slave; @@ -983,6 +984,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[], bool strict_match) { struct bonding *bond = bond_get_bond_by_slave(slave); + struct netdev_nested_priv priv; struct alb_walk_data data = { .strict_match = strict_match, .mac_addr = mac_addr, @@ -990,6 +992,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[], .bond = bond, }; + priv.data = (void *)&data; /* send untagged */ alb_send_lp_vid(slave, mac_addr, 0, 0); @@ -997,7 +1000,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[], * for that device. */ rcu_read_lock(); - netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &data); + netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &priv); rcu_read_unlock(); } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 14740d3053b8..84ecbc6fa0ff 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2511,22 +2511,26 @@ re_arm: } } -static int bond_upper_dev_walk(struct net_device *upper, void *data) +static int bond_upper_dev_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { - __be32 ip = *((__be32 *)data); + __be32 ip = *(__be32 *)priv->data; return ip == bond_confirm_addr(upper, 0, ip); } static bool bond_has_this_ip(struct bonding *bond, __be32 ip) { + struct netdev_nested_priv priv = { + .data = (void *)&ip, + }; bool ret = false; if (ip == bond_confirm_addr(bond->dev, 0, ip)) return true; rcu_read_lock(); - if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &ip)) + if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv)) ret = true; rcu_read_unlock(); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2f8a4cfc5fa1..86ca8b9ea1b8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -5396,9 +5396,10 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter *adapter, return err; } -static int ixgbe_macvlan_up(struct net_device *vdev, void *data) +static int ixgbe_macvlan_up(struct net_device *vdev, + struct netdev_nested_priv *priv) { - struct ixgbe_adapter *adapter = data; + struct ixgbe_adapter *adapter = (struct ixgbe_adapter *)priv->data; struct ixgbe_fwd_adapter *accel; if (!netif_is_macvlan(vdev)) @@ -5415,8 +5416,12 @@ static int ixgbe_macvlan_up(struct net_device *vdev, void *data) static void ixgbe_configure_dfwd(struct ixgbe_adapter *adapter) { + struct netdev_nested_priv priv = { + .data = (void *)adapter, + }; + netdev_walk_all_upper_dev_rcu(adapter->netdev, - ixgbe_macvlan_up, adapter); + ixgbe_macvlan_up, &priv); } static void ixgbe_configure(struct ixgbe_adapter *adapter) @@ -9023,9 +9028,10 @@ static void ixgbe_set_prio_tc_map(struct ixgbe_adapter *adapter) } #endif /* CONFIG_IXGBE_DCB */ -static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, void *data) +static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, + struct netdev_nested_priv *priv) { - struct ixgbe_adapter *adapter = data; + struct ixgbe_adapter *adapter = (struct ixgbe_adapter *)priv->data; struct ixgbe_fwd_adapter *accel; int pool; @@ -9062,13 +9068,16 @@ static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, void *data) static void ixgbe_defrag_macvlan_pools(struct net_device *dev) { struct ixgbe_adapter *adapter = netdev_priv(dev); + struct netdev_nested_priv priv = { + .data = (void *)adapter, + }; /* flush any stale bits out of the fwd bitmask */ bitmap_clear(adapter->fwd_bitmask, 1, 63); /* walk through upper devices reassigning pools */ netdev_walk_all_upper_dev_rcu(dev, ixgbe_reassign_macvlan_pool, - adapter); + &priv); } /** @@ -9242,14 +9251,18 @@ struct upper_walk_data { u8 queue; }; -static int get_macvlan_queue(struct net_device *upper, void *_data) +static int get_macvlan_queue(struct net_device *upper, + struct netdev_nested_priv *priv) { if (netif_is_macvlan(upper)) { struct ixgbe_fwd_adapter *vadapter = macvlan_accel_priv(upper); - struct upper_walk_data *data = _data; - struct ixgbe_adapter *adapter = data->adapter; - int ifindex = data->ifindex; + struct ixgbe_adapter *adapter; + struct upper_walk_data *data; + int ifindex; + data = (struct upper_walk_data *)priv->data; + ifindex = data->ifindex; + adapter = data->adapter; if (vadapter && upper->ifindex == ifindex) { data->queue = adapter->rx_ring[vadapter->rx_base_queue]->reg_idx; data->action = data->queue; @@ -9265,6 +9278,7 @@ static int handle_redirect_action(struct ixgbe_adapter *adapter, int ifindex, { struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ]; unsigned int num_vfs = adapter->num_vfs, vf; + struct netdev_nested_priv priv; struct upper_walk_data data; struct net_device *upper; @@ -9284,8 +9298,9 @@ static int handle_redirect_action(struct ixgbe_adapter *adapter, int ifindex, data.ifindex = ifindex; data.action = 0; data.queue = 0; + priv.data = (void *)&data; if (netdev_walk_all_upper_dev_rcu(adapter->netdev, - get_macvlan_queue, &data)) { + get_macvlan_queue, &priv)) { *action = data.action; *queue = data.queue; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 4186e29119c2..f3c0e241e1b4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3690,13 +3690,13 @@ bool mlxsw_sp_port_dev_check(const struct net_device *dev) return dev->netdev_ops == &mlxsw_sp_port_netdev_ops; } -static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, void *data) +static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) { - struct mlxsw_sp_port **p_mlxsw_sp_port = data; int ret = 0; if (mlxsw_sp_port_dev_check(lower_dev)) { - *p_mlxsw_sp_port = netdev_priv(lower_dev); + priv->data = (void *)netdev_priv(lower_dev); ret = 1; } @@ -3705,15 +3705,16 @@ static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, void *data) struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device *dev) { - struct mlxsw_sp_port *mlxsw_sp_port; + struct netdev_nested_priv priv = { + .data = NULL, + }; if (mlxsw_sp_port_dev_check(dev)) return netdev_priv(dev); - mlxsw_sp_port = NULL; - netdev_walk_all_lower_dev(dev, mlxsw_sp_lower_dev_walk, &mlxsw_sp_port); + netdev_walk_all_lower_dev(dev, mlxsw_sp_lower_dev_walk, &priv); - return mlxsw_sp_port; + return (struct mlxsw_sp_port *)priv.data; } struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev) @@ -3726,16 +3727,17 @@ struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev) struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct net_device *dev) { - struct mlxsw_sp_port *mlxsw_sp_port; + struct netdev_nested_priv priv = { + .data = NULL, + }; if (mlxsw_sp_port_dev_check(dev)) return netdev_priv(dev); - mlxsw_sp_port = NULL; netdev_walk_all_lower_dev_rcu(dev, mlxsw_sp_lower_dev_walk, - &mlxsw_sp_port); + &priv); - return mlxsw_sp_port; + return (struct mlxsw_sp_port *)priv.data; } struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 24f1fd1f8d56..460cb523312f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -7351,9 +7351,10 @@ int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event, return err; } -static int __mlxsw_sp_rif_macvlan_flush(struct net_device *dev, void *data) +static int __mlxsw_sp_rif_macvlan_flush(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct mlxsw_sp_rif *rif = data; + struct mlxsw_sp_rif *rif = (struct mlxsw_sp_rif *)priv->data; if (!netif_is_macvlan(dev)) return 0; @@ -7364,12 +7365,16 @@ static int __mlxsw_sp_rif_macvlan_flush(struct net_device *dev, void *data) static int mlxsw_sp_rif_macvlan_flush(struct mlxsw_sp_rif *rif) { + struct netdev_nested_priv priv = { + .data = (void *)rif, + }; + if (!netif_is_macvlan_port(rif->dev)) return 0; netdev_warn(rif->dev, "Router interface is deleted. Upper macvlans will not work\n"); return netdev_walk_all_upper_dev_rcu(rif->dev, - __mlxsw_sp_rif_macvlan_flush, rif); + __mlxsw_sp_rif_macvlan_flush, &priv); } static void mlxsw_sp_rif_subport_setup(struct mlxsw_sp_rif *rif, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 72912afa6f72..6501ce94ace5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -136,9 +136,9 @@ bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_bridge_device_upper_rif_destroy(struct net_device *dev, - void *data) + struct netdev_nested_priv *priv) { - struct mlxsw_sp *mlxsw_sp = data; + struct mlxsw_sp *mlxsw_sp = priv->data; mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev); return 0; @@ -147,10 +147,14 @@ static int mlxsw_sp_bridge_device_upper_rif_destroy(struct net_device *dev, static void mlxsw_sp_bridge_device_rifs_destroy(struct mlxsw_sp *mlxsw_sp, struct net_device *dev) { + struct netdev_nested_priv priv = { + .data = (void *)mlxsw_sp, + }; + mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev); netdev_walk_all_upper_dev_rcu(dev, mlxsw_sp_bridge_device_upper_rif_destroy, - mlxsw_sp); + &priv); } static int mlxsw_sp_bridge_device_vxlan_init(struct mlxsw_sp_bridge *bridge, diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 42458a46ffaf..9cc31f7e0df1 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -3099,9 +3099,10 @@ struct rocker_walk_data { struct rocker_port *port; }; -static int rocker_lower_dev_walk(struct net_device *lower_dev, void *_data) +static int rocker_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) { - struct rocker_walk_data *data = _data; + struct rocker_walk_data *data = (struct rocker_walk_data *)priv->data; int ret = 0; if (rocker_port_dev_check_under(lower_dev, data->rocker)) { @@ -3115,6 +3116,7 @@ static int rocker_lower_dev_walk(struct net_device *lower_dev, void *_data) struct rocker_port *rocker_port_dev_lower_find(struct net_device *dev, struct rocker *rocker) { + struct netdev_nested_priv priv; struct rocker_walk_data data; if (rocker_port_dev_check_under(dev, rocker)) @@ -3122,7 +3124,8 @@ struct rocker_port *rocker_port_dev_lower_find(struct net_device *dev, data.rocker = rocker; data.port = NULL; - netdev_walk_all_lower_dev(dev, rocker_lower_dev_walk, &data); + priv.data = (void *)&data; + netdev_walk_all_lower_dev(dev, rocker_lower_dev_walk, &priv); return data.port; } diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c index 6aafff9d4231..e013ebe3079c 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/core.c +++ b/drivers/net/wireless/quantenna/qtnfmac/core.c @@ -671,9 +671,10 @@ bool qtnf_netdev_is_qtn(const struct net_device *ndev) return ndev->netdev_ops == &qtnf_netdev_ops; } -static int qtnf_check_br_ports(struct net_device *dev, void *data) +static int qtnf_check_br_ports(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct net_device *ndev = data; + struct net_device *ndev = (struct net_device *)priv->data; if (dev != ndev && netdev_port_same_parent_id(dev, ndev)) return -ENOTSUPP; @@ -686,6 +687,9 @@ static int qtnf_core_netdevice_event(struct notifier_block *nb, { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); const struct netdev_notifier_changeupper_info *info; + struct netdev_nested_priv priv = { + .data = (void *)ndev, + }; struct net_device *brdev; struct qtnf_vif *vif; struct qtnf_bus *bus; @@ -725,7 +729,7 @@ static int qtnf_core_netdevice_event(struct notifier_block *nb, } else { ret = netdev_walk_all_lower_dev(brdev, qtnf_check_br_ports, - ndev); + &priv); } break; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7bd4fcdd0738..313803d6c781 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4455,6 +4455,10 @@ extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; +struct netdev_nested_priv { + void *data; +}; + bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); @@ -4470,8 +4474,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *upper_dev, - void *data), - void *data); + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev); @@ -4508,12 +4512,12 @@ struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter); int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, - void *data), - void *data); + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv); int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *lower_dev, - void *data), - void *data); + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv); void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index b18cdf03edb3..dfec65eca8a6 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -88,9 +88,10 @@ static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, } } -static int br_chk_addr_ip(struct net_device *dev, void *data) +static int br_chk_addr_ip(struct net_device *dev, + struct netdev_nested_priv *priv) { - __be32 ip = *(__be32 *)data; + __be32 ip = *(__be32 *)priv->data; struct in_device *in_dev; __be32 addr = 0; @@ -107,11 +108,15 @@ static int br_chk_addr_ip(struct net_device *dev, void *data) static bool br_is_local_ip(struct net_device *dev, __be32 ip) { - if (br_chk_addr_ip(dev, &ip)) + struct netdev_nested_priv priv = { + .data = (void *)&ip, + }; + + if (br_chk_addr_ip(dev, &priv)) return true; /* check if ip is configured on upper dev */ - if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip)) + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &priv)) return true; return false; @@ -361,9 +366,10 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, } } -static int br_chk_addr_ip6(struct net_device *dev, void *data) +static int br_chk_addr_ip6(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct in6_addr *addr = (struct in6_addr *)data; + struct in6_addr *addr = (struct in6_addr *)priv->data; if (ipv6_chk_addr(dev_net(dev), addr, dev, 0)) return 1; @@ -374,11 +380,15 @@ static int br_chk_addr_ip6(struct net_device *dev, void *data) static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr) { - if (br_chk_addr_ip6(dev, addr)) + struct netdev_nested_priv priv = { + .data = (void *)addr, + }; + + if (br_chk_addr_ip6(dev, &priv)) return true; /* check if ip is configured on upper dev */ - if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr)) + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, &priv)) return true; return false; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 61c94cefa843..ee8780080be5 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1360,7 +1360,7 @@ static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) } static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, - __always_unused void *data) + __always_unused struct netdev_nested_priv *priv) { return br_vlan_is_bind_vlan_dev(dev); } @@ -1383,9 +1383,9 @@ struct br_vlan_bind_walk_data { }; static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, - void *data_in) + struct netdev_nested_priv *priv) { - struct br_vlan_bind_walk_data *data = data_in; + struct br_vlan_bind_walk_data *data = priv->data; int found = 0; if (br_vlan_is_bind_vlan_dev(dev) && @@ -1403,10 +1403,13 @@ br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) struct br_vlan_bind_walk_data data = { .vid = vid, }; + struct netdev_nested_priv priv = { + .data = (void *)&data, + }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, - &data); + &priv); rcu_read_unlock(); return data.result; @@ -1487,9 +1490,9 @@ struct br_vlan_link_state_walk_data { }; static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, - void *data_in) + struct netdev_nested_priv *priv) { - struct br_vlan_link_state_walk_data *data = data_in; + struct br_vlan_link_state_walk_data *data = priv->data; if (br_vlan_is_bind_vlan_dev(vlan_dev)) br_vlan_set_vlan_dev_state(data->br, vlan_dev); @@ -1503,10 +1506,13 @@ static void br_vlan_link_state_change(struct net_device *dev, struct br_vlan_link_state_walk_data data = { .br = br }; + struct netdev_nested_priv priv = { + .data = (void *)&data, + }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, - &data); + &priv); rcu_read_unlock(); } diff --git a/net/core/dev.c b/net/core/dev.c index b7b3d6e15cda..a4a1fa806c5c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6812,9 +6812,10 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, return NULL; } -static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data) +static int ____netdev_has_upper_dev(struct net_device *upper_dev, + struct netdev_nested_priv *priv) { - struct net_device *dev = data; + struct net_device *dev = (struct net_device *)priv->data; return upper_dev == dev; } @@ -6831,10 +6832,14 @@ static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data) bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, - upper_dev); + &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -6851,8 +6856,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, - upper_dev); + &priv); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); @@ -6997,8 +7006,8 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, static int __netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7010,7 +7019,7 @@ static int __netdev_walk_all_upper_dev(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7046,8 +7055,8 @@ static int __netdev_walk_all_upper_dev(struct net_device *dev, int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7058,7 +7067,7 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7094,10 +7103,14 @@ EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + ASSERT_RTNL(); return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, - upper_dev); + &priv); } /** @@ -7215,8 +7228,8 @@ static struct net_device *__netdev_next_lower_dev(struct net_device *dev, int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7227,7 +7240,7 @@ int netdev_walk_all_lower_dev(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7262,8 +7275,8 @@ EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static int __netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7275,7 +7288,7 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7364,13 +7377,15 @@ static u8 __netdev_lower_depth(struct net_device *dev) return max_depth; } -static int __netdev_update_upper_level(struct net_device *dev, void *data) +static int __netdev_update_upper_level(struct net_device *dev, + struct netdev_nested_priv *__unused) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } -static int __netdev_update_lower_level(struct net_device *dev, void *data) +static int __netdev_update_lower_level(struct net_device *dev, + struct netdev_nested_priv *__unused) { dev->lower_level = __netdev_lower_depth(dev) + 1; return 0; @@ -7378,8 +7393,8 @@ static int __netdev_update_lower_level(struct net_device *dev, void *data) int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7390,7 +7405,7 @@ int netdev_walk_all_lower_dev_rcu(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } -- cgit v1.2.3 From 1fc70edb7d7b5ce1ae32b0cf90183f4879ad421a Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 25 Sep 2020 18:13:29 +0000 Subject: net: core: add nested_level variable in net_device This patch is to add a new variable 'nested_level' into the net_device structure. This variable will be used as a parameter of spin_lock_nested() of dev->addr_list_lock. netif_addr_lock() can be called recursively so spin_lock_nested() is used instead of spin_lock() and dev->lower_level is used as a parameter of spin_lock_nested(). But, dev->lower_level value can be updated while it is being used. So, lockdep would warn a possible deadlock scenario. When a stacked interface is deleted, netif_{uc | mc}_sync() is called recursively. So, spin_lock_nested() is called recursively too. At this moment, the dev->lower_level variable is used as a parameter of it. dev->lower_level value is updated when interfaces are being unlinked/linked immediately. Thus, After unlinking, dev->lower_level shouldn't be a parameter of spin_lock_nested(). A (macvlan) | B (vlan) | C (bridge) | D (macvlan) | E (vlan) | F (bridge) A->lower_level : 6 B->lower_level : 5 C->lower_level : 4 D->lower_level : 3 E->lower_level : 2 F->lower_level : 1 When an interface 'A' is removed, it releases resources. At this moment, netif_addr_lock() would be called. Then, netdev_upper_dev_unlink() is called recursively. Then dev->lower_level is updated. There is no problem. But, when the bridge module is removed, 'C' and 'F' interfaces are removed at once. If 'F' is removed first, a lower_level value is like below. A->lower_level : 5 B->lower_level : 4 C->lower_level : 3 D->lower_level : 2 E->lower_level : 1 F->lower_level : 1 Then, 'C' is removed. at this moment, netif_addr_lock() is called recursively. The ordering is like this. C(3)->D(2)->E(1)->F(1) At this moment, the lower_level value of 'E' and 'F' are the same. So, lockdep warns a possible deadlock scenario. In order to avoid this problem, a new variable 'nested_level' is added. This value is the same as dev->lower_level - 1. But this value is updated in rtnl_unlock(). So, this variable can be used as a parameter of spin_lock_nested() safely in the rtnl context. Test commands: ip link add br0 type bridge vlan_filtering 1 ip link add vlan1 link br0 type vlan id 10 ip link add macvlan2 link vlan1 type macvlan ip link add br3 type bridge vlan_filtering 1 ip link set macvlan2 master br3 ip link add vlan4 link br3 type vlan id 10 ip link add macvlan5 link vlan4 type macvlan ip link add br6 type bridge vlan_filtering 1 ip link set macvlan5 master br6 ip link add vlan7 link br6 type vlan id 10 ip link add macvlan8 link vlan7 type macvlan ip link set br0 up ip link set vlan1 up ip link set macvlan2 up ip link set br3 up ip link set vlan4 up ip link set macvlan5 up ip link set br6 up ip link set vlan7 up ip link set macvlan8 up modprobe -rv bridge Splat looks like: [ 36.057436][ T744] WARNING: possible recursive locking detected [ 36.058848][ T744] 5.9.0-rc6+ #728 Not tainted [ 36.059959][ T744] -------------------------------------------- [ 36.061391][ T744] ip/744 is trying to acquire lock: [ 36.062590][ T744] ffff8c4767509280 (&vlan_netdev_addr_lock_key){+...}-{2:2}, at: dev_set_rx_mode+0x19/0x30 [ 36.064922][ T744] [ 36.064922][ T744] but task is already holding lock: [ 36.066626][ T744] ffff8c4767769280 (&vlan_netdev_addr_lock_key){+...}-{2:2}, at: dev_uc_add+0x1e/0x60 [ 36.068851][ T744] [ 36.068851][ T744] other info that might help us debug this: [ 36.070731][ T744] Possible unsafe locking scenario: [ 36.070731][ T744] [ 36.072497][ T744] CPU0 [ 36.073238][ T744] ---- [ 36.074007][ T744] lock(&vlan_netdev_addr_lock_key); [ 36.075290][ T744] lock(&vlan_netdev_addr_lock_key); [ 36.076590][ T744] [ 36.076590][ T744] *** DEADLOCK *** [ 36.076590][ T744] [ 36.078515][ T744] May be due to missing lock nesting notation [ 36.078515][ T744] [ 36.080491][ T744] 3 locks held by ip/744: [ 36.081471][ T744] #0: ffffffff98571df0 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x236/0x490 [ 36.083614][ T744] #1: ffff8c4767769280 (&vlan_netdev_addr_lock_key){+...}-{2:2}, at: dev_uc_add+0x1e/0x60 [ 36.085942][ T744] #2: ffff8c476c8da280 (&bridge_netdev_addr_lock_key/4){+...}-{2:2}, at: dev_uc_sync+0x39/0x80 [ 36.088400][ T744] [ 36.088400][ T744] stack backtrace: [ 36.089772][ T744] CPU: 6 PID: 744 Comm: ip Not tainted 5.9.0-rc6+ #728 [ 36.091364][ T744] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 36.093630][ T744] Call Trace: [ 36.094416][ T744] dump_stack+0x77/0x9b [ 36.095385][ T744] __lock_acquire+0xbc3/0x1f40 [ 36.096522][ T744] lock_acquire+0xb4/0x3b0 [ 36.097540][ T744] ? dev_set_rx_mode+0x19/0x30 [ 36.098657][ T744] ? rtmsg_ifinfo+0x1f/0x30 [ 36.099711][ T744] ? __dev_notify_flags+0xa5/0xf0 [ 36.100874][ T744] ? rtnl_is_locked+0x11/0x20 [ 36.101967][ T744] ? __dev_set_promiscuity+0x7b/0x1a0 [ 36.103230][ T744] _raw_spin_lock_bh+0x38/0x70 [ 36.104348][ T744] ? dev_set_rx_mode+0x19/0x30 [ 36.105461][ T744] dev_set_rx_mode+0x19/0x30 [ 36.106532][ T744] dev_set_promiscuity+0x36/0x50 [ 36.107692][ T744] __dev_set_promiscuity+0x123/0x1a0 [ 36.108929][ T744] dev_set_promiscuity+0x1e/0x50 [ 36.110093][ T744] br_port_set_promisc+0x1f/0x40 [bridge] [ 36.111415][ T744] br_manage_promisc+0x8b/0xe0 [bridge] [ 36.112728][ T744] __dev_set_promiscuity+0x123/0x1a0 [ 36.113967][ T744] ? __hw_addr_sync_one+0x23/0x50 [ 36.115135][ T744] __dev_set_rx_mode+0x68/0x90 [ 36.116249][ T744] dev_uc_sync+0x70/0x80 [ 36.117244][ T744] dev_uc_add+0x50/0x60 [ 36.118223][ T744] macvlan_open+0x18e/0x1f0 [macvlan] [ 36.119470][ T744] __dev_open+0xd6/0x170 [ 36.120470][ T744] __dev_change_flags+0x181/0x1d0 [ 36.121644][ T744] dev_change_flags+0x23/0x60 [ 36.122741][ T744] do_setlink+0x30a/0x11e0 [ 36.123778][ T744] ? __lock_acquire+0x92c/0x1f40 [ 36.124929][ T744] ? __nla_validate_parse.part.6+0x45/0x8e0 [ 36.126309][ T744] ? __lock_acquire+0x92c/0x1f40 [ 36.127457][ T744] __rtnl_newlink+0x546/0x8e0 [ 36.128560][ T744] ? lock_acquire+0xb4/0x3b0 [ 36.129623][ T744] ? deactivate_slab.isra.85+0x6a1/0x850 [ 36.130946][ T744] ? __lock_acquire+0x92c/0x1f40 [ 36.132102][ T744] ? lock_acquire+0xb4/0x3b0 [ 36.133176][ T744] ? is_bpf_text_address+0x5/0xe0 [ 36.134364][ T744] ? rtnl_newlink+0x2e/0x70 [ 36.135445][ T744] ? rcu_read_lock_sched_held+0x32/0x60 [ 36.136771][ T744] ? kmem_cache_alloc_trace+0x2d8/0x380 [ 36.138070][ T744] ? rtnl_newlink+0x2e/0x70 [ 36.139164][ T744] rtnl_newlink+0x47/0x70 [ ... ] Fixes: 845e0ebb4408 ("net: change addr_list_lock back to static key") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 52 ++++++++++++++++++++++++----- net/core/dev.c | 85 +++++++++++++++++++++++++++++++++++++++-------- net/core/dev_addr_lists.c | 12 +++---- 3 files changed, 122 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 313803d6c781..9fdb3ebef306 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1955,6 +1955,7 @@ struct net_device { unsigned short type; unsigned short hard_header_len; unsigned char min_header_len; + unsigned char name_assign_type; unsigned short needed_headroom; unsigned short needed_tailroom; @@ -1965,21 +1966,28 @@ struct net_device { unsigned char addr_len; unsigned char upper_level; unsigned char lower_level; + unsigned short neigh_priv_len; unsigned short dev_id; unsigned short dev_port; spinlock_t addr_list_lock; - unsigned char name_assign_type; - bool uc_promisc; + struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; struct netdev_hw_addr_list dev_addrs; #ifdef CONFIG_SYSFS struct kset *queues_kset; +#endif +#ifdef CONFIG_LOCKDEP + struct list_head unlink_list; #endif unsigned int promiscuity; unsigned int allmulti; + bool uc_promisc; +#ifdef CONFIG_LOCKDEP + unsigned char nested_level; +#endif /* Protocol-specific pointers */ @@ -4260,17 +4268,23 @@ static inline void netif_tx_disable(struct net_device *dev) static inline void netif_addr_lock(struct net_device *dev) { - spin_lock(&dev->addr_list_lock); -} + unsigned char nest_level = 0; -static inline void netif_addr_lock_nested(struct net_device *dev) -{ - spin_lock_nested(&dev->addr_list_lock, dev->lower_level); +#ifdef CONFIG_LOCKDEP + nest_level = dev->nested_level; +#endif + spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_lock_bh(struct net_device *dev) { - spin_lock_bh(&dev->addr_list_lock); + unsigned char nest_level = 0; + +#ifdef CONFIG_LOCKDEP + nest_level = dev->nested_level; +#endif + local_bh_disable(); + spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_unlock(struct net_device *dev) @@ -4455,7 +4469,19 @@ extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; +enum { + NESTED_SYNC_IMM_BIT, + NESTED_SYNC_TODO_BIT, +}; + +#define __NESTED_SYNC_BIT(bit) ((u32)1 << (bit)) +#define __NESTED_SYNC(name) __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT) + +#define NESTED_SYNC_IMM __NESTED_SYNC(IMM) +#define NESTED_SYNC_TODO __NESTED_SYNC(TODO) + struct netdev_nested_priv { + unsigned char flags; void *data; }; @@ -4465,6 +4491,16 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); +#ifdef CONFIG_LOCKDEP +static LIST_HEAD(net_unlink_list); + +static inline void net_unlink_todo(struct net_device *dev) +{ + if (list_empty(&dev->unlink_list)) + list_add_tail(&dev->unlink_list, &net_unlink_list); +} +#endif + /* iterate through upper list, must be called under RCU read lock */ #define netdev_for_each_upper_dev_rcu(dev, updev, iter) \ for (iter = &(dev)->adj_list.upper, \ diff --git a/net/core/dev.c b/net/core/dev.c index a4a1fa806c5c..4906b44af850 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7104,6 +7104,7 @@ static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { + .flags = 0, .data = (void *)upper_dev, }; @@ -7385,9 +7386,19 @@ static int __netdev_update_upper_level(struct net_device *dev, } static int __netdev_update_lower_level(struct net_device *dev, - struct netdev_nested_priv *__unused) + struct netdev_nested_priv *priv) { dev->lower_level = __netdev_lower_depth(dev) + 1; + +#ifdef CONFIG_LOCKDEP + if (!priv) + return 0; + + if (priv->flags & NESTED_SYNC_IMM) + dev->nested_level = dev->lower_level - 1; + if (priv->flags & NESTED_SYNC_TODO) + net_unlink_todo(dev); +#endif return 0; } @@ -7665,6 +7676,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, + struct netdev_nested_priv *priv, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { @@ -7721,9 +7733,9 @@ static int __netdev_upper_dev_link(struct net_device *dev, __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); - __netdev_update_lower_level(upper_dev, NULL); + __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, - NULL); + priv); return 0; @@ -7748,8 +7760,13 @@ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + return __netdev_upper_dev_link(dev, upper_dev, false, - NULL, NULL, extack); + NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -7772,13 +7789,19 @@ int netdev_master_upper_dev_link(struct net_device *dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + return __netdev_upper_dev_link(dev, upper_dev, true, - upper_priv, upper_info, extack); + upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); static void __netdev_upper_dev_unlink(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netdev_nested_priv *priv) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { @@ -7803,9 +7826,9 @@ static void __netdev_upper_dev_unlink(struct net_device *dev, __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); - __netdev_update_lower_level(upper_dev, NULL); + __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, - NULL); + priv); } /** @@ -7819,7 +7842,12 @@ static void __netdev_upper_dev_unlink(struct net_device *dev, void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - __netdev_upper_dev_unlink(dev, upper_dev); + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_TODO, + .data = NULL, + }; + + __netdev_upper_dev_unlink(dev, upper_dev, &priv); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -7855,6 +7883,10 @@ int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *dev, struct netlink_ext_ack *extack) { + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; int err; if (!new_dev) @@ -7862,8 +7894,8 @@ int netdev_adjacent_change_prepare(struct net_device *old_dev, if (old_dev && new_dev != old_dev) netdev_adjacent_dev_disable(dev, old_dev); - - err = netdev_upper_dev_link(new_dev, dev, extack); + err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, + extack); if (err) { if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); @@ -7878,6 +7910,11 @@ void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + if (!new_dev || !old_dev) return; @@ -7885,7 +7922,7 @@ void netdev_adjacent_change_commit(struct net_device *old_dev, return; netdev_adjacent_dev_enable(dev, old_dev); - netdev_upper_dev_unlink(old_dev, dev); + __netdev_upper_dev_unlink(old_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_commit); @@ -7893,13 +7930,18 @@ void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + if (!new_dev) return; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); - netdev_upper_dev_unlink(new_dev, dev); + __netdev_upper_dev_unlink(new_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_abort); @@ -10083,6 +10125,19 @@ static void netdev_wait_allrefs(struct net_device *dev) void netdev_run_todo(void) { struct list_head list; +#ifdef CONFIG_LOCKDEP + struct list_head unlink_list; + + list_replace_init(&net_unlink_list, &unlink_list); + + while (!list_empty(&unlink_list)) { + struct net_device *dev = list_first_entry(&unlink_list, + struct net_device, + unlink_list); + list_del(&dev->unlink_list); + dev->nested_level = dev->lower_level - 1; + } +#endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); @@ -10295,6 +10350,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_segs = GSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; +#ifdef CONFIG_LOCKDEP + dev->nested_level = 0; + INIT_LIST_HEAD(&dev->unlink_list); +#endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 54cd568e7c2f..fa1c37ec40c9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -700,7 +700,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) * larger. */ netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -867,7 +867,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -897,7 +897,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -922,7 +922,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); -- cgit v1.2.3 From 3ddf9b431b931544dc2f94e8f6a055ff501436fa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 28 Sep 2020 17:53:29 -0700 Subject: genetlink: add missing kdoc for validation flags Validation flags are missing kdoc, add it. Fixes: ef6243acb478 ("genetlink: optionally validate strictly/dumps") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/genetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 6e5f1e1aa822..8899d7429ccb 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -138,6 +138,7 @@ genl_dumpit_info(struct netlink_callback *cb) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: flags + * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers -- cgit v1.2.3 From 50b2412b7e7862c5af0cbf4b10d93bc5c712d021 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 4 Aug 2020 10:40:21 +0300 Subject: net/mlx5: Avoid possible free of command entry while timeout comp handler Upon command completion timeout, driver simulates a forced command completion. In a rare case where real interrupt for that command arrives simultaneously, it might release the command entry while the forced handler might still access it. Fix that by adding an entry refcount, to track current amount of allowed handlers. Command entry to be released only when this refcount is decremented to zero. Command refcount is always initialized to one. For callback commands, command completion handler is the symmetric flow to decrement it. For non-callback commands, it is wait_func(). Before ringing the doorbell, increment the refcount for the real completion handler. Once the real completion handler is called, it will decrement it. For callback commands, once the delayed work is scheduled, increment the refcount. Upon callback command completion handler, we will try to cancel the timeout callback. In case of success, we need to decrement the callback refcount as it will never run. In addition, gather the entry index free and the entry free into a one flow for all command types release. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 109 +++++++++++++++++--------- include/linux/mlx5/driver.h | 2 + 2 files changed, 73 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 1d91a0d0ab1d..c0055f5479ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -69,12 +69,10 @@ enum { MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR = 0x10, }; -static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, - struct mlx5_cmd_msg *in, - struct mlx5_cmd_msg *out, - void *uout, int uout_size, - mlx5_cmd_cbk_t cbk, - void *context, int page_queue) +static struct mlx5_cmd_work_ent * +cmd_alloc_ent(struct mlx5_cmd *cmd, struct mlx5_cmd_msg *in, + struct mlx5_cmd_msg *out, void *uout, int uout_size, + mlx5_cmd_cbk_t cbk, void *context, int page_queue) { gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL; struct mlx5_cmd_work_ent *ent; @@ -83,6 +81,7 @@ static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, if (!ent) return ERR_PTR(-ENOMEM); + ent->idx = -EINVAL; ent->in = in; ent->out = out; ent->uout = uout; @@ -91,10 +90,16 @@ static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, ent->context = context; ent->cmd = cmd; ent->page_queue = page_queue; + refcount_set(&ent->refcnt, 1); return ent; } +static void cmd_free_ent(struct mlx5_cmd_work_ent *ent) +{ + kfree(ent); +} + static u8 alloc_token(struct mlx5_cmd *cmd) { u8 token; @@ -109,7 +114,7 @@ static u8 alloc_token(struct mlx5_cmd *cmd) return token; } -static int alloc_ent(struct mlx5_cmd *cmd) +static int cmd_alloc_index(struct mlx5_cmd *cmd) { unsigned long flags; int ret; @@ -123,7 +128,7 @@ static int alloc_ent(struct mlx5_cmd *cmd) return ret < cmd->max_reg_cmds ? ret : -ENOMEM; } -static void free_ent(struct mlx5_cmd *cmd, int idx) +static void cmd_free_index(struct mlx5_cmd *cmd, int idx) { unsigned long flags; @@ -132,6 +137,22 @@ static void free_ent(struct mlx5_cmd *cmd, int idx) spin_unlock_irqrestore(&cmd->alloc_lock, flags); } +static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) +{ + refcount_inc(&ent->refcnt); +} + +static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) +{ + if (!refcount_dec_and_test(&ent->refcnt)) + return; + + if (ent->idx >= 0) + cmd_free_index(ent->cmd, ent->idx); + + cmd_free_ent(ent); +} + static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) { return cmd->cmd_buf + (idx << cmd->log_stride); @@ -219,11 +240,6 @@ static void poll_timeout(struct mlx5_cmd_work_ent *ent) ent->ret = -ETIMEDOUT; } -static void free_cmd(struct mlx5_cmd_work_ent *ent) -{ - kfree(ent); -} - static int verify_signature(struct mlx5_cmd_work_ent *ent) { struct mlx5_cmd_mailbox *next = ent->out->next; @@ -842,6 +858,7 @@ static void cb_timeout_handler(struct work_struct *work) mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + cmd_ent_put(ent); /* for the cmd_ent_get() took on schedule delayed work */ } static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); @@ -873,14 +890,14 @@ static void cmd_work_handler(struct work_struct *work) sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); if (!ent->page_queue) { - alloc_ret = alloc_ent(cmd); + alloc_ret = cmd_alloc_index(cmd); if (alloc_ret < 0) { mlx5_core_err_rl(dev, "failed to allocate command entry\n"); if (ent->callback) { ent->callback(-EAGAIN, ent->context); mlx5_free_cmd_msg(dev, ent->out); free_msg(dev, ent->in); - free_cmd(ent); + cmd_ent_put(ent); } else { ent->ret = -EAGAIN; complete(&ent->done); @@ -916,8 +933,8 @@ static void cmd_work_handler(struct work_struct *work) ent->ts1 = ktime_get_ns(); cmd_mode = cmd->mode; - if (ent->callback) - schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); + if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout)) + cmd_ent_get(ent); set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); /* Skip sending command to fw if internal error */ @@ -933,13 +950,10 @@ static void cmd_work_handler(struct work_struct *work) MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); - /* no doorbell, no need to keep the entry */ - free_ent(cmd, ent->idx); - if (ent->callback) - free_cmd(ent); return; } + cmd_ent_get(ent); /* for the _real_ FW event on completion */ /* ring doorbell after the descriptor is valid */ mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); wmb(); @@ -1039,11 +1053,16 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, if (callback && page_queue) return -EINVAL; - ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context, - page_queue); + ent = cmd_alloc_ent(cmd, in, out, uout, uout_size, + callback, context, page_queue); if (IS_ERR(ent)) return PTR_ERR(ent); + /* put for this ent is when consumed, depending on the use case + * 1) (!callback) blocking flow: by caller after wait_func completes + * 2) (callback) flow: by mlx5_cmd_comp_handler() when ent is handled + */ + ent->token = token; ent->polling = force_polling; @@ -1062,12 +1081,10 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, } if (callback) - goto out; + goto out; /* mlx5_cmd_comp_handler() will put(ent) */ err = wait_func(dev, ent); - if (err == -ETIMEDOUT) - goto out; - if (err == -ECANCELED) + if (err == -ETIMEDOUT || err == -ECANCELED) goto out_free; ds = ent->ts2 - ent->ts1; @@ -1085,7 +1102,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, *status = ent->status; out_free: - free_cmd(ent); + cmd_ent_put(ent); out: return err; } @@ -1516,14 +1533,19 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force if (!forced) { mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", ent->idx); - free_ent(cmd, ent->idx); - free_cmd(ent); + cmd_ent_put(ent); } continue; } - if (ent->callback) - cancel_delayed_work(&ent->cb_timeout_work); + if (ent->callback && cancel_delayed_work(&ent->cb_timeout_work)) + cmd_ent_put(ent); /* timeout work was canceled */ + + if (!forced || /* Real FW completion */ + pci_channel_offline(dev->pdev) || /* FW is inaccessible */ + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + cmd_ent_put(ent); + if (ent->page_queue) sem = &cmd->pages_sem; else @@ -1545,10 +1567,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force ent->ret, deliv_status_to_str(ent->status), ent->status); } - /* only real completion will free the entry slot */ - if (!forced) - free_ent(cmd, ent->idx); - if (ent->callback) { ds = ent->ts2 - ent->ts1; if (ent->op < MLX5_CMD_OP_MAX) { @@ -1576,10 +1594,13 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force free_msg(dev, ent->in); err = err ? err : ent->status; - if (!forced) - free_cmd(ent); + /* final consumer is done, release ent */ + cmd_ent_put(ent); callback(err, context); } else { + /* release wait_func() so mlx5_cmd_invoke() + * can make the final ent_put() + */ complete(&ent->done); } up(sem); @@ -1589,8 +1610,11 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) { + struct mlx5_cmd *cmd = &dev->cmd; + unsigned long bitmask; unsigned long flags; u64 vector; + int i; /* wait for pending handlers to complete */ mlx5_eq_synchronize_cmd_irq(dev); @@ -1599,11 +1623,20 @@ void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) if (!vector) goto no_trig; + bitmask = vector; + /* we must increment the allocated entries refcount before triggering the completions + * to guarantee pending commands will not get freed in the meanwhile. + * For that reason, it also has to be done inside the alloc_lock. + */ + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) + cmd_ent_get(cmd->ent_arr[i]); vector |= MLX5_TRIGGERED_CMD_COMP; spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); mlx5_core_dbg(dev, "vector 0x%llx\n", vector); mlx5_cmd_comp_handler(dev, vector, true); + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) + cmd_ent_put(cmd->ent_arr[i]); return; no_trig: diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c145de0473bc..897156822f0d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -767,6 +767,8 @@ struct mlx5_cmd_work_ent { u64 ts2; u16 op; bool polling; + /* Track the max comp handlers */ + refcount_t refcnt; }; struct mlx5_pas { -- cgit v1.2.3 From b898ce7bccf13087719c021d829dab607c175246 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 11 Sep 2020 11:48:55 -0700 Subject: net/mlx5: cmdif, Avoid skipping reclaim pages if FW is not accessible In case of pci is offline reclaim_pages_cmd() will still try to call the FW to release FW pages, cmd_exec() in this case will return a silent success without actually calling the FW. This is wrong and will cause page leaks, what we should do is to detect pci offline or command interface un-available before tying to access the FW and manually release the FW pages in the driver. In this patch we share the code to check for FW command interface availability and we call it in sensitive places e.g. reclaim_pages_cmd(). Alternative fix: 1. Remove MLX5_CMD_OP_MANAGE_PAGES form mlx5_internal_err_ret_value, command success simulation list. 2. Always Release FW pages even if cmd_exec fails in reclaim_pages_cmd(). Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 17 +++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 2 +- include/linux/mlx5/driver.h | 1 + 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 2b597ac365f8..2d1f4b3be9bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -902,6 +902,13 @@ retry: return idx; } +bool mlx5_cmd_is_down(struct mlx5_core_dev *dev) +{ + return pci_channel_offline(dev->pdev) || + dev->cmd.state != MLX5_CMDIF_STATE_UP || + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR; +} + static void cmd_work_handler(struct work_struct *work) { struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work); @@ -967,10 +974,7 @@ static void cmd_work_handler(struct work_struct *work) set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); /* Skip sending command to fw if internal error */ - if (pci_channel_offline(dev->pdev) || - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || - cmd->state != MLX5_CMDIF_STATE_UP || - !opcode_allowed(&dev->cmd, ent->op)) { + if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, ent->op)) { u8 status = 0; u32 drv_synd; @@ -1800,10 +1804,7 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, u8 token; opcode = MLX5_GET(mbox_in, in, opcode); - if (pci_channel_offline(dev->pdev) || - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || - dev->cmd.state != MLX5_CMDIF_STATE_UP || - !opcode_allowed(&dev->cmd, opcode)) { + if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, opcode)) { err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); MLX5_SET(mbox_out, out, status, status); MLX5_SET(mbox_out, out, syndrome, drv_synd); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index f9b798af6335..c0e18f2ade99 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -432,7 +432,7 @@ static int reclaim_pages_cmd(struct mlx5_core_dev *dev, u32 npages; u32 i = 0; - if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) + if (!mlx5_cmd_is_down(dev)) return mlx5_cmd_exec(dev, in, in_size, out, out_size); /* No hard feelings, we want our pages back! */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 897156822f0d..372100c755e7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -935,6 +935,7 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); +bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); -- cgit v1.2.3 From a95bc734e60449e7b073ff7ff70c35083b290ae9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Oct 2020 09:46:04 +0200 Subject: netlink: fix policy dump leak If userspace doesn't complete the policy dump, we leak the allocated state. Fix this. Fixes: d07dcf9aadd6 ("netlink: add infrastructure to expose policies to userspace") Signed-off-by: Johannes Berg Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 3 ++- net/netlink/genetlink.c | 9 ++++++++- net/netlink/policy.c | 24 ++++++++++-------------- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 8e0eb2c9c528..271620f6bc7f 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1934,7 +1934,8 @@ void nla_get_range_signed(const struct nla_policy *pt, int netlink_policy_dump_start(const struct nla_policy *policy, unsigned int maxtype, unsigned long *state); -bool netlink_policy_dump_loop(unsigned long *state); +bool netlink_policy_dump_loop(unsigned long state); int netlink_policy_dump_write(struct sk_buff *skb, unsigned long state); +void netlink_policy_dump_free(unsigned long state); #endif diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 1eb65a7a27fd..c4b4d3376227 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1079,7 +1079,7 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - while (netlink_policy_dump_loop(&cb->args[1])) { + while (netlink_policy_dump_loop(cb->args[1])) { void *hdr; struct nlattr *nest; @@ -1113,6 +1113,12 @@ nla_put_failure: return skb->len; } +static int ctrl_dumppolicy_done(struct netlink_callback *cb) +{ + netlink_policy_dump_free(cb->args[1]); + return 0; +} + static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, @@ -1123,6 +1129,7 @@ static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETPOLICY, .dumpit = ctrl_dumppolicy, + .done = ctrl_dumppolicy_done, }, }; diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 641ffbdd977a..0176b59ce530 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -84,7 +84,6 @@ int netlink_policy_dump_start(const struct nla_policy *policy, unsigned int policy_idx; int err; - /* also returns 0 if "*_state" is our ERR_PTR() end marker */ if (*_state) return 0; @@ -140,21 +139,11 @@ static bool netlink_policy_dump_finished(struct nl_policy_dump *state) !state->policies[state->policy_idx].policy; } -bool netlink_policy_dump_loop(unsigned long *_state) +bool netlink_policy_dump_loop(unsigned long _state) { - struct nl_policy_dump *state = (void *)*_state; - - if (IS_ERR(state)) - return false; - - if (netlink_policy_dump_finished(state)) { - kfree(state); - /* store end marker instead of freed state */ - *_state = (unsigned long)ERR_PTR(-ENOENT); - return false; - } + struct nl_policy_dump *state = (void *)_state; - return true; + return !netlink_policy_dump_finished(state); } int netlink_policy_dump_write(struct sk_buff *skb, unsigned long _state) @@ -309,3 +298,10 @@ nla_put_failure: nla_nest_cancel(skb, policy); return -ENOBUFS; } + +void netlink_policy_dump_free(unsigned long _state) +{ + struct nl_policy_dump *state = (void *)_state; + + kfree(state); +} -- cgit v1.2.3 From a93bdcb94a0b3ca72046151412c2389dca681d2a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 2 Oct 2020 07:49:45 +0200 Subject: net: core: document two new elements of struct net_device As warned by "make htmldocs", there are two new struct elements that aren't documented: ../include/linux/netdevice.h:2159: warning: Function parameter or member 'unlink_list' not described in 'net_device' ../include/linux/netdevice.h:2159: warning: Function parameter or member 'nested_level' not described in 'net_device' Fixes: 1fc70edb7d7b ("net: core: add nested_level variable in net_device") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9fdb3ebef306..18dec08439f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1851,6 +1851,11 @@ enum netdev_priv_flags { * @udp_tunnel_nic: UDP tunnel offload state * @xdp_state: stores info on attached XDP BPF programs * + * @nested_level: Used as as a parameter of spin_lock_nested() of + * dev->addr_list_lock. + * @unlink_list: As netif_addr_lock() can be called recursively, + * keep a list of interfaces to be deleted. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ -- cgit v1.2.3 From c381b07941adc2274ce552daf86c94701c5e265a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 2 Oct 2020 16:27:28 +0800 Subject: net: introduce helper sendpage_ok() in include/linux/net.h The original problem was from nvme-over-tcp code, who mistakenly uses kernel_sendpage() to send pages allocated by __get_free_pages() without __GFP_COMP flag. Such pages don't have refcount (page_count is 0) on tail pages, sending them by kernel_sendpage() may trigger a kernel panic from a corrupted kernel heap, because these pages are incorrectly freed in network stack as page_count 0 pages. This patch introduces a helper sendpage_ok(), it returns true if the checking page, - is not slab page: PageSlab(page) is false. - has page refcount: page_count(page) is not zero All drivers who want to send page to remote end by kernel_sendpage() may use this helper to check whether the page is OK. If the helper does not return true, the driver should try other non sendpage method (e.g. sock_no_sendpage()) to handle the page. Signed-off-by: Coly Li Cc: Chaitanya Kulkarni Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Jan Kara Cc: Jens Axboe Cc: Mikhail Skorzhinskii Cc: Philipp Reisner Cc: Sagi Grimberg Cc: Vlastimil Babka Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- include/linux/net.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index d48ff1180879..ae713c851342 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -286,6 +287,21 @@ do { \ #define net_get_random_once_wait(buf, nbytes) \ get_random_once_wait((buf), (nbytes)) +/* + * E.g. XFS meta- & log-data is in slab pages, or bcache meta + * data pages, or other high order pages allocated by + * __get_free_pages() without __GFP_COMP, which have a page_count + * of 0 and/or have PageSlab() set. We cannot use send_page for + * those, as that does get_page(); put_page(); and would cause + * either a VM_BUG directly, or __page_cache_release a page that + * would actually still be referenced by someone, leading to some + * obscure delayed Oops somewhere else. + */ +static inline bool sendpage_ok(struct page *page) +{ + return !PageSlab(page) && page_count(page) >= 1; +} + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, -- cgit v1.2.3