From ed1182dc004dbcc7cfe64fb0e8ac520b25431715 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 27 Nov 2020 18:17:26 +0100 Subject: xdp: Handle MEM_TYPE_XSK_BUFF_POOL correctly in xdp_return_buff() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that it does exist a path where xdp_return_buff() is being passed an XDP buffer of type MEM_TYPE_XSK_BUFF_POOL. This path is when AF_XDP zero-copy mode is enabled, and a buffer is redirected to a DEVMAP with an attached XDP program that drops the buffer. This change simply puts the handling of MEM_TYPE_XSK_BUFF_POOL back into xdp_return_buff(). Fixes: 82c41671ca4f ("xdp: Simplify xdp_return_{frame, frame_rx_napi, buff}") Reported-by: Maxim Mikityanskiy Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Maxim Mikityanskiy Link: https://lore.kernel.org/bpf/20201127171726.123627-1-bjorn.topel@gmail.com --- net/core/xdp.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/xdp.c b/net/core/xdp.c index 48aba933a5a8..491ad569a79c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -335,11 +335,10 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. This path is never used by the - * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of - * the switch-statement. + * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -361,6 +360,10 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); @@ -370,19 +373,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false); + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true); + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ -- cgit v1.2.3 From f5da54187e33dce9bea63170667dbb0ca8d98194 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 1 Dec 2020 21:56:57 +0800 Subject: xsk: Replace datagram_poll by sock_poll_wait datagram_poll will judge the current socket status (EPOLLIN, EPOLLOUT) based on the traditional socket information (eg: sk_wmem_alloc), but this does not apply to xsk. So this patch uses sock_poll_wait instead of datagram_poll, and the mask is calculated by xsk_poll. Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support") Signed-off-by: Xuan Zhuo Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/e82f4697438cd63edbf271ebe1918db8261b7c09.1606555939.git.xuanzhuo@linux.alibaba.com --- net/xdp/xsk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b7b039bd9d03..9bbfd8adbb73 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -471,11 +471,13 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - __poll_t mask = datagram_poll(file, sock, wait); + __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; + sock_poll_wait(file, sock, wait); + if (unlikely(!xsk_is_bound(xs))) return mask; -- cgit v1.2.3 From 3413f04141aa440c71da187755e8e22f5093ce83 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 1 Dec 2020 21:56:58 +0800 Subject: xsk: Change the tx writeable condition Modify the tx writeable condition from the queue is not full to the number of present tx queues is less than the half of the total number of queues. Because the tx queue not full is a very short time, this will cause a large number of EPOLLOUT events, and cause a large number of process wake up. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Xuan Zhuo Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/508fef55188d4e1160747ead64c6dcda36735880.1606555939.git.xuanzhuo@linux.alibaba.com --- net/xdp/xsk.c | 16 +++++++++++++--- net/xdp/xsk_queue.h | 6 ++++++ 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9bbfd8adbb73..62504471fd20 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -211,6 +211,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, return 0; } +static bool xsk_tx_writeable(struct xdp_sock *xs) +{ + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) + return false; + + return true; +} + static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { @@ -296,7 +304,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { __xskq_cons_release(xs->tx); - xs->sk.sk_write_space(&xs->sk); + if (xsk_tx_writeable(xs)) + xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); } @@ -436,7 +445,8 @@ static int xsk_generic_xmit(struct sock *sk) out: if (sent_frame) - sk->sk_write_space(sk); + if (xsk_tx_writeable(xs)) + sk->sk_write_space(sk); mutex_unlock(&xs->mutex); return err; @@ -493,7 +503,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; - if (xs->tx && !xskq_cons_is_full(xs->tx)) + if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cdb9cf3cd136..9e71b9f27679 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -264,6 +264,12 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) q->nentries; } +static inline u32 xskq_cons_present_entries(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); +} + /* Functions for producers */ static inline bool xskq_prod_is_full(struct xsk_queue *q) -- cgit v1.2.3 From 12c8a8ca117f3d734babc3fba131fdaa329d2163 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 4 Dec 2020 18:21:16 +0800 Subject: xsk: Return error code if force_zc is set If force_zc is set, we should exit out with an error, not fall back to copy mode. Fixes: 921b68692abb ("xsk: Enable sharing of dma mappings") Reported-by: Hulk Robot Signed-off-by: Zhang Changzhong Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/1607077277-41995-1-git-send-email-zhangchangzhong@huawei.com --- net/xdp/xsk_buff_pool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 9287eddec52c..d5adeee9d5d9 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -175,6 +175,7 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool, if (!pool->dma_pages) { WARN(1, "Driver did not DMA map zero-copy buffers"); + err = -EINVAL; goto err_unreg_xsk; } pool->umem->zc = true; -- cgit v1.2.3 From d9054a1ff585ba01029584ab730efc794603d68f Mon Sep 17 00:00:00 2001 From: Dongdong Wang Date: Fri, 4 Dec 2020 23:59:45 -0800 Subject: lwt: Disable BH too in run_lwt_bpf() The per-cpu bpf_redirect_info is shared among all skb_do_redirect() and BPF redirect helpers. Callers on RX path are all in BH context, disabling preemption is not sufficient to prevent BH interruption. In production, we observed strange packet drops because of the race condition between LWT xmit and TC ingress, and we verified this issue is fixed after we disable BH. Although this bug was technically introduced from the beginning, that is commit 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure"), at that time call_rcu() had to be call_rcu_bh() to match the RCU context. So this patch may not work well before RCU flavor consolidation has been completed around v5.0. Update the comments above the code too, as call_rcu() is now BH friendly. Signed-off-by: Dongdong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Cong Wang Link: https://lore.kernel.org/bpf/20201205075946.497763-1-xiyou.wangcong@gmail.com --- net/core/lwt_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 7d3438215f32..4f3cb7c15ddf 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -39,12 +39,11 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, { int ret; - /* Preempt disable is needed to protect per-cpu redirect_info between - * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and - * access to maps strictly require a rcu_read_lock() for protection, - * mixing with BH RCU lock doesn't work. + /* Preempt disable and BH disable are needed to protect per-cpu + * redirect_info between BPF prog and skb_do_redirect(). */ preempt_disable(); + local_bh_disable(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -78,6 +77,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } + local_bh_enable(); preempt_enable(); return ret; -- cgit v1.2.3 From e3366884b383073a7edc1bad9634412ae0a22d4e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 4 Dec 2020 23:59:46 -0800 Subject: lwt_bpf: Replace preempt_disable() with migrate_disable() migrate_disable() is just a wrapper for preempt_disable() in non-RT kernel. It is safe to replace it, and RT kernel will benefit. Note that it is introduced since Feb 2020. Suggested-by: Alexei Starovoitov Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201205075946.497763-2-xiyou.wangcong@gmail.com --- net/core/lwt_bpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 4f3cb7c15ddf..2f7940bcf715 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -39,10 +39,10 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, { int ret; - /* Preempt disable and BH disable are needed to protect per-cpu + /* Migration disable and BH disable are needed to protect per-cpu * redirect_info between BPF prog and skb_do_redirect(). */ - preempt_disable(); + migrate_disable(); local_bh_disable(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -78,7 +78,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, } local_bh_enable(); - preempt_enable(); + migrate_enable(); return ret; } -- cgit v1.2.3 From 998f17296234aa8d3676b4a13962eb39f4ad24e0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 9 Dec 2020 14:57:37 +0100 Subject: xdp: Remove the xdp_attachment_flags_ok() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device"), the XDP program attachment info is now maintained in the core code. This interacts badly with the xdp_attachment_flags_ok() check that prevents unloading an XDP program with different load flags than it was loaded with. In practice, two kinds of failures are seen: - An XDP program loaded without specifying a mode (and which then ends up in driver mode) cannot be unloaded if the program mode is specified on unload. - The dev_xdp_uninstall() hook always calls the driver callback with the mode set to the type of the program but an empty flags argument, which means the flags_ok() check prevents the program from being removed, leading to bpf prog reference leaks. The original reason this check was added was to avoid ambiguity when multiple programs were loaded. With the way the checks are done in the core now, this is quite simple to enforce in the core code, so let's add a check there and get rid of the xdp_attachment_flags_ok() callback entirely. Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752225751.110217.10267659521308669050.stgit@toke.dk --- .../net/ethernet/netronome/nfp/nfp_net_common.c | 6 ------ drivers/net/ethernet/ti/cpsw_priv.c | 3 --- drivers/net/netdevsim/bpf.c | 3 --- include/net/xdp.h | 2 -- net/core/dev.c | 22 ++++++++++++++++++++-- net/core/xdp.c | 12 ------------ 6 files changed, 20 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index b150da43adb2..437226866ce8 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3562,9 +3562,6 @@ static int nfp_net_xdp_setup_drv(struct nfp_net *nn, struct netdev_bpf *bpf) struct nfp_net_dp *dp; int err; - if (!xdp_attachment_flags_ok(&nn->xdp, bpf)) - return -EBUSY; - if (!prog == !nn->dp.xdp_prog) { WRITE_ONCE(nn->dp.xdp_prog, prog); xdp_attachment_setup(&nn->xdp, bpf); @@ -3593,9 +3590,6 @@ static int nfp_net_xdp_setup_hw(struct nfp_net *nn, struct netdev_bpf *bpf) { int err; - if (!xdp_attachment_flags_ok(&nn->xdp_hw, bpf)) - return -EBUSY; - err = nfp_app_xdp_offload(nn->app, nn, bpf->prog, bpf->extack); if (err) return err; diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 31c5e36ff706..424e644724e4 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1265,9 +1265,6 @@ static int cpsw_xdp_prog_setup(struct cpsw_priv *priv, struct netdev_bpf *bpf) if (!priv->xdpi.prog && !prog) return 0; - if (!xdp_attachment_flags_ok(&priv->xdpi, bpf)) - return -EBUSY; - WRITE_ONCE(priv->xdp_prog, prog); xdp_attachment_setup(&priv->xdpi, bpf); diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 2e90512f3bbe..85546664bdd5 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -190,9 +190,6 @@ nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf, { int err; - if (!xdp_attachment_flags_ok(xdp, bpf)) - return -EBUSY; - if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); return -EOPNOTSUPP; diff --git a/include/net/xdp.h b/include/net/xdp.h index 3814fb631d52..9dab2bc6f187 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -240,8 +240,6 @@ struct xdp_attachment_info { }; struct netdev_bpf; -bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, - struct netdev_bpf *bpf); void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); diff --git a/net/core/dev.c b/net/core/dev.c index 8588ade790cb..38412e70f761 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8917,6 +8917,17 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } +static u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -9007,6 +9018,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -9022,11 +9034,17 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } - /* just one XDP mode bit should be set, zero defaults to SKB mode */ - if (hweight32(flags & XDP_FLAGS_MODES) > 1) { + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); diff --git a/net/core/xdp.c b/net/core/xdp.c index 491ad569a79c..d900cebc0acd 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -403,18 +403,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) } EXPORT_SYMBOL_GPL(__xdp_release_frame); -bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, - struct netdev_bpf *bpf) -{ - if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { - NL_SET_ERR_MSG(bpf->extack, - "program loaded with different flags"); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); - void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { -- cgit v1.2.3