From 271bfcfb83a9f77cbae3d6e1a16e3c14132922f0 Mon Sep 17 00:00:00 2001 From: Daniil Dulov Date: Mon, 27 Feb 2023 01:17:51 -0800 Subject: RDMA/siw: Fix potential page_array out of range access When seg is equal to MAX_ARRAY, the loop should break, otherwise it will result in out of range access. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: b9be6f18cf9e ("rdma/siw: transmit path") Signed-off-by: Daniil Dulov Link: https://lore.kernel.org/r/20230227091751.589612-1-d.dulov@aladdin.ru Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw_qp_tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 05052b49107f..6bb9e9e81ff4 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -558,7 +558,7 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) data_len -= plen; fp_off = 0; - if (++seg > (int)MAX_ARRAY) { + if (++seg >= (int)MAX_ARRAY) { siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); siw_unmap_pages(iov, kmap_mask, seg-1); wqe->processed -= c_tx->bytes_unsent; -- cgit v1.2.3 From f2f6e1661d38fbce6163fc321395e9de24a9529b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 18 Feb 2023 10:57:05 -0800 Subject: IB/rdmavt: Fix target union member for rvt_post_one_wr() The "cplen" result used by the memcpy() into struct rvt_swqe "wqe" may be sized to 80 for struct rvt_ud_wr (which is member "ud_wr", not "wr" which is only 40 bytes in size). Change the destination union member so the compiler can use the correct bounds check. struct rvt_swqe { union { struct ib_send_wr wr; /* don't use wr.sg_list */ struct rvt_ud_wr ud_wr; ... }; ... }; Silences false positive memcpy() run-time warning: memcpy: detected field-spanning write (size 80) of single field "&wqe->wr" at drivers/infiniband/sw/rdmavt/qp.c:2043 (size 40) Reported-by: Zhang Yi Link: https://bugzilla.kernel.org/show_bug.cgi?id=216561 Cc: Dennis Dalessandro Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230218185701.never.779-kees@kernel.org Reviewed-by: Dennis Dalessandro Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rdmavt/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3acab569fbb9..3f707e1fa517 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2040,7 +2040,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe = rvt_get_swqe_ptr(qp, qp->s_head); /* cplen has length from above */ - memcpy(&wqe->wr, wr, cplen); + memcpy(&wqe->ud_wr, wr, cplen); wqe->length = 0; j = 0; -- cgit v1.2.3 From b73a0b80c69de77d8d4942abb37066531c0169b2 Mon Sep 17 00:00:00 2001 From: Natalia Petrova Date: Fri, 3 Mar 2023 15:44:08 +0300 Subject: RDMA/rdmavt: Delete unnecessary NULL check There is no need to check 'rdi->qp_dev' for NULL. The field 'qp_dev' is created in rvt_register_device() which will fail if the 'qp_dev' allocation fails in rvt_driver_qp_init(). Overwise this pointer doesn't changed and passed to rvt_qp_exit() by the next step. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 0acb0cc7ecc1 ("IB/rdmavt: Initialize and teardown of qpn table") Signed-off-by: Natalia Petrova Link: https://lore.kernel.org/r/20230303124408.16685-1-n.petrova@fintech.ru Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rdmavt/qp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3f707e1fa517..2d143a9d2027 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -464,8 +464,6 @@ void rvt_qp_exit(struct rvt_dev_info *rdi) if (qps_inuse) rvt_pr_err(rdi, "QP memory leak! %u still in use\n", qps_inuse); - if (!rdi->qp_dev) - return; kfree(rdi->qp_dev->qp_table); free_qpn_table(&rdi->qp_dev->qpn_table); -- cgit v1.2.3 From 9168d125ea032ad199275193493c13cb077da5cc Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 3 Mar 2023 16:16:21 -0600 Subject: RDMA/rxe: Replace exists by rxe in rxe.c 'exists' looks like a boolean. This patch replaces it by the normal name used for the rxe device, 'rxe', which should be a little less confusing. The second rxe_dbg() message is incorrect since rxe is known to be NULL and this will cause a seg fault if this message were ever sent. Replace it by pr_debug for the moment. Fixes: c6aba5ea0055 ("RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe.c") Link: https://lore.kernel.org/r/20230303221623.8053-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 136c2efe3466..a3f05fdd9fac 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -175,7 +175,7 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) { - struct rxe_dev *exists; + struct rxe_dev *rxe; int err = 0; if (is_vlan_dev(ndev)) { @@ -184,17 +184,17 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) goto err; } - exists = rxe_get_dev_from_net(ndev); - if (exists) { - ib_device_put(&exists->ib_dev); - rxe_dbg(exists, "already configured on %s\n", ndev->name); + rxe = rxe_get_dev_from_net(ndev); + if (rxe) { + ib_device_put(&rxe->ib_dev); + rxe_dbg(rxe, "already configured on %s\n", ndev->name); err = -EEXIST; goto err; } err = rxe_net_add(ibdev_name, ndev); if (err) { - rxe_dbg(exists, "failed to add %s\n", ndev->name); + pr_debug("failed to add %s\n", ndev->name); goto err; } err: -- cgit v1.2.3 From a9fb3287211e64b94ceb2b6b4791cc2b829d0d56 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 3 Mar 2023 16:16:22 -0600 Subject: RDMA/rxe: Change rxe_dbg to rxe_dbg_dev Replace the name rxe_dbg with rxe_dbg_dev which better matches the remaining rxe_dbg_xxx macros for debug messages with a rxe device parameter. Reuse the name rxe_dbg for debug messages which do not have a rxe device parameter. Link: https://lore.kernel.org/r/20230303221623.8053-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 2 +- drivers/infiniband/sw/rxe/rxe.h | 3 ++- drivers/infiniband/sw/rxe/rxe_cq.c | 6 +++--- drivers/infiniband/sw/rxe/rxe_icrc.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_mmap.c | 6 +++--- drivers/infiniband/sw/rxe/rxe_net.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_qp.c | 16 ++++++++-------- drivers/infiniband/sw/rxe/rxe_srq.c | 6 +++--- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- 9 files changed, 25 insertions(+), 24 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index a3f05fdd9fac..d57ba7a5964b 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -187,7 +187,7 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) rxe = rxe_get_dev_from_net(ndev); if (rxe) { ib_device_put(&rxe->ib_dev); - rxe_dbg(rxe, "already configured on %s\n", ndev->name); + rxe_dbg_dev(rxe, "already configured on %s\n", ndev->name); err = -EEXIST; goto err; } diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 2415f3704f57..0757acc38103 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -38,7 +38,8 @@ #define RXE_ROCE_V2_SPORT (0xc000) -#define rxe_dbg(rxe, fmt, ...) ibdev_dbg(&(rxe)->ib_dev, \ +#define rxe_dbg(fmt, ...) pr_debug("%s: " fmt "\n", __func__, ##__VA_ARGS__) +#define rxe_dbg_dev(rxe, fmt, ...) ibdev_dbg(&(rxe)->ib_dev, \ "%s: " fmt, __func__, ##__VA_ARGS__) #define rxe_dbg_uc(uc, fmt, ...) ibdev_dbg((uc)->ibuc.device, \ "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 1df186534639..22fbc198e5d1 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -14,12 +14,12 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, int count; if (cqe <= 0) { - rxe_dbg(rxe, "cqe(%d) <= 0\n", cqe); + rxe_dbg_dev(rxe, "cqe(%d) <= 0\n", cqe); goto err1; } if (cqe > rxe->attr.max_cqe) { - rxe_dbg(rxe, "cqe(%d) > max_cqe(%d)\n", + rxe_dbg_dev(rxe, "cqe(%d) > max_cqe(%d)\n", cqe, rxe->attr.max_cqe); goto err1; } @@ -65,7 +65,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, cq->queue = rxe_queue_init(rxe, &cqe, sizeof(struct rxe_cqe), type); if (!cq->queue) { - rxe_dbg(rxe, "unable to create cq\n"); + rxe_dbg_dev(rxe, "unable to create cq\n"); return -ENOMEM; } diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c index 71bc2c189588..fdf5f08cd8f1 100644 --- a/drivers/infiniband/sw/rxe/rxe_icrc.c +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -21,7 +21,7 @@ int rxe_icrc_init(struct rxe_dev *rxe) tfm = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(tfm)) { - rxe_dbg(rxe, "failed to init crc32 algorithm err: %ld\n", + rxe_dbg_dev(rxe, "failed to init crc32 algorithm err: %ld\n", PTR_ERR(tfm)); return PTR_ERR(tfm); } @@ -51,7 +51,7 @@ static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len) *(__be32 *)shash_desc_ctx(shash) = crc; err = crypto_shash_update(shash, next, len); if (unlikely(err)) { - rxe_dbg(rxe, "failed crc calculation, err: %d\n", err); + rxe_dbg_dev(rxe, "failed crc calculation, err: %d\n", err); return (__force __be32)crc32_le((__force u32)crc, next, len); } diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c index a47d72dbc537..6b7f2bd69879 100644 --- a/drivers/infiniband/sw/rxe/rxe_mmap.c +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -79,7 +79,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) /* Don't allow a mmap larger than the object. */ if (size > ip->info.size) { - rxe_dbg(rxe, "mmap region is larger than the object!\n"); + rxe_dbg_dev(rxe, "mmap region is larger than the object!\n"); spin_unlock_bh(&rxe->pending_lock); ret = -EINVAL; goto done; @@ -87,7 +87,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) goto found_it; } - rxe_dbg(rxe, "unable to find pending mmap info\n"); + rxe_dbg_dev(rxe, "unable to find pending mmap info\n"); spin_unlock_bh(&rxe->pending_lock); ret = -EINVAL; goto done; @@ -98,7 +98,7 @@ found_it: ret = remap_vmalloc_range(vma, ip->obj, 0); if (ret) { - rxe_dbg(rxe, "err %d from remap_vmalloc_range\n", ret); + rxe_dbg_dev(rxe, "err %d from remap_vmalloc_range\n", ret); goto done; } diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index e02e1624bcf4..a2ace42e9536 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -596,7 +596,7 @@ static int rxe_notify(struct notifier_block *not_blk, rxe_port_down(rxe); break; case NETDEV_CHANGEMTU: - rxe_dbg(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); + rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; case NETDEV_CHANGE: @@ -608,7 +608,7 @@ static int rxe_notify(struct notifier_block *not_blk, case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: default: - rxe_dbg(rxe, "ignoring netdev event = %ld for %s\n", + rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n", event, ndev->name); break; } diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index ab72db68b58f..c954dd9394ba 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -19,33 +19,33 @@ static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { if (cap->max_send_wr > rxe->attr.max_qp_wr) { - rxe_dbg(rxe, "invalid send wr = %u > %d\n", + rxe_dbg_dev(rxe, "invalid send wr = %u > %d\n", cap->max_send_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_send_sge > rxe->attr.max_send_sge) { - rxe_dbg(rxe, "invalid send sge = %u > %d\n", + rxe_dbg_dev(rxe, "invalid send sge = %u > %d\n", cap->max_send_sge, rxe->attr.max_send_sge); goto err1; } if (!has_srq) { if (cap->max_recv_wr > rxe->attr.max_qp_wr) { - rxe_dbg(rxe, "invalid recv wr = %u > %d\n", + rxe_dbg_dev(rxe, "invalid recv wr = %u > %d\n", cap->max_recv_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_recv_sge > rxe->attr.max_recv_sge) { - rxe_dbg(rxe, "invalid recv sge = %u > %d\n", + rxe_dbg_dev(rxe, "invalid recv sge = %u > %d\n", cap->max_recv_sge, rxe->attr.max_recv_sge); goto err1; } } if (cap->max_inline_data > rxe->max_inline_data) { - rxe_dbg(rxe, "invalid max inline data = %u > %d\n", + rxe_dbg_dev(rxe, "invalid max inline data = %u > %d\n", cap->max_inline_data, rxe->max_inline_data); goto err1; } @@ -73,7 +73,7 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) } if (!init->recv_cq || !init->send_cq) { - rxe_dbg(rxe, "missing cq\n"); + rxe_dbg_dev(rxe, "missing cq\n"); goto err1; } @@ -82,14 +82,14 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) if (init->qp_type == IB_QPT_GSI) { if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { - rxe_dbg(rxe, "invalid port = %d\n", port_num); + rxe_dbg_dev(rxe, "invalid port = %d\n", port_num); goto err1; } port = &rxe->port; if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { - rxe_dbg(rxe, "GSI QP exists for port %d\n", port_num); + rxe_dbg_dev(rxe, "GSI QP exists for port %d\n", port_num); goto err1; } } diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 82e37a41ced4..27ca82ec0826 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -13,13 +13,13 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) struct ib_srq_attr *attr = &init->attr; if (attr->max_wr > rxe->attr.max_srq_wr) { - rxe_dbg(rxe, "max_wr(%d) > max_srq_wr(%d)\n", + rxe_dbg_dev(rxe, "max_wr(%d) > max_srq_wr(%d)\n", attr->max_wr, rxe->attr.max_srq_wr); goto err1; } if (attr->max_wr <= 0) { - rxe_dbg(rxe, "max_wr(%d) <= 0\n", attr->max_wr); + rxe_dbg_dev(rxe, "max_wr(%d) <= 0\n", attr->max_wr); goto err1; } @@ -27,7 +27,7 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) attr->max_wr = RXE_MIN_SRQ_WR; if (attr->max_sge > rxe->attr.max_srq_sge) { - rxe_dbg(rxe, "max_sge(%d) > max_srq_sge(%d)\n", + rxe_dbg_dev(rxe, "max_sge(%d) > max_srq_sge(%d)\n", attr->max_sge, rxe->attr.max_srq_sge); goto err1; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index e14050a69276..f178d0773ff2 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1095,7 +1095,7 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) err = ib_register_device(dev, ibdev_name, NULL); if (err) - rxe_dbg(rxe, "failed with error %d\n", err); + rxe_dbg_dev(rxe, "failed with error %d\n", err); /* * Note that rxe may be invalid at this point if another thread -- cgit v1.2.3 From 9ac01f434a1eb56ea94611bd75cf62fa276b41f4 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 3 Mar 2023 16:16:23 -0600 Subject: RDMA/rxe: Extend dbg log messages to err and info Extend the dbg log messages (e.g. rxe_dbg_xxx) to include err and info types. rxe.c is modified to use these new log messages as examples. Link: https://lore.kernel.org/r/20230303221623.8053-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 8 +++++--- drivers/infiniband/sw/rxe/rxe.h | 42 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index d57ba7a5964b..7a7e713de52d 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -160,6 +160,8 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); + + rxe_info_dev(rxe, "Set mtu to %d", port->mtu_cap); } /* called by ifc layer to create new rxe device. @@ -179,7 +181,7 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) int err = 0; if (is_vlan_dev(ndev)) { - pr_err("rxe creation allowed on top of a real device only\n"); + rxe_err("rxe creation allowed on top of a real device only"); err = -EPERM; goto err; } @@ -187,14 +189,14 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) rxe = rxe_get_dev_from_net(ndev); if (rxe) { ib_device_put(&rxe->ib_dev); - rxe_dbg_dev(rxe, "already configured on %s\n", ndev->name); + rxe_err_dev(rxe, "already configured on %s", ndev->name); err = -EEXIST; goto err; } err = rxe_net_add(ibdev_name, ndev); if (err) { - pr_debug("failed to add %s\n", ndev->name); + rxe_err("failed to add %s\n", ndev->name); goto err; } err: diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 0757acc38103..bd8a8ea4ea8f 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -58,6 +58,48 @@ #define rxe_dbg_mw(mw, fmt, ...) ibdev_dbg((mw)->ibmw.device, \ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err(fmt, ...) pr_err_ratelimited("%s: " fmt "\n", __func__, \ + ##__VA_ARGS__) +#define rxe_err_dev(rxe, fmt, ...) ibdev_err_ratelimited(&(rxe)->ib_dev, \ + "%s: " fmt, __func__, ##__VA_ARGS__) +#define rxe_err_uc(uc, fmt, ...) ibdev_err_ratelimited((uc)->ibuc.device, \ + "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_pd(pd, fmt, ...) ibdev_err_ratelimited((pd)->ibpd.device, \ + "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_ah(ah, fmt, ...) ibdev_err_ratelimited((ah)->ibah.device, \ + "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_srq(srq, fmt, ...) ibdev_err_ratelimited((srq)->ibsrq.device, \ + "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_qp(qp, fmt, ...) ibdev_err_ratelimited((qp)->ibqp.device, \ + "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_cq(cq, fmt, ...) ibdev_err_ratelimited((cq)->ibcq.device, \ + "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_mr(mr, fmt, ...) ibdev_err_ratelimited((mr)->ibmr.device, \ + "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_mw(mw, fmt, ...) ibdev_err_ratelimited((mw)->ibmw.device, \ + "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) + +#define rxe_info(fmt, ...) pr_info_ratelimited("%s: " fmt "\n", __func__, \ + ##__VA_ARGS__) +#define rxe_info_dev(rxe, fmt, ...) ibdev_info_ratelimited(&(rxe)->ib_dev, \ + "%s: " fmt, __func__, ##__VA_ARGS__) +#define rxe_info_uc(uc, fmt, ...) ibdev_info_ratelimited((uc)->ibuc.device, \ + "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_pd(pd, fmt, ...) ibdev_info_ratelimited((pd)->ibpd.device, \ + "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_ah(ah, fmt, ...) ibdev_info_ratelimited((ah)->ibah.device, \ + "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_srq(srq, fmt, ...) ibdev_info_ratelimited((srq)->ibsrq.device, \ + "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_qp(qp, fmt, ...) ibdev_info_ratelimited((qp)->ibqp.device, \ + "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_cq(cq, fmt, ...) ibdev_info_ratelimited((cq)->ibcq.device, \ + "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_mr(mr, fmt, ...) ibdev_info_ratelimited((mr)->ibmr.device, \ + "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_mw(mw, fmt, ...) ibdev_info_ratelimited((mw)->ibmw.device, \ + "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) + /* responder states */ enum resp_states { RESPST_NONE, -- cgit v1.2.3 From 5bf944f24129cbc4b5828348bdce2db94ca9fbd6 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 3 Mar 2023 16:16:24 -0600 Subject: RDMA/rxe: Add error messages This patch adds error and debug messages so that every interaction with rdma-core through a verbs API call or a completion error return will generate at least one error message backed up by debug messages with more detail. With dynamic debugging one can follow up after seeing an error message by turning on the appropriate debug messages. Link: https://lore.kernel.org/r/20230303221623.8053-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 4 + drivers/infiniband/sw/rxe/rxe_loc.h | 1 - drivers/infiniband/sw/rxe/rxe_mr.c | 13 - drivers/infiniband/sw/rxe/rxe_resp.c | 4 + drivers/infiniband/sw/rxe/rxe_verbs.c | 828 ++++++++++++++++++++++++---------- 5 files changed, 609 insertions(+), 241 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 20737fec392b..876057e3ee3c 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -428,6 +428,10 @@ static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe, uwc->wc_flags = IB_WC_WITH_IMM; uwc->byte_len = wqe->dma.length; } + } else { + if (wqe->status != IB_WC_WR_FLUSH_ERR) + rxe_err_qp(qp, "non-flush error status = %d", + wqe->status); } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 1bb0cb479eb1..839de34cf4c9 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -80,7 +80,6 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); int rxe_invalidate_mr(struct rxe_qp *qp, u32 key); int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); -int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); void rxe_mr_cleanup(struct rxe_pool_elem *elem); /* rxe_mw.c */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b10aa1580a64..1e17f8086d59 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -722,19 +722,6 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) return 0; } -int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) -{ - struct rxe_mr *mr = to_rmr(ibmr); - - /* See IBA 10.6.7.2.6 */ - if (atomic_read(&mr->num_mw) > 0) - return -EINVAL; - - rxe_cleanup(mr); - kfree_rcu(mr); - return 0; -} - void rxe_mr_cleanup(struct rxe_pool_elem *elem) { struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 0cc1ba91d48c..4217eec03a94 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1151,6 +1151,10 @@ static enum resp_states do_complete(struct rxe_qp *qp, wc->port_num = qp->attr.port_num; } + } else { + if (wc->status != IB_WC_WR_FLUSH_ERR) + rxe_err_qp(qp, "non-flush error status = %d", + wc->status); } /* have copy for srq and reference for !srq */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f178d0773ff2..84b53c070fc5 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -12,31 +12,48 @@ #include "rxe_queue.h" #include "rxe_hw_counters.h" -static int rxe_query_device(struct ib_device *dev, +static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr); + +/* dev */ +static int rxe_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, - struct ib_udata *uhw) + struct ib_udata *udata) { - struct rxe_dev *rxe = to_rdev(dev); + struct rxe_dev *rxe = to_rdev(ibdev); + int err; + + if (udata->inlen || udata->outlen) { + rxe_dbg_dev(rxe, "malformed udata"); + err = -EINVAL; + goto err_out; + } - if (uhw->inlen || uhw->outlen) - return -EINVAL; + memcpy(attr, &rxe->attr, sizeof(*attr)); - *attr = rxe->attr; return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } -static int rxe_query_port(struct ib_device *dev, +static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { - struct rxe_dev *rxe = to_rdev(dev); - int rc; + struct rxe_dev *rxe = to_rdev(ibdev); + int err, ret; - /* *attr being zeroed by the caller, avoid zeroing it here */ - *attr = rxe->port.attr; + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } + + memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); - rc = ib_get_eth_speed(dev, port_num, &attr->active_speed, - &attr->active_width); + ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, + &attr->active_width); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; @@ -47,27 +64,45 @@ static int rxe_query_port(struct ib_device *dev, mutex_unlock(&rxe->usdev_lock); - return rc; + return ret; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } -static int rxe_query_pkey(struct ib_device *device, +static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { - if (index > 0) - return -EINVAL; + struct rxe_dev *rxe = to_rdev(ibdev); + int err; + + if (index != 0) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad pkey index = %d", index); + goto err_out; + } *pkey = IB_DEFAULT_PKEY_FULL; return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } -static int rxe_modify_device(struct ib_device *dev, +static int rxe_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *attr) { - struct rxe_dev *rxe = to_rdev(dev); + struct rxe_dev *rxe = to_rdev(ibdev); + int err; if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | - IB_DEVICE_MODIFY_NODE_DESC)) - return -EOPNOTSUPP; + IB_DEVICE_MODIFY_NODE_DESC)) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "unsupported mask = 0x%x", mask); + goto err_out; + } if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); @@ -78,16 +113,33 @@ static int rxe_modify_device(struct ib_device *dev, } return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } -static int rxe_modify_port(struct ib_device *dev, - u32 port_num, int mask, struct ib_port_modify *attr) +static int rxe_modify_port(struct ib_device *ibdev, u32 port_num, + int mask, struct ib_port_modify *attr) { - struct rxe_dev *rxe = to_rdev(dev); + struct rxe_dev *rxe = to_rdev(ibdev); struct rxe_port *port; + int err; - port = &rxe->port; + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } + //TODO is shutdown useful + if (mask & ~(IB_PORT_RESET_QKEY_CNTR)) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "unsupported mask = 0x%x", mask); + goto err_out; + } + + port = &rxe->port; port->attr.port_cap_flags |= attr->set_port_cap_mask; port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; @@ -95,73 +147,125 @@ static int rxe_modify_port(struct ib_device *dev, port->attr.qkey_viol_cntr = 0; return 0; -} -static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev, - u32 port_num) -{ - return IB_LINK_LAYER_ETHERNET; +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } -static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata) +static enum rdma_link_layer rxe_get_link_layer(struct ib_device *ibdev, + u32 port_num) { - struct rxe_dev *rxe = to_rdev(ibuc->device); - struct rxe_ucontext *uc = to_ruc(ibuc); + struct rxe_dev *rxe = to_rdev(ibdev); + int err; - return rxe_add_to_pool(&rxe->uc_pool, uc); -} + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } -static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) -{ - struct rxe_ucontext *uc = to_ruc(ibuc); + return IB_LINK_LAYER_ETHERNET; - rxe_cleanup(uc); +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } -static int rxe_port_immutable(struct ib_device *dev, u32 port_num, +static int rxe_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { + struct rxe_dev *rxe = to_rdev(ibdev); + struct ib_port_attr attr = {}; int err; - struct ib_port_attr attr; - immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } - err = ib_query_port(dev, port_num, &attr); + err = ib_query_port(ibdev, port_num, &attr); if (err) - return err; + goto err_out; + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } +/* uc */ +static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibuc->device); + struct rxe_ucontext *uc = to_ruc(ibuc); + int err; + + err = rxe_add_to_pool(&rxe->uc_pool, uc); + if (err) + rxe_err_dev(rxe, "unable to create uc"); + + return err; +} + +static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) +{ + struct rxe_ucontext *uc = to_ruc(ibuc); + int err; + + err = rxe_cleanup(uc); + if (err) + rxe_err_uc(uc, "cleanup failed, err = %d", err); +} + +/* pd */ static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); + int err; + + err = rxe_add_to_pool(&rxe->pd_pool, pd); + if (err) { + rxe_dbg_dev(rxe, "unable to alloc pd"); + goto err_out; + } - return rxe_add_to_pool(&rxe->pd_pool, pd); + return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); + int err; + + err = rxe_cleanup(pd); + if (err) + rxe_err_pd(pd, "cleanup failed, err = %d", err); - rxe_cleanup(pd); return 0; } +/* ah */ static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) - { struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); struct rxe_create_ah_resp __user *uresp = NULL; - int err; + int err, cleanup_err; if (udata) { /* test if new user provider */ @@ -174,16 +278,18 @@ static int rxe_create_ah(struct ib_ah *ibah, err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); - if (err) - return err; + if (err) { + rxe_dbg_dev(rxe, "unable to create ah"); + goto err_out; + } /* create index > 0 */ ah->ah_num = ah->elem.index; err = rxe_ah_chk_attr(ah, init_attr->ah_attr); if (err) { - rxe_cleanup(ah); - return err; + rxe_dbg_ah(ah, "bad attr"); + goto err_cleanup; } if (uresp) { @@ -191,8 +297,9 @@ static int rxe_create_ah(struct ib_ah *ibah, err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { - rxe_cleanup(ah); - return -EFAULT; + err = -EFAULT; + rxe_dbg_ah(ah, "unable to copy to user"); + goto err_cleanup; } } else if (ah->is_user) { /* only if old user provider */ @@ -203,19 +310,34 @@ static int rxe_create_ah(struct ib_ah *ibah, rxe_finalize(ah); return 0; + +err_cleanup: + cleanup_err = rxe_cleanup(ah); + if (cleanup_err) + rxe_err_ah(ah, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_ah(ah, "returned err = %d", err); + return err; } static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { - int err; struct rxe_ah *ah = to_rah(ibah); + int err; err = rxe_ah_chk_attr(ah, attr); - if (err) - return err; + if (err) { + rxe_dbg_ah(ah, "bad attr"); + goto err_out; + } rxe_init_av(attr, &ah->av); + return 0; + +err_out: + rxe_err_ah(ah, "returned err = %d", err); + return err; } static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) @@ -225,92 +347,77 @@ static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) memset(attr, 0, sizeof(*attr)); attr->type = ibah->type; rxe_av_to_attr(&ah->av, attr); + return 0; } static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); + int err; - rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); - - return 0; -} - -static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) -{ - int i; - u32 length; - struct rxe_recv_wqe *recv_wqe; - int num_sge = ibwr->num_sge; - int full; - - full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); - if (unlikely(full)) - return -ENOMEM; - - if (unlikely(num_sge > rq->max_sge)) - return -EINVAL; - - length = 0; - for (i = 0; i < num_sge; i++) - length += ibwr->sg_list[i].length; - - recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); - recv_wqe->wr_id = ibwr->wr_id; - - memcpy(recv_wqe->dma.sge, ibwr->sg_list, - num_sge * sizeof(struct ib_sge)); - - recv_wqe->dma.length = length; - recv_wqe->dma.resid = length; - recv_wqe->dma.num_sge = num_sge; - recv_wqe->dma.cur_sge = 0; - recv_wqe->dma.sge_offset = 0; - - queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); + err = rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); + if (err) + rxe_err_ah(ah, "cleanup failed, err = %d", err); return 0; } +/* srq */ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, struct ib_udata *udata) { - int err; struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_pd *pd = to_rpd(ibsrq->pd); struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_create_srq_resp __user *uresp = NULL; + int err, cleanup_err; if (udata) { - if (udata->outlen < sizeof(*uresp)) - return -EINVAL; + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_err_dev(rxe, "malformed udata"); + goto err_out; + } uresp = udata->outbuf; } - if (init->srq_type != IB_SRQT_BASIC) - return -EOPNOTSUPP; + if (init->srq_type != IB_SRQT_BASIC) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "srq type = %d, not supported", + init->srq_type); + goto err_out; + } err = rxe_srq_chk_init(rxe, init); - if (err) - return err; + if (err) { + rxe_dbg_dev(rxe, "invalid init attributes"); + goto err_out; + } err = rxe_add_to_pool(&rxe->srq_pool, srq); - if (err) - return err; + if (err) { + rxe_dbg_dev(rxe, "unable to create srq, err = %d", err); + goto err_out; + } rxe_get(pd); srq->pd = pd; err = rxe_srq_from_init(rxe, srq, init, udata, uresp); - if (err) + if (err) { + rxe_dbg_srq(srq, "create srq failed, err = %d", err); goto err_cleanup; + } return 0; err_cleanup: - rxe_cleanup(srq); - + cleanup_err = rxe_cleanup(srq); + if (cleanup_err) + rxe_err_srq(srq, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_dev(rxe, "returned err = %d", err); return err; } @@ -318,46 +425,64 @@ static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct ib_udata *udata) { - int err; struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_dev *rxe = to_rdev(ibsrq->device); - struct rxe_modify_srq_cmd ucmd = {}; + struct rxe_modify_srq_cmd cmd = {}; + int err; if (udata) { - if (udata->inlen < sizeof(ucmd)) - return -EINVAL; + if (udata->inlen < sizeof(cmd)) { + err = -EINVAL; + rxe_dbg_srq(srq, "malformed udata"); + goto err_out; + } - err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); - if (err) - return err; + err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (err) { + err = -EFAULT; + rxe_dbg_srq(srq, "unable to read udata"); + goto err_out; + } } err = rxe_srq_chk_attr(rxe, srq, attr, mask); - if (err) - return err; + if (err) { + rxe_dbg_srq(srq, "bad init attributes"); + goto err_out; + } + + err = rxe_srq_from_attr(rxe, srq, attr, mask, &cmd, udata); + if (err) { + rxe_dbg_srq(srq, "bad attr"); + goto err_out; + } + + return 0; - return rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata); +err_out: + rxe_err_srq(srq, "returned err = %d", err); + return err; } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) { struct rxe_srq *srq = to_rsrq(ibsrq); + int err; - if (srq->error) - return -EINVAL; + if (srq->error) { + err = -EINVAL; + rxe_dbg_srq(srq, "srq in error state"); + goto err_out; + } attr->max_wr = srq->rq.queue->buf->index_mask; attr->max_sge = srq->rq.max_sge; attr->srq_limit = srq->limit; return 0; -} - -static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) -{ - struct rxe_srq *srq = to_rsrq(ibsrq); - rxe_cleanup(srq); - return 0; +err_out: + rxe_err_srq(srq, "returned err = %d", err); + return err; } static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, @@ -378,76 +503,116 @@ static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&srq->rq.producer_lock, flags); - if (err) + if (err) { *bad_wr = wr; + rxe_err_srq(srq, "returned err = %d", err); + } return err; } +static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + int err; + + err = rxe_cleanup(srq); + if (err) + rxe_err_srq(srq, "cleanup failed, err = %d", err); + + return 0; +} + +/* qp */ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, struct ib_udata *udata) { - int err; struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_pd *pd = to_rpd(ibqp->pd); struct rxe_qp *qp = to_rqp(ibqp); struct rxe_create_qp_resp __user *uresp = NULL; + int err, cleanup_err; if (udata) { - if (udata->outlen < sizeof(*uresp)) - return -EINVAL; - uresp = udata->outbuf; - } - - if (init->create_flags) - return -EOPNOTSUPP; - - err = rxe_qp_chk_init(rxe, init); - if (err) - return err; + if (udata->inlen) { + err = -EINVAL; + rxe_dbg_dev(rxe, "malformed udata, err = %d", err); + goto err_out; + } - if (udata) { - if (udata->inlen) - return -EINVAL; + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_dbg_dev(rxe, "malformed udata, err = %d", err); + goto err_out; + } qp->is_user = true; + uresp = udata->outbuf; } else { qp->is_user = false; } + if (init->create_flags) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "unsupported create_flags, err = %d", err); + goto err_out; + } + + err = rxe_qp_chk_init(rxe, init); + if (err) { + rxe_dbg_dev(rxe, "bad init attr, err = %d", err); + goto err_out; + } + err = rxe_add_to_pool(&rxe->qp_pool, qp); - if (err) - return err; + if (err) { + rxe_dbg_dev(rxe, "unable to create qp, err = %d", err); + goto err_out; + } err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata); - if (err) - goto qp_init; + if (err) { + rxe_dbg_qp(qp, "create qp failed, err = %d", err); + goto err_cleanup; + } rxe_finalize(qp); return 0; -qp_init: - rxe_cleanup(qp); +err_cleanup: + cleanup_err = rxe_cleanup(qp); + if (cleanup_err) + rxe_err_qp(qp, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_dev(rxe, "returned err = %d", err); return err; } static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { - int err; struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); + int err; - if (mask & ~IB_QP_ATTR_STANDARD_BITS) - return -EOPNOTSUPP; + if (mask & ~IB_QP_ATTR_STANDARD_BITS) { + err = -EOPNOTSUPP; + rxe_dbg_qp(qp, "unsupported mask = 0x%x, err = %d", + mask, err); + goto err_out; + } err = rxe_qp_chk_attr(rxe, qp, attr, mask); - if (err) - return err; + if (err) { + rxe_dbg_qp(qp, "bad mask/attr, err = %d", err); + goto err_out; + } err = rxe_qp_from_attr(qp, attr, mask, udata); - if (err) - return err; + if (err) { + rxe_dbg_qp(qp, "modify qp failed, err = %d", err); + goto err_out; + } if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, @@ -455,6 +620,10 @@ static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp->attr.dest_qp_num); return 0; + +err_out: + rxe_err_qp(qp, "returned err = %d", err); + return err; } static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, @@ -471,38 +640,59 @@ static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct rxe_qp *qp = to_rqp(ibqp); - int ret; + int err; + + err = rxe_qp_chk_destroy(qp); + if (err) { + rxe_dbg_qp(qp, "unable to destroy qp, err = %d", err); + goto err_out; + } - ret = rxe_qp_chk_destroy(qp); - if (ret) - return ret; + err = rxe_cleanup(qp); + if (err) + rxe_err_qp(qp, "cleanup failed, err = %d", err); - rxe_cleanup(qp); return 0; + +err_out: + rxe_err_qp(qp, "returned err = %d", err); + return err; } +/* send wr */ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length) { int num_sge = ibwr->num_sge; struct rxe_sq *sq = &qp->sq; - if (unlikely(num_sge > sq->max_sge)) - return -EINVAL; + if (unlikely(num_sge > sq->max_sge)) { + rxe_dbg_qp(qp, "num_sge > max_sge"); + goto err_out; + } if (unlikely(mask & WR_ATOMIC_MASK)) { - if (length < 8) - return -EINVAL; + if (length != 8) { + rxe_dbg_qp(qp, "atomic length != 8"); + goto err_out; + } - if (atomic_wr(ibwr)->remote_addr & 0x7) - return -EINVAL; + if (atomic_wr(ibwr)->remote_addr & 0x7) { + rxe_dbg_qp(qp, "misaligned atomic address"); + goto err_out; + } } if (unlikely((ibwr->send_flags & IB_SEND_INLINE) && - (length > sq->max_inline))) - return -EINVAL; + (length > sq->max_inline))) { + rxe_dbg_qp(qp, "inline length too big"); + goto err_out; + } return 0; + +err_out: + return -EINVAL; } static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, @@ -550,12 +740,12 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, break; case IB_WR_LOCAL_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; - break; + break; case IB_WR_REG_MR: wr->wr.reg.mr = reg_wr(ibwr)->mr; wr->wr.reg.key = reg_wr(ibwr)->key; wr->wr.reg.access = reg_wr(ibwr)->access; - break; + break; default: break; } @@ -624,9 +814,9 @@ static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, spin_lock_irqsave(&qp->sq.sq_lock, flags); full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); - if (unlikely(full)) { spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + rxe_dbg_qp(qp, "queue full"); return -ENOMEM; } @@ -652,6 +842,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, while (wr) { mask = wr_opcode_mask(wr->opcode, qp); if (unlikely(!mask)) { + rxe_dbg_qp(qp, "bad wr opcode for qp"); err = -EINVAL; *bad_wr = wr; break; @@ -659,6 +850,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, if (unlikely((wr->send_flags & IB_SEND_INLINE) && !(mask & WR_INLINE_MASK))) { + rxe_dbg_qp(qp, "opcode doesn't support inline data"); err = -EINVAL; *bad_wr = wr; break; @@ -669,17 +861,26 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, length = 0; for (i = 0; i < wr->num_sge; i++) length += wr->sg_list[i].length; + if (length > 1<<31) { + err = -EINVAL; + rxe_dbg_qp(qp, "message length too long"); + *bad_wr = wr; + break; + } err = post_one_send(qp, wr, mask, length); - if (err) { *bad_wr = wr; break; } + wr = next; } - rxe_sched_task(&qp->req.task); + /* if we didn't post anything there's nothing to do */ + if (!err) + rxe_sched_task(&qp->req.task); + if (unlikely(qp->req.state == QP_STATE_ERROR)) rxe_sched_task(&qp->comp.task); @@ -690,23 +891,90 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { struct rxe_qp *qp = to_rqp(ibqp); + int err; if (unlikely(!qp->valid)) { *bad_wr = wr; - return -EINVAL; + err = -EINVAL; + rxe_dbg_qp(qp, "qp destroyed"); + goto err_out; } if (unlikely(qp->req.state < QP_STATE_READY)) { *bad_wr = wr; - return -EINVAL; + err = -EINVAL; + rxe_dbg_qp(qp, "qp not ready to send"); + goto err_out; } if (qp->is_user) { /* Utilize process context to do protocol processing */ rxe_run_task(&qp->req.task); - return 0; - } else - return rxe_post_send_kernel(qp, wr, bad_wr); + } else { + err = rxe_post_send_kernel(qp, wr, bad_wr); + if (err) + goto err_out; + } + + return 0; + +err_out: + rxe_err_qp(qp, "returned err = %d", err); + return err; +} + +/* recv wr */ +static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) +{ + int i; + unsigned long length; + struct rxe_recv_wqe *recv_wqe; + int num_sge = ibwr->num_sge; + int full; + int err; + + full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); + if (unlikely(full)) { + err = -ENOMEM; + rxe_dbg("queue full"); + goto err_out; + } + + if (unlikely(num_sge > rq->max_sge)) { + err = -EINVAL; + rxe_dbg("bad num_sge > max_sge"); + goto err_out; + } + + length = 0; + for (i = 0; i < num_sge; i++) + length += ibwr->sg_list[i].length; + + /* IBA max message size is 2^31 */ + if (length >= (1UL<<31)) { + err = -EINVAL; + rxe_dbg("message length too long"); + goto err_out; + } + + recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); + + recv_wqe->wr_id = ibwr->wr_id; + recv_wqe->dma.length = length; + recv_wqe->dma.resid = length; + recv_wqe->dma.num_sge = num_sge; + recv_wqe->dma.cur_sge = 0; + recv_wqe->dma.sge_offset = 0; + memcpy(recv_wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); + + return 0; + +err_out: + rxe_dbg("returned err = %d", err); + return err; } static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, @@ -719,12 +987,16 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { *bad_wr = wr; - return -EINVAL; + err = -EINVAL; + rxe_dbg_qp(qp, "qp destroyed or not ready to post recv"); + goto err_out; } if (unlikely(qp->srq)) { *bad_wr = wr; - return -EINVAL; + err = -EINVAL; + rxe_dbg_qp(qp, "use post_srq_recv instead"); + goto err_out; } spin_lock_irqsave(&rq->producer_lock, flags); @@ -743,73 +1015,101 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (qp->resp.state == QP_STATE_ERROR) rxe_sched_task(&qp->resp.task); +err_out: + if (err) + rxe_err_qp(qp, "returned err = %d", err); + return err; } +/* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata) { - int err; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; + int err, cleanup_err; if (udata) { - if (udata->outlen < sizeof(*uresp)) - return -EINVAL; + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_dbg_dev(rxe, "malformed udata, err = %d", err); + goto err_out; + } uresp = udata->outbuf; } - if (attr->flags) - return -EOPNOTSUPP; + if (attr->flags) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "bad attr->flags, err = %d", err); + goto err_out; + } err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); - if (err) - return err; + if (err) { + rxe_dbg_dev(rxe, "bad init attributes, err = %d", err); + goto err_out; + } + + err = rxe_add_to_pool(&rxe->cq_pool, cq); + if (err) { + rxe_dbg_dev(rxe, "unable to create cq, err = %d", err); + goto err_out; + } err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); - if (err) - return err; - - return rxe_add_to_pool(&rxe->cq_pool, cq); -} - -static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) -{ - struct rxe_cq *cq = to_rcq(ibcq); - - /* See IBA C11-17: The CI shall return an error if this Verb is - * invoked while a Work Queue is still associated with the CQ. - */ - if (atomic_read(&cq->num_wq)) - return -EINVAL; - - rxe_cq_disable(cq); + if (err) { + rxe_dbg_cq(cq, "create cq failed, err = %d", err); + goto err_cleanup; + } - rxe_cleanup(cq); return 0; + +err_cleanup: + cleanup_err = rxe_cleanup(cq); + if (cleanup_err) + rxe_err_cq(cq, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { - int err; struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); struct rxe_resize_cq_resp __user *uresp = NULL; + int err; if (udata) { - if (udata->outlen < sizeof(*uresp)) - return -EINVAL; + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_dbg_cq(cq, "malformed udata"); + goto err_out; + } uresp = udata->outbuf; } err = rxe_cq_chk_attr(rxe, cq, cqe, 0); - if (err) - return err; + if (err) { + rxe_dbg_cq(cq, "bad attr, err = %d", err); + goto err_out; + } - return rxe_cq_resize_queue(cq, cqe, uresp, udata); + err = rxe_cq_resize_queue(cq, cqe, uresp, udata); + if (err) { + rxe_dbg_cq(cq, "resize cq failed, err = %d", err); + goto err_out; + } + + return 0; + +err_out: + rxe_err_cq(cq, "returned err = %d", err); + return err; } static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) @@ -823,7 +1123,7 @@ static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) for (i = 0; i < num_entries; i++) { cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); if (!cqe) - break; + break; /* queue empty */ memcpy(wc++, &cqe->ibwc, sizeof(*wc)); queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); @@ -864,6 +1164,34 @@ static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) return ret; } +static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int err; + + /* See IBA C11-17: The CI shall return an error if this Verb is + * invoked while a Work Queue is still associated with the CQ. + */ + if (atomic_read(&cq->num_wq)) { + err = -EINVAL; + rxe_dbg_cq(cq, "still in use"); + goto err_out; + } + + rxe_cq_disable(cq); + + err = rxe_cleanup(cq); + if (err) + rxe_err_cq(cq, "cleanup failed, err = %d", err); + + return 0; + +err_out: + rxe_err_cq(cq, "returned err = %d", err); + return err; +} + +/* mr */ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) { struct rxe_dev *rxe = to_rdev(ibpd->device); @@ -874,12 +1202,15 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { err = -ENOMEM; + rxe_dbg_dev(rxe, "no memory for mr"); goto err_out; } err = rxe_add_to_pool(&rxe->mr_pool, mr); - if (err) + if (err) { + rxe_dbg_dev(rxe, "unable to create mr"); goto err_free; + } rxe_get(pd); mr->ibmr.pd = ibpd; @@ -892,46 +1223,53 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) err_free: kfree(mr); err_out: + rxe_err_pd(pd, "returned err = %d", err); return ERR_PTR(err); } -static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, - u64 start, - u64 length, - u64 iova, - int access, struct ib_udata *udata) +static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, + u64 length, u64 iova, int access, + struct ib_udata *udata) { - int err; struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; + int err, cleanup_err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { err = -ENOMEM; + rxe_dbg_pd(pd, "no memory for mr"); goto err_out; } err = rxe_add_to_pool(&rxe->mr_pool, mr); - if (err) + if (err) { + rxe_dbg_pd(pd, "unable to create mr"); goto err_free; + } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_user(rxe, start, length, iova, access, mr); - if (err) + if (err) { + rxe_dbg_mr(mr, "reg_user_mr failed, err = %d", err); goto err_cleanup; + } rxe_finalize(mr); return &mr->ibmr; err_cleanup: - rxe_cleanup(mr); + cleanup_err = rxe_cleanup(mr); + if (cleanup_err) + rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err); err_free: kfree(mr); err_out: + rxe_err_pd(pd, "returned err = %d", err); return ERR_PTR(err); } @@ -941,40 +1279,76 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; - int err; + int err, cleanup_err; - if (mr_type != IB_MR_TYPE_MEM_REG) - return ERR_PTR(-EINVAL); + if (mr_type != IB_MR_TYPE_MEM_REG) { + err = -EINVAL; + rxe_dbg_pd(pd, "mr type %d not supported, err = %d", + mr_type, err); + goto err_out; + } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { err = -ENOMEM; + rxe_dbg_mr(mr, "no memory for mr"); goto err_out; } err = rxe_add_to_pool(&rxe->mr_pool, mr); - if (err) + if (err) { + rxe_dbg_mr(mr, "unable to create mr, err = %d", err); goto err_free; + } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_fast(max_num_sg, mr); - if (err) + if (err) { + rxe_dbg_mr(mr, "alloc_mr failed, err = %d", err); goto err_cleanup; + } rxe_finalize(mr); return &mr->ibmr; err_cleanup: - rxe_cleanup(mr); + cleanup_err = rxe_cleanup(mr); + if (cleanup_err) + rxe_err_mr(mr, "cleanup failed, err = %d", err); err_free: kfree(mr); err_out: + rxe_err_pd(pd, "returned err = %d", err); return ERR_PTR(err); } +static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct rxe_mr *mr = to_rmr(ibmr); + int err, cleanup_err; + + /* See IBA 10.6.7.2.6 */ + if (atomic_read(&mr->num_mw) > 0) { + err = -EINVAL; + rxe_dbg_mr(mr, "mr has mw's bound"); + goto err_out; + } + + cleanup_err = rxe_cleanup(mr); + if (cleanup_err) + rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err); + + kfree_rcu(mr); + return 0; + +err_out: + rxe_err_mr(mr, "returned err = %d", err); + return err; +} + static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { -- cgit v1.2.3 From 3946fc2a42b18cf0b675121158a2625825ce27b5 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sat, 4 Mar 2023 11:45:27 -0600 Subject: RDMA/rxe: Convert tasklet args to queue pairs Originally is was thought that the tasklet machinery in rxe_task.c would be used in other applications but that has not happened for years. This patch replaces the 'void *arg' by struct 'rxe_qp *qp' in the parameters to the tasklet calls. This change will have no affect on performance but may make the code a little clearer. Link: https://lore.kernel.org/r/20230304174533.11296-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 3 +-- drivers/infiniband/sw/rxe/rxe_loc.h | 6 +++--- drivers/infiniband/sw/rxe/rxe_req.c | 3 +-- drivers/infiniband/sw/rxe/rxe_resp.c | 3 +-- drivers/infiniband/sw/rxe/rxe_task.c | 11 ++++++----- drivers/infiniband/sw/rxe/rxe_task.h | 9 +++++---- 6 files changed, 17 insertions(+), 18 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 876057e3ee3c..cbfa16b3a490 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -575,9 +575,8 @@ static void free_pkt(struct rxe_pkt_info *pkt) ib_device_put(dev); } -int rxe_completer(void *arg) +int rxe_completer(struct rxe_qp *qp) { - struct rxe_qp *qp = (struct rxe_qp *)arg; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_send_wqe *wqe = NULL; struct sk_buff *skb = NULL; diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 839de34cf4c9..804b15e929dd 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -170,9 +170,9 @@ void rxe_srq_cleanup(struct rxe_pool_elem *elem); void rxe_dealloc(struct ib_device *ib_dev); -int rxe_completer(void *arg); -int rxe_requester(void *arg); -int rxe_responder(void *arg); +int rxe_completer(struct rxe_qp *qp); +int rxe_requester(struct rxe_qp *qp); +int rxe_responder(struct rxe_qp *qp); /* rxe_icrc.c */ int rxe_icrc_init(struct rxe_dev *rxe); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 899c8779f800..f2dc2d191e16 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -635,9 +635,8 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) return 0; } -int rxe_requester(void *arg) +int rxe_requester(struct rxe_qp *qp) { - struct rxe_qp *qp = (struct rxe_qp *)arg; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_pkt_info pkt; struct sk_buff *skb; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 4217eec03a94..7cb1b962d665 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1443,9 +1443,8 @@ static void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) queue_advance_consumer(q, q->type); } -int rxe_responder(void *arg) +int rxe_responder(struct rxe_qp *qp) { - struct rxe_qp *qp = (struct rxe_qp *)arg; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); enum resp_states state; struct rxe_pkt_info *pkt = NULL; diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 60b90e33a884..959cc6229a34 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -11,7 +11,7 @@ int __rxe_do_task(struct rxe_task *task) { int ret; - while ((ret = task->func(task->arg)) == 0) + while ((ret = task->func(task->qp)) == 0) ; task->ret = ret; @@ -29,7 +29,7 @@ static void do_task(struct tasklet_struct *t) int cont; int ret; struct rxe_task *task = from_tasklet(task, t, tasklet); - struct rxe_qp *qp = (struct rxe_qp *)task->arg; + struct rxe_qp *qp = (struct rxe_qp *)task->qp; unsigned int iterations = RXE_MAX_ITERATIONS; spin_lock_bh(&task->lock); @@ -54,7 +54,7 @@ static void do_task(struct tasklet_struct *t) do { cont = 0; - ret = task->func(task->arg); + ret = task->func(task->qp); spin_lock_bh(&task->lock); switch (task->state) { @@ -91,9 +91,10 @@ static void do_task(struct tasklet_struct *t) task->ret = ret; } -int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) +int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, + int (*func)(struct rxe_qp *)) { - task->arg = arg; + task->qp = qp; task->func = func; task->destroyed = false; diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 7b88129702ac..41efd5fd49b0 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -22,18 +22,19 @@ struct rxe_task { struct tasklet_struct tasklet; int state; spinlock_t lock; - void *arg; - int (*func)(void *arg); + struct rxe_qp *qp; + int (*func)(struct rxe_qp *qp); int ret; bool destroyed; }; /* * init rxe_task structure - * arg => parameter to pass to fcn + * qp => parameter to pass to func * func => function to call until it returns != 0 */ -int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)); +int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, + int (*func)(struct rxe_qp *)); /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -- cgit v1.2.3 From 49dc9c1f0c7e396654a31a480328fffd902fa494 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sat, 4 Mar 2023 11:45:29 -0600 Subject: RDMA/rxe: Cleanup reset state handling in rxe_resp.c Cleanup the handling of qp in the error state, reset state and during rxe_qp_do_cleanup. The error state does about the same thing as the others but has code spread all over. This patch combines them in a cleaner way. Link: https://lore.kernel.org/r/20230304174533.11296-4-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.h | 1 - drivers/infiniband/sw/rxe/rxe_resp.c | 107 +++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 51 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index bd8a8ea4ea8f..d33dd6cf83d3 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -133,7 +133,6 @@ enum resp_states { RESPST_ERR_LENGTH, RESPST_ERR_CQ_OVERFLOW, RESPST_ERROR, - RESPST_RESET, RESPST_DONE, RESPST_EXIT, }; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 7cb1b962d665..8f9bbb14fa7a 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -42,7 +42,6 @@ static char *resp_state_name[] = { [RESPST_ERR_LENGTH] = "ERR_LENGTH", [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW", [RESPST_ERROR] = "ERROR", - [RESPST_RESET] = "RESET", [RESPST_DONE] = "DONE", [RESPST_EXIT] = "EXIT", }; @@ -69,17 +68,6 @@ static inline enum resp_states get_req(struct rxe_qp *qp, { struct sk_buff *skb; - if (qp->resp.state == QP_STATE_ERROR) { - while ((skb = skb_dequeue(&qp->req_pkts))) { - rxe_put(qp); - kfree_skb(skb); - ib_device_put(qp->ibqp.device); - } - - /* go drain recv wr queue */ - return RESPST_CHK_RESOURCE; - } - skb = skb_peek(&qp->req_pkts); if (!skb) return RESPST_EXIT; @@ -334,24 +322,6 @@ static enum resp_states check_resource(struct rxe_qp *qp, { struct rxe_srq *srq = qp->srq; - if (qp->resp.state == QP_STATE_ERROR) { - if (qp->resp.wqe) { - qp->resp.status = IB_WC_WR_FLUSH_ERR; - return RESPST_COMPLETE; - } else if (!srq) { - qp->resp.wqe = queue_head(qp->rq.queue, - QUEUE_TYPE_FROM_CLIENT); - if (qp->resp.wqe) { - qp->resp.status = IB_WC_WR_FLUSH_ERR; - return RESPST_COMPLETE; - } else { - return RESPST_EXIT; - } - } else { - return RESPST_EXIT; - } - } - if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) { /* it is the requesters job to not send * too many read/atomic ops, we just @@ -1425,22 +1395,66 @@ static enum resp_states do_class_d1e_error(struct rxe_qp *qp) } } -static void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) +/* drain incoming request packet queue */ +static void rxe_drain_req_pkts(struct rxe_qp *qp) { struct sk_buff *skb; - struct rxe_queue *q = qp->rq.queue; while ((skb = skb_dequeue(&qp->req_pkts))) { rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } +} + +/* complete receive wqe with flush error */ +static int complete_flush(struct rxe_qp *qp, struct rxe_recv_wqe *wqe) +{ + struct rxe_cqe cqe = {}; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + + if (qp->rcq->is_user) { + uwc->status = IB_WC_WR_FLUSH_ERR; + uwc->qp_num = qp_num(qp); + uwc->wr_id = wqe->wr_id; + } else { + wc->status = IB_WC_WR_FLUSH_ERR; + wc->qp = &qp->ibqp; + wc->wr_id = wqe->wr_id; + } + + if (rxe_cq_post(qp->rcq, &cqe, 0)) + return -ENOMEM; + + return 0; +} + +/* drain and optionally complete the recive queue + * if unable to complete a wqe stop completing and + * just flush the remaining wqes + */ +static void rxe_drain_recv_queue(struct rxe_qp *qp, bool notify) +{ + struct rxe_queue *q = qp->rq.queue; + struct rxe_recv_wqe *wqe; + int err; - if (notify) + if (qp->srq) return; - while (!qp->srq && q && queue_head(q, q->type)) + while ((wqe = queue_head(q, q->type))) { + if (notify) { + err = complete_flush(qp, wqe); + if (err) { + rxe_dbg_qp(qp, "complete failed for recv wqe"); + notify = 0; + } + } queue_advance_consumer(q, q->type); + } + + qp->resp.wqe = NULL; } int rxe_responder(struct rxe_qp *qp) @@ -1453,20 +1467,18 @@ int rxe_responder(struct rxe_qp *qp) if (!rxe_get(qp)) return -EAGAIN; - qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; - - if (!qp->valid) + if (!qp->valid || qp->resp.state == QP_STATE_ERROR || + qp->resp.state == QP_STATE_RESET) { + bool notify = qp->valid && + (qp->resp.state == QP_STATE_ERROR); + rxe_drain_req_pkts(qp); + rxe_drain_recv_queue(qp, notify); goto exit; + } - switch (qp->resp.state) { - case QP_STATE_RESET: - state = RESPST_RESET; - break; + qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; - default: - state = RESPST_GET_REQ; - break; - } + state = RESPST_GET_REQ; while (1) { rxe_dbg_qp(qp, "state = %s\n", resp_state_name[state]); @@ -1625,11 +1637,6 @@ int rxe_responder(struct rxe_qp *qp) goto exit; - case RESPST_RESET: - rxe_drain_req_pkts(qp, false); - qp->resp.wqe = NULL; - goto exit; - case RESPST_ERROR: qp->resp.goto_error = 0; rxe_dbg_qp(qp, "moved to error state\n"); -- cgit v1.2.3 From fbdeb828a21ff8de045a27ddcfc6ee66201b9a94 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sat, 4 Mar 2023 11:45:30 -0600 Subject: RDMA/rxe: Cleanup error state handling in rxe_comp.c Cleanup the handling of qp in the error state, reset state and during rxe_qp_do_cleanup. Make the same as rxe_resp.c Link: https://lore.kernel.org/r/20230304174533.11296-5-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 55 ++++++++++++++++++++++++++++++------ drivers/infiniband/sw/rxe/rxe_cq.c | 1 + drivers/infiniband/sw/rxe/rxe_resp.c | 28 +++++++++--------- 3 files changed, 61 insertions(+), 23 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index cbfa16b3a490..f7ab0dfe1034 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -542,25 +542,60 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, return COMPST_GET_WQE; } -static void rxe_drain_resp_pkts(struct rxe_qp *qp, bool notify) +/* drain incoming response packet queue */ +static void drain_resp_pkts(struct rxe_qp *qp) { struct sk_buff *skb; - struct rxe_send_wqe *wqe; - struct rxe_queue *q = qp->sq.queue; while ((skb = skb_dequeue(&qp->resp_pkts))) { rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } +} + +/* complete send wqe with flush error */ +static int flush_send_wqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_cqe cqe = {}; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + int err; + + if (qp->is_user) { + uwc->wr_id = wqe->wr.wr_id; + uwc->status = IB_WC_WR_FLUSH_ERR; + uwc->qp_num = qp->ibqp.qp_num; + } else { + wc->wr_id = wqe->wr.wr_id; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->qp = &qp->ibqp; + } + + err = rxe_cq_post(qp->scq, &cqe, 0); + if (err) + rxe_dbg_cq(qp->scq, "post cq failed, err = %d", err); + + return err; +} + +/* drain and optionally complete the send queue + * if unable to complete a wqe, i.e. cq is full, stop + * completing and flush the remaining wqes + */ +static void flush_send_queue(struct rxe_qp *qp, bool notify) +{ + struct rxe_send_wqe *wqe; + struct rxe_queue *q = qp->sq.queue; + int err; while ((wqe = queue_head(q, q->type))) { if (notify) { - wqe->status = IB_WC_WR_FLUSH_ERR; - do_complete(qp, wqe); - } else { - queue_advance_consumer(q, q->type); + err = flush_send_wqe(qp, wqe); + if (err) + notify = 0; } + queue_advance_consumer(q, q->type); } } @@ -589,8 +624,10 @@ int rxe_completer(struct rxe_qp *qp) if (!qp->valid || qp->comp.state == QP_STATE_ERROR || qp->comp.state == QP_STATE_RESET) { - rxe_drain_resp_pkts(qp, qp->valid && - qp->comp.state == QP_STATE_ERROR); + bool notify = qp->valid && + (qp->comp.state == QP_STATE_ERROR); + drain_resp_pkts(qp); + flush_send_queue(qp, notify); goto exit; } diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 22fbc198e5d1..66a13c935d50 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -114,6 +114,7 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); if (unlikely(full)) { + rxe_err_cq(cq, "queue full"); spin_unlock_irqrestore(&cq->cq_lock, flags); if (cq->ibcq.event_handler) { ev.device = cq->ibcq.device; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 8f9bbb14fa7a..2f71183449f9 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1396,7 +1396,7 @@ static enum resp_states do_class_d1e_error(struct rxe_qp *qp) } /* drain incoming request packet queue */ -static void rxe_drain_req_pkts(struct rxe_qp *qp) +static void drain_req_pkts(struct rxe_qp *qp) { struct sk_buff *skb; @@ -1408,33 +1408,35 @@ static void rxe_drain_req_pkts(struct rxe_qp *qp) } /* complete receive wqe with flush error */ -static int complete_flush(struct rxe_qp *qp, struct rxe_recv_wqe *wqe) +static int flush_recv_wqe(struct rxe_qp *qp, struct rxe_recv_wqe *wqe) { struct rxe_cqe cqe = {}; struct ib_wc *wc = &cqe.ibwc; struct ib_uverbs_wc *uwc = &cqe.uibwc; + int err; if (qp->rcq->is_user) { + uwc->wr_id = wqe->wr_id; uwc->status = IB_WC_WR_FLUSH_ERR; uwc->qp_num = qp_num(qp); - uwc->wr_id = wqe->wr_id; } else { + wc->wr_id = wqe->wr_id; wc->status = IB_WC_WR_FLUSH_ERR; wc->qp = &qp->ibqp; - wc->wr_id = wqe->wr_id; } - if (rxe_cq_post(qp->rcq, &cqe, 0)) - return -ENOMEM; + err = rxe_cq_post(qp->rcq, &cqe, 0); + if (err) + rxe_dbg_cq(qp->rcq, "post cq failed err = %d", err); - return 0; + return err; } /* drain and optionally complete the recive queue * if unable to complete a wqe stop completing and * just flush the remaining wqes */ -static void rxe_drain_recv_queue(struct rxe_qp *qp, bool notify) +static void flush_recv_queue(struct rxe_qp *qp, bool notify) { struct rxe_queue *q = qp->rq.queue; struct rxe_recv_wqe *wqe; @@ -1445,11 +1447,9 @@ static void rxe_drain_recv_queue(struct rxe_qp *qp, bool notify) while ((wqe = queue_head(q, q->type))) { if (notify) { - err = complete_flush(qp, wqe); - if (err) { - rxe_dbg_qp(qp, "complete failed for recv wqe"); + err = flush_recv_wqe(qp, wqe); + if (err) notify = 0; - } } queue_advance_consumer(q, q->type); } @@ -1471,8 +1471,8 @@ int rxe_responder(struct rxe_qp *qp) qp->resp.state == QP_STATE_RESET) { bool notify = qp->valid && (qp->resp.state == QP_STATE_ERROR); - rxe_drain_req_pkts(qp); - rxe_drain_recv_queue(qp, notify); + drain_req_pkts(qp); + flush_recv_queue(qp, notify); goto exit; } -- cgit v1.2.3 From a246aa2e8a6d8919462b7ffe550cbe51d2894152 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sat, 4 Mar 2023 11:45:31 -0600 Subject: RDMA/rxe: Remove qp reference counting in tasks Currently each of the three tasklets requester, completer and responder in the rxe driver take and release a reference to the qp argument at the beginning and end of the subroutines. The caller passing in the qp argument should be responsible for holding a reference to qp so these are not required. Further doing so breaks the qp cleanup code in rxe_qp_do_cleanup which calls these routines after all the references have been dropped so they cannot drain the packet and work request queues as intended. In fact if these routines are deferred by calling tasklet_schedule there is no guarantee that the calling code does have a qp reference. That is a bug in rxe_task.c which will be fixed later in this series. Link: https://lore.kernel.org/r/20230304174533.11296-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 5 ----- drivers/infiniband/sw/rxe/rxe_req.c | 5 ----- drivers/infiniband/sw/rxe/rxe_resp.c | 4 ---- 3 files changed, 14 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index f7ab0dfe1034..7aa8e90bdfe4 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -619,9 +619,6 @@ int rxe_completer(struct rxe_qp *qp) enum comp_state state; int ret; - if (!rxe_get(qp)) - return -EAGAIN; - if (!qp->valid || qp->comp.state == QP_STATE_ERROR || qp->comp.state == QP_STATE_RESET) { bool notify = qp->valid && @@ -824,7 +821,5 @@ exit: out: if (pkt) free_pkt(pkt); - rxe_put(qp); - return ret; } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index f2dc2d191e16..abc65c54bfd6 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -653,9 +653,6 @@ int rxe_requester(struct rxe_qp *qp) struct rxe_ah *ah; struct rxe_av *av; - if (!rxe_get(qp)) - return -EAGAIN; - if (unlikely(!qp->valid)) goto exit; @@ -844,7 +841,5 @@ err: exit: ret = -EAGAIN; out: - rxe_put(qp); - return ret; } diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 2f71183449f9..01e3cbea8445 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1464,9 +1464,6 @@ int rxe_responder(struct rxe_qp *qp) struct rxe_pkt_info *pkt = NULL; int ret; - if (!rxe_get(qp)) - return -EAGAIN; - if (!qp->valid || qp->resp.state == QP_STATE_ERROR || qp->resp.state == QP_STATE_RESET) { bool notify = qp->valid && @@ -1658,6 +1655,5 @@ done: exit: ret = -EAGAIN; out: - rxe_put(qp); return ret; } -- cgit v1.2.3 From 960ebe97e5238565d15063c8f4d1b2108efe2e65 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sat, 4 Mar 2023 11:45:32 -0600 Subject: RDMA/rxe: Remove __rxe_do_task() The subroutine __rxe_do_task is not thread safe and it has no way to guarantee that the tasks, which are designed with the assumption that they are non-reentrant, are not reentered. All of its uses are non-performance critical. This patch replaces calls to __rxe_do_task with calls to rxe_sched_task. It also removes irrelevant or unneeded if tests. Instead of calling the task machinery a single call to the tasklet function (rxe_requester, etc.) is sufficient to draing the queues if task execution has been disabled or stopped. Together these changes allow the removal of __rxe_do_task. Link: https://lore.kernel.org/r/20230304174533.11296-7-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 56 +++++++++++------------------------- drivers/infiniband/sw/rxe/rxe_task.c | 13 --------- drivers/infiniband/sw/rxe/rxe_task.h | 6 ---- 3 files changed, 17 insertions(+), 58 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c954dd9394ba..49891f8ed4e6 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -473,29 +473,23 @@ static void rxe_qp_reset(struct rxe_qp *qp) { /* stop tasks from running */ rxe_disable_task(&qp->resp.task); - - /* stop request/comp */ - if (qp->sq.queue) { - if (qp_type(qp) == IB_QPT_RC) - rxe_disable_task(&qp->comp.task); - rxe_disable_task(&qp->req.task); - } + rxe_disable_task(&qp->comp.task); + rxe_disable_task(&qp->req.task); /* move qp to the reset state */ qp->req.state = QP_STATE_RESET; qp->comp.state = QP_STATE_RESET; qp->resp.state = QP_STATE_RESET; - /* let state machines reset themselves drain work and packet queues - * etc. - */ - __rxe_do_task(&qp->resp.task); + /* drain work and packet queuesc */ + rxe_requester(qp); + rxe_completer(qp); + rxe_responder(qp); - if (qp->sq.queue) { - __rxe_do_task(&qp->comp.task); - __rxe_do_task(&qp->req.task); + if (qp->rq.queue) + rxe_queue_reset(qp->rq.queue); + if (qp->sq.queue) rxe_queue_reset(qp->sq.queue); - } /* cleanup attributes */ atomic_set(&qp->ssn, 0); @@ -518,13 +512,8 @@ static void rxe_qp_reset(struct rxe_qp *qp) /* reenable tasks */ rxe_enable_task(&qp->resp.task); - - if (qp->sq.queue) { - if (qp_type(qp) == IB_QPT_RC) - rxe_enable_task(&qp->comp.task); - - rxe_enable_task(&qp->req.task); - } + rxe_enable_task(&qp->comp.task); + rxe_enable_task(&qp->req.task); } /* drain the send queue */ @@ -533,10 +522,7 @@ static void rxe_qp_drain(struct rxe_qp *qp) if (qp->sq.queue) { if (qp->req.state != QP_STATE_DRAINED) { qp->req.state = QP_STATE_DRAIN; - if (qp_type(qp) == IB_QPT_RC) - rxe_sched_task(&qp->comp.task); - else - __rxe_do_task(&qp->comp.task); + rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); } } @@ -552,11 +538,7 @@ void rxe_qp_error(struct rxe_qp *qp) /* drain work and packet queues */ rxe_sched_task(&qp->resp.task); - - if (qp_type(qp) == IB_QPT_RC) - rxe_sched_task(&qp->comp.task); - else - __rxe_do_task(&qp->comp.task); + rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); } @@ -773,24 +755,20 @@ static void rxe_qp_do_cleanup(struct work_struct *work) qp->valid = 0; qp->qp_timeout_jiffies = 0; - rxe_cleanup_task(&qp->resp.task); if (qp_type(qp) == IB_QPT_RC) { del_timer_sync(&qp->retrans_timer); del_timer_sync(&qp->rnr_nak_timer); } + rxe_cleanup_task(&qp->resp.task); rxe_cleanup_task(&qp->req.task); rxe_cleanup_task(&qp->comp.task); /* flush out any receive wr's or pending requests */ - if (qp->req.task.func) - __rxe_do_task(&qp->req.task); - - if (qp->sq.queue) { - __rxe_do_task(&qp->comp.task); - __rxe_do_task(&qp->req.task); - } + rxe_requester(qp); + rxe_completer(qp); + rxe_responder(qp); if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 959cc6229a34..a67f48545443 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -6,19 +6,6 @@ #include "rxe.h" -int __rxe_do_task(struct rxe_task *task) - -{ - int ret; - - while ((ret = task->func(task->qp)) == 0) - ; - - task->ret = ret; - - return ret; -} - /* * this locking is due to a potential race where * a second caller finds the task already running diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 41efd5fd49b0..99585e40cef9 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -39,12 +39,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -/* - * raw call to func in loop without any checking - * can call when tasklets are disabled - */ -int __rxe_do_task(struct rxe_task *task); - void rxe_run_task(struct rxe_task *task); void rxe_sched_task(struct rxe_task *task); -- cgit v1.2.3 From f455a1bc972cefc4bf2dbf1bf37a36bb51a5f7e7 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sat, 4 Mar 2023 11:45:33 -0600 Subject: RDMA/rxe: Make tasks schedule each other Replace rxe_run_task() by rxe_sched_task() when tasks call each other. These are not performance critical and mainly involve error paths but they run the risk of causing deadlocks. Link: https://lore.kernel.org/r/20230304174533.11296-8-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 8 ++++---- drivers/infiniband/sw/rxe/rxe_req.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 7aa8e90bdfe4..2c70cdcd55dc 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -322,7 +322,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task); + rxe_sched_task(&qp->req.task); } } return COMPST_ERROR_RETRY; @@ -473,7 +473,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_run_task(&qp->req.task); + rxe_sched_task(&qp->req.task); } } @@ -487,7 +487,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_run_task(&qp->req.task); + rxe_sched_task(&qp->req.task); } } @@ -767,7 +767,7 @@ int rxe_completer(struct rxe_qp *qp) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_run_task(&qp->req.task); + rxe_sched_task(&qp->req.task); } goto done; diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index abc65c54bfd6..745731140a54 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -753,7 +753,7 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - rxe_run_task(&qp->comp.task); + rxe_sched_task(&qp->comp.task); goto done; } payload = mtu; @@ -837,7 +837,7 @@ err: qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); wqe->state = wqe_state_error; qp->req.state = QP_STATE_ERROR; - rxe_run_task(&qp->comp.task); + rxe_sched_task(&qp->comp.task); exit: ret = -EAGAIN; out: -- cgit v1.2.3 From d94671632572813e90bcf475bb4c7d51fbf20173 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sat, 4 Mar 2023 11:45:34 -0600 Subject: RDMA/rxe: Rewrite rxe_task.c This patch is a major rewrite of the tasklet routines in rxe_task.c. The main motivation for this is the realization that the code violates the safety of the qp pointer by correct reference counting. When a tasklet is scheduled from a verbs API the calling thread has a valid reference to the qp and schedules the tasklet to run at a later time carrying a pointer to the qp. Once the calling code returns however the qp can be destroyed at any time. In order to correct this a reference to the qp must be taken when the task is scheduled and held until it finishes running. This is complicated by the tasklet library not alwys running a task that is scheduled depending on whether someone else has scheduled it. This patch moves the logic for deciding whether to run or schedule a task outside of do_task() and guarantees that there is only one copy of the task scheduled or running at a time. Secondly the separate flags controlling teardown and draining of the task are included in the task state machine and all references to the state are protected by spinlocks to avoid consistency and memory barrier issues. Link: https://lore.kernel.org/r/20230304174533.11296-9-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 266 ++++++++++++++++++++++++++++------- drivers/infiniband/sw/rxe/rxe_task.h | 8 +- 2 files changed, 218 insertions(+), 56 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index a67f48545443..fea9a517c8d9 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -6,56 +6,128 @@ #include "rxe.h" -/* - * this locking is due to a potential race where - * a second caller finds the task already running - * but looks just after the last call to func +/* Check if task is idle i.e. not running, not scheduled in + * tasklet queue and not draining. If so move to busy to + * reserve a slot in do_task() by setting to busy and taking + * a qp reference to cover the gap from now until the task finishes. + * state will move out of busy if task returns a non zero value + * in do_task(). If state is already busy it is raised to armed + * to indicate to do_task that additional pass should be made + * over the task. + * Context: caller should hold task->lock. + * Returns: true if state transitioned from idle to busy else false. + */ +static bool __reserve_if_idle(struct rxe_task *task) +{ + WARN_ON(rxe_read(task->qp) <= 0); + + if (task->tasklet.state & TASKLET_STATE_SCHED) + return false; + + if (task->state == TASK_STATE_IDLE) { + rxe_get(task->qp); + task->state = TASK_STATE_BUSY; + task->num_sched++; + return true; + } + + if (task->state == TASK_STATE_BUSY) + task->state = TASK_STATE_ARMED; + + return false; +} + +/* check if task is idle or drained and not currently + * scheduled in the tasklet queue. This routine is + * called by rxe_cleanup_task or rxe_disable_task to + * see if the queue is empty. + * Context: caller should hold task->lock. + * Returns true if done else false. + */ +static bool __is_done(struct rxe_task *task) +{ + if (task->tasklet.state & TASKLET_STATE_SCHED) + return false; + + if (task->state == TASK_STATE_IDLE || + task->state == TASK_STATE_DRAINED) { + return true; + } + + return false; +} + +/* a locked version of __is_done */ +static bool is_done(struct rxe_task *task) +{ + unsigned long flags; + int done; + + spin_lock_irqsave(&task->lock, flags); + done = __is_done(task); + spin_unlock_irqrestore(&task->lock, flags); + + return done; +} + +/* do_task is a wrapper for the three tasks (requester, + * completer, responder) and calls them in a loop until + * they return a non-zero value. It is called either + * directly by rxe_run_task or indirectly if rxe_sched_task + * schedules the task. They must call __reserve_if_idle to + * move the task to busy before calling or scheduling. + * The task can also be moved to drained or invalid + * by calls to rxe-cleanup_task or rxe_disable_task. + * In that case tasks which get here are not executed but + * just flushed. The tasks are designed to look to see if + * there is work to do and do part of it before returning + * here with a return value of zero until all the work + * has been consumed then it retuens a non-zero value. + * The number of times the task can be run is limited by + * max iterations so one task cannot hold the cpu forever. */ static void do_task(struct tasklet_struct *t) { int cont; int ret; struct rxe_task *task = from_tasklet(task, t, tasklet); - struct rxe_qp *qp = (struct rxe_qp *)task->qp; - unsigned int iterations = RXE_MAX_ITERATIONS; + unsigned int iterations; + unsigned long flags; + int resched = 0; - spin_lock_bh(&task->lock); - switch (task->state) { - case TASK_STATE_START: - task->state = TASK_STATE_BUSY; - spin_unlock_bh(&task->lock); - break; - - case TASK_STATE_BUSY: - task->state = TASK_STATE_ARMED; - fallthrough; - case TASK_STATE_ARMED: - spin_unlock_bh(&task->lock); - return; + WARN_ON(rxe_read(task->qp) <= 0); - default: - spin_unlock_bh(&task->lock); - rxe_dbg_qp(qp, "failed with bad state %d\n", task->state); + spin_lock_irqsave(&task->lock, flags); + if (task->state >= TASK_STATE_DRAINED) { + rxe_put(task->qp); + task->num_done++; + spin_unlock_irqrestore(&task->lock, flags); return; } + spin_unlock_irqrestore(&task->lock, flags); do { + iterations = RXE_MAX_ITERATIONS; cont = 0; - ret = task->func(task->qp); - spin_lock_bh(&task->lock); + do { + ret = task->func(task->qp); + } while (ret == 0 && iterations-- > 0); + + spin_lock_irqsave(&task->lock, flags); switch (task->state) { case TASK_STATE_BUSY: if (ret) { - task->state = TASK_STATE_START; - } else if (iterations--) { - cont = 1; + task->state = TASK_STATE_IDLE; } else { - /* reschedule the tasklet and exit + /* This can happen if the client + * can add work faster than the + * tasklet can finish it. + * Reschedule the tasklet and exit * the loop to give up the cpu */ - tasklet_schedule(&task->tasklet); - task->state = TASK_STATE_START; + task->state = TASK_STATE_IDLE; + resched = 1; } break; @@ -68,72 +140,158 @@ static void do_task(struct tasklet_struct *t) cont = 1; break; + case TASK_STATE_DRAINING: + if (ret) + task->state = TASK_STATE_DRAINED; + else + cont = 1; + break; + default: - rxe_dbg_qp(qp, "failed with bad state %d\n", - task->state); + WARN_ON(1); + rxe_info_qp(task->qp, "unexpected task state = %d", task->state); + } + + if (!cont) { + task->num_done++; + if (WARN_ON(task->num_done != task->num_sched)) + rxe_err_qp(task->qp, "%ld tasks scheduled, %ld tasks done", + task->num_sched, task->num_done); } - spin_unlock_bh(&task->lock); + spin_unlock_irqrestore(&task->lock, flags); } while (cont); task->ret = ret; + + if (resched) + rxe_sched_task(task); + + rxe_put(task->qp); } int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, int (*func)(struct rxe_qp *)) { - task->qp = qp; - task->func = func; - task->destroyed = false; + WARN_ON(rxe_read(qp) <= 0); + + task->qp = qp; + task->func = func; tasklet_setup(&task->tasklet, do_task); - task->state = TASK_STATE_START; + task->state = TASK_STATE_IDLE; spin_lock_init(&task->lock); return 0; } +/* rxe_cleanup_task is only called from rxe_do_qp_cleanup in + * process context. The qp is already completed with no + * remaining references. Once the queue is drained the + * task is moved to invalid and returns. The qp cleanup + * code then calls the task functions directly without + * using the task struct to drain any late arriving packets + * or work requests. + */ void rxe_cleanup_task(struct rxe_task *task) { - bool idle; + unsigned long flags; - /* - * Mark the task, then wait for it to finish. It might be - * running in a non-tasklet (direct call) context. - */ - task->destroyed = true; + spin_lock_irqsave(&task->lock, flags); + if (!__is_done(task) && task->state < TASK_STATE_DRAINED) { + task->state = TASK_STATE_DRAINING; + } else { + task->state = TASK_STATE_INVALID; + spin_unlock_irqrestore(&task->lock, flags); + return; + } + spin_unlock_irqrestore(&task->lock, flags); - do { - spin_lock_bh(&task->lock); - idle = (task->state == TASK_STATE_START); - spin_unlock_bh(&task->lock); - } while (!idle); + /* now the task cannot be scheduled or run just wait + * for the previously scheduled tasks to finish. + */ + while (!is_done(task)) + cond_resched(); tasklet_kill(&task->tasklet); + + spin_lock_irqsave(&task->lock, flags); + task->state = TASK_STATE_INVALID; + spin_unlock_irqrestore(&task->lock, flags); } +/* run the task inline if it is currently idle + * cannot call do_task holding the lock + */ void rxe_run_task(struct rxe_task *task) { - if (task->destroyed) - return; + unsigned long flags; + int run; + + WARN_ON(rxe_read(task->qp) <= 0); - do_task(&task->tasklet); + spin_lock_irqsave(&task->lock, flags); + run = __reserve_if_idle(task); + spin_unlock_irqrestore(&task->lock, flags); + + if (run) + do_task(&task->tasklet); } +/* schedule the task to run later as a tasklet. + * the tasklet)schedule call can be called holding + * the lock. + */ void rxe_sched_task(struct rxe_task *task) { - if (task->destroyed) - return; + unsigned long flags; + + WARN_ON(rxe_read(task->qp) <= 0); - tasklet_schedule(&task->tasklet); + spin_lock_irqsave(&task->lock, flags); + if (__reserve_if_idle(task)) + tasklet_schedule(&task->tasklet); + spin_unlock_irqrestore(&task->lock, flags); } +/* rxe_disable/enable_task are only called from + * rxe_modify_qp in process context. Task is moved + * to the drained state by do_task. + */ void rxe_disable_task(struct rxe_task *task) { + unsigned long flags; + + WARN_ON(rxe_read(task->qp) <= 0); + + spin_lock_irqsave(&task->lock, flags); + if (!__is_done(task) && task->state < TASK_STATE_DRAINED) { + task->state = TASK_STATE_DRAINING; + } else { + task->state = TASK_STATE_DRAINED; + spin_unlock_irqrestore(&task->lock, flags); + return; + } + spin_unlock_irqrestore(&task->lock, flags); + + while (!is_done(task)) + cond_resched(); + tasklet_disable(&task->tasklet); } void rxe_enable_task(struct rxe_task *task) { + unsigned long flags; + + WARN_ON(rxe_read(task->qp) <= 0); + + spin_lock_irqsave(&task->lock, flags); + if (task->state == TASK_STATE_INVALID) { + spin_unlock_irqrestore(&task->lock, flags); + return; + } + task->state = TASK_STATE_IDLE; tasklet_enable(&task->tasklet); + spin_unlock_irqrestore(&task->lock, flags); } diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 99585e40cef9..facb7c8e3729 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -8,9 +8,12 @@ #define RXE_TASK_H enum { - TASK_STATE_START = 0, + TASK_STATE_IDLE = 0, TASK_STATE_BUSY = 1, TASK_STATE_ARMED = 2, + TASK_STATE_DRAINING = 3, + TASK_STATE_DRAINED = 4, + TASK_STATE_INVALID = 5, }; /* @@ -25,7 +28,8 @@ struct rxe_task { struct rxe_qp *qp; int (*func)(struct rxe_qp *qp); int ret; - bool destroyed; + long num_sched; + long num_done; }; /* -- cgit v1.2.3 From 78b26a335310a097d6b22581b706050db42f196c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Mon, 27 Mar 2023 16:56:44 -0500 Subject: RDMA/rxe: Remove tasklet call from rxe_cq.c Remove the tasklet call in rxe_cq.c and also the is_dying in the cq struct. There is no reason for the rxe driver to defer the call to the cq completion handler by scheduling a tasklet. rxe_cq_post() is not called in a hard irq context. The rxe driver currently is incorrect because the tasklet call is made without protecting the cq pointer with a reference from having the underlying memory freed before the deferred routine is called. Executing the comp_handler inline fixes this problem. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Bob Pearson Link: https://lore.kernel.org/r/20230327215643.10410-1-rpearsonhpe@gmail.com Acked-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_cq.c | 32 +++----------------------------- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 -- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 -- 3 files changed, 3 insertions(+), 33 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 66a13c935d50..20ff0c0c4605 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -39,21 +39,6 @@ err1: return -EINVAL; } -static void rxe_send_complete(struct tasklet_struct *t) -{ - struct rxe_cq *cq = from_tasklet(cq, t, comp_task); - unsigned long flags; - - spin_lock_irqsave(&cq->cq_lock, flags); - if (cq->is_dying) { - spin_unlock_irqrestore(&cq->cq_lock, flags); - return; - } - spin_unlock_irqrestore(&cq->cq_lock, flags); - - cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); -} - int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_udata *udata, struct rxe_create_cq_resp __user *uresp) @@ -79,10 +64,6 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, cq->is_user = uresp; - cq->is_dying = false; - - tasklet_setup(&cq->comp_task, rxe_send_complete); - spin_lock_init(&cq->cq_lock); cq->ibcq.cqe = cqe; return 0; @@ -103,6 +84,7 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, return err; } +/* caller holds reference to cq */ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) { struct ib_event ev; @@ -136,21 +118,13 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) if ((cq->notify == IB_CQ_NEXT_COMP) || (cq->notify == IB_CQ_SOLICITED && solicited)) { cq->notify = 0; - tasklet_schedule(&cq->comp_task); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } return 0; } -void rxe_cq_disable(struct rxe_cq *cq) -{ - unsigned long flags; - - spin_lock_irqsave(&cq->cq_lock, flags); - cq->is_dying = true; - spin_unlock_irqrestore(&cq->cq_lock, flags); -} - void rxe_cq_cleanup(struct rxe_pool_elem *elem) { struct rxe_cq *cq = container_of(elem, typeof(*cq), elem); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 84b53c070fc5..090d5bfb1e18 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1178,8 +1178,6 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) goto err_out; } - rxe_cq_disable(cq); - err = rxe_cleanup(cq); if (err) rxe_err_cq(cq, "cleanup failed, err = %d", err); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index c269ae2a3224..d812093a3916 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -63,9 +63,7 @@ struct rxe_cq { struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; - bool is_dying; bool is_user; - struct tasklet_struct comp_task; atomic_t num_wq; }; -- cgit v1.2.3 From b6ba68555d75fd99f7daa9c5a5e476f8635cb155 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 29 Mar 2023 21:14:01 +0300 Subject: RDMA/rxe: Clean kzalloc failure paths There is no need to print any debug messages after failure to allocate memory, because kernel will print OOM dumps anyway. Together with removal of these messages, remove useless goto jumps. Fixes: 5bf944f24129 ("RDMA/rxe: Add error messages") Reported-by: Dan Carpenter Link: https://lore.kernel.org/all/ea43486f-43dd-4054-b1d5-3a0d202be621@kili.mountain Link: https://lore.kernel.org/r/d3cedf723b84e73e8062a67b7489d33802bafba2.1680113597.git.leon@kernel.org Reviewed-by: Bob Pearson Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_queue.c | 5 ++--- drivers/infiniband/sw/rxe/rxe_verbs.c | 27 +++++++-------------------- 2 files changed, 9 insertions(+), 23 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c index d6dbf5a0058d..9611ee191a46 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.c +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -61,11 +61,11 @@ struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, /* num_elem == 0 is allowed, but uninteresting */ if (*num_elem < 0) - goto err1; + return NULL; q = kzalloc(sizeof(*q), GFP_KERNEL); if (!q) - goto err1; + return NULL; q->rxe = rxe; q->type = type; @@ -100,7 +100,6 @@ struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, err2: kfree(q); -err1: return NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 090d5bfb1e18..4e2db7c2e4ed 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1198,11 +1198,8 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) { - err = -ENOMEM; - rxe_dbg_dev(rxe, "no memory for mr"); - goto err_out; - } + if (!mr) + return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { @@ -1220,7 +1217,6 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) err_free: kfree(mr); -err_out: rxe_err_pd(pd, "returned err = %d", err); return ERR_PTR(err); } @@ -1235,11 +1231,8 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, int err, cleanup_err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) { - err = -ENOMEM; - rxe_dbg_pd(pd, "no memory for mr"); - goto err_out; - } + if (!mr) + return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { @@ -1266,7 +1259,6 @@ err_cleanup: rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err); err_free: kfree(mr); -err_out: rxe_err_pd(pd, "returned err = %d", err); return ERR_PTR(err); } @@ -1287,17 +1279,12 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, } mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) { - err = -ENOMEM; - rxe_dbg_mr(mr, "no memory for mr"); - goto err_out; - } + if (!mr) + return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); - if (err) { - rxe_dbg_mr(mr, "unable to create mr, err = %d", err); + if (err) goto err_free; - } rxe_get(pd); mr->ibmr.pd = ibpd; -- cgit v1.2.3 From 266e9b3475ba82212062771fdbc40be0e3c06ec8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 2 Apr 2023 14:10:13 +0900 Subject: RDMA/siw: Remove namespace check from siw_netdev_event() syzbot is reporting that siw_netdev_event(NETDEV_UNREGISTER) cannot destroy siw_device created after unshare(CLONE_NEWNET) due to net namespace check. It seems that this check was by error there and should be removed. Reported-by: syzbot Link: https://syzkaller.appspot.com/bug?extid=5e70d01ee8985ae62a3b Suggested-by: Jason Gunthorpe Suggested-by: Leon Romanovsky Fixes: bdcf26bf9b3a ("rdma/siw: network and RDMA core interface") Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/r/a44e9ac5-44e2-d575-9e30-02483cc7ffd1@I-love.SAKURA.ne.jp Reviewed-by: Bernard Metzler Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw_main.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index dacc174604bf..65b5cda5457b 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -437,9 +437,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, dev_dbg(&netdev->dev, "siw: event %lu\n", event); - if (dev_net(netdev) != &init_net) - return NOTIFY_OK; - base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; -- cgit v1.2.3 From 67a00d29c360f4d7b2ca7b1ccf24b145f60d14b8 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 29 Mar 2023 14:33:09 -0500 Subject: RDMA/rxe: Fix incorrect TASKLET_STATE_SCHED check in rxe_task.c In a previous patch TASKLET_STATE_SCHED was used as a mask but it is a bit position instead. Add the missing shift. Link: https://lore.kernel.org/r/20230329193308.7489-1-rpearsonhpe@gmail.com Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-rdma/8a054b78-6d50-4bc6-8d8a-83f85fbdb82f@kili.mountain/ Fixes: d94671632572 ("RDMA/rxe: Rewrite rxe_task.c") Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index fea9a517c8d9..fb9a6bc8e620 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -21,7 +21,7 @@ static bool __reserve_if_idle(struct rxe_task *task) { WARN_ON(rxe_read(task->qp) <= 0); - if (task->tasklet.state & TASKLET_STATE_SCHED) + if (task->tasklet.state & BIT(TASKLET_STATE_SCHED)) return false; if (task->state == TASK_STATE_IDLE) { @@ -46,7 +46,7 @@ static bool __reserve_if_idle(struct rxe_task *task) */ static bool __is_done(struct rxe_task *task) { - if (task->tasklet.state & TASKLET_STATE_SCHED) + if (task->tasklet.state & BIT(TASKLET_STATE_SCHED)) return false; if (task->state == TASK_STATE_IDLE || -- cgit v1.2.3 From b2b1ddc457458fecd1c6f385baa9fbda5f0c63ad Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 13 Apr 2023 18:11:15 +0800 Subject: RDMA/rxe: Fix the error "trying to register non-static key in rxe_cleanup_task" In the function rxe_create_qp(), rxe_qp_from_init() is called to initialize qp, internally things like rxe_init_task are not setup until rxe_qp_init_req(). If an error occurred before this point then the unwind will call rxe_cleanup() and eventually to rxe_qp_do_cleanup()/rxe_cleanup_task() which will oops when trying to access the uninitialized spinlock. If rxe_init_task is not executed, rxe_cleanup_task will not be called. Reported-by: syzbot+cfcc1a3c85be15a40cba@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=fd85757b74b3eb59f904138486f755f71e090df8 Fixes: 8700e3e7c485 ("Soft RoCE driver") Fixes: 2d4b21e0a291 ("IB/rxe: Prevent from completer to operate on non valid QP") Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20230413101115.1366068-1-yanjun.zhu@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_qp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 49891f8ed4e6..d5de5ba6940f 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -761,9 +761,14 @@ static void rxe_qp_do_cleanup(struct work_struct *work) del_timer_sync(&qp->rnr_nak_timer); } - rxe_cleanup_task(&qp->resp.task); - rxe_cleanup_task(&qp->req.task); - rxe_cleanup_task(&qp->comp.task); + if (qp->resp.task.func) + rxe_cleanup_task(&qp->resp.task); + + if (qp->req.task.func) + rxe_cleanup_task(&qp->req.task); + + if (qp->comp.task.func) + rxe_cleanup_task(&qp->comp.task); /* flush out any receive wr's or pending requests */ rxe_requester(qp); -- cgit v1.2.3 From 8d7c7c0eeb74281c846ef9231ce20536c79a99b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 14 Apr 2023 10:58:29 -0300 Subject: RDMA: Add ib_virt_dma_to_page() Make it clearer what is going on by adding a function to go back from the "virtual" dma_addr to a kva and another to a struct page. This is used in the ib_uses_virt_dma() style drivers (siw, rxe, hfi, qib). Call them instead of a naked casting and virt_to_page() when working with dma_addr values encoded by the various ib_map functions. This also fixes the virt_to_page() casting problem Linus Walleij has been chasing. Cc: Linus Walleij Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v2-05ea785520ed+10-ib_virt_page_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_mr.c | 16 ++++++++-------- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- drivers/infiniband/sw/siw/siw_qp_rx.c | 6 +++--- drivers/infiniband/sw/siw/siw_qp_tx.c | 19 ++++++------------- drivers/infiniband/sw/siw/siw_verbs.c | 4 ++-- include/rdma/ib_verbs.h | 25 +++++++++++++++++++++++++ 6 files changed, 45 insertions(+), 27 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 1e17f8086d59..0e538fafcc20 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -210,10 +210,10 @@ err1: return err; } -static int rxe_set_page(struct ib_mr *ibmr, u64 iova) +static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) { struct rxe_mr *mr = to_rmr(ibmr); - struct page *page = virt_to_page(iova & mr->page_mask); + struct page *page = ib_virt_dma_to_page(dma_addr); bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); int err; @@ -279,16 +279,16 @@ static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, return 0; } -static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 iova, void *addr, +static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr, unsigned int length, enum rxe_mr_copy_dir dir) { - unsigned int page_offset = iova & (PAGE_SIZE - 1); + unsigned int page_offset = dma_addr & (PAGE_SIZE - 1); unsigned int bytes; struct page *page; u8 *va; while (length) { - page = virt_to_page(iova & mr->page_mask); + page = ib_virt_dma_to_page(dma_addr); bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); va = kmap_local_page(page); @@ -300,7 +300,7 @@ static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 iova, void *addr, kunmap_local(va); page_offset = 0; - iova += bytes; + dma_addr += bytes; addr += bytes; length -= bytes; } @@ -488,7 +488,7 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); - page = virt_to_page(iova & PAGE_MASK); + page = ib_virt_dma_to_page(iova); } else { unsigned long index; int err; @@ -545,7 +545,7 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); - page = virt_to_page(iova & PAGE_MASK); + page = ib_virt_dma_to_page(iova); } else { unsigned long index; int err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 4e2db7c2e4ed..12819153bda7 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -760,7 +760,7 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { - memcpy(p, (void *)(uintptr_t)sge->addr, sge->length); + memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length); p += sge->length; } } diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index fd721cc19682..58bbf738e4e5 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -139,7 +139,7 @@ static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, break; bytes = min(bytes, len); - if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) == + if (siw_rx_kva(srx, ib_virt_dma_to_ptr(buf_addr), bytes) == bytes) { copied += bytes; offset += bytes; @@ -487,7 +487,7 @@ int siw_proc_send(struct siw_qp *qp) mem_p = *mem; if (mem_p->mem_obj == NULL) rv = siw_rx_kva(srx, - (void *)(uintptr_t)(sge->laddr + frx->sge_off), + ib_virt_dma_to_ptr(sge->laddr + frx->sge_off), sge_bytes); else if (!mem_p->is_pbl) rv = siw_rx_umem(srx, mem_p->umem, @@ -852,7 +852,7 @@ int siw_proc_rresp(struct siw_qp *qp) if (mem_p->mem_obj == NULL) rv = siw_rx_kva(srx, - (void *)(uintptr_t)(sge->laddr + wqe->processed), + ib_virt_dma_to_ptr(sge->laddr + wqe->processed), bytes); else if (!mem_p->is_pbl) rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 6bb9e9e81ff4..4b292e0504f1 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -29,7 +29,7 @@ static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); if (paddr) - return virt_to_page((void *)(uintptr_t)paddr); + return ib_virt_dma_to_page(paddr); return NULL; } @@ -56,8 +56,7 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr) if (!mem->mem_obj) { /* Kernel client using kva */ - memcpy(paddr, - (const void *)(uintptr_t)sge->laddr, bytes); + memcpy(paddr, ib_virt_dma_to_ptr(sge->laddr), bytes); } else if (c_tx->in_syscall) { if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr), bytes)) @@ -477,7 +476,7 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) * or memory region with assigned kernel buffer */ iov[seg].iov_base = - (void *)(uintptr_t)(sge->laddr + sge_off); + ib_virt_dma_to_ptr(sge->laddr + sge_off); iov[seg].iov_len = sge_len; if (do_crc) @@ -537,19 +536,13 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) * Cast to an uintptr_t to preserve all 64 bits * in sge->laddr. */ - uintptr_t va = (uintptr_t)(sge->laddr + sge_off); + u64 va = sge->laddr + sge_off; - /* - * virt_to_page() takes a (void *) pointer - * so cast to a (void *) meaning it will be 64 - * bits on a 64 bit platform and 32 bits on a - * 32 bit platform. - */ - page_array[seg] = virt_to_page((void *)(va & PAGE_MASK)); + page_array[seg] = ib_virt_dma_to_page(va); if (do_crc) crypto_shash_update( c_tx->mpa_crc_hd, - (void *)va, + ib_virt_dma_to_ptr(va), plen); } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 906fde1a2a0d..398ec13db624 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -660,7 +660,7 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, bytes = -EINVAL; break; } - memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, + memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr), core_sge->length); kbuf += core_sge->length; @@ -1523,7 +1523,7 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, } siw_dbg_mem(mem, "sge[%d], size %u, addr 0x%p, total %lu\n", - i, pble->size, (void *)(uintptr_t)pble->addr, + i, pble->size, ib_virt_dma_to_ptr(pble->addr), pbl_size); } rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 949cf4ffc536..1e7774ac808f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4035,6 +4035,31 @@ static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev) return dma_pci_p2pdma_supported(dev->dma_device); } +/** + * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer + * @dma_addr: The DMA address + * + * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after + * going through the dma_addr marshalling. + */ +static inline void *ib_virt_dma_to_ptr(u64 dma_addr) +{ + /* virt_dma mode maps the kvs's directly into the dma addr */ + return (void *)(uintptr_t)dma_addr; +} + +/** + * ib_virt_dma_to_page - Convert a dma_addr to a struct page + * @dma_addr: The DMA address + * + * Used by ib_uses_virt_dma() device to get back to the struct page after going + * through the dma_addr marshalling. + */ +static inline struct page *ib_virt_dma_to_page(u64 dma_addr) +{ + return virt_to_page(ib_virt_dma_to_ptr(dma_addr)); +} + /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created -- cgit v1.2.3 From a588429a66e92960b195b8dceb3fc5477a2b2f80 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 4 Apr 2023 23:26:07 -0500 Subject: RDMA/rxe: Remove qp->resp.state The rxe driver has four different QP state variables, qp->attr.qp_state, qp->req.state, qp->comp.state, and qp->resp.state. All of these basically carry the same information. This patch replaces uses of qp->resp.state by qp->attr.qp_state. This is the first of three patches which will remove all but the qp->attr.qp_state variable. This will bring the driver closer to the IBA description. Link: https://lore.kernel.org/r/20230405042611.6467-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- drivers/infiniband/sw/rxe/rxe_qp.c | 5 ----- drivers/infiniband/sw/rxe/rxe_recv.c | 2 +- drivers/infiniband/sw/rxe/rxe_resp.c | 10 +++++----- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 - 6 files changed, 8 insertions(+), 14 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index a2ace42e9536..2be2425083ce 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -414,7 +414,7 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct rxe_dev *rxe = to_rdev(qp->ibqp.device); if ((is_request && (qp->req.state != QP_STATE_READY)) || - (!is_request && (qp->resp.state != QP_STATE_READY))) { + (!is_request && (qp_state(qp) <= IB_QPS_RTR))) { rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index d5de5ba6940f..fcbcca39876b 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -287,7 +287,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; - qp->resp.state = QP_STATE_RESET; return 0; } @@ -479,7 +478,6 @@ static void rxe_qp_reset(struct rxe_qp *qp) /* move qp to the reset state */ qp->req.state = QP_STATE_RESET; qp->comp.state = QP_STATE_RESET; - qp->resp.state = QP_STATE_RESET; /* drain work and packet queuesc */ rxe_requester(qp); @@ -532,7 +530,6 @@ static void rxe_qp_drain(struct rxe_qp *qp) void rxe_qp_error(struct rxe_qp *qp) { qp->req.state = QP_STATE_ERROR; - qp->resp.state = QP_STATE_ERROR; qp->comp.state = QP_STATE_ERROR; qp->attr.qp_state = IB_QPS_ERR; @@ -663,13 +660,11 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, case IB_QPS_INIT: rxe_dbg_qp(qp, "state -> INIT\n"); qp->req.state = QP_STATE_INIT; - qp->resp.state = QP_STATE_INIT; qp->comp.state = QP_STATE_INIT; break; case IB_QPS_RTR: rxe_dbg_qp(qp, "state -> RTR\n"); - qp->resp.state = QP_STATE_READY; break; case IB_QPS_RTS: diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 434a693cd4a5..ac42ceccf71f 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -39,7 +39,7 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, } if (pkt->mask & RXE_REQ_MASK) { - if (unlikely(qp->resp.state != QP_STATE_READY)) + if (unlikely(qp_state(qp) <= IB_QPS_RTR)) return -EINVAL; } else if (unlikely(qp->req.state < QP_STATE_READY || qp->req.state > QP_STATE_DRAINED)) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 01e3cbea8445..67eac616235c 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1137,7 +1137,7 @@ static enum resp_states do_complete(struct rxe_qp *qp, return RESPST_ERR_CQ_OVERFLOW; finish: - if (unlikely(qp->resp.state == QP_STATE_ERROR)) + if (unlikely(qp_state(qp) == IB_QPS_ERR)) return RESPST_CHK_RESOURCE; if (unlikely(!pkt)) return RESPST_DONE; @@ -1464,10 +1464,10 @@ int rxe_responder(struct rxe_qp *qp) struct rxe_pkt_info *pkt = NULL; int ret; - if (!qp->valid || qp->resp.state == QP_STATE_ERROR || - qp->resp.state == QP_STATE_RESET) { - bool notify = qp->valid && - (qp->resp.state == QP_STATE_ERROR); + if (!qp->valid || qp_state(qp) == IB_QPS_ERR || + qp_state(qp) == IB_QPS_RESET) { + bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); + drain_req_pkts(qp); flush_recv_queue(qp, notify); goto exit; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 12819153bda7..36cad3665ee4 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1012,7 +1012,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&rq->producer_lock, flags); - if (qp->resp.state == QP_STATE_ERROR) + if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->resp.task); err_out: diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index d812093a3916..12594cb2a9cf 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -173,7 +173,6 @@ struct resp_res { }; struct rxe_resp_info { - enum rxe_qp_state state; u32 msn; u32 psn; u32 ack_psn; -- cgit v1.2.3 From f55efc2ed206eedfed08e1c8f8552f64fc3a11dd Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 4 Apr 2023 23:26:08 -0500 Subject: RDMA/rxe: Remove qp->comp.state The rxe driver has four different QP state variables, qp->attr.qp_state, qp->req.state, qp->comp.state, and qp->resp.state. All of these basically carry the same information. This patch replaces uses of qp->comp.state by qp->attr.qp_state. This is the second of three patches which will remove all but the qp->attr.qp_state variable. This will bring the driver closer to the IBA description. Link: https://lore.kernel.org/r/20230405042611.6467-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 8 ++++---- drivers/infiniband/sw/rxe/rxe_qp.c | 5 ----- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 - 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 2c70cdcd55dc..173ebfe784e6 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -619,10 +619,10 @@ int rxe_completer(struct rxe_qp *qp) enum comp_state state; int ret; - if (!qp->valid || qp->comp.state == QP_STATE_ERROR || - qp->comp.state == QP_STATE_RESET) { - bool notify = qp->valid && - (qp->comp.state == QP_STATE_ERROR); + if (!qp->valid || qp_state(qp) == IB_QPS_ERR || + qp_state(qp) == IB_QPS_RESET) { + bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); + drain_resp_pkts(qp); flush_send_queue(qp, notify); goto exit; diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index fcbcca39876b..36e4a00e5d12 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -232,7 +232,6 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, QUEUE_TYPE_FROM_CLIENT); qp->req.state = QP_STATE_RESET; - qp->comp.state = QP_STATE_RESET; qp->req.opcode = -1; qp->comp.opcode = -1; @@ -477,7 +476,6 @@ static void rxe_qp_reset(struct rxe_qp *qp) /* move qp to the reset state */ qp->req.state = QP_STATE_RESET; - qp->comp.state = QP_STATE_RESET; /* drain work and packet queuesc */ rxe_requester(qp); @@ -530,7 +528,6 @@ static void rxe_qp_drain(struct rxe_qp *qp) void rxe_qp_error(struct rxe_qp *qp) { qp->req.state = QP_STATE_ERROR; - qp->comp.state = QP_STATE_ERROR; qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ @@ -660,7 +657,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, case IB_QPS_INIT: rxe_dbg_qp(qp, "state -> INIT\n"); qp->req.state = QP_STATE_INIT; - qp->comp.state = QP_STATE_INIT; break; case IB_QPS_RTR: @@ -670,7 +666,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, case IB_QPS_RTS: rxe_dbg_qp(qp, "state -> RTS\n"); qp->req.state = QP_STATE_READY; - qp->comp.state = QP_STATE_READY; break; case IB_QPS_SQD: diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 12594cb2a9cf..1ae8dfd0ce7b 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -127,7 +127,6 @@ struct rxe_req_info { }; struct rxe_comp_info { - enum rxe_qp_state state; u32 psn; int opcode; int timeout; -- cgit v1.2.3 From 98e891b5e4d94ceb0844de3355c9218027426e72 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 4 Apr 2023 23:26:09 -0500 Subject: RDMA/rxe: Remove qp->req.state The rxe driver has four different QP state variables, qp->attr.qp_state, qp->req.state, qp->comp.state, and qp->resp.state. All of these basically carry the same information. This patch replaces uses of qp->req.state by qp->attr.qp_state and enum rxe_qp_state. This is the third of three patches which will remove all but the qp->attr.qp_state variable. This will bring the driver closer to the IBA description. Link: https://lore.kernel.org/r/20230405042611.6467-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 9 +++---- drivers/infiniband/sw/rxe/rxe_net.c | 4 +-- drivers/infiniband/sw/rxe/rxe_qp.c | 49 ++++++++++------------------------- drivers/infiniband/sw/rxe/rxe_recv.c | 9 ++++--- drivers/infiniband/sw/rxe/rxe_req.c | 15 +++++------ drivers/infiniband/sw/rxe/rxe_verbs.c | 4 +-- drivers/infiniband/sw/rxe/rxe_verbs.h | 10 ------- 7 files changed, 34 insertions(+), 66 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 173ebfe784e6..979990734e0c 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -491,12 +491,11 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, } } - if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + if (unlikely(qp_state(qp) == IB_QPS_SQD)) { /* state_lock used by requester & completer */ spin_lock_bh(&qp->state_lock); - if ((qp->req.state == QP_STATE_DRAIN) && - (qp->comp.psn == qp->req.psn)) { - qp->req.state = QP_STATE_DRAINED; + if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) { + qp->attr.sq_draining = 0; spin_unlock_bh(&qp->state_lock); if (qp->ibqp.event_handler) { @@ -723,7 +722,7 @@ int rxe_completer(struct rxe_qp *qp) * (4) the timeout parameter is set */ if ((qp_type(qp) == IB_QPT_RC) && - (qp->req.state == QP_STATE_READY) && + (qp_state(qp) >= IB_QPS_RTS) && (psn_compare(qp->req.psn, qp->comp.psn) > 0) && qp->qp_timeout_jiffies) mod_timer(&qp->retrans_timer, diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 2be2425083ce..9ed81d0bd25c 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -413,8 +413,8 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); - if ((is_request && (qp->req.state != QP_STATE_READY)) || - (!is_request && (qp_state(qp) <= IB_QPS_RTR))) { + if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || + (!is_request && (qp_state(qp) < IB_QPS_RTR))) { rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 36e4a00e5d12..78c7c13e614b 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -231,7 +231,6 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, qp->req.wqe_index = queue_get_producer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT); - qp->req.state = QP_STATE_RESET; qp->req.opcode = -1; qp->comp.opcode = -1; @@ -394,12 +393,9 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, goto err1; } - if (mask & IB_QP_STATE) { - if (cur_state == IB_QPS_SQD) { - if (qp->req.state == QP_STATE_DRAIN && - new_state != IB_QPS_ERR) - goto err1; - } + if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) { + if (qp->attr.sq_draining && new_state != IB_QPS_ERR) + goto err1; } if (mask & IB_QP_PORT) { @@ -474,9 +470,6 @@ static void rxe_qp_reset(struct rxe_qp *qp) rxe_disable_task(&qp->comp.task); rxe_disable_task(&qp->req.task); - /* move qp to the reset state */ - qp->req.state = QP_STATE_RESET; - /* drain work and packet queuesc */ rxe_requester(qp); rxe_completer(qp); @@ -512,22 +505,9 @@ static void rxe_qp_reset(struct rxe_qp *qp) rxe_enable_task(&qp->req.task); } -/* drain the send queue */ -static void rxe_qp_drain(struct rxe_qp *qp) -{ - if (qp->sq.queue) { - if (qp->req.state != QP_STATE_DRAINED) { - qp->req.state = QP_STATE_DRAIN; - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); - } - } -} - /* move the qp to the error state */ void rxe_qp_error(struct rxe_qp *qp) { - qp->req.state = QP_STATE_ERROR; qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ @@ -540,6 +520,8 @@ void rxe_qp_error(struct rxe_qp *qp) int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { + enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->attr.qp_state; int err; if (mask & IB_QP_MAX_QP_RD_ATOMIC) { @@ -656,7 +638,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, case IB_QPS_INIT: rxe_dbg_qp(qp, "state -> INIT\n"); - qp->req.state = QP_STATE_INIT; break; case IB_QPS_RTR: @@ -665,12 +646,15 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, case IB_QPS_RTS: rxe_dbg_qp(qp, "state -> RTS\n"); - qp->req.state = QP_STATE_READY; break; case IB_QPS_SQD: rxe_dbg_qp(qp, "state -> SQD\n"); - rxe_qp_drain(qp); + if (cur_state != IB_QPS_SQD) { + qp->attr.sq_draining = 1; + rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->req.task); + } break; case IB_QPS_SQE: @@ -708,16 +692,11 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) rxe_av_to_attr(&qp->pri_av, &attr->ah_attr); rxe_av_to_attr(&qp->alt_av, &attr->alt_ah_attr); - if (qp->req.state == QP_STATE_DRAIN) { - attr->sq_draining = 1; - /* applications that get this state - * typically spin on it. yield the - * processor - */ + /* Applications that get this state typically spin on it. + * Yield the processor + */ + if (qp->attr.sq_draining) cond_resched(); - } else { - attr->sq_draining = 0; - } rxe_dbg_qp(qp, "attr->sq_draining = %d\n", attr->sq_draining); diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index ac42ceccf71f..ca17ac6c5878 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -39,11 +39,12 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, } if (pkt->mask & RXE_REQ_MASK) { - if (unlikely(qp_state(qp) <= IB_QPS_RTR)) + if (unlikely(qp_state(qp) < IB_QPS_RTR)) return -EINVAL; - } else if (unlikely(qp->req.state < QP_STATE_READY || - qp->req.state > QP_STATE_DRAINED)) - return -EINVAL; + } else { + if (unlikely(qp_state(qp) < IB_QPS_RTS)) + return -EINVAL; + } return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 745731140a54..8a8242512f2a 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -120,13 +120,13 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT); - if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + if (unlikely(qp_state(qp) == IB_QPS_SQD)) { /* check to see if we are drained; * state_lock used by requester and completer */ spin_lock_bh(&qp->state_lock); do { - if (qp->req.state != QP_STATE_DRAIN) { + if (!qp->attr.sq_draining) { /* comp just finished */ spin_unlock_bh(&qp->state_lock); break; @@ -139,7 +139,7 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) break; } - qp->req.state = QP_STATE_DRAINED; + qp->attr.sq_draining = 0; spin_unlock_bh(&qp->state_lock); if (qp->ibqp.event_handler) { @@ -159,8 +159,7 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) wqe = queue_addr_from_index(q, index); - if (unlikely((qp->req.state == QP_STATE_DRAIN || - qp->req.state == QP_STATE_DRAINED) && + if (unlikely((qp_state(qp) == IB_QPS_SQD) && (wqe->state != wqe_state_processing))) return NULL; @@ -656,7 +655,7 @@ int rxe_requester(struct rxe_qp *qp) if (unlikely(!qp->valid)) goto exit; - if (unlikely(qp->req.state == QP_STATE_ERROR)) { + if (unlikely(qp_state(qp) == IB_QPS_ERR)) { wqe = req_next_wqe(qp); if (wqe) /* @@ -667,7 +666,7 @@ int rxe_requester(struct rxe_qp *qp) goto exit; } - if (unlikely(qp->req.state == QP_STATE_RESET)) { + if (unlikely(qp_state(qp) == IB_QPS_RESET)) { qp->req.wqe_index = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); qp->req.opcode = -1; @@ -836,7 +835,7 @@ err: /* update wqe_index for each wqe completion */ qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); wqe->state = wqe_state_error; - qp->req.state = QP_STATE_ERROR; + qp->attr.qp_state = IB_QPS_ERR; rxe_sched_task(&qp->comp.task); exit: ret = -EAGAIN; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 36cad3665ee4..6d97c7093ae6 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -881,7 +881,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, if (!err) rxe_sched_task(&qp->req.task); - if (unlikely(qp->req.state == QP_STATE_ERROR)) + if (unlikely(qp_state(qp) == IB_QPS_ERR)) rxe_sched_task(&qp->comp.task); return err; @@ -900,7 +900,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, goto err_out; } - if (unlikely(qp->req.state < QP_STATE_READY)) { + if (unlikely(qp_state(qp) < IB_QPS_RTS)) { *bad_wr = wr; err = -EINVAL; rxe_dbg_qp(qp, "qp not ready to send"); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 1ae8dfd0ce7b..26a20f088692 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -102,17 +102,7 @@ struct rxe_srq { int error; }; -enum rxe_qp_state { - QP_STATE_RESET, - QP_STATE_INIT, - QP_STATE_READY, - QP_STATE_DRAIN, /* req only */ - QP_STATE_DRAINED, /* req only */ - QP_STATE_ERROR -}; - struct rxe_req_info { - enum rxe_qp_state state; int wqe_index; u32 psn; int opcode; -- cgit v1.2.3 From 7b560b89a08d35c23dfc95dc44aee10651c8b9a0 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 4 Apr 2023 23:26:10 -0500 Subject: RDMA/rxe: Move code to check if drained to subroutine Move two blocks of code in rxe_comp.c and rxe_req.c to subroutines that check if draining is complete in the SQD state and, if so, generate a SQ_DRAINED event. Link: https://lore.kernel.org/r/20230405042611.6467-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 35 ++++++++++++++++++++--------------- drivers/infiniband/sw/rxe/rxe_req.c | 32 ++++++++++++++++++-------------- 2 files changed, 38 insertions(+), 29 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 979990734e0c..1ccae8cff359 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -477,20 +477,8 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) } } -static inline enum comp_state complete_ack(struct rxe_qp *qp, - struct rxe_pkt_info *pkt, - struct rxe_send_wqe *wqe) +static void comp_check_sq_drain_done(struct rxe_qp *qp) { - if (wqe->has_rd_atomic) { - wqe->has_rd_atomic = 0; - atomic_inc(&qp->req.rd_atomic); - if (qp->req.need_rd_atomic) { - qp->comp.timeout_retry = 0; - qp->req.need_rd_atomic = 0; - rxe_sched_task(&qp->req.task); - } - } - if (unlikely(qp_state(qp) == IB_QPS_SQD)) { /* state_lock used by requester & completer */ spin_lock_bh(&qp->state_lock); @@ -507,10 +495,27 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); } - } else { - spin_unlock_bh(&qp->state_lock); + return; } + spin_unlock_bh(&qp->state_lock); } +} + +static inline enum comp_state complete_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + if (wqe->has_rd_atomic) { + wqe->has_rd_atomic = 0; + atomic_inc(&qp->req.rd_atomic); + if (qp->req.need_rd_atomic) { + qp->comp.timeout_retry = 0; + qp->req.need_rd_atomic = 0; + rxe_sched_task(&qp->req.task); + } + } + + comp_check_sq_drain_done(qp); do_complete(qp, wqe); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 8a8242512f2a..f329038efbc8 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -108,17 +108,12 @@ void rnr_nak_timer(struct timer_list *t) rxe_sched_task(&qp->req.task); } -static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +static void req_check_sq_drain_done(struct rxe_qp *qp) { - struct rxe_send_wqe *wqe; struct rxe_queue *q = qp->sq.queue; unsigned int index = qp->req.wqe_index; - unsigned int cons; - unsigned int prod; - - wqe = queue_head(q, QUEUE_TYPE_FROM_CLIENT); - cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); - prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT); + unsigned int cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); + struct rxe_send_wqe *wqe = queue_addr_from_index(q, cons); if (unlikely(qp_state(qp) == IB_QPS_SQD)) { /* check to see if we are drained; @@ -126,18 +121,14 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) */ spin_lock_bh(&qp->state_lock); do { - if (!qp->attr.sq_draining) { + if (!qp->attr.sq_draining) /* comp just finished */ - spin_unlock_bh(&qp->state_lock); break; - } if (wqe && ((index != cons) || - (wqe->state != wqe_state_posted))) { + (wqe->state != wqe_state_posted))) /* comp not done yet */ - spin_unlock_bh(&qp->state_lock); break; - } qp->attr.sq_draining = 0; spin_unlock_bh(&qp->state_lock); @@ -151,9 +142,22 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); } + return; } while (0); + spin_unlock_bh(&qp->state_lock); } +} +static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe; + struct rxe_queue *q = qp->sq.queue; + unsigned int index = qp->req.wqe_index; + unsigned int prod; + + req_check_sq_drain_done(qp); + + prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT); if (index == prod) return NULL; -- cgit v1.2.3 From f605f26ea196a3b49bea249330cbd18dba61a33e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 4 Apr 2023 23:26:11 -0500 Subject: RDMA/rxe: Protect QP state with qp->state_lock Currently the rxe driver makes little effort to make the changes to qp state (which includes qp->attr.qp_state, qp->attr.sq_draining and qp->valid) atomic between different client threads and IO threads. In particular a common template is for an RDMA application to call ib_modify_qp() to move a qp to ERR state and then wait until all the packet and work queues have drained before calling ib_destroy_qp(). None of these state changes are protected by locks to assure that the changes are executed atomically and that memory barriers are included. This has been observed to lead to incorrect behavior around qp cleanup. This patch continues the work of the previous patches in this series and adds locking code around qp state changes and lookups. Link: https://lore.kernel.org/r/20230405042611.6467-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 48 ++++--- drivers/infiniband/sw/rxe/rxe_net.c | 3 + drivers/infiniband/sw/rxe/rxe_qp.c | 153 ++++++++++++---------- drivers/infiniband/sw/rxe/rxe_recv.c | 10 +- drivers/infiniband/sw/rxe/rxe_req.c | 71 ++++++---- drivers/infiniband/sw/rxe/rxe_resp.c | 12 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 238 +++++++++++++++++++--------------- 7 files changed, 317 insertions(+), 218 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 1ccae8cff359..db18ace74d2b 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -118,10 +118,12 @@ void retransmit_timer(struct timer_list *t) rxe_dbg_qp(qp, "retransmit timer fired\n"); + spin_lock_bh(&qp->state_lock); if (qp->valid) { qp->comp.timeout = 1; rxe_sched_task(&qp->comp.task); } + spin_unlock_bh(&qp->state_lock); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) @@ -479,9 +481,8 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) static void comp_check_sq_drain_done(struct rxe_qp *qp) { + spin_lock_bh(&qp->state_lock); if (unlikely(qp_state(qp) == IB_QPS_SQD)) { - /* state_lock used by requester & completer */ - spin_lock_bh(&qp->state_lock); if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) { qp->attr.sq_draining = 0; spin_unlock_bh(&qp->state_lock); @@ -497,8 +498,8 @@ static void comp_check_sq_drain_done(struct rxe_qp *qp) } return; } - spin_unlock_bh(&qp->state_lock); } + spin_unlock_bh(&qp->state_lock); } static inline enum comp_state complete_ack(struct rxe_qp *qp, @@ -614,6 +615,26 @@ static void free_pkt(struct rxe_pkt_info *pkt) ib_device_put(dev); } +/* reset the retry timer if + * - QP is type RC + * - there is a packet sent by the requester that + * might be acked (we still might get spurious + * timeouts but try to keep them as few as possible) + * - the timeout parameter is set + * - the QP is alive + */ +static void reset_retry_timer(struct rxe_qp *qp) +{ + if (qp_type(qp) == IB_QPT_RC && qp->qp_timeout_jiffies) { + spin_lock_bh(&qp->state_lock); + if (qp_state(qp) >= IB_QPS_RTS && + psn_compare(qp->req.psn, qp->comp.psn) > 0) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); + spin_unlock_bh(&qp->state_lock); + } +} + int rxe_completer(struct rxe_qp *qp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); @@ -623,14 +644,17 @@ int rxe_completer(struct rxe_qp *qp) enum comp_state state; int ret; + spin_lock_bh(&qp->state_lock); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || - qp_state(qp) == IB_QPS_RESET) { + qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_resp_pkts(qp); flush_send_queue(qp, notify); + spin_unlock_bh(&qp->state_lock); goto exit; } + spin_unlock_bh(&qp->state_lock); if (qp->comp.timeout) { qp->comp.timeout_retry = 1; @@ -718,20 +742,7 @@ int rxe_completer(struct rxe_qp *qp) break; } - /* re reset the timeout counter if - * (1) QP is type RC - * (2) the QP is alive - * (3) there is a packet sent by the requester that - * might be acked (we still might get spurious - * timeouts but try to keep them as few as possible) - * (4) the timeout parameter is set - */ - if ((qp_type(qp) == IB_QPT_RC) && - (qp_state(qp) >= IB_QPS_RTS) && - (psn_compare(qp->req.psn, qp->comp.psn) > 0) && - qp->qp_timeout_jiffies) - mod_timer(&qp->retrans_timer, - jiffies + qp->qp_timeout_jiffies); + reset_retry_timer(qp); goto exit; case COMPST_ERROR_RETRY: @@ -793,6 +804,7 @@ int rxe_completer(struct rxe_qp *qp) */ qp->req.wait_for_rnr_timer = 1; rxe_dbg_qp(qp, "set rnr nak timer\n"); + // TODO who protects from destroy_qp?? mod_timer(&qp->rnr_nak_timer, jiffies + rnrnak_jiffies(aeth_syn(pkt) & ~AETH_TYPE_MASK)); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 9ed81d0bd25c..2bc7361152ea 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -413,11 +413,14 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + spin_lock_bh(&qp->state_lock); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { + spin_unlock_bh(&qp->state_lock); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } + spin_unlock_bh(&qp->state_lock); rxe_icrc_generate(skb, pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 78c7c13e614b..c5451a4488ca 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -325,8 +325,10 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, if (err) goto err2; + spin_lock_bh(&qp->state_lock); qp->attr.qp_state = IB_QPS_RESET; qp->valid = 1; + spin_unlock_bh(&qp->state_lock); return 0; @@ -377,27 +379,9 @@ int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init) return 0; } -/* called by the modify qp verb, this routine checks all the parameters before - * making any changes - */ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { - enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? - attr->cur_qp_state : qp->attr.qp_state; - enum ib_qp_state new_state = (mask & IB_QP_STATE) ? - attr->qp_state : cur_state; - - if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { - rxe_dbg_qp(qp, "invalid mask or state\n"); - goto err1; - } - - if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) { - if (qp->attr.sq_draining && new_state != IB_QPS_ERR) - goto err1; - } - if (mask & IB_QP_PORT) { if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) { rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num); @@ -508,22 +492,96 @@ static void rxe_qp_reset(struct rxe_qp *qp) /* move the qp to the error state */ void rxe_qp_error(struct rxe_qp *qp) { + spin_lock_bh(&qp->state_lock); qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ rxe_sched_task(&qp->resp.task); rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); + spin_unlock_bh(&qp->state_lock); +} + +static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask) +{ + spin_lock_bh(&qp->state_lock); + qp->attr.sq_draining = 1; + rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->req.task); + spin_unlock_bh(&qp->state_lock); +} + +/* caller should hold qp->state_lock */ +static int __qp_chk_state(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask) +{ + enum ib_qp_state cur_state; + enum ib_qp_state new_state; + + cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->attr.qp_state; + new_state = (mask & IB_QP_STATE) ? + attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) + return -EINVAL; + + if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) { + if (qp->attr.sq_draining && new_state != IB_QPS_ERR) + return -EINVAL; + } + + return 0; } +static const char *const qps2str[] = { + [IB_QPS_RESET] = "RESET", + [IB_QPS_INIT] = "INIT", + [IB_QPS_RTR] = "RTR", + [IB_QPS_RTS] = "RTS", + [IB_QPS_SQD] = "SQD", + [IB_QPS_SQE] = "SQE", + [IB_QPS_ERR] = "ERR", +}; + /* called by the modify qp verb */ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { - enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? - attr->cur_qp_state : qp->attr.qp_state; int err; + if (mask & IB_QP_CUR_STATE) + qp->attr.cur_qp_state = attr->qp_state; + + if (mask & IB_QP_STATE) { + spin_lock_bh(&qp->state_lock); + err = __qp_chk_state(qp, attr, mask); + if (!err) { + qp->attr.qp_state = attr->qp_state; + rxe_dbg_qp(qp, "state -> %s\n", + qps2str[attr->qp_state]); + } + spin_unlock_bh(&qp->state_lock); + + if (err) + return err; + + switch (attr->qp_state) { + case IB_QPS_RESET: + rxe_qp_reset(qp); + break; + case IB_QPS_SQD: + rxe_qp_sqd(qp, attr, mask); + break; + case IB_QPS_ERR: + rxe_qp_error(qp); + break; + default: + break; + } + } + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { int max_rd_atomic = attr->max_rd_atomic ? roundup_pow_of_two(attr->max_rd_atomic) : 0; @@ -545,9 +603,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, return err; } - if (mask & IB_QP_CUR_STATE) - qp->attr.cur_qp_state = attr->qp_state; - if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY) qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify; @@ -627,48 +682,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, if (mask & IB_QP_DEST_QPN) qp->attr.dest_qp_num = attr->dest_qp_num; - if (mask & IB_QP_STATE) { - qp->attr.qp_state = attr->qp_state; - - switch (attr->qp_state) { - case IB_QPS_RESET: - rxe_dbg_qp(qp, "state -> RESET\n"); - rxe_qp_reset(qp); - break; - - case IB_QPS_INIT: - rxe_dbg_qp(qp, "state -> INIT\n"); - break; - - case IB_QPS_RTR: - rxe_dbg_qp(qp, "state -> RTR\n"); - break; - - case IB_QPS_RTS: - rxe_dbg_qp(qp, "state -> RTS\n"); - break; - - case IB_QPS_SQD: - rxe_dbg_qp(qp, "state -> SQD\n"); - if (cur_state != IB_QPS_SQD) { - qp->attr.sq_draining = 1; - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); - } - break; - - case IB_QPS_SQE: - rxe_dbg_qp(qp, "state -> SQE !!?\n"); - /* Not possible from modify_qp. */ - break; - - case IB_QPS_ERR: - rxe_dbg_qp(qp, "state -> ERR\n"); - rxe_qp_error(qp); - break; - } - } - return 0; } @@ -695,10 +708,12 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) /* Applications that get this state typically spin on it. * Yield the processor */ - if (qp->attr.sq_draining) + spin_lock_bh(&qp->state_lock); + if (qp->attr.sq_draining) { + spin_unlock_bh(&qp->state_lock); cond_resched(); - - rxe_dbg_qp(qp, "attr->sq_draining = %d\n", attr->sq_draining); + } + spin_unlock_bh(&qp->state_lock); return 0; } @@ -722,7 +737,9 @@ static void rxe_qp_do_cleanup(struct work_struct *work) { struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); + spin_lock_bh(&qp->state_lock); qp->valid = 0; + spin_unlock_bh(&qp->state_lock); qp->qp_timeout_jiffies = 0; if (qp_type(qp) == IB_QPT_RC) { diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index ca17ac6c5878..2f953cc74256 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -38,13 +38,19 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return -EINVAL; } + spin_lock_bh(&qp->state_lock); if (pkt->mask & RXE_REQ_MASK) { - if (unlikely(qp_state(qp) < IB_QPS_RTR)) + if (unlikely(qp_state(qp) < IB_QPS_RTR)) { + spin_unlock_bh(&qp->state_lock); return -EINVAL; + } } else { - if (unlikely(qp_state(qp) < IB_QPS_RTS)) + if (unlikely(qp_state(qp) < IB_QPS_RTS)) { + spin_unlock_bh(&qp->state_lock); return -EINVAL; + } } + spin_unlock_bh(&qp->state_lock); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index f329038efbc8..8e50d116d273 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -102,24 +102,33 @@ void rnr_nak_timer(struct timer_list *t) rxe_dbg_qp(qp, "nak timer fired\n"); - /* request a send queue retry */ - qp->req.need_retry = 1; - qp->req.wait_for_rnr_timer = 0; - rxe_sched_task(&qp->req.task); + spin_lock_bh(&qp->state_lock); + if (qp->valid) { + /* request a send queue retry */ + qp->req.need_retry = 1; + qp->req.wait_for_rnr_timer = 0; + rxe_sched_task(&qp->req.task); + } + spin_unlock_bh(&qp->state_lock); } static void req_check_sq_drain_done(struct rxe_qp *qp) { - struct rxe_queue *q = qp->sq.queue; - unsigned int index = qp->req.wqe_index; - unsigned int cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); - struct rxe_send_wqe *wqe = queue_addr_from_index(q, cons); + struct rxe_queue *q; + unsigned int index; + unsigned int cons; + struct rxe_send_wqe *wqe; + + spin_lock_bh(&qp->state_lock); + if (qp_state(qp) == IB_QPS_SQD) { + q = qp->sq.queue; + index = qp->req.wqe_index; + cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); + wqe = queue_addr_from_index(q, cons); - if (unlikely(qp_state(qp) == IB_QPS_SQD)) { /* check to see if we are drained; * state_lock used by requester and completer */ - spin_lock_bh(&qp->state_lock); do { if (!qp->attr.sq_draining) /* comp just finished */ @@ -144,28 +153,40 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) } return; } while (0); - spin_unlock_bh(&qp->state_lock); } + spin_unlock_bh(&qp->state_lock); } -static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) { - struct rxe_send_wqe *wqe; struct rxe_queue *q = qp->sq.queue; unsigned int index = qp->req.wqe_index; unsigned int prod; - req_check_sq_drain_done(qp); - prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT); if (index == prod) return NULL; + else + return queue_addr_from_index(q, index); +} - wqe = queue_addr_from_index(q, index); +static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe; + + req_check_sq_drain_done(qp); + + wqe = __req_next_wqe(qp); + if (wqe == NULL) + return NULL; + spin_lock(&qp->state_lock); if (unlikely((qp_state(qp) == IB_QPS_SQD) && - (wqe->state != wqe_state_processing))) + (wqe->state != wqe_state_processing))) { + spin_unlock(&qp->state_lock); return NULL; + } + spin_unlock(&qp->state_lock); wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); return wqe; @@ -656,15 +677,16 @@ int rxe_requester(struct rxe_qp *qp) struct rxe_ah *ah; struct rxe_av *av; - if (unlikely(!qp->valid)) + spin_lock_bh(&qp->state_lock); + if (unlikely(!qp->valid)) { + spin_unlock_bh(&qp->state_lock); goto exit; + } if (unlikely(qp_state(qp) == IB_QPS_ERR)) { - wqe = req_next_wqe(qp); + wqe = __req_next_wqe(qp); + spin_unlock_bh(&qp->state_lock); if (wqe) - /* - * Generate an error completion for error qp state - */ goto err; else goto exit; @@ -678,8 +700,10 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wait_psn = 0; qp->req.need_retry = 0; qp->req.wait_for_rnr_timer = 0; + spin_unlock_bh(&qp->state_lock); goto exit; } + spin_unlock_bh(&qp->state_lock); /* we come here if the retransmit timer has fired * or if the rnr timer has fired. If the retransmit @@ -839,8 +863,7 @@ err: /* update wqe_index for each wqe completion */ qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); wqe->state = wqe_state_error; - qp->attr.qp_state = IB_QPS_ERR; - rxe_sched_task(&qp->comp.task); + rxe_qp_error(qp); exit: ret = -EAGAIN; out: diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 67eac616235c..68f6cd188d8e 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1137,8 +1137,13 @@ static enum resp_states do_complete(struct rxe_qp *qp, return RESPST_ERR_CQ_OVERFLOW; finish: - if (unlikely(qp_state(qp) == IB_QPS_ERR)) + spin_lock_bh(&qp->state_lock); + if (unlikely(qp_state(qp) == IB_QPS_ERR)) { + spin_unlock_bh(&qp->state_lock); return RESPST_CHK_RESOURCE; + } + spin_unlock_bh(&qp->state_lock); + if (unlikely(!pkt)) return RESPST_DONE; if (qp_type(qp) == IB_QPT_RC) @@ -1464,14 +1469,17 @@ int rxe_responder(struct rxe_qp *qp) struct rxe_pkt_info *pkt = NULL; int ret; + spin_lock_bh(&qp->state_lock); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || - qp_state(qp) == IB_QPS_RESET) { + qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_req_pkts(qp); flush_recv_queue(qp, notify); + spin_unlock_bh(&qp->state_lock); goto exit; } + spin_unlock_bh(&qp->state_lock); qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 6d97c7093ae6..dea605b7f683 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -660,42 +660,70 @@ err_out: } /* send wr */ + +/* sanity check incoming send work request */ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, - unsigned int mask, unsigned int length) + unsigned int *maskp, unsigned int *lengthp) { int num_sge = ibwr->num_sge; struct rxe_sq *sq = &qp->sq; + unsigned int mask = 0; + unsigned long length = 0; + int err = -EINVAL; + int i; - if (unlikely(num_sge > sq->max_sge)) { - rxe_dbg_qp(qp, "num_sge > max_sge"); - goto err_out; - } + do { + mask = wr_opcode_mask(ibwr->opcode, qp); + if (!mask) { + rxe_err_qp(qp, "bad wr opcode for qp type"); + break; + } - if (unlikely(mask & WR_ATOMIC_MASK)) { - if (length != 8) { - rxe_dbg_qp(qp, "atomic length != 8"); - goto err_out; + if (num_sge > sq->max_sge) { + rxe_err_qp(qp, "num_sge > max_sge"); + break; } - if (atomic_wr(ibwr)->remote_addr & 0x7) { - rxe_dbg_qp(qp, "misaligned atomic address"); - goto err_out; + length = 0; + for (i = 0; i < ibwr->num_sge; i++) + length += ibwr->sg_list[i].length; + + if (length > (1UL << 31)) { + rxe_err_qp(qp, "message length too long"); + break; } - } - if (unlikely((ibwr->send_flags & IB_SEND_INLINE) && - (length > sq->max_inline))) { - rxe_dbg_qp(qp, "inline length too big"); - goto err_out; - } + if (mask & WR_ATOMIC_MASK) { + if (length != 8) { + rxe_err_qp(qp, "atomic length != 8"); + break; + } + if (atomic_wr(ibwr)->remote_addr & 0x7) { + rxe_err_qp(qp, "misaligned atomic address"); + break; + } + } + if (ibwr->send_flags & IB_SEND_INLINE) { + if (!(mask & WR_INLINE_MASK)) { + rxe_err_qp(qp, "opcode doesn't support inline data"); + break; + } + if (length > sq->max_inline) { + rxe_err_qp(qp, "inline length too big"); + break; + } + } - return 0; + err = 0; + } while (0); -err_out: - return -EINVAL; + *maskp = mask; + *lengthp = (int)length; + + return err; } -static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, +static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, const struct ib_send_wr *ibwr) { wr->wr_id = ibwr->wr_id; @@ -711,8 +739,18 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, wr->wr.ud.ah_num = to_rah(ibah)->ah_num; if (qp_type(qp) == IB_QPT_GSI) wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; - if (wr->opcode == IB_WR_SEND_WITH_IMM) + + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; + break; + case IB_WR_SEND: + break; + default: + rxe_err_qp(qp, "bad wr opcode %d for UD/GSI QP", + wr->opcode); + return -EINVAL; + } } else { switch (wr->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: @@ -729,6 +767,11 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, case IB_WR_SEND_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; + case IB_WR_RDMA_READ_WITH_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; + wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; + break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: wr->wr.atomic.remote_addr = @@ -746,10 +789,20 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, wr->wr.reg.key = reg_wr(ibwr)->key; wr->wr.reg.access = reg_wr(ibwr)->access; break; + case IB_WR_SEND: + case IB_WR_BIND_MW: + case IB_WR_FLUSH: + case IB_WR_ATOMIC_WRITE: + break; default: + rxe_err_qp(qp, "unsupported wr opcode %d", + wr->opcode); + return -EINVAL; break; } } + + return 0; } static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, @@ -765,19 +818,22 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, } } -static void init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, +static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length, struct rxe_send_wqe *wqe) { int num_sge = ibwr->num_sge; + int err; - init_send_wr(qp, &wqe->wr, ibwr); + err = init_send_wr(qp, &wqe->wr, ibwr); + if (err) + return err; /* local operation */ if (unlikely(mask & WR_LOCAL_OP_MASK)) { wqe->mask = mask; wqe->state = wqe_state_posted; - return; + return 0; } if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) @@ -796,93 +852,62 @@ static void init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, wqe->dma.sge_offset = 0; wqe->state = wqe_state_posted; wqe->ssn = atomic_add_return(1, &qp->ssn); + + return 0; } -static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, - unsigned int mask, u32 length) +static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr) { int err; struct rxe_sq *sq = &qp->sq; struct rxe_send_wqe *send_wqe; - unsigned long flags; + unsigned int mask; + unsigned int length; int full; - err = validate_send_wr(qp, ibwr, mask, length); + err = validate_send_wr(qp, ibwr, &mask, &length); if (err) return err; - spin_lock_irqsave(&qp->sq.sq_lock, flags); - full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { - spin_unlock_irqrestore(&qp->sq.sq_lock, flags); - rxe_dbg_qp(qp, "queue full"); + rxe_err_qp(qp, "send queue full"); return -ENOMEM; } send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); - init_send_wqe(qp, ibwr, mask, length, send_wqe); - - queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); - - spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + err = init_send_wqe(qp, ibwr, mask, length, send_wqe); + if (!err) + queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); - return 0; + return err; } -static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, +static int rxe_post_send_kernel(struct rxe_qp *qp, + const struct ib_send_wr *ibwr, const struct ib_send_wr **bad_wr) { int err = 0; - unsigned int mask; - unsigned int length = 0; - int i; - struct ib_send_wr *next; - - while (wr) { - mask = wr_opcode_mask(wr->opcode, qp); - if (unlikely(!mask)) { - rxe_dbg_qp(qp, "bad wr opcode for qp"); - err = -EINVAL; - *bad_wr = wr; - break; - } - - if (unlikely((wr->send_flags & IB_SEND_INLINE) && - !(mask & WR_INLINE_MASK))) { - rxe_dbg_qp(qp, "opcode doesn't support inline data"); - err = -EINVAL; - *bad_wr = wr; - break; - } - - next = wr->next; - - length = 0; - for (i = 0; i < wr->num_sge; i++) - length += wr->sg_list[i].length; - if (length > 1<<31) { - err = -EINVAL; - rxe_dbg_qp(qp, "message length too long"); - *bad_wr = wr; - break; - } + unsigned long flags; - err = post_one_send(qp, wr, mask, length); + spin_lock_irqsave(&qp->sq.sq_lock, flags); + while (ibwr) { + err = post_one_send(qp, ibwr); if (err) { - *bad_wr = wr; + *bad_wr = ibwr; break; } - - wr = next; + ibwr = ibwr->next; } + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); - /* if we didn't post anything there's nothing to do */ if (!err) rxe_sched_task(&qp->req.task); - if (unlikely(qp_state(qp) == IB_QPS_ERR)) + spin_lock_bh(&qp->state_lock); + if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->comp.task); + spin_unlock_bh(&qp->state_lock); return err; } @@ -893,19 +918,21 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct rxe_qp *qp = to_rqp(ibqp); int err; - if (unlikely(!qp->valid)) { - *bad_wr = wr; - err = -EINVAL; - rxe_dbg_qp(qp, "qp destroyed"); - goto err_out; + spin_lock_bh(&qp->state_lock); + /* caller has already called destroy_qp */ + if (WARN_ON_ONCE(!qp->valid)) { + spin_unlock_bh(&qp->state_lock); + rxe_err_qp(qp, "qp has been destroyed"); + return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { + spin_unlock_bh(&qp->state_lock); *bad_wr = wr; - err = -EINVAL; - rxe_dbg_qp(qp, "qp not ready to send"); - goto err_out; + rxe_err_qp(qp, "qp not ready to send"); + return -EINVAL; } + spin_unlock_bh(&qp->state_lock); if (qp->is_user) { /* Utilize process context to do protocol processing */ @@ -913,14 +940,10 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) - goto err_out; + return err; } return 0; - -err_out: - rxe_err_qp(qp, "returned err = %d", err); - return err; } /* recv wr */ @@ -985,18 +1008,27 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, struct rxe_rq *rq = &qp->rq; unsigned long flags; - if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { + spin_lock_bh(&qp->state_lock); + /* caller has already called destroy_qp */ + if (WARN_ON_ONCE(!qp->valid)) { + spin_unlock_bh(&qp->state_lock); + rxe_err_qp(qp, "qp has been destroyed"); + return -EINVAL; + } + + /* see C10-97.2.1 */ + if (unlikely((qp_state(qp) < IB_QPS_INIT))) { + spin_unlock_bh(&qp->state_lock); *bad_wr = wr; - err = -EINVAL; - rxe_dbg_qp(qp, "qp destroyed or not ready to post recv"); - goto err_out; + rxe_dbg_qp(qp, "qp not ready to post recv"); + return -EINVAL; } + spin_unlock_bh(&qp->state_lock); if (unlikely(qp->srq)) { *bad_wr = wr; - err = -EINVAL; - rxe_dbg_qp(qp, "use post_srq_recv instead"); - goto err_out; + rxe_dbg_qp(qp, "qp has srq, use post_srq_recv instead"); + return -EINVAL; } spin_lock_irqsave(&rq->producer_lock, flags); @@ -1012,12 +1044,10 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&rq->producer_lock, flags); + spin_lock_bh(&qp->state_lock); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->resp.task); - -err_out: - if (err) - rxe_err_qp(qp, "returned err = %d", err); + spin_unlock_bh(&qp->state_lock); return err; } -- cgit v1.2.3 From 10af303192bc5490bb39b29541ecb0ead2eff1ce Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Tue, 18 Apr 2023 18:06:42 +0900 Subject: RDMA/rxe: Fix spinlock recursion deadlock on requester The following deadlock is observed: Call Trace: _raw_spin_lock_bh+0x29/0x30 check_type_state.constprop.0+0x4e/0xc0 [rdma_rxe] rxe_rcv+0x173/0x3d0 [rdma_rxe] rxe_udp_encap_recv+0x69/0xd0 [rdma_rxe] ? __pfx_rxe_udp_encap_recv+0x10/0x10 [rdma_rxe] udp_queue_rcv_one_skb+0x258/0x520 udp_unicast_rcv_skb+0x75/0x90 __udp4_lib_rcv+0x364/0x5c0 ip_protocol_deliver_rcu+0xa7/0x160 ip_local_deliver_finish+0x73/0xa0 ip_sublist_rcv_finish+0x80/0x90 ip_sublist_rcv+0x191/0x220 ip_list_rcv+0x132/0x160 __netif_receive_skb_list_core+0x297/0x2c0 netif_receive_skb_list_internal+0x1c5/0x300 napi_complete_done+0x6f/0x1b0 virtnet_poll+0x1f4/0x2d0 [virtio_net] __napi_poll+0x2c/0x1b0 net_rx_action+0x293/0x350 ? __napi_schedule+0x79/0x90 __do_softirq+0xcb/0x2ab __irq_exit_rcu+0xb9/0xf0 common_interrupt+0x80/0xa0 asm_common_interrupt+0x22/0x40 RIP: 0010:_raw_spin_lock+0x17/0x30 rxe_requester+0xe4/0x8f0 [rdma_rxe] ? xas_load+0x9/0xa0 ? xa_load+0x70/0xb0 do_task+0x64/0x1f0 [rdma_rxe] rxe_post_send+0x54/0x110 [rdma_rxe] ib_uverbs_post_send+0x5f8/0x680 [ib_uverbs] ? netif_receive_skb_list_internal+0x1e3/0x300 ib_uverbs_write+0x3c8/0x500 [ib_uverbs] vfs_write+0xc5/0x3b0 ksys_write+0xab/0xe0 ? syscall_trace_enter.constprop.0+0x126/0x1a0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc The deadlock is easily reproducible with perftest. Fix it by disabling softirq when acquiring the lock in process context. Fixes: f605f26ea196 ("RDMA/rxe: Protect QP state with qp->state_lock") Link: https://lore.kernel.org/r/20230418090642.1849358-1-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Acked-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 8e50d116d273..65134a9aefe7 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -180,13 +180,13 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) if (wqe == NULL) return NULL; - spin_lock(&qp->state_lock); + spin_lock_bh(&qp->state_lock); if (unlikely((qp_state(qp) == IB_QPS_SQD) && (wqe->state != wqe_state_processing))) { - spin_unlock(&qp->state_lock); + spin_unlock_bh(&qp->state_lock); return NULL; } - spin_unlock(&qp->state_lock); + spin_unlock_bh(&qp->state_lock); wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); return wqe; -- cgit v1.2.3