diff options
Diffstat (limited to 'fs/ksmbd/transport_rdma.c')
-rw-r--r-- | fs/ksmbd/transport_rdma.c | 363 |
1 files changed, 213 insertions, 150 deletions
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index e646d79554b8..d035e060c2f0 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 524224; - -static int smb_direct_max_outstanding_rw_ops = 8; +static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; static LIST_HEAD(smb_direct_device_list); static DEFINE_RWLOCK(smb_direct_device_lock); @@ -147,18 +145,18 @@ struct smb_direct_transport { atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; - atomic_t rw_avail_ops; + int max_rw_credits; + int pages_per_rw_credit; + atomic_t rw_credits; wait_queue_head_t wait_send_credits; - wait_queue_head_t wait_rw_avail_ops; + wait_queue_head_t wait_rw_credits; mempool_t *sendmsg_mempool; struct kmem_cache *sendmsg_cache; mempool_t *recvmsg_mempool; struct kmem_cache *recvmsg_cache; - wait_queue_head_t wait_send_payload_pending; - atomic_t send_payload_pending; wait_queue_head_t wait_send_pending; atomic_t send_pending; @@ -208,12 +206,25 @@ struct smb_direct_recvmsg { struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; + int status; struct completion *completion; + struct list_head list; struct rdma_rw_ctx rw_ctx; struct sg_table sgt; struct scatterlist sg_list[]; }; +void init_smbd_max_io_size(unsigned int sz) +{ + sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); + smb_direct_max_read_write_size = sz; +} + +unsigned int get_smbd_max_read_write_size(void) +{ + return smb_direct_max_read_write_size; +} + static inline int get_buf_page_count(void *buf, int size) { return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - @@ -377,7 +388,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) t->reassembly_queue_length = 0; init_waitqueue_head(&t->wait_reassembly_queue); init_waitqueue_head(&t->wait_send_credits); - init_waitqueue_head(&t->wait_rw_avail_ops); + init_waitqueue_head(&t->wait_rw_credits); spin_lock_init(&t->receive_credit_lock); spin_lock_init(&t->recvmsg_queue_lock); @@ -386,8 +397,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->empty_recvmsg_queue_lock); INIT_LIST_HEAD(&t->empty_recvmsg_queue); - init_waitqueue_head(&t->wait_send_payload_pending); - atomic_set(&t->send_payload_pending, 0); init_waitqueue_head(&t->wait_send_pending); atomic_set(&t->send_pending, 0); @@ -417,8 +426,6 @@ static void free_transport(struct smb_direct_transport *t) wake_up_interruptible(&t->wait_send_credits); ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_payload_pending, - atomic_read(&t->send_payload_pending) == 0); wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); @@ -569,6 +576,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + t->status = SMB_DIRECT_CS_CONNECTED; enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; @@ -873,13 +881,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); } - if (sendmsg->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); /* iterate and free the list of messages in reverse. the list's head * is invalid. @@ -911,21 +914,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t, { int ret; - if (wr->num_sge > 1) - atomic_inc(&t->send_payload_pending); - else - atomic_inc(&t->send_pending); - + atomic_inc(&t->send_pending); ret = ib_post_send(t->qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (wr->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); smb_direct_disconnect_rdma_connection(t); } return ret; @@ -983,18 +977,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, } static int wait_for_credits(struct smb_direct_transport *t, - wait_queue_head_t *waitq, atomic_t *credits) + wait_queue_head_t *waitq, atomic_t *total_credits, + int needed) { int ret; do { - if (atomic_dec_return(credits) >= 0) + if (atomic_sub_return(needed, total_credits) >= 0) return 0; - atomic_inc(credits); + atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, - atomic_read(credits) > 0 || - t->status != SMB_DIRECT_CS_CONNECTED); + atomic_read(total_credits) >= needed || + t->status != SMB_DIRECT_CS_CONNECTED); if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; @@ -1015,7 +1010,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t, return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits); + return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); +} + +static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +{ + return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); +} + +static int calc_rw_credits(struct smb_direct_transport *t, + char *buf, unsigned int len) +{ + return DIV_ROUND_UP(get_buf_page_count(buf, len), + t->pages_per_rw_credit); } static int smb_direct_create_header(struct smb_direct_transport *t, @@ -1086,7 +1093,7 @@ static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nen int offset, len; int i = 0; - if (nentries < get_buf_page_count(buf, size)) + if (size <= 0 || nentries < get_buf_page_count(buf, size)) return -EINVAL; offset = offset_in_page(buf); @@ -1118,7 +1125,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, int npages; npages = get_sg_list(buf, size, sg_list, nentries); - if (npages <= 0) + if (npages < 0) return -EINVAL; return ib_dma_map_sg(device, sg_list, npages, dir); } @@ -1313,11 +1320,21 @@ done: * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_payload_pending, - atomic_read(&st->send_payload_pending) == 0); + wait_event(st->wait_send_pending, + atomic_read(&st->send_pending) == 0); return ret; } +static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, + struct smb_direct_rdma_rw_msg *msg, + enum dma_data_direction dir) +{ + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, msg->sgt.nents, dir); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); +} + static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { @@ -1326,19 +1343,14 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, struct smb_direct_transport *t = msg->t; if (wc->status != IB_WC_SUCCESS) { + msg->status = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); - smb_direct_disconnect_rdma_connection(t); + if (wc->status != IB_WC_WR_FLUSH_ERR) + smb_direct_disconnect_rdma_connection(t); } - if (atomic_inc_return(&t->rw_avail_ops) > 0) - wake_up(&t->wait_rw_avail_ops); - - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, dir); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); complete(msg->completion); - kfree(msg); } static void read_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1351,94 +1363,141 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) read_write_done(cq, wc, DMA_TO_DEVICE); } -static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, - int buf_len, u32 remote_key, u64 remote_offset, - u32 remote_len, bool is_read) +static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + void *buf, int buf_len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len, + bool is_read) { - struct smb_direct_rdma_rw_msg *msg; - int ret; + struct smb_direct_rdma_rw_msg *msg, *next_msg; + int i, ret; DECLARE_COMPLETION_ONSTACK(completion); - struct ib_send_wr *first_wr = NULL; + struct ib_send_wr *first_wr; + LIST_HEAD(msg_list); + char *desc_buf; + int credits_needed; + unsigned int desc_buf_len; + size_t total_length = 0; + + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; - ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); + /* calculate needed credits */ + credits_needed = 0; + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + desc_buf_len = le32_to_cpu(desc[i].length); + + credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + desc_buf += desc_buf_len; + total_length += desc_buf_len; + if (desc_buf_len == 0 || total_length > buf_len || + total_length > t->max_rdma_rw_size) + return -EINVAL; + } + + ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", + is_read ? "read" : "write", buf_len, credits_needed); + + ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) return ret; - /* TODO: mempool */ - msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + - sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); - if (!msg) { - atomic_inc(&t->rw_avail_ops); - return -ENOMEM; - } + /* build rdma_rw_ctx for each descriptor */ + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } - msg->sgt.sgl = &msg->sg_list[0]; - ret = sg_alloc_table_chained(&msg->sgt, - get_buf_page_count(buf, buf_len), - msg->sg_list, SG_CHUNK_SIZE); - if (ret) { - atomic_inc(&t->rw_avail_ops); - kfree(msg); - return -ENOMEM; - } + desc_buf_len = le32_to_cpu(desc[i].length); - ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents); - if (ret <= 0) { - pr_err("failed to get pages\n"); - goto err; - } + msg->t = t; + msg->cqe.done = is_read ? read_done : write_done; + msg->completion = &completion; - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, get_buf_page_count(buf, buf_len), - 0, remote_offset, remote_key, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - if (ret < 0) { - pr_err("failed to init rdma_rw_ctx: %d\n", ret); - goto err; + msg->sgt.sgl = &msg->sg_list[0]; + ret = sg_alloc_table_chained(&msg->sgt, + get_buf_page_count(desc_buf, desc_buf_len), + msg->sg_list, SG_CHUNK_SIZE); + if (ret) { + kfree(msg); + ret = -ENOMEM; + goto out; + } + + ret = get_sg_list(desc_buf, desc_buf_len, + msg->sgt.sgl, msg->sgt.orig_nents); + if (ret < 0) { + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, + get_buf_page_count(desc_buf, desc_buf_len), + 0, + le64_to_cpu(desc[i].offset), + le32_to_cpu(desc[i].token), + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret < 0) { + pr_err("failed to init rdma_rw_ctx: %d\n", ret); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + list_add_tail(&msg->list, &msg_list); + desc_buf += desc_buf_len; } - msg->t = t; - msg->cqe.done = is_read ? read_done : write_done; - msg->completion = &completion; - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, - &msg->cqe, NULL); + /* concatenate work requests of rdma_rw_ctxs */ + first_wr = NULL; + list_for_each_entry_reverse(msg, &msg_list, list) { + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + &msg->cqe, first_wr); + } ret = ib_post_send(t->qp, first_wr, NULL); if (ret) { - pr_err("failed to post send wr: %d\n", ret); - goto err; + pr_err("failed to post send wr for RDMA R/W: %d\n", ret); + goto out; } + msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); wait_for_completion(&completion); - return 0; - -err: - atomic_inc(&t->rw_avail_ops); - if (first_wr) - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); + ret = msg->status; +out: + list_for_each_entry_safe(msg, next_msg, &msg_list, list) { + list_del(&msg->list); + smb_direct_free_rdma_rw_msg(t, msg, + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + atomic_add(credits_needed, &t->rw_credits); + wake_up(&t->wait_rw_credits); return ret; } -static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_write(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, false); + desc, desc_len, false); } -static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_read(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, true); + desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) @@ -1638,41 +1697,57 @@ out_err: return ret; } +static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +{ + return min_t(unsigned int, + t->cm_id->device->attrs.max_fast_reg_page_list_len, + 256); +} + static int smb_direct_init_params(struct smb_direct_transport *t, struct ib_qp_cap *cap) { struct ib_device *device = t->cm_id->device; - int max_send_sges, max_pages, max_rw_wrs, max_send_wrs; + int max_send_sges, max_rw_wrs, max_send_wrs; + unsigned int max_sge_per_wr, wrs_per_credit; - /* need 2 more sge. because a SMB_DIRECT header will be mapped, - * and maybe a send buffer could be not page aligned. + /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, + * SMB2 response could be mapped. */ t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2; + max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { pr_err("max_send_size %d is too large\n", t->max_send_size); return -EINVAL; } - /* - * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA - * read/writes. HCA guarantees at least max_send_sge of sges for - * a RDMA read/write work request, and if memory registration is used, - * we need reg_mr, local_inv wrs for each read/write. + /* Calculate the number of work requests for RDMA R/W. + * The maximum number of pages which can be registered + * with one Memory region can be transferred with one + * R/W credit. And at least 4 work requests for each credit + * are needed for MR registration, RDMA R/W, local & remote + * MR invalidation. */ t->max_rdma_rw_size = smb_direct_max_read_write_size; - max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES); - max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num, - max_pages) * 2; - max_rw_wrs *= smb_direct_max_outstanding_rw_ops; + t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); + t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, + (t->pages_per_rw_credit - 1) * + PAGE_SIZE); + + max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, + device->attrs.max_sge_rd); + max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, + max_send_sges); + wrs_per_credit = max_t(unsigned int, 4, + DIV_ROUND_UP(t->pages_per_rw_credit, + max_sge_per_wr) + 1); + max_rw_wrs = t->max_rw_credits * wrs_per_credit; max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { - pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n", - smb_direct_send_credit_target, - smb_direct_max_outstanding_rw_ops); + pr_err("consider lowering send_credit_target = %d\n", + smb_direct_send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; @@ -1687,11 +1762,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { - pr_err("warning: device max_send_sge = %d too small\n", - device->attrs.max_send_sge); - return -EINVAL; - } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); @@ -1707,7 +1777,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->send_credit_target = smb_direct_send_credit_target; atomic_set(&t->send_credits, 0); - atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops); + atomic_set(&t->rw_credits, t->max_rw_credits); t->max_send_size = smb_direct_max_send_size; t->max_recv_size = smb_direct_max_receive_size; @@ -1715,12 +1785,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; + cap->max_send_sge = max_sge_per_wr; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = - rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * - smb_direct_max_outstanding_rw_ops; + cap->max_rdma_ctxs = t->max_rw_credits; return 0; } @@ -1813,7 +1881,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->send_cq = ib_alloc_cq(t->cm_id->device, t, - t->send_credit_target, 0, IB_POLL_WORKQUEUE); + smb_direct_send_credit_target + cap->max_rdma_ctxs, + 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->send_cq)) { pr_err("Can't create RDMA send CQ\n"); ret = PTR_ERR(t->send_cq); @@ -1822,8 +1891,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->recv_cq = ib_alloc_cq(t->cm_id->device, t, - cap->max_send_wr + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); + t->recv_credit_max, 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); ret = PTR_ERR(t->recv_cq); @@ -1852,17 +1920,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - int pages_per_mr, mr_count; - - pages_per_mr = min_t(int, pages_per_rw, - t->cm_id->device->attrs.max_fast_reg_page_list_len); - mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * - atomic_read(&t->rw_avail_ops); - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, - IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, + t->max_rw_credits, IB_MR_TYPE_MEM_REG, + t->pages_per_rw_credit, 0); if (ret) { pr_err("failed to init mr pool count %d pages %d\n", - mr_count, pages_per_mr); + t->max_rw_credits, t->pages_per_rw_credit); goto err; } } |