From e73a625bc24880f1fe5abaa89bb63e0918fbd66c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Sep 2022 09:19:59 -0600 Subject: block: kill deprecated BUG_ON() in the flush handling We've never had any useful reports from this BUG_ON(), and in fact a number of the BUG_ON()'s in the flush handling need to be turned into more graceful handling. In preparation for allowing batched completions of the end_io handling, where we can enter the flush completion with queuelist having been reused for the batch, get rid of this BUG_ON(). Signed-off-by: Jens Axboe --- block/blk-flush.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index d20a0c6b2c66..27705fc584a0 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -205,7 +205,6 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); -- cgit v1.2.3 From 4b6a5d9cea911424e84107df8c4eb8317938d2cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 08:22:09 -0600 Subject: block: enable batched allocation for blk_mq_alloc_request() The filesystem IO path can take advantage of allocating batches of requests, if the underlying submitter tells the block layer about it through the blk_plug. For passthrough IO, the exported API is the blk_mq_alloc_request() helper, and that one does not allow for request caching. Wire up request caching for blk_mq_alloc_request(), which is generally done without having a bio available upfront. Tested-by: Anuj Gupta Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 83492d942348..b32f70f38c6e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -510,25 +510,87 @@ retry: alloc_time_ns); } -struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, - blk_mq_req_flags_t flags) +static struct request *blk_mq_rq_cache_fill(struct request_queue *q, + struct blk_plug *plug, + blk_opf_t opf, + blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, - .nr_tags = 1, + .nr_tags = plug->nr_ios, + .cached_rq = &plug->cached_rq, }; struct request *rq; - int ret; - ret = blk_queue_enter(q, flags); - if (ret) - return ERR_PTR(ret); + if (blk_queue_enter(q, flags)) + return NULL; + + plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); - if (!rq) - goto out_queue_exit; + if (unlikely(!rq)) + blk_queue_exit(q); + return rq; +} + +static struct request *blk_mq_alloc_cached_request(struct request_queue *q, + blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct blk_plug *plug = current->plug; + struct request *rq; + + if (!plug) + return NULL; + if (rq_list_empty(plug->cached_rq)) { + if (plug->nr_ios == 1) + return NULL; + rq = blk_mq_rq_cache_fill(q, plug, opf, flags); + if (rq) + goto got_it; + return NULL; + } + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + + if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + + plug->cached_rq = rq_list_next(rq); +got_it: + rq->cmd_flags = opf; + INIT_LIST_HEAD(&rq->queuelist); + return rq; +} + +struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct request *rq; + + rq = blk_mq_alloc_cached_request(q, opf, flags); + if (!rq) { + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = opf, + .nr_tags = 1, + }; + int ret; + + ret = blk_queue_enter(q, flags); + if (ret) + return ERR_PTR(ret); + + rq = __blk_mq_alloc_requests(&data); + if (!rq) + goto out_queue_exit; + } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; -- cgit v1.2.3 From de671d6116b5210097cd6fbb877bac92536f265b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 15:19:54 -0600 Subject: block: change request end_io handler to pass back a return value Everything is just converted to returning RQ_END_IO_NONE, and there should be no functional changes with this patch. In preparation for allowing the end_io handler to pass ownership back to the block layer, rather than retain ownership of the request. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-flush.c | 10 +++++++--- block/blk-mq.c | 14 +++++++++----- drivers/md/dm-rq.c | 4 +++- drivers/nvme/host/core.c | 6 ++++-- drivers/nvme/host/ioctl.c | 5 ++++- drivers/nvme/host/pci.c | 12 ++++++++---- drivers/nvme/target/passthru.c | 5 +++-- drivers/scsi/scsi_error.c | 4 +++- drivers/scsi/sg.c | 9 +++++---- drivers/scsi/st.c | 4 +++- drivers/target/target_core_pscsi.c | 6 ++++-- drivers/ufs/core/ufshpb.c | 8 ++++++-- include/linux/blk-mq.h | 7 ++++++- 13 files changed, 65 insertions(+), 29 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 27705fc584a0..53202eff545e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -217,7 +217,8 @@ static void blk_flush_complete_seq(struct request *rq, blk_kick_flush(q, fq, cmd_flags); } -static void flush_end_io(struct request *flush_rq, blk_status_t error) +static enum rq_end_io_ret flush_end_io(struct request *flush_rq, + blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -231,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); - return; + return RQ_END_IO_NONE; } blk_account_io_flush(flush_rq); @@ -268,6 +269,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return RQ_END_IO_NONE; } bool is_flush_rq(struct request *rq) @@ -353,7 +355,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_flush_queue_rq(flush_rq, false); } -static void mq_flush_data_end_io(struct request *rq, blk_status_t error) +static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, + blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; @@ -375,6 +378,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); + return RQ_END_IO_NONE; } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index b32f70f38c6e..a21631de45b3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1001,7 +1001,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - rq->end_io(rq, error); + if (rq->end_io(rq, error) == RQ_END_IO_FREE) + blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } @@ -1295,12 +1296,13 @@ struct blk_rq_wait { blk_status_t ret; }; -static void blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); + return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) @@ -1534,10 +1536,12 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) void blk_mq_put_rq_ref(struct request *rq) { - if (is_flush_rq(rq)) - rq->end_io(rq, 0); - else if (req_ref_put_and_test(rq)) + if (is_flush_rq(rq)) { + if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + blk_mq_free_request(rq); + } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); + } } static bool blk_mq_check_expired(struct request *rq, void *priv) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 4f49bbcce4f1..3001b10a3fbf 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -292,11 +292,13 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) dm_complete_request(rq, error); } -static void end_clone_request(struct request *clone, blk_status_t error) +static enum rq_end_io_ret end_clone_request(struct request *clone, + blk_status_t error) { struct dm_rq_target_io *tio = clone->end_io_data; dm_complete_request(tio->orig, error); + return RQ_END_IO_NONE; } static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0f05b61a30fe..965a4c3e9d44 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1172,7 +1172,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2); } -static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) +static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, + blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; unsigned long flags; @@ -1184,7 +1185,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) dev_err(ctrl->device, "failed nvme_keep_alive_end_io error=%d\n", status); - return; + return RQ_END_IO_NONE; } ctrl->comp_seen = false; @@ -1195,6 +1196,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) spin_unlock_irqrestore(&ctrl->lock, flags); if (startka) nvme_queue_keep_alive_work(ctrl); + return RQ_END_IO_NONE; } static void nvme_keep_alive_work(struct work_struct *work) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 357791ff0623..2995789d5f9d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -392,7 +392,8 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) io_uring_cmd_done(ioucmd, status, result); } -static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err) +static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, + blk_status_t err) { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); @@ -411,6 +412,8 @@ static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err) nvme_uring_task_cb(ioucmd); else io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); + + return RQ_END_IO_NONE; } static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f9af99b7e672..7bbffd2a9beb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1268,7 +1268,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static void abort_endio(struct request *req, blk_status_t error) +static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; @@ -1276,6 +1276,7 @@ static void abort_endio(struct request *req, blk_status_t error) "Abort status: 0x%x", nvme_req(req)->status); atomic_inc(&nvmeq->dev->ctrl.abort_limit); blk_mq_free_request(req); + return RQ_END_IO_NONE; } static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) @@ -2447,22 +2448,25 @@ out_unlock: return result; } -static void nvme_del_queue_end(struct request *req, blk_status_t error) +static enum rq_end_io_ret nvme_del_queue_end(struct request *req, + blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; blk_mq_free_request(req); complete(&nvmeq->delete_done); + return RQ_END_IO_NONE; } -static void nvme_del_cq_end(struct request *req, blk_status_t error) +static enum rq_end_io_ret nvme_del_cq_end(struct request *req, + blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; if (error) set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - nvme_del_queue_end(req, error); + return nvme_del_queue_end(req, error); } static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 94d3153bae54..79af5140af8b 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -245,14 +245,15 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) nvme_passthru_end(ctrl, effects, req->cmd, status); } -static void nvmet_passthru_req_done(struct request *rq, - blk_status_t blk_status) +static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq, + blk_status_t blk_status) { struct nvmet_req *req = rq->end_io_data; req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, nvme_req(rq)->status); blk_mq_free_request(rq); + return RQ_END_IO_NONE; } static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 448748e3fba5..786fb963cf3f 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2004,9 +2004,11 @@ maybe_retry: } } -static void eh_lock_door_done(struct request *req, blk_status_t status) +static enum rq_end_io_ret eh_lock_door_done(struct request *req, + blk_status_t status) { blk_mq_free_request(req); + return RQ_END_IO_NONE; } /** diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 340b050ad28d..94c5e9a9309c 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static void sg_rq_end_io(struct request *rq, blk_status_t status); +static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -1311,7 +1311,7 @@ sg_rq_end_io_usercontext(struct work_struct *work) * This function is a "bottom half" handler that is called by the mid * level when a command is completed (or has failed). */ -static void +static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); @@ -1324,11 +1324,11 @@ sg_rq_end_io(struct request *rq, blk_status_t status) int result, resid, done = 1; if (WARN_ON(srp->done != 0)) - return; + return RQ_END_IO_NONE; sfp = srp->parentfp; if (WARN_ON(sfp == NULL)) - return; + return RQ_END_IO_NONE; sdp = sfp->parentdp; if (unlikely(atomic_read(&sdp->detaching))) @@ -1406,6 +1406,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status) INIT_WORK(&srp->ew.work, sg_rq_end_io_usercontext); schedule_work(&srp->ew.work); } + return RQ_END_IO_NONE; } static const struct file_operations sg_fops = { diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 850172a2b8f1..55e7c07ebe4c 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -512,7 +512,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) atomic64_dec(&STp->stats->in_flight); } -static void st_scsi_execute_end(struct request *req, blk_status_t status) +static enum rq_end_io_ret st_scsi_execute_end(struct request *req, + blk_status_t status) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct st_request *SRpnt = req->end_io_data; @@ -532,6 +533,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status) blk_rq_unmap_user(tmp); blk_mq_free_request(req); + return RQ_END_IO_NONE; } static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index e6a967ddc08c..8a7306e5e133 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -39,7 +39,7 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static void pscsi_req_done(struct request *, blk_status_t); +static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t); /* pscsi_attach_hba(): * @@ -1002,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) return 0; } -static void pscsi_req_done(struct request *req, blk_status_t status) +static enum rq_end_io_ret pscsi_req_done(struct request *req, + blk_status_t status) { struct se_cmd *cmd = req->end_io_data; struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); @@ -1029,6 +1030,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status) } blk_mq_free_request(req); + return RQ_END_IO_NONE; } static const struct target_backend_ops pscsi_ops = { diff --git a/drivers/ufs/core/ufshpb.c b/drivers/ufs/core/ufshpb.c index a1a7a1175a5a..3d69a81c5b17 100644 --- a/drivers/ufs/core/ufshpb.c +++ b/drivers/ufs/core/ufshpb.c @@ -613,14 +613,17 @@ static void ufshpb_activate_subregion(struct ufshpb_lu *hpb, srgn->srgn_state = HPB_SRGN_VALID; } -static void ufshpb_umap_req_compl_fn(struct request *req, blk_status_t error) +static enum rq_end_io_ret ufshpb_umap_req_compl_fn(struct request *req, + blk_status_t error) { struct ufshpb_req *umap_req = (struct ufshpb_req *)req->end_io_data; ufshpb_put_req(umap_req->hpb, umap_req); + return RQ_END_IO_NONE; } -static void ufshpb_map_req_compl_fn(struct request *req, blk_status_t error) +static enum rq_end_io_ret ufshpb_map_req_compl_fn(struct request *req, + blk_status_t error) { struct ufshpb_req *map_req = (struct ufshpb_req *) req->end_io_data; struct ufshpb_lu *hpb = map_req->hpb; @@ -636,6 +639,7 @@ static void ufshpb_map_req_compl_fn(struct request *req, blk_status_t error) spin_unlock_irqrestore(&hpb->rgn_state_lock, flags); ufshpb_put_map_req(map_req->hpb, map_req); + return RQ_END_IO_NONE; } static void ufshpb_set_unmap_cmd(unsigned char *cdb, struct ufshpb_region *rgn) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 00a15808c137..e6fa49dd6196 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -14,7 +14,12 @@ struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 -typedef void (rq_end_io_fn)(struct request *, blk_status_t); +enum rq_end_io_ret { + RQ_END_IO_NONE, + RQ_END_IO_FREE, +}; + +typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); /* * request flags */ -- cgit v1.2.3 From ab3e1d3bbab9e973aeb4dd4603251578658a47ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 08:24:16 -0600 Subject: block: allow end_io based requests in the completion batch handling With end_io handlers now being able to potentially pass ownership of the request upon completion, we can allow requests with end_io handlers in the batch completion handling. Reviewed-by: Anuj Gupta Reviewed-by: Keith Busch Co-developed-by: Stefan Roesch Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 +++++++++++-- include/linux/blk-mq.h | 3 ++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a21631de45b3..8070b6c10e8d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -823,8 +823,10 @@ static void blk_complete_request(struct request *req) * can find how many bytes remain in the request * later. */ - req->bio = NULL; - req->__data_len = 0; + if (!req->end_io) { + req->bio = NULL; + req->__data_len = 0; + } } /** @@ -1055,6 +1057,13 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) rq_qos_done(rq->q, rq); + /* + * If end_io handler returns NONE, then it still has + * ownership of the request. + */ + if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + continue; + WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e6fa49dd6196..50811d0fb143 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -853,8 +853,9 @@ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { - if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror) + if (!iob || (req->rq_flags & RQF_ELV) || ioerror) return false; + if (!iob->complete) iob->complete = complete; else if (iob->complete != complete) -- cgit v1.2.3 From c0a7ba77e81b8440d10f38559a5e1d219ff7e87c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2022 08:26:26 -0600 Subject: nvme: split out metadata vs non metadata end_io uring_cmd completions By splitting up the metadata and non-metadata end_io handling, we can remove any request dependencies on the normal non-metadata IO path. This is in preparation for enabling the normal IO passthrough path to pass the ownership of the request back to the block layer. Reviewed-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Sagi Grimberg Reviewed-by: Keith Busch Co-developed-by: Stefan Roesch Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 79 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 2995789d5f9d..f9d1f7e4d6d1 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -356,9 +356,15 @@ struct nvme_uring_cmd_pdu { struct bio *bio; struct request *req; }; - void *meta; /* kernel-resident buffer */ - void __user *meta_buffer; u32 meta_len; + u32 nvme_status; + union { + struct { + void *meta; /* kernel-resident buffer */ + void __user *meta_buffer; + }; + u64 result; + } u; }; static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( @@ -367,11 +373,10 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; } -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) +static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd) { struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); struct request *req = pdu->req; - struct bio *bio = req->bio; int status; u64 result; @@ -382,27 +387,39 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) result = le64_to_cpu(nvme_req(req)->result.u64); - if (pdu->meta) - status = nvme_finish_user_metadata(req, pdu->meta_buffer, - pdu->meta, pdu->meta_len, status); - if (bio) - blk_rq_unmap_user(bio); + if (pdu->meta_len) + status = nvme_finish_user_metadata(req, pdu->u.meta_buffer, + pdu->u.meta, pdu->meta_len, status); + if (req->bio) + blk_rq_unmap_user(req->bio); blk_mq_free_request(req); io_uring_cmd_done(ioucmd, status, result); } +static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + + io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result); +} + static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, blk_status_t err) { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - /* extract bio before reusing the same field for request */ - struct bio *bio = pdu->bio; void *cookie = READ_ONCE(ioucmd->cookie); - pdu->req = req; - req->bio = bio; + req->bio = pdu->bio; + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + pdu->nvme_status = -EINTR; + else + pdu->nvme_status = nvme_req(req)->status; + pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64); /* * For iopoll, complete it directly. @@ -413,6 +430,29 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, else io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); + blk_mq_free_request(req); + return RQ_END_IO_NONE; +} + +static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, + blk_status_t err) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + void *cookie = READ_ONCE(ioucmd->cookie); + + req->bio = pdu->bio; + pdu->req = req; + + /* + * For iopoll, complete it directly. + * Otherwise, move the completion to task work. + */ + if (cookie != NULL && blk_rq_is_poll(req)) + nvme_uring_task_meta_cb(ioucmd); + else + io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb); + return RQ_END_IO_NONE; } @@ -474,8 +514,6 @@ retry: blk_flags); if (IS_ERR(req)) return PTR_ERR(req); - req->end_io = nvme_uring_cmd_end_io; - req->end_io_data = ioucmd; if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { if (unlikely(!req->bio)) { @@ -490,10 +528,15 @@ retry: } /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; - pdu->meta = meta; - pdu->meta_buffer = nvme_to_user_ptr(d.metadata); pdu->meta_len = d.metadata_len; - + req->end_io_data = ioucmd; + if (pdu->meta_len) { + pdu->u.meta = meta; + pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata); + req->end_io = nvme_uring_cmd_end_io_meta; + } else { + req->end_io = nvme_uring_cmd_end_io; + } blk_execute_rq_nowait(req, false); return -EIOCBQUEUED; } -- cgit v1.2.3 From 851eb780decb7180bcf09fad0035cba9aae669df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Sep 2022 11:41:51 -0600 Subject: nvme: enable batched completions of passthrough IO Now that the normal passthrough end_io path doesn't need the request anymore, we can kill the explicit blk_mq_free_request() and just pass back RQ_END_IO_FREE instead. This enables the batched completion from freeing batches of requests at the time. This brings passthrough IO performance at least on par with bdev based O_DIRECT with io_uring. With this and batche allocations, peak performance goes from 110M IOPS to 122M IOPS. For IRQ based, passthrough is now also about 10% faster than previously, going from ~61M to ~67M IOPS. Reviewed-by: Anuj Gupta Reviewed-by: Sagi Grimberg Reviewed-by: Keith Busch Co-developed-by: Stefan Roesch Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f9d1f7e4d6d1..914b142b6f2b 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -430,8 +430,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, else io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); - blk_mq_free_request(req); - return RQ_END_IO_NONE; + return RQ_END_IO_FREE; } static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, -- cgit v1.2.3 From a9216fac3ed8819cbbda5d39dd5fcaa43dfd35d8 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:38 +0530 Subject: io_uring: add io_uring_cmd_import_fixed This is a new helper that callers can use to obtain a bvec iterator for the previously mapped buffer. This is preparatory work to enable fixed-buffer support for io_uring_cmd. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220930062749.152261-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 8 ++++++++ io_uring/uring_cmd.c | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 58676c0a398f..1dbf51115c30 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -4,6 +4,7 @@ #include #include +#include enum io_uring_cmd_flags { IO_URING_F_COMPLETE_DEFER = 1, @@ -32,6 +33,8 @@ struct io_uring_cmd { }; #if defined(CONFIG_IO_URING) +int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd); void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *)); @@ -59,6 +62,11 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else +static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd) +{ + return -EOPNOTSUPP; +} static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t ret2) { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index f3ed61e9bd0f..6a6d69523d75 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -8,6 +8,7 @@ #include #include "io_uring.h" +#include "rsrc.h" #include "uring_cmd.h" static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) @@ -129,3 +130,12 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return IOU_ISSUE_SKIP_COMPLETE; } + +int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + return io_import_fixed(rw, iter, req->imu, ubuf, len); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); -- cgit v1.2.3 From 9cda70f622cdcf049521a9c2886e5fd8a90a0591 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:39 +0530 Subject: io_uring: introduce fixed buffer support for io_uring_cmd Add IORING_URING_CMD_FIXED flag that is to be used for sending io_uring command with previously registered buffers. User-space passes the buffer index in sqe->buf_index, same as done in read/write variants that uses fixed buffers. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220930062749.152261-3-anuj20.g@samsung.com [axboe: shuffle valid flags check before acting on it] Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 2 +- include/uapi/linux/io_uring.h | 9 +++++++++ io_uring/uring_cmd.c | 19 ++++++++++++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 1dbf51115c30..e10c5cc81082 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -28,7 +28,7 @@ struct io_uring_cmd { void *cookie; }; u32 cmd_op; - u32 pad; + u32 flags; u8 pdu[32]; /* available inline for free use */ }; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92f29d9505a6..ab7458033ee3 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -56,6 +56,7 @@ struct io_uring_sqe { __u32 hardlink_flags; __u32 xattr_flags; __u32 msg_ring_flags; + __u32 uring_cmd_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -219,6 +220,14 @@ enum io_uring_op { IORING_OP_LAST, }; +/* + * sqe->uring_cmd_flags + * IORING_URING_CMD_FIXED use registered buffer; pass thig flag + * along with setting sqe->buf_index. + */ +#define IORING_URING_CMD_FIXED (1U << 0) + + /* * sqe->fsync_flags */ diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 6a6d69523d75..e50de0b6b9f8 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -77,8 +78,24 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - if (sqe->rw_flags || sqe->__pad1) + if (sqe->__pad1) return -EINVAL; + + ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags); + if (ioucmd->flags & ~IORING_URING_CMD_FIXED) + return -EINVAL; + + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + struct io_ring_ctx *ctx = req->ctx; + u16 index; + + req->buf_index = READ_ONCE(sqe->buf_index); + if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); + req->imu = ctx->user_bufs[index]; + io_req_set_rsrc_node(req, ctx, 0); + } ioucmd->cmd = sqe->cmd; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); return 0; -- cgit v1.2.3 From 557654025df5706785d395558244890dc4b2c875 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:40 +0530 Subject: block: add blk_rq_map_user_io Create a helper blk_rq_map_user_io for mapping of vectored as well as non-vectored requests. This will help in saving dupilcation of code at few places in scsi and nvme. Signed-off-by: Anuj Gupta Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 2 ++ 2 files changed, 38 insertions(+) diff --git a/block/blk-map.c b/block/blk-map.c index 7693f8e3c454..0e37bbedd46c 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -611,6 +611,42 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_user); +int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data, + void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask, + bool vec, int iov_count, bool check_iter_count, int rw) +{ + int ret = 0; + + if (vec) { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov = fast_iov; + struct iov_iter iter; + + ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + return ret; + + if (iov_count) { + /* SG_IO howto says that the shorter of the two wins */ + iov_iter_truncate(&iter, buf_len); + if (check_iter_count && !iov_iter_count(&iter)) { + kfree(iov); + return -EINVAL; + } + } + + ret = blk_rq_map_user_iov(req->q, req, map_data, &iter, + gfp_mask); + kfree(iov); + } else if (buf_len) { + ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len, + gfp_mask); + } + return ret; +} +EXPORT_SYMBOL(blk_rq_map_user_io); + /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50811d0fb143..ba18e9bdb799 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -985,6 +985,8 @@ struct rq_map_data { int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); +int blk_rq_map_user_io(struct request *, struct rq_map_data *, + void __user *, unsigned long, gfp_t, bool, int, bool, int); int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); int blk_rq_unmap_user(struct bio *); -- cgit v1.2.3 From 6732932c836a4313f471b92b4d90761f31d3fa81 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:41 +0530 Subject: scsi: Use blk_rq_map_user_io helper Use the new blk_rq_map_user_io helper instead of duplicating code at various places. Additionally this also takes advantage of the on-stack iov fast path. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-5-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/scsi/scsi_ioctl.c | 22 +++------------------- drivers/scsi/sg.c | 22 ++-------------------- 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index 729e309e6034..2d20da55fb64 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -449,25 +449,9 @@ static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) if (ret < 0) goto out_put_request; - ret = 0; - if (hdr->iovec_count && hdr->dxfer_len) { - struct iov_iter i; - struct iovec *iov = NULL; - - ret = import_iovec(rq_data_dir(rq), hdr->dxferp, - hdr->iovec_count, 0, &iov, &i); - if (ret < 0) - goto out_put_request; - - /* SG_IO howto says that the shorter of the two wins */ - iov_iter_truncate(&i, hdr->dxfer_len); - - ret = blk_rq_map_user_iov(rq->q, rq, NULL, &i, GFP_KERNEL); - kfree(iov); - } else if (hdr->dxfer_len) - ret = blk_rq_map_user(rq->q, rq, NULL, hdr->dxferp, - hdr->dxfer_len, GFP_KERNEL); - + ret = blk_rq_map_user_io(rq, NULL, hdr->dxferp, hdr->dxfer_len, + GFP_KERNEL, hdr->iovec_count && hdr->dxfer_len, + hdr->iovec_count, 0, rq_data_dir(rq)); if (ret) goto out_put_request; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 94c5e9a9309c..ce34a8ad53b4 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1804,26 +1804,8 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) md->from_user = 0; } - if (iov_count) { - struct iovec *iov = NULL; - struct iov_iter i; - - res = import_iovec(rw, hp->dxferp, iov_count, 0, &iov, &i); - if (res < 0) - return res; - - iov_iter_truncate(&i, hp->dxfer_len); - if (!iov_iter_count(&i)) { - kfree(iov); - return -EINVAL; - } - - res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC); - kfree(iov); - } else - res = blk_rq_map_user(q, rq, md, hp->dxferp, - hp->dxfer_len, GFP_ATOMIC); - + res = blk_rq_map_user_io(rq, md, hp->dxferp, hp->dxfer_len, + GFP_ATOMIC, iov_count, iov_count, 1, rw); if (!res) { srp->bio = rq->bio; -- cgit v1.2.3 From 7f05635764390d5f811971af9f4c89b794032c80 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:42 +0530 Subject: nvme: Use blk_rq_map_user_io helper User blk_rq_map_user_io instead of duplicating the same code at different places Signed-off-by: Anuj Gupta Link: https://lore.kernel.org/r/20220930062749.152261-6-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 914b142b6f2b..3746a02a88ef 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -88,22 +88,8 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { - if (!vec) - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - GFP_KERNEL); - else { - struct iovec fast_iov[UIO_FASTIOV]; - struct iovec *iov = fast_iov; - struct iov_iter iter; - - ret = import_iovec(rq_data_dir(req), ubuffer, bufflen, - UIO_FASTIOV, &iov, &iter); - if (ret < 0) - goto out; - ret = blk_rq_map_user_iov(q, req, NULL, &iter, - GFP_KERNEL); - kfree(iov); - } + ret = blk_rq_map_user_io(req, NULL, ubuffer, bufflen, + GFP_KERNEL, vec, 0, 0, rq_data_dir(req)); if (ret) goto out; bio = req->bio; -- cgit v1.2.3 From 38c0ddab7b93daa90c046d0b9064a34fb0e586e5 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:43 +0530 Subject: nvme: refactor nvme_add_user_metadata Pass struct request rather than bio. It helps to kill a parameter, and some processing clean-up too. Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-7-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 3746a02a88ef..bcaa6b3f97ca 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -20,19 +20,20 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval) return (void __user *)ptrval; } -static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, - unsigned len, u32 seed, bool write) +static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, + unsigned len, u32 seed) { struct bio_integrity_payload *bip; int ret = -ENOMEM; void *buf; + struct bio *bio = req->bio; buf = kmalloc(len, GFP_KERNEL); if (!buf) goto out; ret = -EFAULT; - if (write && copy_from_user(buf, ubuf, len)) + if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len)) goto out_free_meta; bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); @@ -45,9 +46,13 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, bip->bip_iter.bi_sector = seed; ret = bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)); - if (ret == len) - return buf; - ret = -ENOMEM; + if (ret != len) { + ret = -ENOMEM; + goto out_free_meta; + } + + req->cmd_flags |= REQ_INTEGRITY; + return buf; out_free_meta: kfree(buf); out: @@ -70,7 +75,6 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, u32 meta_seed, void **metap, unsigned timeout, bool vec, blk_opf_t rq_flags, blk_mq_req_flags_t blk_flags) { - bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; struct request *req; @@ -96,13 +100,12 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, if (bdev) bio_set_dev(bio, bdev); if (bdev && meta_buffer && meta_len) { - meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, - meta_seed, write); + meta = nvme_add_user_metadata(req, meta_buffer, + meta_len, meta_seed); if (IS_ERR(meta)) { ret = PTR_ERR(meta); goto out_unmap; } - req->cmd_flags |= REQ_INTEGRITY; *metap = meta; } } -- cgit v1.2.3 From 470e900c8036ff1aafeb5f06f3cb7a375a081399 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:44 +0530 Subject: nvme: refactor nvme_alloc_request nvme_alloc_request expects a large number of parameters. Split this out into two functions to reduce number of parameters. First one retains the name nvme_alloc_request, while second one is named nvme_map_user_request. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-8-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 90 ++++++++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 37 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index bcaa6b3f97ca..3f1e7af19716 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -70,54 +70,57 @@ static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, } static struct request *nvme_alloc_user_request(struct request_queue *q, - struct nvme_command *cmd, void __user *ubuffer, - unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, void **metap, unsigned timeout, bool vec, - blk_opf_t rq_flags, blk_mq_req_flags_t blk_flags) + struct nvme_command *cmd, blk_opf_t rq_flags, + blk_mq_req_flags_t blk_flags) { - struct nvme_ns *ns = q->queuedata; - struct block_device *bdev = ns ? ns->disk->part0 : NULL; struct request *req; - struct bio *bio = NULL; - void *meta = NULL; - int ret; req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); if (IS_ERR(req)) return req; nvme_init_request(req, cmd); - - if (timeout) - req->timeout = timeout; nvme_req(req)->flags |= NVME_REQ_USERCMD; + return req; +} - if (ubuffer && bufflen) { - ret = blk_rq_map_user_io(req, NULL, ubuffer, bufflen, - GFP_KERNEL, vec, 0, 0, rq_data_dir(req)); - if (ret) - goto out; - bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); - if (bdev && meta_buffer && meta_len) { - meta = nvme_add_user_metadata(req, meta_buffer, - meta_len, meta_seed); - if (IS_ERR(meta)) { - ret = PTR_ERR(meta); - goto out_unmap; - } - *metap = meta; +static int nvme_map_user_request(struct request *req, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, void **metap, bool vec) +{ + struct request_queue *q = req->q; + struct nvme_ns *ns = q->queuedata; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + ret = blk_rq_map_user_io(req, NULL, ubuffer, bufflen, GFP_KERNEL, vec, + 0, 0, rq_data_dir(req)); + + if (ret) + goto out; + bio = req->bio; + if (bdev) + bio_set_dev(bio, bdev); + + if (bdev && meta_buffer && meta_len) { + meta = nvme_add_user_metadata(req, meta_buffer, meta_len, + meta_seed); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); + goto out_unmap; } + *metap = meta; } - return req; + return ret; out_unmap: if (bio) blk_rq_unmap_user(bio); out: blk_mq_free_request(req); - return ERR_PTR(ret); + return ret; } static int nvme_submit_user_cmd(struct request_queue *q, @@ -132,11 +135,18 @@ static int nvme_submit_user_cmd(struct request_queue *q, u32 effects; int ret; - req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer, - meta_len, meta_seed, &meta, timeout, vec, 0, 0); + req = nvme_alloc_user_request(q, cmd, 0, 0); if (IS_ERR(req)) return PTR_ERR(req); + req->timeout = timeout; + if (ubuffer && bufflen) { + ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, + meta_len, meta_seed, &meta, vec); + if (ret) + return ret; + } + bio = req->bio; ctrl = nvme_req(req)->ctrl; @@ -456,6 +466,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, blk_opf_t rq_flags = 0; blk_mq_req_flags_t blk_flags = 0; void *meta = NULL; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -495,13 +506,18 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, rq_flags |= REQ_POLLED; retry: - req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr), - d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, 0, &meta, d.timeout_ms ? - msecs_to_jiffies(d.timeout_ms) : 0, vec, rq_flags, - blk_flags); + req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); if (IS_ERR(req)) return PTR_ERR(req); + req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; + + if (d.addr && d.data_len) { + ret = nvme_map_user_request(req, nvme_to_user_ptr(d.addr), + d.data_len, nvme_to_user_ptr(d.metadata), + d.metadata_len, 0, &meta, vec); + if (ret) + return ret; + } if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { if (unlikely(!req->bio)) { -- cgit v1.2.3 From 32f1c71b15fc9cb8e964c3d0c15ca99a70cfe8a7 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Fri, 30 Sep 2022 11:57:45 +0530 Subject: block: rename bio_map_put to blk_mq_map_bio_put This patch renames existing bio_map_put function to blk_mq_map_bio_put. Signed-off-by: Anuj Gupta Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-9-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 0e37bbedd46c..84b13a4158b7 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -231,7 +231,7 @@ out_bmd: return ret; } -static void bio_map_put(struct bio *bio) +static void blk_mq_map_bio_put(struct bio *bio) { if (bio->bi_opf & REQ_ALLOC_CACHE) { bio_put(bio); @@ -331,7 +331,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_map_put(bio); + blk_mq_map_bio_put(bio); return ret; } @@ -672,7 +672,7 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_map_put(next_bio); + blk_mq_map_bio_put(next_bio); } return ret; -- cgit v1.2.3 From ab89e8e7ca526ca04baaad2aa28172d336425d67 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:46 +0530 Subject: block: factor out blk_rq_map_bio_alloc helper Move bio allocation logic from bio_map_user_iov to a new helper blk_rq_map_bio_alloc. It is named so because functionality is opposite of what is done inside blk_mq_map_bio_put. This is a prep patch. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220930062749.152261-10-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 84b13a4158b7..d6ea377394a9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -241,17 +241,10 @@ static void blk_mq_map_bio_put(struct bio *bio) } } -static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, - gfp_t gfp_mask) +static struct bio *blk_rq_map_bio_alloc(struct request *rq, + unsigned int nr_vecs, gfp_t gfp_mask) { - unsigned int max_sectors = queue_max_hw_sectors(rq->q); - unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; - int ret; - int j; - - if (!iov_iter_count(iter)) - return -EINVAL; if (rq->cmd_flags & REQ_POLLED) { blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; @@ -259,13 +252,31 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, &fs_bio_set); if (!bio) - return -ENOMEM; + return NULL; } else { bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) - return -ENOMEM; + return NULL; bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); } + return bio; +} + +static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, + gfp_t gfp_mask) +{ + unsigned int max_sectors = queue_max_hw_sectors(rq->q); + unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); + struct bio *bio; + int ret; + int j; + + if (!iov_iter_count(iter)) + return -EINVAL; + + bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); + if (bio == NULL) + return -ENOMEM; while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; -- cgit v1.2.3 From 37987547932c89f15f9b76950040131ddb591a8b Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:47 +0530 Subject: block: extend functionality to map bvec iterator Extend blk_rq_map_user_iov so that it can handle bvec iterator, using the new blk_rq_map_user_bvec function. It maps the pages from bvec iterator into a bio and place the bio into request. This helper will be used by nvme for uring-passthrough path when IO is done using pre-mapped buffers. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-11-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index d6ea377394a9..34735626b00f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -548,6 +548,62 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(blk_rq_append_bio); +/* Prepare bio for passthrough IO given ITER_BVEC iter */ +static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) +{ + struct request_queue *q = rq->q; + size_t nr_iter = iov_iter_count(iter); + size_t nr_segs = iter->nr_segs; + struct bio_vec *bvecs, *bvprvp = NULL; + struct queue_limits *lim = &q->limits; + unsigned int nsegs = 0, bytes = 0; + struct bio *bio; + size_t i; + + if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) + return -EINVAL; + if (nr_segs > queue_max_segments(q)) + return -EINVAL; + + /* no iovecs to alloc, as we already have a BVEC iterator */ + bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); + if (bio == NULL) + return -ENOMEM; + + bio_iov_bvec_set(bio, (struct iov_iter *)iter); + blk_rq_bio_prep(rq, bio, nr_segs); + + /* loop to perform a bunch of sanity checks */ + bvecs = (struct bio_vec *)iter->bvec; + for (i = 0; i < nr_segs; i++) { + struct bio_vec *bv = &bvecs[i]; + + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, fallback to copy. + */ + if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { + blk_mq_map_bio_put(bio); + return -EREMOTEIO; + } + /* check full condition */ + if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) + goto put_bio; + if (bytes + bv->bv_len > nr_iter) + goto put_bio; + if (bv->bv_offset + bv->bv_len > PAGE_SIZE) + goto put_bio; + + nsegs++; + bytes += bv->bv_len; + bvprvp = bv; + } + return 0; +put_bio: + blk_mq_map_bio_put(bio); + return -EINVAL; +} + /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted @@ -567,24 +623,35 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - bool copy = false; + bool copy = false, map_bvec = false; unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; - if (!iter_is_iovec(iter)) - goto fail; - if (map_data) copy = true; else if (blk_queue_may_bounce(q)) copy = true; else if (iov_iter_alignment(iter) & align) copy = true; + else if (iov_iter_is_bvec(iter)) + map_bvec = true; + else if (!iter_is_iovec(iter)) + copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); + if (map_bvec) { + ret = blk_rq_map_user_bvec(rq, iter); + if (!ret) + return 0; + if (ret != -EREMOTEIO) + goto fail; + /* fall back to copying the data on limits mismatches */ + copy = true; + } + i = *iter; do { if (copy) -- cgit v1.2.3 From 4d174486820e625fa85bac5d4235d4b4cb659866 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:48 +0530 Subject: nvme: pass ubuffer as an integer This is a prep patch. Modify nvme_submit_user_cmd and nvme_map_user_request to take ubuffer as plain integer argument, and do away with nvme_to_user_ptr conversion in callers. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220930062749.152261-12-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 3f1e7af19716..7a41caa9bfd2 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -83,9 +83,10 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, return req; } -static int nvme_map_user_request(struct request *req, void __user *ubuffer, +static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, void **metap, bool vec) + u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, + bool vec) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -94,8 +95,8 @@ static int nvme_map_user_request(struct request *req, void __user *ubuffer, void *meta = NULL; int ret; - ret = blk_rq_map_user_io(req, NULL, ubuffer, bufflen, GFP_KERNEL, vec, - 0, 0, rq_data_dir(req)); + ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), bufflen, + GFP_KERNEL, vec, 0, 0, rq_data_dir(req)); if (ret) goto out; @@ -124,7 +125,7 @@ out: } static int nvme_submit_user_cmd(struct request_queue *q, - struct nvme_command *cmd, void __user *ubuffer, + struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u64 *result, unsigned timeout, bool vec) { @@ -142,7 +143,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, meta_seed, &meta, vec); + meta_len, meta_seed, &meta, NULL, vec); if (ret) return ret; } @@ -226,7 +227,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, - nvme_to_user_ptr(io.addr), length, + io.addr, length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0, false); } @@ -280,7 +281,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, timeout = msecs_to_jiffies(cmd.timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, + cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &result, timeout, false); @@ -326,7 +327,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, timeout = msecs_to_jiffies(cmd.timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, + cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout, vec); @@ -512,9 +513,9 @@ retry: req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; if (d.addr && d.data_len) { - ret = nvme_map_user_request(req, nvme_to_user_ptr(d.addr), + ret = nvme_map_user_request(req, d.addr, d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, 0, &meta, vec); + d.metadata_len, 0, &meta, ioucmd, vec); if (ret) return ret; } -- cgit v1.2.3 From 23fd22e55b767be9c31fda57205afb2023cd6aad Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Fri, 30 Sep 2022 11:57:49 +0530 Subject: nvme: wire up fixed buffer support for nvme passthrough if io_uring sends passthrough command with IORING_URING_CMD_FIXED flag, use the pre-registered buffer for IO (non-vectored variant). Pass the buffer/length to io_uring and get the bvec iterator for the range. Next, pass this bvec to block-layer and obtain a bio/request for subsequent processing. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20220930062749.152261-13-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 7a41caa9bfd2..81f5550b670d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -95,8 +95,22 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, void *meta = NULL; int ret; - ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), bufflen, - GFP_KERNEL, vec, 0, 0, rq_data_dir(req)); + if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { + struct iov_iter iter; + + /* fixedbufs is only for non-vectored io */ + if (WARN_ON_ONCE(vec)) + return -EINVAL; + ret = io_uring_cmd_import_fixed(ubuffer, bufflen, + rq_data_dir(req), &iter, ioucmd); + if (ret < 0) + goto out; + ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); + } else { + ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), + bufflen, GFP_KERNEL, vec, 0, 0, + rq_data_dir(req)); + } if (ret) goto out; -- cgit v1.2.3 From 0e0abad2a71bcd7ba0f30e7975f5b4199ade4e60 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 Oct 2022 14:39:10 +0200 Subject: io_uring: Add missing inline to io_uring_cmd_import_fixed() dummy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_IO_URING is not set: include/linux/io_uring.h:65:12: error: ‘io_uring_cmd_import_fixed’ defined but not used [-Werror=unused-function] 65 | static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, | ^~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by adding the missing "inline" keyword. Fixes: a9216fac3ed8819c ("io_uring: add io_uring_cmd_import_fixed") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/7404b4a696f64e33e5ef3c5bd3754d4f26d13e50.1664887093.git.geert+renesas@glider.be Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e10c5cc81082..43bc8a2edccf 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -62,7 +62,7 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else -static int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, +static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) { return -EOPNOTSUPP; -- cgit v1.2.3