summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-07 19:35:50 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-07 19:35:50 +0300
commit7c989b1da3946e40bf71be00a0b401015235605a (patch)
tree33de4ff984af6a301d6e80a05d40a893909388c9 /block
parent513389809e138ae903b6ef43c1d5d2ffaf4dca17 (diff)
parent0e0abad2a71bcd7ba0f30e7975f5b4199ade4e60 (diff)
downloadlinux-7c989b1da3946e40bf71be00a0b401015235605a.tar.xz
Merge tag 'for-6.1/passthrough-2022-10-04' of git://git.kernel.dk/linux
Pull passthrough updates from Jens Axboe: "With these changes, passthrough NVMe support over io_uring now performs at the same level as block device O_DIRECT, and in many cases 6-8% better. This contains: - Add support for fixed buffers for passthrough (Anuj, Kanchan) - Enable batched allocations and freeing on passthrough, similarly to what we support on the normal storage path (me) - Fix from Geert fixing an issue with !CONFIG_IO_URING" * tag 'for-6.1/passthrough-2022-10-04' of git://git.kernel.dk/linux: io_uring: Add missing inline to io_uring_cmd_import_fixed() dummy nvme: wire up fixed buffer support for nvme passthrough nvme: pass ubuffer as an integer block: extend functionality to map bvec iterator block: factor out blk_rq_map_bio_alloc helper block: rename bio_map_put to blk_mq_map_bio_put nvme: refactor nvme_alloc_request nvme: refactor nvme_add_user_metadata nvme: Use blk_rq_map_user_io helper scsi: Use blk_rq_map_user_io helper block: add blk_rq_map_user_io io_uring: introduce fixed buffer support for io_uring_cmd io_uring: add io_uring_cmd_import_fixed nvme: enable batched completions of passthrough IO nvme: split out metadata vs non metadata end_io uring_cmd completions block: allow end_io based requests in the completion batch handling block: change request end_io handler to pass back a return value block: enable batched allocation for blk_mq_alloc_request() block: kill deprecated BUG_ON() in the flush handling
Diffstat (limited to 'block')
-rw-r--r--block/blk-flush.c11
-rw-r--r--block/blk-map.c150
-rw-r--r--block/blk-mq.c107
3 files changed, 230 insertions, 38 deletions
diff --git a/block/blk-flush.c b/block/blk-flush.c
index d20a0c6b2c66..53202eff545e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -205,7 +205,6 @@ static void blk_flush_complete_seq(struct request *rq,
* flush data request completion path. Restore @rq for
* normal completion and end it.
*/
- BUG_ON(!list_empty(&rq->queuelist));
list_del_init(&rq->flush.list);
blk_flush_restore_request(rq);
blk_mq_end_request(rq, error);
@@ -218,7 +217,8 @@ static void blk_flush_complete_seq(struct request *rq,
blk_kick_flush(q, fq, cmd_flags);
}
-static void flush_end_io(struct request *flush_rq, blk_status_t error)
+static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
+ blk_status_t error)
{
struct request_queue *q = flush_rq->q;
struct list_head *running;
@@ -232,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
if (!req_ref_put_and_test(flush_rq)) {
fq->rq_status = error;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
- return;
+ return RQ_END_IO_NONE;
}
blk_account_io_flush(flush_rq);
@@ -269,6 +269,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
}
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+ return RQ_END_IO_NONE;
}
bool is_flush_rq(struct request *rq)
@@ -354,7 +355,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
blk_flush_queue_rq(flush_rq, false);
}
-static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
+static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
+ blk_status_t error)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -376,6 +378,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
blk_mq_sched_restart(hctx);
+ return RQ_END_IO_NONE;
}
/**
diff --git a/block/blk-map.c b/block/blk-map.c
index 7693f8e3c454..34735626b00f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -231,7 +231,7 @@ out_bmd:
return ret;
}
-static void bio_map_put(struct bio *bio)
+static void blk_mq_map_bio_put(struct bio *bio)
{
if (bio->bi_opf & REQ_ALLOC_CACHE) {
bio_put(bio);
@@ -241,17 +241,10 @@ static void bio_map_put(struct bio *bio)
}
}
-static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
- gfp_t gfp_mask)
+static struct bio *blk_rq_map_bio_alloc(struct request *rq,
+ unsigned int nr_vecs, gfp_t gfp_mask)
{
- unsigned int max_sectors = queue_max_hw_sectors(rq->q);
- unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
struct bio *bio;
- int ret;
- int j;
-
- if (!iov_iter_count(iter))
- return -EINVAL;
if (rq->cmd_flags & REQ_POLLED) {
blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE;
@@ -259,13 +252,31 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask,
&fs_bio_set);
if (!bio)
- return -ENOMEM;
+ return NULL;
} else {
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
- return -ENOMEM;
+ return NULL;
bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
}
+ return bio;
+}
+
+static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
+ gfp_t gfp_mask)
+{
+ unsigned int max_sectors = queue_max_hw_sectors(rq->q);
+ unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
+ struct bio *bio;
+ int ret;
+ int j;
+
+ if (!iov_iter_count(iter))
+ return -EINVAL;
+
+ bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask);
+ if (bio == NULL)
+ return -ENOMEM;
while (iov_iter_count(iter)) {
struct page **pages, *stack_pages[UIO_FASTIOV];
@@ -331,7 +342,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
out_unmap:
bio_release_pages(bio, false);
- bio_map_put(bio);
+ blk_mq_map_bio_put(bio);
return ret;
}
@@ -537,6 +548,62 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(blk_rq_append_bio);
+/* Prepare bio for passthrough IO given ITER_BVEC iter */
+static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
+{
+ struct request_queue *q = rq->q;
+ size_t nr_iter = iov_iter_count(iter);
+ size_t nr_segs = iter->nr_segs;
+ struct bio_vec *bvecs, *bvprvp = NULL;
+ struct queue_limits *lim = &q->limits;
+ unsigned int nsegs = 0, bytes = 0;
+ struct bio *bio;
+ size_t i;
+
+ if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
+ return -EINVAL;
+ if (nr_segs > queue_max_segments(q))
+ return -EINVAL;
+
+ /* no iovecs to alloc, as we already have a BVEC iterator */
+ bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ bio_iov_bvec_set(bio, (struct iov_iter *)iter);
+ blk_rq_bio_prep(rq, bio, nr_segs);
+
+ /* loop to perform a bunch of sanity checks */
+ bvecs = (struct bio_vec *)iter->bvec;
+ for (i = 0; i < nr_segs; i++) {
+ struct bio_vec *bv = &bvecs[i];
+
+ /*
+ * If the queue doesn't support SG gaps and adding this
+ * offset would create a gap, fallback to copy.
+ */
+ if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) {
+ blk_mq_map_bio_put(bio);
+ return -EREMOTEIO;
+ }
+ /* check full condition */
+ if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
+ goto put_bio;
+ if (bytes + bv->bv_len > nr_iter)
+ goto put_bio;
+ if (bv->bv_offset + bv->bv_len > PAGE_SIZE)
+ goto put_bio;
+
+ nsegs++;
+ bytes += bv->bv_len;
+ bvprvp = bv;
+ }
+ return 0;
+put_bio:
+ blk_mq_map_bio_put(bio);
+ return -EINVAL;
+}
+
/**
* blk_rq_map_user_iov - map user data to a request, for passthrough requests
* @q: request queue where request should be inserted
@@ -556,24 +623,35 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data,
const struct iov_iter *iter, gfp_t gfp_mask)
{
- bool copy = false;
+ bool copy = false, map_bvec = false;
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
int ret = -EINVAL;
- if (!iter_is_iovec(iter))
- goto fail;
-
if (map_data)
copy = true;
else if (blk_queue_may_bounce(q))
copy = true;
else if (iov_iter_alignment(iter) & align)
copy = true;
+ else if (iov_iter_is_bvec(iter))
+ map_bvec = true;
+ else if (!iter_is_iovec(iter))
+ copy = true;
else if (queue_virt_boundary(q))
copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
+ if (map_bvec) {
+ ret = blk_rq_map_user_bvec(rq, iter);
+ if (!ret)
+ return 0;
+ if (ret != -EREMOTEIO)
+ goto fail;
+ /* fall back to copying the data on limits mismatches */
+ copy = true;
+ }
+
i = *iter;
do {
if (copy)
@@ -611,6 +689,42 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(blk_rq_map_user);
+int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data,
+ void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask,
+ bool vec, int iov_count, bool check_iter_count, int rw)
+{
+ int ret = 0;
+
+ if (vec) {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov = fast_iov;
+ struct iov_iter iter;
+
+ ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len,
+ UIO_FASTIOV, &iov, &iter);
+ if (ret < 0)
+ return ret;
+
+ if (iov_count) {
+ /* SG_IO howto says that the shorter of the two wins */
+ iov_iter_truncate(&iter, buf_len);
+ if (check_iter_count && !iov_iter_count(&iter)) {
+ kfree(iov);
+ return -EINVAL;
+ }
+ }
+
+ ret = blk_rq_map_user_iov(req->q, req, map_data, &iter,
+ gfp_mask);
+ kfree(iov);
+ } else if (buf_len) {
+ ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len,
+ gfp_mask);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(blk_rq_map_user_io);
+
/**
* blk_rq_unmap_user - unmap a request with user data
* @bio: start of bio list
@@ -636,7 +750,7 @@ int blk_rq_unmap_user(struct bio *bio)
next_bio = bio;
bio = bio->bi_next;
- bio_map_put(next_bio);
+ blk_mq_map_bio_put(next_bio);
}
return ret;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 83492d942348..8070b6c10e8d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -510,25 +510,87 @@ retry:
alloc_time_ns);
}
-struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
- blk_mq_req_flags_t flags)
+static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
+ struct blk_plug *plug,
+ blk_opf_t opf,
+ blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = opf,
- .nr_tags = 1,
+ .nr_tags = plug->nr_ios,
+ .cached_rq = &plug->cached_rq,
};
struct request *rq;
- int ret;
- ret = blk_queue_enter(q, flags);
- if (ret)
- return ERR_PTR(ret);
+ if (blk_queue_enter(q, flags))
+ return NULL;
+
+ plug->nr_ios = 1;
rq = __blk_mq_alloc_requests(&data);
- if (!rq)
- goto out_queue_exit;
+ if (unlikely(!rq))
+ blk_queue_exit(q);
+ return rq;
+}
+
+static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
+ blk_opf_t opf,
+ blk_mq_req_flags_t flags)
+{
+ struct blk_plug *plug = current->plug;
+ struct request *rq;
+
+ if (!plug)
+ return NULL;
+ if (rq_list_empty(plug->cached_rq)) {
+ if (plug->nr_ios == 1)
+ return NULL;
+ rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
+ if (rq)
+ goto got_it;
+ return NULL;
+ }
+ rq = rq_list_peek(&plug->cached_rq);
+ if (!rq || rq->q != q)
+ return NULL;
+
+ if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
+ return NULL;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+ return NULL;
+
+ plug->cached_rq = rq_list_next(rq);
+got_it:
+ rq->cmd_flags = opf;
+ INIT_LIST_HEAD(&rq->queuelist);
+ return rq;
+}
+
+struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
+ blk_mq_req_flags_t flags)
+{
+ struct request *rq;
+
+ rq = blk_mq_alloc_cached_request(q, opf, flags);
+ if (!rq) {
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .flags = flags,
+ .cmd_flags = opf,
+ .nr_tags = 1,
+ };
+ int ret;
+
+ ret = blk_queue_enter(q, flags);
+ if (ret)
+ return ERR_PTR(ret);
+
+ rq = __blk_mq_alloc_requests(&data);
+ if (!rq)
+ goto out_queue_exit;
+ }
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
@@ -761,8 +823,10 @@ static void blk_complete_request(struct request *req)
* can find how many bytes remain in the request
* later.
*/
- req->bio = NULL;
- req->__data_len = 0;
+ if (!req->end_io) {
+ req->bio = NULL;
+ req->__data_len = 0;
+ }
}
/**
@@ -939,7 +1003,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
if (rq->end_io) {
rq_qos_done(rq->q, rq);
- rq->end_io(rq, error);
+ if (rq->end_io(rq, error) == RQ_END_IO_FREE)
+ blk_mq_free_request(rq);
} else {
blk_mq_free_request(rq);
}
@@ -992,6 +1057,13 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
rq_qos_done(rq->q, rq);
+ /*
+ * If end_io handler returns NONE, then it still has
+ * ownership of the request.
+ */
+ if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
+ continue;
+
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (!req_ref_put_and_test(rq))
continue;
@@ -1233,12 +1305,13 @@ struct blk_rq_wait {
blk_status_t ret;
};
-static void blk_end_sync_rq(struct request *rq, blk_status_t ret)
+static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
{
struct blk_rq_wait *wait = rq->end_io_data;
wait->ret = ret;
complete(&wait->done);
+ return RQ_END_IO_NONE;
}
bool blk_rq_is_poll(struct request *rq)
@@ -1472,10 +1545,12 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
void blk_mq_put_rq_ref(struct request *rq)
{
- if (is_flush_rq(rq))
- rq->end_io(rq, 0);
- else if (req_ref_put_and_test(rq))
+ if (is_flush_rq(rq)) {
+ if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
+ blk_mq_free_request(rq);
+ } else if (req_ref_put_and_test(rq)) {
__blk_mq_free_request(rq);
+ }
}
static bool blk_mq_check_expired(struct request *rq, void *priv)