From a492f075450f3ba87de36e5ffe92a9d0c7af9723 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Thu, 28 Aug 2014 08:15:21 -0600 Subject: block,scsi: fixup blk_get_request dead queue scenarios The blk_get_request function may fail in low-memory conditions or during device removal (even if __GFP_WAIT is set). To distinguish between these errors, modify the blk_get_request call stack to return the appropriate ERR_PTR. Verify that all callers check the return status and consider IS_ERR instead of a simple NULL pointer check. For consistency, make a similar change to the blk_mq_alloc_request leg of blk_get_request. It may fail if the queue is dead, or the caller was unwilling to wait. Signed-off-by: Joe Lawrence Acked-by: Jiri Kosina [for pktdvd] Acked-by: Boaz Harrosh [for osd] Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..940aa8a34b70 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -218,9 +218,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_hw_ctx *hctx; struct request *rq; struct blk_mq_alloc_data alloc_data; + int ret; - if (blk_mq_queue_enter(q)) - return NULL; + ret = blk_mq_queue_enter(q); + if (ret) + return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -240,6 +242,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = alloc_data.ctx; } blk_mq_put_ctx(ctx); + if (!rq) + return ERR_PTR(-EWOULDBLOCK); return rq; } EXPORT_SYMBOL(blk_mq_alloc_request); -- cgit v1.2.3 From bf57229745f849e500ba69ff91e35bc8160a7373 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:08 -0700 Subject: blk-mq: remove REQ_END Pass an explicit parameter for the last request in a batch to ->queue_rq instead of using a request flag. Besides being a cleaner and non-stateful interface this is also required for the next patch, which fixes the blk-mq I/O submission code to not start a time too early. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 +++++----------------- drivers/block/mtip32xx/mtip32xx.c | 3 ++- drivers/block/null_blk.c | 3 ++- drivers/block/virtio_blk.c | 4 ++-- drivers/scsi/scsi_lib.c | 3 ++- include/linux/blk-mq.h | 2 +- include/linux/blk_types.h | 2 -- 7 files changed, 14 insertions(+), 25 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index e743d28620b2..32b4797f4186 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -384,7 +384,7 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void blk_mq_start_request(struct request *rq, bool last) +static void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -421,16 +421,6 @@ static void blk_mq_start_request(struct request *rq, bool last) */ rq->nr_phys_segments++; } - - /* - * Flag the last request in the series so that drivers know when IO - * should be kicked off, if they don't do it on a per-request basis. - * - * Note: the flag isn't the only condition drivers should do kick off. - * If drive is busy, the last request might not have the bit set. - */ - if (last) - rq->cmd_flags |= REQ_END; } static void __blk_mq_requeue_request(struct request *rq) @@ -440,8 +430,6 @@ static void __blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - rq->cmd_flags &= ~REQ_END; - if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -755,9 +743,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq, list_empty(&rq_list)); + blk_mq_start_request(rq); - ret = q->mq_ops->queue_rq(hctx, rq); + ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -1198,14 +1186,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) int ret; blk_mq_bio_to_request(rq, bio); - blk_mq_start_request(rq, true); + blk_mq_start_request(rq); /* * For OK queue, we are done. For error, kill it. Any other * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq); + ret = q->mq_ops->queue_rq(data.hctx, rq, true); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 5c8e7fe07745..0e2084f37c67 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3775,7 +3775,8 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool last) { int ret; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 00d469c7f9f7..c5b7315c2c13 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -313,7 +313,8 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool last) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 0a581400de0f..13756e016797 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -164,14 +164,14 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, + bool last) { struct virtio_blk *vblk = hctx->queue->queuedata; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; int qid = hctx->queue_num; - const bool last = (req->cmd_flags & REQ_END) != 0; int err; bool notify = false; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 1f2bae475cb7..f1df41168391 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1855,7 +1855,8 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) +static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, + bool last) { struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a1e31f274fcd..9c4e306a9217 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -77,7 +77,7 @@ struct blk_mq_tag_set { struct list_head tag_list; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); +typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 66c2167f04a9..bb7d66460e7a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,7 +188,6 @@ enum rq_flag_bits { __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_KERNEL, /* direct IO to kernel pages */ __REQ_PM, /* runtime pm request */ - __REQ_END, /* last of chain of requests */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ @@ -242,7 +241,6 @@ enum rq_flag_bits { #define REQ_SECURE (1ULL << __REQ_SECURE) #define REQ_KERNEL (1ULL << __REQ_KERNEL) #define REQ_PM (1ULL << __REQ_PM) -#define REQ_END (1ULL << __REQ_END) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -- cgit v1.2.3 From e2490073cd7c3d6f6ef6e029a208edd4d38efac4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:09 -0700 Subject: blk-mq: call blk_mq_start_request from ->queue_rq When we call blk_mq_start_request from the core blk-mq code before calling into ->queue_rq there is a racy window where the timeout handler can hit before we've fully set up the driver specific part of the command. Move the call to blk_mq_start_request into the driver so the driver can start the request only once it is fully set up. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 ++++++------- drivers/block/mtip32xx/mtip32xx.c | 2 ++ drivers/block/null_blk.c | 2 ++ drivers/block/virtio_blk.c | 2 ++ drivers/scsi/scsi_lib.c | 1 + include/linux/blk-mq.h | 1 + 6 files changed, 14 insertions(+), 7 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 32b4797f4186..141f2e06803a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -384,7 +384,7 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void blk_mq_start_request(struct request *rq) +void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -422,16 +422,18 @@ static void blk_mq_start_request(struct request *rq) rq->nr_phys_segments++; } } +EXPORT_SYMBOL(blk_mq_start_request); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; + if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (q->dma_drain_size && blk_rq_bytes(rq)) + rq->nr_phys_segments--; + } } void blk_mq_requeue_request(struct request *rq) @@ -743,8 +745,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq); - ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: @@ -1186,7 +1186,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) int ret; blk_mq_bio_to_request(rq, bio); - blk_mq_start_request(rq); /* * For OK queue, we are done. For error, kill it. Any other diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 0e2084f37c67..4042440a0470 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3783,6 +3783,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, if (unlikely(mtip_check_unal_depth(hctx, rq))) return BLK_MQ_RQ_QUEUE_BUSY; + blk_mq_start_request(rq); + ret = mtip_submit_request(hctx, rq); if (likely(!ret)) return BLK_MQ_RQ_QUEUE_OK; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index c5b7315c2c13..332ce20d45da 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -321,6 +321,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, cmd->rq = rq; cmd->nq = hctx->driver_data; + blk_mq_start_request(rq); + null_handle_cmd(cmd); return BLK_MQ_RQ_QUEUE_OK; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 13756e016797..83816bf6882f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -205,6 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, } } + blk_mq_start_request(req); + num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); if (num) { if (rq_data_dir(vbr->req) == WRITE) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index f1df41168391..2dcd9078de48 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1890,6 +1890,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, scsi_init_cmd_errh(cmd); cmd->scsi_done = scsi_mq_done; + blk_mq_start_request(req); reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9c4e306a9217..878b6f71da48 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -159,6 +159,7 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); +void blk_mq_start_request(struct request *rq); void blk_mq_end_io(struct request *rq, int error); void __blk_mq_end_io(struct request *rq, int error); -- cgit v1.2.3 From c8a446ad695ada43a885ec12b38411dbd190a11b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:10 -0700 Subject: blk-mq: rename blk_mq_end_io to blk_mq_end_request Now that we've changed the driver API on the submission side use the opportunity to fix up the name on the completion side to fit into the general scheme. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 4 ++-- block/blk-mq.c | 16 ++++++++-------- drivers/block/mtip32xx/mtip32xx.c | 4 ++-- drivers/block/null_blk.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/scsi/scsi_lib.c | 4 ++-- include/linux/blk-mq.h | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-flush.c b/block/blk-flush.c index 3cb5e9e7108a..698e6926388c 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -202,7 +202,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, list_del_init(&rq->flush.list); blk_flush_restore_request(rq); if (q->mq_ops) - blk_mq_end_io(rq, error); + blk_mq_end_request(rq, error); else __blk_end_request_all(rq, error); break; @@ -378,7 +378,7 @@ void blk_insert_flush(struct request *rq) */ if (!policy) { if (q->mq_ops) - blk_mq_end_io(rq, 0); + blk_mq_end_request(rq, 0); else __blk_end_bidi_request(rq, 0, 0, 0); return; diff --git a/block/blk-mq.c b/block/blk-mq.c index 141f2e06803a..1713686f5c2f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -300,7 +300,7 @@ void blk_mq_clone_flush_request(struct request *flush_rq, hctx->cmd_size); } -inline void __blk_mq_end_io(struct request *rq, int error) +inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); @@ -312,15 +312,15 @@ inline void __blk_mq_end_io(struct request *rq, int error) blk_mq_free_request(rq); } } -EXPORT_SYMBOL(__blk_mq_end_io); +EXPORT_SYMBOL(__blk_mq_end_request); -void blk_mq_end_io(struct request *rq, int error) +void blk_mq_end_request(struct request *rq, int error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); - __blk_mq_end_io(rq, error); + __blk_mq_end_request(rq, error); } -EXPORT_SYMBOL(blk_mq_end_io); +EXPORT_SYMBOL(blk_mq_end_request); static void __blk_mq_complete_request_remote(void *data) { @@ -360,7 +360,7 @@ void __blk_mq_complete_request(struct request *rq) struct request_queue *q = rq->q; if (!q->softirq_done_fn) - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); else blk_mq_ipi_complete_request(rq); } @@ -758,7 +758,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) pr_err("blk-mq: bad return on queue: %d\n", ret); case BLK_MQ_RQ_QUEUE_ERROR: rq->errors = -EIO; - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); break; } @@ -1200,7 +1200,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) if (ret == BLK_MQ_RQ_QUEUE_ERROR) { rq->errors = -EIO; - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); goto done; } } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4042440a0470..6b7e8d0fba99 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -247,7 +247,7 @@ static void mtip_async_complete(struct mtip_port *port, if (unlikely(cmd->unaligned)) up(&port->cmd_slot_unal); - blk_mq_end_io(rq, status ? -EIO : 0); + blk_mq_end_request(rq, status ? -EIO : 0); } /* @@ -3739,7 +3739,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) int err; err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); - blk_mq_end_io(rq, err); + blk_mq_end_request(rq, err); return 0; } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 332ce20d45da..ac50a2931044 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -177,7 +177,7 @@ static void end_cmd(struct nullb_cmd *cmd) { switch (queue_mode) { case NULL_Q_MQ: - blk_mq_end_io(cmd->rq, 0); + blk_mq_end_request(cmd->rq, 0); return; case NULL_Q_RQ: INIT_LIST_HEAD(&cmd->rq->queuelist); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 83816bf6882f..f751fc392ba9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -135,7 +135,7 @@ static inline void virtblk_request_done(struct request *req) req->errors = (error != 0); } - blk_mq_end_io(req, error); + blk_mq_end_request(req, error); } static void virtblk_done(struct virtqueue *vq) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 2dcd9078de48..73ce7d27f5c8 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -713,7 +713,7 @@ static bool scsi_end_request(struct request *req, int error, if (req->mq_ctx) { /* - * In the MQ case the command gets freed by __blk_mq_end_io, + * In the MQ case the command gets freed by __blk_mq_end_request, * so we have to do all cleanup that depends on it earlier. * * We also can't kick the queues from irq context, so we @@ -721,7 +721,7 @@ static bool scsi_end_request(struct request *req, int error, */ scsi_mq_uninit_cmd(cmd); - __blk_mq_end_io(req, error); + __blk_mq_end_request(req, error); if (scsi_target(sdev)->single_lun || !list_empty(&sdev->host->starved_list)) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 878b6f71da48..cb217c16990d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -160,8 +160,8 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_ind struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); void blk_mq_start_request(struct request *rq); -void blk_mq_end_io(struct request *rq, int error); -void __blk_mq_end_io(struct request *rq, int error); +void blk_mq_end_request(struct request *rq, int error); +void __blk_mq_end_request(struct request *rq, int error); void blk_mq_requeue_request(struct request *rq); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); -- cgit v1.2.3 From 81481eb423c295c5480a3fab9bb961cf286c91e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:11 -0700 Subject: blk-mq: fix and simplify tag iteration for the timeout handler Don't do a kmalloc from timer to handle timeouts, chances are we could be under heavy load or similar and thus just miss out on the timeouts. Fortunately it is very easy to just iterate over all in use tags, and doing this properly actually cleans up the blk_mq_busy_iter API as well, and prepares us for the next patch by passing a reserved argument to the iterator. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 44 +++++++++++-------------- block/blk-mq.c | 85 +++++++++++++++---------------------------------- include/linux/blk-mq.h | 6 +++- include/scsi/scsi_tcq.h | 2 +- 4 files changed, 49 insertions(+), 88 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c1b92426c95e..b08788086414 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -392,45 +392,37 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, __blk_mq_put_reserved_tag(tags, tag); } -static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, - unsigned long *free_map, unsigned int off) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_iter_fn *fn, void *data, bool reserved) { - int i; + struct request *rq; + int bit, i; for (i = 0; i < bt->map_nr; i++) { struct blk_align_bitmap *bm = &bt->map[i]; - int bit = 0; - - do { - bit = find_next_zero_bit(&bm->word, bm->depth, bit); - if (bit >= bm->depth) - break; - __set_bit(bit + off, free_map); - bit++; - } while (1); + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = blk_mq_tag_to_rq(hctx->tags, off + bit); + if (rq->q == hctx->queue) + fn(hctx, rq, data, reserved); + } off += (1 << bt->bits_per_word); } } -void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, - void (*fn)(void *, unsigned long *), void *data) +void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, + void *priv) { - unsigned long *tag_map; - size_t map_size; - - map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; - tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); - if (!tag_map) - return; + struct blk_mq_tags *tags = hctx->tags; - bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); if (tags->nr_reserved_tags) - bt_for_each_free(&tags->breserved_tags, tag_map, 0); - - fn(data, tag_map); - kfree(tag_map); + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); } EXPORT_SYMBOL(blk_mq_tag_busy_iter); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1713686f5c2f..3baebcaf36db 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,58 +525,6 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); -struct blk_mq_timeout_data { - struct blk_mq_hw_ctx *hctx; - unsigned long *next; - unsigned int *next_set; -}; - -static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) -{ - struct blk_mq_timeout_data *data = __data; - struct blk_mq_hw_ctx *hctx = data->hctx; - unsigned int tag; - - /* It may not be in flight yet (this is where - * the REQ_ATOMIC_STARTED flag comes in). The requests are - * statically allocated, so we know it's always safe to access the - * memory associated with a bit offset into ->rqs[]. - */ - tag = 0; - do { - struct request *rq; - - tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); - if (tag >= hctx->tags->nr_tags) - break; - - rq = blk_mq_tag_to_rq(hctx->tags, tag++); - if (rq->q != hctx->queue) - continue; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - continue; - - blk_rq_check_expired(rq, data->next, data->next_set); - } while (1); -} - -static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, - unsigned long *next, - unsigned int *next_set) -{ - struct blk_mq_timeout_data data = { - .hctx = hctx, - .next = next, - .next_set = next_set, - }; - - /* - * Ask the tagging code to iterate busy requests, so we can - * check them for timeout. - */ - blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); -} - static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) { struct request_queue *q = rq->q; @@ -598,13 +546,30 @@ static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) return q->mq_ops->timeout(rq); } + +struct blk_mq_timeout_data { + unsigned long next; + unsigned int next_set; +}; + +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + struct blk_mq_timeout_data *data = priv; + + if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + blk_rq_check_expired(rq, &data->next, &data->next_set); +} -static void blk_mq_rq_timer(unsigned long data) +static void blk_mq_rq_timer(unsigned long priv) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = (struct request_queue *)priv; + struct blk_mq_timeout_data data = { + .next = 0, + .next_set = 0, + }; struct blk_mq_hw_ctx *hctx; - unsigned long next = 0; - int i, next_set = 0; + int i; queue_for_each_hw_ctx(q, hctx, i) { /* @@ -614,12 +579,12 @@ static void blk_mq_rq_timer(unsigned long data) if (!hctx->nr_ctx || !hctx->tags) continue; - blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); } - if (next_set) { - next = blk_rq_timeout(round_jiffies_up(next)); - mod_timer(&q->timeout, next); + if (data.next_set) { + data.next = blk_rq_timeout(round_jiffies_up(data.next)); + mod_timer(&q->timeout, data.next); } else { queue_for_each_hw_ctx(q, hctx, i) blk_mq_tag_idle(hctx); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cb217c16990d..0eb0f642be4b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -86,6 +86,9 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int, typedef void (exit_request_fn)(void *, struct request *, unsigned int, unsigned int); +typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, + bool); + struct blk_mq_ops { /* * Queue request @@ -174,7 +177,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); +void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, + void *priv); /* * Driver command data is immediately after the request. So subtract request diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h index cdcc90b07ecb..e64583560701 100644 --- a/include/scsi/scsi_tcq.h +++ b/include/scsi/scsi_tcq.h @@ -68,7 +68,7 @@ static inline void scsi_activate_tcq(struct scsi_device *sdev, int depth) return; if (!shost_use_blk_mq(sdev->host) && - blk_queue_tagged(sdev->request_queue)) + !blk_queue_tagged(sdev->request_queue)) blk_queue_init_tags(sdev->request_queue, depth, sdev->host->bqt); -- cgit v1.2.3 From 46f92d42ee37e10970e33891b7b61a342bd97aeb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:12 -0700 Subject: blk-mq: unshared timeout handler Duplicate the (small) timeout handler in blk-mq so that we can pass arguments more easily to the driver timeout handler. This enables the next patch. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 53 +++++++++++++++++++++++++++++++++++++---------------- block/blk-timeout.c | 8 ++------ block/blk.h | 2 -- 3 files changed, 39 insertions(+), 24 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3baebcaf36db..298d6e360661 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,9 +525,15 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); -static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +struct blk_mq_timeout_data { + unsigned long next; + unsigned int next_set; +}; + +static void blk_mq_rq_timed_out(struct request *req) { - struct request_queue *q = rq->q; + struct blk_mq_ops *ops = req->q->mq_ops; + enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; /* * We know that complete is set at this point. If STARTED isn't set @@ -538,27 +544,43 @@ static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) * we both flags will get cleared. So check here again, and ignore * a timeout event with a request that isn't active. */ - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - return BLK_EH_NOT_HANDLED; - - if (!q->mq_ops->timeout) - return BLK_EH_RESET_TIMER; + if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) + return; - return q->mq_ops->timeout(rq); + if (ops->timeout) + ret = ops->timeout(req); + + switch (ret) { + case BLK_EH_HANDLED: + __blk_mq_complete_request(req); + break; + case BLK_EH_RESET_TIMER: + blk_add_timer(req); + blk_clear_rq_complete(req); + break; + case BLK_EH_NOT_HANDLED: + break; + default: + printk(KERN_ERR "block: bad eh return: %d\n", ret); + break; + } } -struct blk_mq_timeout_data { - unsigned long next; - unsigned int next_set; -}; - static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - blk_rq_check_expired(rq, &data->next, &data->next_set); + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return; + + if (time_after_eq(jiffies, rq->deadline)) { + if (!blk_mark_rq_complete(rq)) + blk_mq_rq_timed_out(rq); + } else if (!data->next_set || time_after(data->next, rq->deadline)) { + data->next = rq->deadline; + data->next_set = 1; + } } static void blk_mq_rq_timer(unsigned long priv) @@ -1781,7 +1803,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) else blk_queue_make_request(q, blk_sq_make_request); - blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); if (set->timeout) blk_queue_rq_timeout(q, set->timeout); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 95a09590ccfd..4d448259e622 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -7,7 +7,6 @@ #include #include "blk.h" -#include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -90,10 +89,7 @@ static void blk_rq_timed_out(struct request *req) switch (ret) { case BLK_EH_HANDLED: /* Can we use req->errors here? */ - if (q->mq_ops) - __blk_mq_complete_request(req); - else - __blk_complete_request(req); + __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: blk_add_timer(req); @@ -113,7 +109,7 @@ static void blk_rq_timed_out(struct request *req) } } -void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, +static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set) { if (time_after_eq(jiffies, rq->deadline)) { diff --git a/block/blk.h b/block/blk.h index 6748c4f8d7a1..e515a285d4c9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -38,8 +38,6 @@ bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); void blk_rq_timed_out_timer(unsigned long data); -void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, - unsigned int *next_set); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -- cgit v1.2.3 From 0152fb6b57c4fae769ee75ea2ae670f4ff39fba9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 13 Sep 2014 16:40:13 -0700 Subject: blk-mq: pass a reserved argument to the timeout handler Allow blk-mq to pass an argument to the timeout handler to indicate if we're timing out a reserved or regular command. For many drivers those need to be handled different. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- drivers/scsi/scsi_lib.c | 10 +++++++++- include/linux/blk-mq.h | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 298d6e360661..d12f1983d493 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -530,7 +530,7 @@ struct blk_mq_timeout_data { unsigned int next_set; }; -static void blk_mq_rq_timed_out(struct request *req) +static void blk_mq_rq_timed_out(struct request *req, bool reserved) { struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; @@ -548,7 +548,7 @@ static void blk_mq_rq_timed_out(struct request *req) return; if (ops->timeout) - ret = ops->timeout(req); + ret = ops->timeout(req, reserved); switch (ret) { case BLK_EH_HANDLED: @@ -576,7 +576,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) - blk_mq_rq_timed_out(rq); + blk_mq_rq_timed_out(rq, reserved); } else if (!data->next_set || time_after(data->next, rq->deadline)) { data->next = rq->deadline; data->next_set = 1; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 73ce7d27f5c8..86b1156edb82 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1932,6 +1932,14 @@ out: return ret; } +static enum blk_eh_timer_return scsi_timeout(struct request *req, + bool reserved) +{ + if (reserved) + return BLK_EH_RESET_TIMER; + return scsi_times_out(req); +} + static int scsi_init_request(void *data, struct request *rq, unsigned int hctx_idx, unsigned int request_idx, unsigned int numa_node) @@ -2043,7 +2051,7 @@ static struct blk_mq_ops scsi_mq_ops = { .map_queue = blk_mq_map_queue, .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, - .timeout = scsi_times_out, + .timeout = scsi_timeout, .init_request = scsi_init_request, .exit_request = scsi_exit_request, }; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0eb0f642be4b..325349559fb0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -79,6 +79,7 @@ struct blk_mq_tag_set { typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); +typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); typedef int (init_request_fn)(void *, struct request *, unsigned int, @@ -103,7 +104,7 @@ struct blk_mq_ops { /* * Called on request timeout */ - rq_timed_out_fn *timeout; + timeout_fn *timeout; softirq_done_fn *complete; -- cgit v1.2.3 From 2edd2c740b2918eb0a9a1fe1b69678b903769ec2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 Sep 2014 17:47:58 +0800 Subject: blk-mq: remove unnecessary blk_clear_rq_complete() This patch removes two unnecessary blk_clear_rq_complete(), the REQ_ATOM_COMPLETE flag is cleared inside blk_mq_start_request(), so: - The blk_clear_rq_complete() in blk_flush_restore_request() needn't because the request will be freed later, and clearing it here may open a small race window with timeout. - The blk_clear_rq_complete() in blk_mq_requeue_request() isn't necessary too, even though REQ_ATOM_STARTED is cleared in __blk_mq_requeue_request(), in theory it still may cause a small race window with timeout since the two clear_bit() may be reordered. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 -- block/blk-mq.c | 1 - 2 files changed, 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-flush.c b/block/blk-flush.c index 698e6926388c..c8e25768f2e1 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -126,8 +126,6 @@ static void blk_flush_restore_request(struct request *rq) /* make @rq a normal request */ rq->cmd_flags &= ~REQ_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; - - blk_clear_rq_complete(rq); } static bool blk_flush_queue_rq(struct request *rq, bool add_front) diff --git a/block/blk-mq.c b/block/blk-mq.c index d12f1983d493..3b277b4eaa95 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -439,7 +439,6 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq) { __blk_mq_requeue_request(rq); - blk_clear_rq_complete(rq); BUG_ON(blk_queued_rq(rq)); blk_mq_add_to_requeue_list(rq, true); -- cgit v1.2.3 From aedcd72f6c283dffefbb8b808ae67bdd2c6eb11a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Sep 2014 08:27:03 -0600 Subject: blk-mq: limit memory consumption if a crash dump is active It's not uncommon for crash dump kernels to be limited to 128MB or something low in that area. This is normally not a problem for devices as we don't use that much memory, but for some shared SCSI setups with huge queue depths, it can potentially fill most of memory with tons of request allocations. blk-mq does scale back when it fails to allocate memory, but it scales back just enough so that blk-mq succeeds. This could still leave the system with not enough memory to make any real progress. Check if we are in a kdump environment and limit the hardware queues and tag depth. Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b277b4eaa95..c5345a951820 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -1742,6 +1743,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); -- cgit v1.2.3 From 90415837659fec54f33584b423dab250eb1e8432 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Sep 2014 10:21:48 -0600 Subject: block: fix blk_abort_request on blk-mq Signed-off-by: Christoph Hellwig Moved blk_mq_rq_timed_out() definition to the private blk-mq.h header. Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- block/blk-mq.h | 2 ++ block/blk-timeout.c | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index c5345a951820..a3a80884ed95 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -530,7 +530,7 @@ struct blk_mq_timeout_data { unsigned int next_set; }; -static void blk_mq_rq_timed_out(struct request *req, bool reserved) +void blk_mq_rq_timed_out(struct request *req, bool reserved) { struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; diff --git a/block/blk-mq.h b/block/blk-mq.h index ca4964a6295d..a3c613a9df2d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -60,6 +60,8 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); +extern void blk_mq_rq_timed_out(struct request *req, bool reserved); + /* * Basic implementation of sparser bitmap, allowing the user to spread * the bits over more cachelines. diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8bae410b8a1b..56c025894cdf 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -7,6 +7,7 @@ #include #include "blk.h" +#include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -158,7 +159,10 @@ void blk_abort_request(struct request *req) if (blk_mark_rq_complete(req)) return; blk_delete_timer(req); - blk_rq_timed_out(req); + if (req->q->mq_ops) + blk_mq_rq_timed_out(req, false); + else + blk_rq_timed_out(req); } EXPORT_SYMBOL_GPL(blk_abort_request); -- cgit v1.2.3 From 08e98fc6016c890c2f4ffba6decc0ca9d2d5d7f8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:38 +0800 Subject: blk-mq: handle failure path for initializing hctx Failure of initializing one hctx isn't handled, so this patch introduces blk_mq_init_hctx() and its pair to handle it explicitly. Also this patch makes code cleaner. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 114 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 45 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a3a80884ed95..66ef1fb79326 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1509,6 +1509,20 @@ static int blk_mq_hctx_notify(void *data, unsigned long action, return NOTIFY_OK; } +static void blk_mq_exit_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + blk_mq_tag_idle(hctx); + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); +} + static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { @@ -1518,17 +1532,8 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - - blk_mq_tag_idle(hctx); - - if (set->ops->exit_hctx) - set->ops->exit_hctx(hctx, i); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - kfree(hctx->ctxs); - blk_mq_free_bitmap(&hctx->ctx_map); + blk_mq_exit_hctx(q, set, hctx, i); } - } static void blk_mq_free_hw_queues(struct request_queue *q, @@ -1543,53 +1548,72 @@ static void blk_mq_free_hw_queues(struct request_queue *q, } } -static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_tag_set *set) +static int blk_mq_init_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; + int node; + + node = hctx->numa_node; + if (node == NUMA_NO_NODE) + node = hctx->numa_node = set->numa_node; + + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); + spin_lock_init(&hctx->lock); + INIT_LIST_HEAD(&hctx->dispatch); + hctx->queue = q; + hctx->queue_num = hctx_idx; + hctx->flags = set->flags; + hctx->cmd_size = set->cmd_size; + + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, + blk_mq_hctx_notify, hctx); + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + + hctx->tags = set->tags[hctx_idx]; /* - * Initialize hardware queues + * Allocate space for all possible cpus to avoid allocation at + * runtime */ - queue_for_each_hw_ctx(q, hctx, i) { - int node; + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + GFP_KERNEL, node); + if (!hctx->ctxs) + goto unregister_cpu_notifier; - node = hctx->numa_node; - if (node == NUMA_NO_NODE) - node = hctx->numa_node = set->numa_node; + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + goto free_ctxs; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); - INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); - spin_lock_init(&hctx->lock); - INIT_LIST_HEAD(&hctx->dispatch); - hctx->queue = q; - hctx->queue_num = i; - hctx->flags = set->flags; - hctx->cmd_size = set->cmd_size; + hctx->nr_ctx = 0; - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, - blk_mq_hctx_notify, hctx); - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) + goto free_bitmap; - hctx->tags = set->tags[i]; + return 0; - /* - * Allocate space for all possible cpus to avoid allocation at - * runtime - */ - hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), - GFP_KERNEL, node); - if (!hctx->ctxs) - break; + free_bitmap: + blk_mq_free_bitmap(&hctx->ctx_map); + free_ctxs: + kfree(hctx->ctxs); + unregister_cpu_notifier: + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) - break; + return -1; +} - hctx->nr_ctx = 0; +static int blk_mq_init_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; - if (set->ops->init_hctx && - set->ops->init_hctx(hctx, set->driver_data, i)) + /* + * Initialize hardware queues + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_init_hctx(q, set, hctx, i)) break; } -- cgit v1.2.3 From 1bcb1eada4f11a713cbe586d1b5a5d93a48277cb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:39 +0800 Subject: blk-mq: allocate flush_rq in blk_mq_init_flush() It is reasonable to allocate flush req in blk_mq_init_flush(). Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 11 ++++++++++- block/blk-mq.c | 16 ++++++---------- block/blk-mq.h | 2 +- 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-flush.c b/block/blk-flush.c index c8e25768f2e1..55028a707927 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -472,7 +472,16 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -void blk_mq_init_flush(struct request_queue *q) +int blk_mq_init_flush(struct request_queue *q) { + struct blk_mq_tag_set *set = q->tag_set; + spin_lock_init(&q->mq_flush_lock); + + q->flush_rq = kzalloc(round_up(sizeof(struct request) + + set->cmd_size, cache_line_size()), + GFP_KERNEL); + if (!q->flush_rq) + return -ENOMEM; + return 0; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 66ef1fb79326..78bcf8bfb22a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1848,17 +1848,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); - blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, set->nr_hw_queues); - q->flush_rq = kzalloc(round_up(sizeof(struct request) + - set->cmd_size, cache_line_size()), - GFP_KERNEL); - if (!q->flush_rq) - goto err_hw; - if (blk_mq_init_hw_queues(q, set)) - goto err_flush_rq; + goto err_hw; mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); @@ -1866,12 +1859,15 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); + if (blk_mq_init_flush(q)) + goto err_hw_queues; + blk_mq_map_swqueue(q); return q; -err_flush_rq: - kfree(q->flush_rq); +err_hw_queues: + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); err_hw: blk_cleanup_queue(q); err_hctxs: diff --git a/block/blk-mq.h b/block/blk-mq.h index a3c613a9df2d..ecac69c08937 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,7 +27,7 @@ struct blk_mq_ctx { void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -void blk_mq_init_flush(struct request_queue *q); +int blk_mq_init_flush(struct request_queue *q); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, -- cgit v1.2.3 From f355265571440a7db16e784b6edf4e7d26971a03 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:40 +0800 Subject: block: introduce blk_init_flush and its pair These two temporary functions are introduced for holding flush initialization and de-initialization, so that we can introduce 'flush queue' easier in the following patch. And once 'flush queue' and its allocation/free functions are ready, they will be removed for sake of code readability. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++--- block/blk-flush.c | 19 ++++++++++++++++++- block/blk-mq.c | 2 +- block/blk-mq.h | 1 - block/blk-sysfs.c | 4 ++-- block/blk.h | 3 +++ 6 files changed, 26 insertions(+), 8 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index 6946a4275e6f..0a9d17269957 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -705,8 +705,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); - if (!q->flush_rq) + if (blk_init_flush(q)) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) @@ -742,7 +741,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, return q; fail: - kfree(q->flush_rq); + blk_exit_flush(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); diff --git a/block/blk-flush.c b/block/blk-flush.c index 55028a707927..c72ab32fd8eb 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -472,7 +472,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -int blk_mq_init_flush(struct request_queue *q) +static int blk_mq_init_flush(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; @@ -485,3 +485,20 @@ int blk_mq_init_flush(struct request_queue *q) return -ENOMEM; return 0; } + +int blk_init_flush(struct request_queue *q) +{ + if (q->mq_ops) + return blk_mq_init_flush(q); + + q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); + if (!q->flush_rq) + return -ENOMEM; + + return 0; +} + +void blk_exit_flush(struct request_queue *q) +{ + kfree(q->flush_rq); +} diff --git a/block/blk-mq.c b/block/blk-mq.c index 78bcf8bfb22a..2758cdf2de94 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1859,7 +1859,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); - if (blk_mq_init_flush(q)) + if (blk_init_flush(q)) goto err_hw_queues; blk_mq_map_swqueue(q); diff --git a/block/blk-mq.h b/block/blk-mq.h index ecac69c08937..d567d5283ffa 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,7 +27,6 @@ struct blk_mq_ctx { void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -int blk_mq_init_flush(struct request_queue *q); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 17f5c84ce7bf..949075952119 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,11 +517,11 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); + blk_exit_flush(q); + if (q->mq_ops) blk_mq_free_queue(q); - kfree(q->flush_rq); - blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/block/blk.h b/block/blk.h index e515a285d4c9..c6fa3d4c6a89 100644 --- a/block/blk.h +++ b/block/blk.h @@ -22,6 +22,9 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +int blk_init_flush(struct request_queue *q); +void blk_exit_flush(struct request_queue *q); + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); void blk_exit_rl(struct request_list *rl); -- cgit v1.2.3 From 7c94e1c157a227837b04f02f5edeff8301410ba2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:43 +0800 Subject: block: introduce blk_flush_queue to drive flush machinery This patch introduces 'struct blk_flush_queue' and puts all flush machinery related fields into this structure, so that - flush implementation details aren't exposed to driver - it is easy to convert to per dispatch-queue flush machinery This patch is basically a mechanical replacement. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +- block/blk-flush.c | 109 ++++++++++++++++++++++++++++++------------------- block/blk-mq.c | 10 +++-- block/blk.h | 22 +++++++++- include/linux/blkdev.h | 10 +---- 5 files changed, 99 insertions(+), 56 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index 222fe84d6ac4..cfaca8ca6cc4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -390,11 +390,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) * be drained. Check all the queues and counters. */ if (drain_all) { + struct blk_flush_queue *fq = blk_get_flush_queue(q); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; - drain |= !list_empty(&q->flush_queue[i]); + if (fq) + drain |= !list_empty(&fq->flush_queue[i]); } } diff --git a/block/blk-flush.c b/block/blk-flush.c index caf44756d329..b01a86d6bf86 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -28,7 +28,7 @@ * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at - * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a + * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * flush is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of FLUSH/FUA @@ -155,7 +155,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) * completion and trigger the next step. * * CONTEXT: - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) * * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. @@ -164,7 +164,8 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, int error) { struct request_queue *q = rq->q; - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; bool queued = false, kicked; BUG_ON(rq->flush.seq & seq); @@ -180,12 +181,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_POSTFLUSH: /* queue for flush */ if (list_empty(pending)) - q->flush_pending_since = jiffies; + fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending); break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &q->flush_data_in_flight); + list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); queued = blk_flush_queue_rq(rq, true); break; @@ -220,17 +221,18 @@ static void flush_end_io(struct request *flush_rq, int error) bool queued = false; struct request *rq, *n; unsigned long flags = 0; + struct blk_flush_queue *fq = blk_get_flush_queue(q); if (q->mq_ops) { - spin_lock_irqsave(&q->mq_flush_lock, flags); + spin_lock_irqsave(&fq->mq_flush_lock, flags); flush_rq->tag = -1; } - running = &q->flush_queue[q->flush_running_idx]; - BUG_ON(q->flush_pending_idx == q->flush_running_idx); + running = &fq->flush_queue[fq->flush_running_idx]; + BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); /* account completion of the flush request */ - q->flush_running_idx ^= 1; + fq->flush_running_idx ^= 1; if (!q->mq_ops) elv_completed_request(q, flush_rq); @@ -254,13 +256,13 @@ static void flush_end_io(struct request *flush_rq, int error) * directly into request_fn may confuse the driver. Always use * kblockd. */ - if (queued || q->flush_queue_delayed) { + if (queued || fq->flush_queue_delayed) { WARN_ON(q->mq_ops); blk_run_queue_async(q); } - q->flush_queue_delayed = 0; + fq->flush_queue_delayed = 0; if (q->mq_ops) - spin_unlock_irqrestore(&q->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** @@ -271,33 +273,34 @@ static void flush_end_io(struct request *flush_rq, int error) * Please read the comment at the top of this file for more info. * * CONTEXT: - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) * * RETURNS: * %true if flush was issued, %false otherwise. */ static bool blk_kick_flush(struct request_queue *q) { - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, flush.list); - struct request *flush_rq = q->flush_rq; + struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ - if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) + if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return false; /* C2 and C3 */ - if (!list_empty(&q->flush_data_in_flight) && + if (!list_empty(&fq->flush_data_in_flight) && time_before(jiffies, - q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return false; /* * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ - q->flush_pending_idx ^= 1; + fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); if (q->mq_ops) @@ -329,6 +332,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; unsigned long flags; + struct blk_flush_queue *fq = blk_get_flush_queue(q); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -337,10 +341,10 @@ static void mq_flush_data_end_io(struct request *rq, int error) * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ - spin_lock_irqsave(&q->mq_flush_lock, flags); + spin_lock_irqsave(&fq->mq_flush_lock, flags); if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) blk_mq_run_hw_queue(hctx, true); - spin_unlock_irqrestore(&q->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** @@ -408,11 +412,13 @@ void blk_insert_flush(struct request *rq) rq->cmd_flags |= REQ_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ if (q->mq_ops) { + struct blk_flush_queue *fq = blk_get_flush_queue(q); + rq->end_io = mq_flush_data_end_io; - spin_lock_irq(&q->mq_flush_lock); + spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&q->mq_flush_lock); + spin_unlock_irq(&fq->mq_flush_lock); return; } rq->end_io = flush_data_end_io; @@ -473,31 +479,52 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -static int blk_mq_init_flush(struct request_queue *q) +static struct blk_flush_queue *blk_alloc_flush_queue( + struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; + struct blk_flush_queue *fq; + int rq_sz = sizeof(struct request); - spin_lock_init(&q->mq_flush_lock); + fq = kzalloc(sizeof(*fq), GFP_KERNEL); + if (!fq) + goto fail; - q->flush_rq = kzalloc(round_up(sizeof(struct request) + - set->cmd_size, cache_line_size()), - GFP_KERNEL); - if (!q->flush_rq) - return -ENOMEM; - return 0; + if (q->mq_ops) { + spin_lock_init(&fq->mq_flush_lock); + rq_sz = round_up(rq_sz + q->tag_set->cmd_size, + cache_line_size()); + } + + fq->flush_rq = kzalloc(rq_sz, GFP_KERNEL); + if (!fq->flush_rq) + goto fail_rq; + + INIT_LIST_HEAD(&fq->flush_queue[0]); + INIT_LIST_HEAD(&fq->flush_queue[1]); + INIT_LIST_HEAD(&fq->flush_data_in_flight); + + return fq; + + fail_rq: + kfree(fq); + fail: + return NULL; } -int blk_init_flush(struct request_queue *q) +static void blk_free_flush_queue(struct blk_flush_queue *fq) { - INIT_LIST_HEAD(&q->flush_queue[0]); - INIT_LIST_HEAD(&q->flush_queue[1]); - INIT_LIST_HEAD(&q->flush_data_in_flight); + /* bio based request queue hasn't flush queue */ + if (!fq) + return; - if (q->mq_ops) - return blk_mq_init_flush(q); + kfree(fq->flush_rq); + kfree(fq); +} - q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); - if (!q->flush_rq) +int blk_init_flush(struct request_queue *q) +{ + q->fq = blk_alloc_flush_queue(q); + if (!q->fq) return -ENOMEM; return 0; @@ -505,5 +532,5 @@ int blk_init_flush(struct request_queue *q) void blk_exit_flush(struct request_queue *q) { - kfree(q->flush_rq); + blk_free_flush_queue(q->fq); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 2758cdf2de94..d39e8a5eaeaa 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -508,20 +508,22 @@ void blk_mq_kick_requeue_list(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_kick_requeue_list); -static inline bool is_flush_request(struct request *rq, unsigned int tag) +static inline bool is_flush_request(struct request *rq, + struct blk_flush_queue *fq, unsigned int tag) { return ((rq->cmd_flags & REQ_FLUSH_SEQ) && - rq->q->flush_rq->tag == tag); + fq->flush_rq->tag == tag); } struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { struct request *rq = tags->rqs[tag]; + struct blk_flush_queue *fq = blk_get_flush_queue(rq->q); - if (!is_flush_request(rq, tag)) + if (!is_flush_request(rq, fq, tag)) return rq; - return rq->q->flush_rq; + return fq->flush_rq; } EXPORT_SYMBOL(blk_mq_tag_to_rq); diff --git a/block/blk.h b/block/blk.h index c6fa3d4c6a89..833c4ac6c4eb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -12,11 +12,28 @@ /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) +struct blk_flush_queue { + unsigned int flush_queue_delayed:1; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; + struct request *flush_rq; + spinlock_t mq_flush_lock; +}; + extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; +static inline struct blk_flush_queue *blk_get_flush_queue( + struct request_queue *q) +{ + return q->fq; +} + static inline void __blk_get_queue(struct request_queue *q) { kobject_get(&q->kobj); @@ -89,6 +106,7 @@ void blk_insert_flush(struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; + struct blk_flush_queue *fq = blk_get_flush_queue(q); while (1) { if (!list_empty(&q->queue_head)) { @@ -111,9 +129,9 @@ static inline struct request *__elv_next_request(struct request_queue *q) * should be restarted later. Please see flush_end_io() for * details. */ - if (q->flush_pending_idx != q->flush_running_idx && + if (fq->flush_pending_idx != fq->flush_running_idx && !queue_flush_queueable(q)) { - q->flush_queue_delayed = 1; + fq->flush_queue_delayed = 1; return NULL; } if (unlikely(blk_queue_bypass(q)) || diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e267bf0db559..49f3461e4272 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -36,6 +36,7 @@ struct request; struct sg_io_hdr; struct bsg_job; struct blkcg_gq; +struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -455,14 +456,7 @@ struct request_queue { */ unsigned int flush_flags; unsigned int flush_not_queueable:1; - unsigned int flush_queue_delayed:1; - unsigned int flush_pending_idx:1; - unsigned int flush_running_idx:1; - unsigned long flush_pending_since; - struct list_head flush_queue[2]; - struct list_head flush_data_in_flight; - struct request *flush_rq; - spinlock_t mq_flush_lock; + struct blk_flush_queue *fq; struct list_head requeue_list; spinlock_t requeue_lock; -- cgit v1.2.3 From ba483388e3058b3e412632a84e6bf1f134beaf3d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:44 +0800 Subject: block: remove blk_init_flush() and its pair Now mission of the two helpers is over, and just call blk_alloc_flush_queue() and blk_free_flush_queue() directly. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- block/blk-flush.c | 19 ++----------------- block/blk-mq.c | 3 ++- block/blk-sysfs.c | 2 +- block/blk.h | 4 ++-- 5 files changed, 10 insertions(+), 23 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index cfaca8ca6cc4..dba0a8350807 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -704,7 +704,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - if (blk_init_flush(q)) + q->fq = blk_alloc_flush_queue(q); + if (!q->fq) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) @@ -740,7 +741,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, return q; fail: - blk_exit_flush(q); + blk_free_flush_queue(q->fq); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); diff --git a/block/blk-flush.c b/block/blk-flush.c index b01a86d6bf86..d66cbf2b2bc8 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -479,8 +479,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -static struct blk_flush_queue *blk_alloc_flush_queue( - struct request_queue *q) +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); @@ -511,7 +510,7 @@ static struct blk_flush_queue *blk_alloc_flush_queue( return NULL; } -static void blk_free_flush_queue(struct blk_flush_queue *fq) +void blk_free_flush_queue(struct blk_flush_queue *fq) { /* bio based request queue hasn't flush queue */ if (!fq) @@ -520,17 +519,3 @@ static void blk_free_flush_queue(struct blk_flush_queue *fq) kfree(fq->flush_rq); kfree(fq); } - -int blk_init_flush(struct request_queue *q) -{ - q->fq = blk_alloc_flush_queue(q); - if (!q->fq) - return -ENOMEM; - - return 0; -} - -void blk_exit_flush(struct request_queue *q) -{ - blk_free_flush_queue(q->fq); -} diff --git a/block/blk-mq.c b/block/blk-mq.c index d39e8a5eaeaa..59ca79634cb9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1861,7 +1861,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); - if (blk_init_flush(q)) + q->fq = blk_alloc_flush_queue(q); + if (!q->fq) goto err_hw_queues; blk_mq_map_swqueue(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 949075952119..718cffc4c678 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,7 +517,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - blk_exit_flush(q); + blk_free_flush_queue(q->fq); if (q->mq_ops) blk_mq_free_queue(q); diff --git a/block/blk.h b/block/blk.h index 833c4ac6c4eb..9eaa6e91b13f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -39,8 +39,8 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } -int blk_init_flush(struct request_queue *q); -void blk_exit_flush(struct request_queue *q); +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q); +void blk_free_flush_queue(struct blk_flush_queue *fq); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); -- cgit v1.2.3 From e97c293cdf77263abdc021de280516e0017afc84 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:46 +0800 Subject: block: introduce 'blk_mq_ctx' parameter to blk_get_flush_queue This patch adds 'blk_mq_ctx' parameter to blk_get_flush_queue(), so that this function can find the corresponding blk_flush_queue bound with current mq context since the flush queue will become per hw-queue. For legacy queue, the parameter can be simply 'NULL'. For multiqueue case, the parameter should be set as the context from which the related request is originated. With this context info, the hw queue and related flush queue can be found easily. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-flush.c | 11 +++++------ block/blk-mq.c | 3 ++- block/blk.h | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index dba0a8350807..b1dd4e086740 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -390,7 +390,7 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) * be drained. Check all the queues and counters. */ if (drain_all) { - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; diff --git a/block/blk-flush.c b/block/blk-flush.c index 9bc5b4f35c23..004d95e4098e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -223,7 +223,7 @@ static void flush_end_io(struct request *flush_rq, int error) bool queued = false; struct request *rq, *n; unsigned long flags = 0; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); if (q->mq_ops) { spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -319,7 +319,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) static void flush_data_end_io(struct request *rq, int error) { struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); /* * After populating an empty queue, kick it to avoid stall. Read @@ -333,11 +333,10 @@ static void mq_flush_data_end_io(struct request *rq, int error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; + struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); /* @@ -367,7 +366,7 @@ void blk_insert_flush(struct request *rq) struct request_queue *q = rq->q; unsigned int fflags = q->flush_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); /* * @policy now records what operations need to be done. Adjust diff --git a/block/blk-mq.c b/block/blk-mq.c index 59ca79634cb9..53b6def12fc4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -518,7 +518,8 @@ static inline bool is_flush_request(struct request *rq, struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { struct request *rq = tags->rqs[tag]; - struct blk_flush_queue *fq = blk_get_flush_queue(rq->q); + /* mq_ctx of flush rq is always cloned from the corresponding req */ + struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); if (!is_flush_request(rq, fq, tag)) return rq; diff --git a/block/blk.h b/block/blk.h index 9eaa6e91b13f..7ecdd8517e69 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,7 +29,7 @@ extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( - struct request_queue *q) + struct request_queue *q, struct blk_mq_ctx *ctx) { return q->fq; } @@ -106,7 +106,7 @@ void blk_insert_flush(struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; - struct blk_flush_queue *fq = blk_get_flush_queue(q); + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); while (1) { if (!list_empty(&q->queue_head)) { -- cgit v1.2.3 From f70ced09170761acb69840cafaace4abc72cba4b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:47 +0800 Subject: blk-mq: support per-distpatch_queue flush machinery This patch supports to run one single flush machinery for each blk-mq dispatch queue, so that: - current init_request and exit_request callbacks can cover flush request too, then the buggy copying way of initializing flush request's pdu can be fixed - flushing performance gets improved in case of multi hw-queue In fio sync write test over virtio-blk(4 hw queues, ioengine=sync, iodepth=64, numjobs=4, bs=4K), it is observed that througput gets increased a lot over my test environment: - throughput: +70% in case of virtio-blk over null_blk - throughput: +30% in case of virtio-blk over SSD image The multi virtqueue feature isn't merged to QEMU yet, and patches for the feature can be found in below tree: git://kernel.ubuntu.com/ming/qemu.git v2.1.0-mq.4 And simply passing 'num_queues=4 vectors=5' should be enough to enable multi queue(quad queue) feature for QEMU virtio-blk. Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-flush.c | 21 ++++++++++++++------- block/blk-mq.c | 50 ++++++++++++++++++++++++-------------------------- block/blk-sysfs.c | 4 ++-- block/blk.h | 16 +++++++++++++--- include/linux/blk-mq.h | 6 ++++++ 6 files changed, 60 insertions(+), 39 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index b1dd4e086740..e1c2775c7597 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -704,7 +704,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - q->fq = blk_alloc_flush_queue(q); + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); if (!q->fq) return NULL; diff --git a/block/blk-flush.c b/block/blk-flush.c index 004d95e4098e..20badd7b9d1b 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -305,8 +305,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); - if (q->mq_ops) - blk_mq_clone_flush_request(flush_rq, first_rq); + + /* + * Borrow tag from the first request since they can't + * be in flight at the same time. + */ + if (q->mq_ops) { + flush_rq->mq_ctx = first_rq->mq_ctx; + flush_rq->tag = first_rq->tag; + } flush_rq->cmd_type = REQ_TYPE_FS; flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; @@ -480,22 +487,22 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q) +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); - fq = kzalloc(sizeof(*fq), GFP_KERNEL); + fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node); if (!fq) goto fail; if (q->mq_ops) { spin_lock_init(&fq->mq_flush_lock); - rq_sz = round_up(rq_sz + q->tag_set->cmd_size, - cache_line_size()); + rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); } - fq->flush_rq = kzalloc(rq_sz, GFP_KERNEL); + fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node); if (!fq->flush_rq) goto fail_rq; diff --git a/block/blk-mq.c b/block/blk-mq.c index 53b6def12fc4..4e7a31466139 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -281,26 +281,6 @@ void blk_mq_free_request(struct request *rq) __blk_mq_free_request(hctx, ctx, rq); } -/* - * Clone all relevant state from a request that has been put on hold in - * the flush state machine into the preallocated flush request that hangs - * off the request queue. - * - * For a driver the flush request should be invisible, that's why we are - * impersonating the original request here. - */ -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq) -{ - struct blk_mq_hw_ctx *hctx = - orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); - - flush_rq->mq_ctx = orig_rq->mq_ctx; - flush_rq->tag = orig_rq->tag; - memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), - hctx->cmd_size); -} - inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); @@ -1516,12 +1496,20 @@ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + unsigned flush_start_tag = set->queue_depth; + blk_mq_tag_idle(hctx); + if (set->ops->exit_request) + set->ops->exit_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx); + if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_free_flush_queue(hctx->fq); kfree(hctx->ctxs); blk_mq_free_bitmap(&hctx->ctx_map); } @@ -1556,6 +1544,7 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { int node; + unsigned flush_start_tag = set->queue_depth; node = hctx->numa_node; if (node == NUMA_NO_NODE) @@ -1594,8 +1583,23 @@ static int blk_mq_init_hctx(struct request_queue *q, set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); + if (!hctx->fq) + goto exit_hctx; + + if (set->ops->init_request && + set->ops->init_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx, node)) + goto free_fq; + return 0; + free_fq: + kfree(hctx->fq); + exit_hctx: + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: blk_mq_free_bitmap(&hctx->ctx_map); free_ctxs: @@ -1862,16 +1866,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); - q->fq = blk_alloc_flush_queue(q); - if (!q->fq) - goto err_hw_queues; - blk_mq_map_swqueue(q); return q; -err_hw_queues: - blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); err_hw: blk_cleanup_queue(q); err_hctxs: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 718cffc4c678..e8f38a36c625 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,10 +517,10 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - blk_free_flush_queue(q->fq); - if (q->mq_ops) blk_mq_free_queue(q); + else + blk_free_flush_queue(q->fq); blk_trace_shutdown(q); diff --git a/block/blk.h b/block/blk.h index 7ecdd8517e69..43b036185712 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,6 +2,8 @@ #define BLK_INTERNAL_H #include +#include +#include "blk-mq.h" /* Amount of time in which a process may batch requests */ #define BLK_BATCH_TIME (HZ/50UL) @@ -31,7 +33,14 @@ extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( struct request_queue *q, struct blk_mq_ctx *ctx) { - return q->fq; + struct blk_mq_hw_ctx *hctx; + + if (!q->mq_ops) + return q->fq; + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + return hctx->fq; } static inline void __blk_get_queue(struct request_queue *q) @@ -39,8 +48,9 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q); -void blk_free_flush_queue(struct blk_flush_queue *fq); +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size); +void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 325349559fb0..02c5d950f444 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -4,6 +4,7 @@ #include struct blk_mq_tags; +struct blk_flush_queue; struct blk_mq_cpu_notifier { struct list_head list; @@ -34,6 +35,7 @@ struct blk_mq_hw_ctx { struct request_queue *queue; unsigned int queue_num; + struct blk_flush_queue *fq; void *driver_data; @@ -119,6 +121,10 @@ struct blk_mq_ops { /* * Called for every command allocated by the block layer to allow * the driver to set up driver specific data. + * + * Tag greater than or equal to queue_depth is for setting up + * flush request. + * * Ditto for exit/teardown. */ init_request_fn *init_request; -- cgit v1.2.3 From a86073e48ae85c9b50127facb0cc45bbd35972a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Oct 2014 15:41:54 -0600 Subject: blk-mq: allocate cpumask on the home node All other allocs are done on the specific node, somehow the cpumask for hw queue runs was missed. Fix that by using zalloc_cpumask_var_node() in blk_mq_init_queue(). Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e7a31466139..79aa11b3efa5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1802,7 +1802,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!hctxs[i]) goto err_hctxs; - if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, + node)) goto err_hctxs; atomic_set(&hctxs[i]->nr_active, 0); -- cgit v1.2.3