From 1661f2e21c8bbf922dcb76faf2126a33ffe4cddb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jan 2017 11:19:31 +0100 Subject: floppy: replace wrong kmalloc(GFP_USER) with GFP_KERNEL The raw_cmd_copyin() function does a kmalloc() with GFP_USER, although the allocated structure is obviously not mapped to userspace, just copied from/to. In this case GFP_KERNEL is more appropriate, so let's use it, although in the current implementation this does not manifest as any error. Reported-by: Matthew Wilcox Signed-off-by: Vlastimil Babka Signed-off-by: Jiri Kosina --- drivers/block/floppy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index a391a3cfb3fe..184887af4b9f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3119,7 +3119,7 @@ static int raw_cmd_copyin(int cmd, void __user *param, *rcmd = NULL; loop: - ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER); + ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL); if (!ptr) return -ENOMEM; *rcmd = ptr; -- cgit v1.2.3 From 729204ef49ec00b788ce23deb9eb922a5769f55d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 17 Dec 2016 18:49:09 +0800 Subject: block: relax check on sg gap If the last bvec of the 1st bio and the 1st bvec of the next bio are physically contigious, and the latter can be merged to last segment of the 1st bio, we should think they don't violate sg gap(or virt boundary) limit. Both Vitaly and Dexuan reported lots of unmergeable small bios are observed when running mkfs on Hyper-V virtual storage, and performance becomes quite low. This patch fixes that performance issue. The same issue should exist on NVMe, since it sets virt boundary too. Reported-by: Vitaly Kuznetsov Reported-by: Dexuan Cui Tested-by: Dexuan Cui Cc: Keith Busch Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83695641bd5e..b20da8dfa7ec 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1607,6 +1607,25 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } +/* + * Check if the two bvecs from two bios can be merged to one segment. + * If yes, no need to check gap between the two bios since the 1st bio + * and the 1st bvec in the 2nd bio can be handled in one segment. + */ +static inline bool bios_segs_mergeable(struct request_queue *q, + struct bio *prev, struct bio_vec *prev_last_bv, + struct bio_vec *next_first_bv) +{ + if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) + return false; + if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) + return false; + if (prev->bi_seg_back_size + next_first_bv->bv_len > + queue_max_segment_size(q)) + return false; + return true; +} + static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, struct bio *next) { @@ -1616,7 +1635,8 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, bio_get_last_bvec(prev, &pb); bio_get_first_bvec(next, &nb); - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); + if (!bios_segs_mergeable(q, prev, &pb, &nb)) + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); } return false; -- cgit v1.2.3 From f8a5b12247fe18f7fed801ad262a7ab190e1f848 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Dec 2016 09:24:51 -0700 Subject: blk-mq: make mq_ops a const pointer We never change it, make that clear. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche --- block/blk-mq.c | 2 +- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a8e67a155d04..79e1cb0f7b15 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -639,7 +639,7 @@ struct blk_mq_timeout_data { void blk_mq_rq_timed_out(struct request *req, bool reserved) { - struct blk_mq_ops *ops = req->q->mq_ops; + const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; /* diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4a2ab5d99ff7..afc81d77e471 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -60,7 +60,7 @@ struct blk_mq_hw_ctx { struct blk_mq_tag_set { unsigned int *mq_map; - struct blk_mq_ops *ops; + const struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b20da8dfa7ec..2e99d659b0f1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -407,7 +407,7 @@ struct request_queue { dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; - struct blk_mq_ops *mq_ops; + const struct blk_mq_ops *mq_ops; unsigned int *mq_map; -- cgit v1.2.3 From 1e668f4e7921e8c82838cf5f95ff4a2d5c852efc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Jan 2017 15:41:40 -0500 Subject: MAINTAINERS: Update maintainer entry for NBD The old maintainers email is bouncing and I've rewritten most of this driver in the recent months. Also add linux-block to the mailinglist and remove the old tree, I will send patches through the linux-block tree. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5f0420a0da5b..f481713fe44c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8599,10 +8599,10 @@ S: Maintained F: drivers/net/ethernet/netronome/ NETWORK BLOCK DEVICE (NBD) -M: Markus Pargmann +M: Josef Bacik S: Maintained +L: linux-block@vger.kernel.org L: nbd-general@lists.sourceforge.net -T: git git://git.pengutronix.de/git/mpa/linux-nbd.git F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c F: include/uapi/linux/nbd.h -- cgit v1.2.3 From c5082b70adfe8e1ea1cf4a8eff92c9f260e364d2 Mon Sep 17 00:00:00 2001 From: Alden Tondettar Date: Sun, 15 Jan 2017 15:31:56 -0700 Subject: partitions/efi: Fix integer overflow in GPT size calculation If a GUID Partition Table claims to have more than 2**25 entries, the calculation of the partition table size in alloc_read_gpt_entries() will overflow a 32-bit integer and not enough space will be allocated for the table. Nothing seems to get written out of bounds, but later efi_partition() will read up to 32768 bytes from a 128 byte buffer, possibly OOPSing or exposing information to /proc/partitions and uevents. The problem exists on both 64-bit and 32-bit platforms. Fix the overflow and also print a meaningful debug message if the table size is too large. Signed-off-by: Alden Tondettar Acked-by: Ard Biesheuvel Signed-off-by: Jens Axboe --- block/partitions/efi.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index bcd86e5cd546..39f70d968754 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, if (!gpt) return NULL; - count = le32_to_cpu(gpt->num_partition_entries) * + count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; @@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; - u64 lastlba; + u64 lastlba, pt_size; if (!ptes) return 0; @@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Sanity check partition table size */ + pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry); + if (pt_size > KMALLOC_MAX_SIZE) { + pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", + (unsigned long long)pt_size, KMALLOC_MAX_SIZE); + goto fail; + } + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ - crc = efi_crc32((const unsigned char *) (*ptes), - le32_to_cpu((*gpt)->num_partition_entries) * - le32_to_cpu((*gpt)->sizeof_partition_entry)); + crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partition Entry Array CRC check failed.\n"); -- cgit v1.2.3 From c51ca6cf545bc51ad38bd50816bde37c647d608d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 10 Dec 2016 15:13:59 -0700 Subject: block: move existing elevator ops to union Prep patch for adding MQ ops as well, since doing anon unions with named initializers doesn't work on older compilers. Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- block/blk-ioc.c | 8 +++---- block/blk-merge.c | 4 ++-- block/blk.h | 10 ++++---- block/cfq-iosched.c | 2 +- block/deadline-iosched.c | 2 +- block/elevator.c | 60 ++++++++++++++++++++++++------------------------ block/noop-iosched.c | 2 +- include/linux/elevator.h | 4 +++- 8 files changed, 47 insertions(+), 45 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 381cb50a673c..ab372092a57d 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -43,8 +43,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (icq->flags & ICQ_EXITED) return; - if (et->ops.elevator_exit_icq_fn) - et->ops.elevator_exit_icq_fn(icq); + if (et->ops.sq.elevator_exit_icq_fn) + et->ops.sq.elevator_exit_icq_fn(icq); icq->flags |= ICQ_EXITED; } @@ -383,8 +383,8 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); - if (et->ops.elevator_init_icq_fn) - et->ops.elevator_init_icq_fn(icq); + if (et->ops.sq.elevator_init_icq_fn) + et->ops.sq.elevator_init_icq_fn(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(ioc, q); diff --git a/block/blk-merge.c b/block/blk-merge.c index 182398cb1524..480570b691dc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -763,8 +763,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_allow_rq_merge_fn) - if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next)) + if (e->type->ops.sq.elevator_allow_rq_merge_fn) + if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) return 0; return attempt_merge(q, rq, next); diff --git a/block/blk.h b/block/blk.h index 041185e5f129..f46c0ac8ae3d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -167,7 +167,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) return NULL; } if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) + !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) return NULL; } } @@ -176,16 +176,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_activate_req_fn) - e->type->ops.elevator_activate_req_fn(q, rq); + if (e->type->ops.sq.elevator_activate_req_fn) + e->type->ops.sq.elevator_activate_req_fn(q, rq); } static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_deactivate_req_fn) - e->type->ops.elevator_deactivate_req_fn(q, rq); + if (e->type->ops.sq.elevator_deactivate_req_fn) + e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } #ifdef CONFIG_FAIL_IO_TIMEOUT diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c73a6fcaeb9d..37aeb20fa454 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4837,7 +4837,7 @@ static struct elv_fs_entry cfq_attrs[] = { }; static struct elevator_type iosched_cfq = { - .ops = { + .ops.sq = { .elevator_merge_fn = cfq_merge, .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 55e0bb6d7da7..05fc0ea25a98 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -439,7 +439,7 @@ static struct elv_fs_entry deadline_attrs[] = { }; static struct elevator_type iosched_deadline = { - .ops = { + .ops.sq = { .elevator_merge_fn = deadline_merge, .elevator_merged_fn = deadline_merged_request, .elevator_merge_req_fn = deadline_merged_requests, diff --git a/block/elevator.c b/block/elevator.c index 40f0c04e5ad3..022a26830297 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -58,8 +58,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_allow_bio_merge_fn) - return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio); + if (e->type->ops.sq.elevator_allow_bio_merge_fn) + return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); return 1; } @@ -224,7 +224,7 @@ int elevator_init(struct request_queue *q, char *name) } } - err = e->ops.elevator_init_fn(q, e); + err = e->ops.sq.elevator_init_fn(q, e); if (err) elevator_put(e); return err; @@ -234,8 +234,8 @@ EXPORT_SYMBOL(elevator_init); void elevator_exit(struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); - if (e->type->ops.elevator_exit_fn) - e->type->ops.elevator_exit_fn(e); + if (e->type->ops.sq.elevator_exit_fn) + e->type->ops.sq.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -443,8 +443,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) return ELEVATOR_BACK_MERGE; } - if (e->type->ops.elevator_merge_fn) - return e->type->ops.elevator_merge_fn(q, req, bio); + if (e->type->ops.sq.elevator_merge_fn) + return e->type->ops.sq.elevator_merge_fn(q, req, bio); return ELEVATOR_NO_MERGE; } @@ -495,8 +495,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_merged_fn) - e->type->ops.elevator_merged_fn(q, rq, type); + if (e->type->ops.sq.elevator_merged_fn) + e->type->ops.sq.elevator_merged_fn(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); @@ -510,8 +510,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct elevator_queue *e = q->elevator; const int next_sorted = next->rq_flags & RQF_SORTED; - if (next_sorted && e->type->ops.elevator_merge_req_fn) - e->type->ops.elevator_merge_req_fn(q, rq, next); + if (next_sorted && e->type->ops.sq.elevator_merge_req_fn) + e->type->ops.sq.elevator_merge_req_fn(q, rq, next); elv_rqhash_reposition(q, rq); @@ -528,8 +528,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_bio_merged_fn) - e->type->ops.elevator_bio_merged_fn(q, rq, bio); + if (e->type->ops.sq.elevator_bio_merged_fn) + e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); } #ifdef CONFIG_PM @@ -578,7 +578,7 @@ void elv_drain_elevator(struct request_queue *q) lockdep_assert_held(q->queue_lock); - while (q->elevator->type->ops.elevator_dispatch_fn(q, 1)) + while (q->elevator->type->ops.sq.elevator_dispatch_fn(q, 1)) ; if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " @@ -653,7 +653,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) * rq cannot be accessed after calling * elevator_add_req_fn. */ - q->elevator->type->ops.elevator_add_req_fn(q, rq); + q->elevator->type->ops.sq.elevator_add_req_fn(q, rq); break; case ELEVATOR_INSERT_FLUSH: @@ -682,8 +682,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_latter_req_fn) - return e->type->ops.elevator_latter_req_fn(q, rq); + if (e->type->ops.sq.elevator_latter_req_fn) + return e->type->ops.sq.elevator_latter_req_fn(q, rq); return NULL; } @@ -691,8 +691,8 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_former_req_fn) - return e->type->ops.elevator_former_req_fn(q, rq); + if (e->type->ops.sq.elevator_former_req_fn) + return e->type->ops.sq.elevator_former_req_fn(q, rq); return NULL; } @@ -701,8 +701,8 @@ int elv_set_request(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_set_req_fn) - return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); + if (e->type->ops.sq.elevator_set_req_fn) + return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; } @@ -710,16 +710,16 @@ void elv_put_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_put_req_fn) - e->type->ops.elevator_put_req_fn(rq); + if (e->type->ops.sq.elevator_put_req_fn) + e->type->ops.sq.elevator_put_req_fn(rq); } int elv_may_queue(struct request_queue *q, unsigned int op) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_may_queue_fn) - return e->type->ops.elevator_may_queue_fn(q, op); + if (e->type->ops.sq.elevator_may_queue_fn) + return e->type->ops.sq.elevator_may_queue_fn(q, op); return ELV_MQUEUE_MAY; } @@ -734,8 +734,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq) if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; if ((rq->rq_flags & RQF_SORTED) && - e->type->ops.elevator_completed_req_fn) - e->type->ops.elevator_completed_req_fn(q, rq); + e->type->ops.sq.elevator_completed_req_fn) + e->type->ops.sq.elevator_completed_req_fn(q, rq); } } @@ -803,8 +803,8 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; - if (e->type->ops.elevator_registered_fn) - e->type->ops.elevator_registered_fn(q); + if (e->type->ops.sq.elevator_registered_fn) + e->type->ops.sq.elevator_registered_fn(q); } return error; } @@ -912,7 +912,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) spin_unlock_irq(q->queue_lock); /* allocate, init and register new elevator */ - err = new_e->ops.elevator_init_fn(q, new_e); + err = new_e->ops.sq.elevator_init_fn(q, new_e); if (err) goto fail_init; diff --git a/block/noop-iosched.c b/block/noop-iosched.c index a163c487cf38..2d1b15d89b45 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e) } static struct elevator_type elevator_noop = { - .ops = { + .ops.sq = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b276e9ef0e0b..2a9e966eed03 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -94,7 +94,9 @@ struct elevator_type struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ - struct elevator_ops ops; + union { + struct elevator_ops sq; + } ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; -- cgit v1.2.3 From c23ecb426084a98418ee29124c139e37c274ad04 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Dec 2016 14:23:43 -0700 Subject: block: move rq_ioc() to blk.h We want to use it outside of blk-core.c. Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval --- block/blk-core.c | 16 ---------------- block/blk.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 61ba08c58b64..92baea07acbc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1039,22 +1039,6 @@ static bool blk_rq_should_init_elevator(struct bio *bio) return true; } -/** - * rq_ioc - determine io_context for request allocation - * @bio: request being allocated is for this bio (can be %NULL) - * - * Determine io_context to use for request allocation for @bio. May return - * %NULL if %current->io_context doesn't exist. - */ -static struct io_context *rq_ioc(struct bio *bio) -{ -#ifdef CONFIG_BLK_CGROUP - if (bio && bio->bi_ioc) - return bio->bi_ioc; -#endif - return current->io_context; -} - /** * __get_request - get a free request * @rl: request list to allocate from diff --git a/block/blk.h b/block/blk.h index f46c0ac8ae3d..9a716b5925a4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -263,6 +263,22 @@ void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); +/** + * rq_ioc - determine io_context for request allocation + * @bio: request being allocated is for this bio (can be %NULL) + * + * Determine io_context to use for request allocation for @bio. May return + * %NULL if %current->io_context doesn't exist. + */ +static inline struct io_context *rq_ioc(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (bio && bio->bi_ioc) + return bio->bi_ioc; +#endif + return current->io_context; +} + /** * create_io_context - try to create task->io_context * @gfp_mask: allocation mask -- cgit v1.2.3 From 16a3c2a70cad5ccdc2dc0a4544bff82554807493 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Dec 2016 14:27:46 -0700 Subject: blk-mq: un-export blk_mq_free_hctx_request() It's only used in blk-mq, kill it from the main exported header and kill the symbol export as well. Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval --- block/blk-mq.c | 5 ++--- include/linux/blk-mq.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 79e1cb0f7b15..f49f6325b332 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -337,15 +337,14 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, blk_queue_exit(q); } -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, + struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; ctx->rq_completed[rq_is_sync(rq)]++; __blk_mq_free_request(hctx, ctx, rq); - } -EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); void blk_mq_free_request(struct request *rq) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index afc81d77e471..2686f9e7302a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -181,7 +181,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { -- cgit v1.2.3 From 2c3ad667902ef6f4b60ef0a3c6f7d8c2b007769a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Dec 2016 14:34:47 -0700 Subject: blk-mq: export some helpers we need to the scheduling framework Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Omar Sandoval --- block/blk-mq.c | 39 +++++++++++++++++++++------------------ block/blk-mq.h | 25 +++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f49f6325b332..9fc521755e22 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -167,8 +167,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); -static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, unsigned int op) +void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int op) { INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ @@ -213,9 +213,10 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, ctx->rq_dispatched[op_is_sync(op)]++; } +EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); -static struct request * -__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) +struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, + unsigned int op) { struct request *rq; unsigned int tag; @@ -236,6 +237,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) return NULL; } +EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags) @@ -319,8 +321,8 @@ out_queue_exit: } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct request *rq) +void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq) { const int tag = rq->tag; struct request_queue *q = rq->q; @@ -802,7 +804,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) * Process software queues that have been marked busy, splicing them * to the for-dispatch */ -static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, @@ -811,6 +813,7 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } +EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); static inline unsigned int queued_to_index(unsigned int queued) { @@ -921,7 +924,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) /* * Touch any software queue that has pending entries. */ - flush_busy_ctxs(hctx, &rq_list); + blk_mq_flush_busy_ctxs(hctx, &rq_list); /* * If we have previous entries on our dispatch list, grab them @@ -1135,8 +1138,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, list_add_tail(&rq->queuelist, &ctx->rq_list); } -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; @@ -1550,8 +1553,8 @@ run_queue: return cookie; } -static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) { struct page *page; @@ -1588,8 +1591,8 @@ static size_t order_to_size(unsigned int order) return (size_t)PAGE_SIZE << order; } -static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx) { struct blk_mq_tags *tags; unsigned int i, j, entries_per_page, max_order = 4; @@ -2279,10 +2282,10 @@ static int blk_mq_queue_reinit_dead(unsigned int cpu) * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list * and set bit0 in pending bitmap as ctx1->index_hw is still zero. * - * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in - * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. - * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list - * is ignored. + * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set + * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. + * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is + * ignored. */ static int blk_mq_queue_reinit_prepare(unsigned int cpu) { diff --git a/block/blk-mq.h b/block/blk-mq.h index 63e9116cddbd..e59f5ca520a2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -32,6 +32,21 @@ void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); + +/* + * Internal helpers for allocating/freeing the request map + */ +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx); +struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx); + +/* + * Internal helpers for request insertion into sw queues + */ +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head); /* * CPU hotplug helpers @@ -103,6 +118,16 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +/* + * Internal helpers for request allocation/init/free + */ +void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int op); +void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq); +struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, + unsigned int op); + static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_STOPPED, &hctx->state); -- cgit v1.2.3 From 4941115bef2bc891aa00a2f0edeaf06dc982325a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jan 2017 08:09:05 -0700 Subject: blk-mq-tag: cleanup the normal/reserved tag allocation This is in preparation for having another tag set available. Cleanup the parameters, and allow passing in of tags for blk_mq_put_tag(). Signed-off-by: Jens Axboe [hch: even more cleanups] Signed-off-by: Christoph Hellwig Reviewed-by: Omar Sandoval --- block/blk-mq-tag.c | 94 +++++++++++++++++++++--------------------------------- block/blk-mq-tag.h | 4 +-- block/blk-mq.c | 2 +- block/blk-mq.h | 5 +++ 4 files changed, 44 insertions(+), 61 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index dcf5ce3ba4bf..ced752716878 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -90,32 +90,46 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) +static int __blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { if (!hctx_may_queue(hctx, bt)) return -1; return __sbitmap_queue_get(bt); } -static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, - struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags) +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_WAIT(wait); + unsigned int tag_offset; int tag; - tag = __bt_get(hctx, bt); + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return BLK_MQ_TAG_FAIL; + } + bt = &tags->breserved_tags; + tag_offset = 0; + } else { + bt = &tags->bitmap_tags; + tag_offset = tags->nr_reserved_tags; + } + + tag = __blk_mq_get_tag(data->hctx, bt); if (tag != -1) - return tag; + goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) - return -1; + return BLK_MQ_TAG_FAIL; - ws = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, data->hctx); do { prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt); + tag = __blk_mq_get_tag(data->hctx, bt); if (tag != -1) break; @@ -125,14 +139,14 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, * some to complete. Note that hctx can be NULL here for * reserved tag allocation. */ - if (hctx) - blk_mq_run_hw_queue(hctx, false); + if (data->hctx) + blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt); + tag = __blk_mq_get_tag(data->hctx, bt); if (tag != -1) break; @@ -142,61 +156,25 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); - if (data->flags & BLK_MQ_REQ_RESERVED) { - bt = &data->hctx->tags->breserved_tags; - } else { - hctx = data->hctx; - bt = &hctx->tags->bitmap_tags; - } + tags = blk_mq_tags_from_data(data); + if (data->flags & BLK_MQ_REQ_RESERVED) + bt = &tags->breserved_tags; + else + bt = &tags->bitmap_tags; + finish_wait(&ws->wait, &wait); - ws = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, data->hctx); } while (1); finish_wait(&ws->wait, &wait); - return tag; -} - -static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) -{ - int tag; - - tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - data->hctx->tags); - if (tag >= 0) - return tag + data->hctx->tags->nr_reserved_tags; - - return BLK_MQ_TAG_FAIL; -} - -static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) -{ - int tag; - if (unlikely(!data->hctx->tags->nr_reserved_tags)) { - WARN_ON_ONCE(1); - return BLK_MQ_TAG_FAIL; - } - - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, - data->hctx->tags); - if (tag < 0) - return BLK_MQ_TAG_FAIL; - - return tag; +found_tag: + return tag + tag_offset; } -unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, + struct blk_mq_ctx *ctx, unsigned int tag) { - if (data->flags & BLK_MQ_REQ_RESERVED) - return __blk_mq_get_reserved_tag(data); - return __blk_mq_get_tag(data); -} - -void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - unsigned int tag) -{ - struct blk_mq_tags *tags = hctx->tags; - if (tag >= tags->nr_reserved_tags) { const int real_tag = tag - tags->nr_reserved_tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index d1662734dc53..923602dd3bfb 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -24,8 +24,8 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - unsigned int tag); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, + struct blk_mq_ctx *ctx, unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9fc521755e22..6fab8e9c724f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -335,7 +335,7 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); - blk_mq_put_tag(hctx, ctx, tag); + blk_mq_put_tag(hctx, hctx->tags, ctx, tag); blk_queue_exit(q); } diff --git a/block/blk-mq.h b/block/blk-mq.h index e59f5ca520a2..48b7771eb192 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -118,6 +118,11 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) +{ + return data->hctx->tags; +} + /* * Internal helpers for request allocation/init/free */ -- cgit v1.2.3 From cc71a6f43886a8af57dbbce2a45b4b2aaf570fe6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Jan 2017 14:29:56 -0700 Subject: blk-mq: abstract out helpers for allocating/freeing tag maps Prep patch for adding an extra tag map for scheduler requests. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- block/blk-mq.c | 117 ++++++++++++++++++++++++++++++++++++--------------------- block/blk-mq.h | 14 ++++--- 2 files changed, 83 insertions(+), 48 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 6fab8e9c724f..fcdeadc55753 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1553,8 +1553,8 @@ run_queue: return cookie; } -void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx) +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) { struct page *page; @@ -1580,33 +1580,30 @@ void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, kmemleak_free(page_address(page)); __free_pages(page, page->private); } +} +void blk_mq_free_rq_map(struct blk_mq_tags *tags) +{ kfree(tags->rqs); + tags->rqs = NULL; blk_mq_free_tags(tags); } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - -struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; - unsigned int i, j, entries_per_page, max_order = 4; - size_t rq_size, left; - tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->numa_node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; - INIT_LIST_HEAD(&tags->page_list); - - tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, set->numa_node); if (!tags->rqs) { @@ -1614,15 +1611,31 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return NULL; } + return tags; +} + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) +{ + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + INIT_LIST_HEAD(&tags->page_list); + /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); - left = rq_size * set->queue_depth; + left = rq_size * depth; - for (i = 0; i < set->queue_depth; ) { + for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; @@ -1656,7 +1669,7 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; - to_do = min(entries_per_page, set->queue_depth - i); + to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { tags->rqs[i] = p; @@ -1673,11 +1686,11 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; + return 0; fail: - blk_mq_free_rq_map(set, tags, hctx_idx); - return NULL; + blk_mq_free_rqs(set, tags, hctx_idx); + return -ENOMEM; } /* @@ -1869,6 +1882,33 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } +static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) +{ + int ret = 0; + + set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, + set->queue_depth, set->reserved_tags); + if (!set->tags[hctx_idx]) + return false; + + ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, + set->queue_depth); + if (!ret) + return true; + + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + return false; +} + +static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; +} + static void blk_mq_map_swqueue(struct request_queue *q, const struct cpumask *online_mask) { @@ -1897,17 +1937,15 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[hctx_idx]) { - set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx); - + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_rq_map(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - if (!set->tags[hctx_idx]) - q->mq_map[i] = 0; + q->mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); @@ -1930,10 +1968,9 @@ static void blk_mq_map_swqueue(struct request_queue *q, * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) { - blk_mq_free_rq_map(set, set->tags[i], i); - set->tags[i] = NULL; - } + if (i && set->tags[i]) + blk_mq_free_map_and_requests(set, i); + hctx->tags = NULL; continue; } @@ -2100,10 +2137,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) { - blk_mq_free_rq_map(set, hctx->tags, j); - set->tags[j] = NULL; - } + if (hctx->tags) + blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); free_cpumask_var(hctx->cpumask); kobject_put(&hctx->kobj); @@ -2299,17 +2334,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_init_rq_map(set, i); - if (!set->tags[i]) + for (i = 0; i < set->nr_hw_queues; i++) + if (!__blk_mq_alloc_rq_map(set, i)) goto out_unwind; - } return 0; out_unwind: while (--i >= 0) - blk_mq_free_rq_map(set, set->tags[i], i); + blk_mq_free_rq_map(set->tags[i]); return -ENOMEM; } @@ -2433,10 +2466,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < nr_cpu_ids; i++) { - if (set->tags[i]) - blk_mq_free_rq_map(set, set->tags[i], i); - } + for (i = 0; i < nr_cpu_ids; i++) + blk_mq_free_map_and_requests(set, i); kfree(set->mq_map); set->mq_map = NULL; diff --git a/block/blk-mq.h b/block/blk-mq.h index 48b7771eb192..1b279b02d0f6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -37,17 +37,21 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); /* * Internal helpers for allocating/freeing the request map */ -void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx); -struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx); +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx); +void blk_mq_free_rq_map(struct blk_mq_tags *tags); +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags); +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth); /* * Internal helpers for request insertion into sw queues */ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head); - /* * CPU hotplug helpers */ -- cgit v1.2.3 From fd2d332677c687ca90c12a47d6c377c547100b56 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Jan 2017 10:04:45 -0700 Subject: blk-mq: add support for carrying internal tag information in blk_qc_t No functional change in this patch, just in preparation for having two types of tags available to the block layer for a single request. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval --- block/blk-mq.c | 11 ++++++++--- include/linux/blk_types.h | 22 +++++++++++++++++----- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fcdeadc55753..d40be641f3d5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1308,6 +1308,11 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); +} + static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) { int ret; @@ -1318,7 +1323,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) .list = NULL, .last = 1 }; - blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + blk_qc_t new_cookie = request_to_qc_t(hctx, rq); if (blk_mq_hctx_stopped(hctx)) goto insert; @@ -1387,7 +1392,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) wbt_track(&rq->issue_stat, wb_acct); - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1496,7 +1501,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) wbt_track(&rq->issue_stat, wb_acct); - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 519ea2c9df61..0e5b1cd5113c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,22 +232,29 @@ static inline bool op_is_sync(unsigned int op) } typedef unsigned int blk_qc_t; -#define BLK_QC_T_NONE -1U -#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_NONE -1U +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { return cookie != BLK_QC_T_NONE; } -static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num) +static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num, + bool internal) { - return tag | (queue_num << BLK_QC_T_SHIFT); + blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT); + + if (internal) + ret |= BLK_QC_T_INTERNAL; + + return ret; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) { - return cookie >> BLK_QC_T_SHIFT; + return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; } static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) @@ -255,6 +262,11 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +static inline bool blk_qc_t_is_internal(blk_qc_t cookie) +{ + return (cookie & BLK_QC_T_INTERNAL) != 0; +} + struct blk_issue_stat { u64 time; }; -- cgit v1.2.3 From 2af8cbe30531eca73c8f3ba277f155fc0020b01a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jan 2017 14:39:30 -0700 Subject: blk-mq: split tag ->rqs[] into two This is in preparation for having two sets of tags available. For that we need a static index, and a dynamically assignable one. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval --- block/blk-mq-tag.c | 4 ++-- block/blk-mq-tag.h | 1 + block/blk-mq.c | 30 +++++++++++++++++++++++------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ced752716878..9753747a34a2 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -290,11 +290,11 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) struct blk_mq_tags *tags = set->tags[i]; for (j = 0; j < tags->nr_tags; j++) { - if (!tags->rqs[j]) + if (!tags->static_rqs[j]) continue; ret = set->ops->reinit_request(set->driver_data, - tags->rqs[j]); + tags->static_rqs[j]); if (ret) goto out; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 923602dd3bfb..41cd15fd1afd 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -16,6 +16,7 @@ struct blk_mq_tags { struct sbitmap_queue breserved_tags; struct request **rqs; + struct request **static_rqs; struct list_head page_list; }; diff --git a/block/blk-mq.c b/block/blk-mq.c index d40be641f3d5..89b81254201b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -223,7 +223,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, tag = blk_mq_get_tag(data); if (tag != BLK_MQ_TAG_FAIL) { - rq = data->hctx->tags->rqs[tag]; + rq = data->hctx->tags->static_rqs[tag]; if (blk_mq_tag_busy(data->hctx)) { rq->rq_flags = RQF_MQ_INFLIGHT; @@ -231,6 +231,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, } rq->tag = tag; + data->hctx->tags->rqs[tag] = rq; blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } @@ -1567,11 +1568,13 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, int i; for (i = 0; i < tags->nr_tags; i++) { - if (!tags->rqs[i]) + struct request *rq = tags->static_rqs[i]; + + if (!rq) continue; - set->ops->exit_request(set->driver_data, tags->rqs[i], + set->ops->exit_request(set->driver_data, rq, hctx_idx, i); - tags->rqs[i] = NULL; + tags->static_rqs[i] = NULL; } } @@ -1591,6 +1594,8 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; + kfree(tags->static_rqs); + tags->static_rqs = NULL; blk_mq_free_tags(tags); } @@ -1616,6 +1621,15 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, return NULL; } + tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + set->numa_node); + if (!tags->static_rqs) { + kfree(tags->rqs); + blk_mq_free_tags(tags); + return NULL; + } + return tags; } @@ -1677,12 +1691,14 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { - tags->rqs[i] = p; + struct request *rq = p; + + tags->static_rqs[i] = rq; if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, - tags->rqs[i], hctx_idx, i, + rq, hctx_idx, i, set->numa_node)) { - tags->rqs[i] = NULL; + tags->static_rqs[i] = NULL; goto fail; } } -- cgit v1.2.3 From bd166ef183c263c5ced656d49ef19c7da4adc774 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Jan 2017 06:03:22 -0700 Subject: blk-mq-sched: add framework for MQ capable IO schedulers This adds a set of hooks that intercepts the blk-mq path of allocating/inserting/issuing/completing requests, allowing us to develop a scheduler within that framework. We reuse the existing elevator scheduler API on the registration side, but augment that with the scheduler flagging support for the blk-mq interfce, and with a separate set of ops hooks for MQ devices. We split driver and scheduler tags, so we can run the scheduling independently of device queue depth. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- block/Makefile | 2 +- block/blk-cgroup.c | 24 +++- block/blk-core.c | 4 +- block/blk-exec.c | 3 +- block/blk-flush.c | 12 +- block/blk-ioc.c | 8 +- block/blk-merge.c | 2 +- block/blk-mq-sched.c | 368 +++++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sched.h | 170 ++++++++++++++++++++++ block/blk-mq-sysfs.c | 13 ++ block/blk-mq.c | 318 +++++++++++++++++++++++----------------- block/blk-mq.h | 8 +- block/blk-tag.c | 1 + block/elevator.c | 204 ++++++++++++++++++++------ include/linux/blk-mq.h | 5 +- include/linux/blkdev.h | 4 +- include/linux/elevator.h | 32 +++++ 17 files changed, 984 insertions(+), 194 deletions(-) create mode 100644 block/blk-mq-sched.c create mode 100644 block/blk-mq-sched.h diff --git a/block/Makefile b/block/Makefile index a827f988c4e6..2eee9e1bb6db 100644 --- a/block/Makefile +++ b/block/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ - blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \ + blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8ba0af780e88..2630f64bed19 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1223,7 +1223,11 @@ int blkcg_activate_policy(struct request_queue *q, if (blkcg_policy_enabled(q, pol)) return 0; - blk_queue_bypass_start(q); + if (q->mq_ops) { + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + } else + blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); @@ -1261,7 +1265,10 @@ pd_prealloc: spin_unlock_irq(q->queue_lock); out_bypass_end: - blk_queue_bypass_end(q); + if (q->mq_ops) + blk_mq_unfreeze_queue(q); + else + blk_queue_bypass_end(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; @@ -1284,7 +1291,12 @@ void blkcg_deactivate_policy(struct request_queue *q, if (!blkcg_policy_enabled(q, pol)) return; - blk_queue_bypass_start(q); + if (q->mq_ops) { + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + } else + blk_queue_bypass_start(q); + spin_lock_irq(q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1304,7 +1316,11 @@ void blkcg_deactivate_policy(struct request_queue *q, } spin_unlock_irq(q->queue_lock); - blk_queue_bypass_end(q); + + if (q->mq_ops) + blk_mq_unfreeze_queue(q); + else + blk_queue_bypass_end(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); diff --git a/block/blk-core.c b/block/blk-core.c index 92baea07acbc..a61f1407f4f6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" #include "blk-wbt.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); @@ -134,6 +135,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->cmd = rq->__cmd; rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; + rq->internal_tag = -1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; @@ -2127,7 +2129,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_insert_request(rq, false, true, false); + blk_mq_sched_insert_request(rq, false, true, false); return 0; } diff --git a/block/blk-exec.c b/block/blk-exec.c index 3ecb00a6cf45..86656fdfa637 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -9,6 +9,7 @@ #include #include "blk.h" +#include "blk-mq-sched.h" /* * for max sense size @@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(rq, at_head, true, false); + blk_mq_sched_insert_request(rq, at_head, true, false); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index 20b7c7a02f1c..d7de34ee39c2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -74,6 +74,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-mq-sched.h" /* FLUSH/FUA sequences */ enum { @@ -391,9 +392,10 @@ static void mq_flush_data_end_io(struct request *rq, int error) * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) - blk_mq_run_hw_queue(hctx, true); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + + blk_mq_run_hw_queue(hctx, true); } /** @@ -453,9 +455,9 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - if (q->mq_ops) { - blk_mq_insert_request(rq, false, true, false); - } else + if (q->mq_ops) + blk_mq_sched_insert_request(rq, false, true, false); + else list_add_tail(&rq->queuelist, &q->queue_head); return; } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index ab372092a57d..fe186a9eade9 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -43,7 +43,9 @@ static void ioc_exit_icq(struct io_cq *icq) if (icq->flags & ICQ_EXITED) return; - if (et->ops.sq.elevator_exit_icq_fn) + if (et->uses_mq && et->ops.mq.exit_icq) + et->ops.mq.exit_icq(icq); + else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn) et->ops.sq.elevator_exit_icq_fn(icq); icq->flags |= ICQ_EXITED; @@ -383,7 +385,9 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); - if (et->ops.sq.elevator_init_icq_fn) + if (et->uses_mq && et->ops.mq.init_icq) + et->ops.mq.init_icq(icq); + else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn) et->ops.sq.elevator_init_icq_fn(icq); } else { kmem_cache_free(et->icq_cache, icq); diff --git a/block/blk-merge.c b/block/blk-merge.c index 480570b691dc..6aa43dec5af4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -763,7 +763,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.sq.elevator_allow_rq_merge_fn) + if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn) if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) return 0; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c new file mode 100644 index 000000000000..26759798a0b3 --- /dev/null +++ b/block/blk-mq-sched.c @@ -0,0 +1,368 @@ +/* + * blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include +#include + +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-sched.h" +#include "blk-mq-tag.h" +#include "blk-wbt.h" + +void blk_mq_sched_free_hctx_data(struct request_queue *q, + void (*exit)(struct blk_mq_hw_ctx *)) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (exit && hctx->sched_data) + exit(hctx); + kfree(hctx->sched_data); + hctx->sched_data = NULL; + } +} +EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); + +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, + int (*init)(struct blk_mq_hw_ctx *), + void (*exit)(struct blk_mq_hw_ctx *)) +{ + struct blk_mq_hw_ctx *hctx; + int ret; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); + if (!hctx->sched_data) { + ret = -ENOMEM; + goto error; + } + + if (init) { + ret = init(hctx); + if (ret) { + /* + * We don't want to give exit() a partially + * initialized sched_data. init() must clean up + * if it fails. + */ + kfree(hctx->sched_data); + hctx->sched_data = NULL; + goto error; + } + } + } + + return 0; +error: + blk_mq_sched_free_hctx_data(q, exit); + return ret; +} +EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); + +static void __blk_mq_sched_assign_ioc(struct request_queue *q, + struct request *rq, struct io_context *ioc) +{ + struct io_cq *icq; + + spin_lock_irq(q->queue_lock); + icq = ioc_lookup_icq(ioc, q); + spin_unlock_irq(q->queue_lock); + + if (!icq) { + icq = ioc_create_icq(ioc, q, GFP_ATOMIC); + if (!icq) + return; + } + + rq->elv.icq = icq; + if (!blk_mq_sched_get_rq_priv(q, rq)) { + rq->rq_flags |= RQF_ELVPRIV; + get_io_context(icq->ioc); + return; + } + + rq->elv.icq = NULL; +} + +static void blk_mq_sched_assign_ioc(struct request_queue *q, + struct request *rq, struct bio *bio) +{ + struct io_context *ioc; + + ioc = rq_ioc(bio); + if (ioc) + __blk_mq_sched_assign_ioc(q, rq, ioc); +} + +struct request *blk_mq_sched_get_request(struct request_queue *q, + struct bio *bio, + unsigned int op, + struct blk_mq_alloc_data *data) +{ + struct elevator_queue *e = q->elevator; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA); + + blk_queue_enter_live(q); + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, ctx->cpu); + + blk_mq_set_alloc_data(data, q, 0, ctx, hctx); + + if (e) { + data->flags |= BLK_MQ_REQ_INTERNAL; + + /* + * Flush requests are special and go directly to the + * dispatch list. + */ + if (!is_flush && e->type->ops.mq.get_request) { + rq = e->type->ops.mq.get_request(q, op, data); + if (rq) + rq->rq_flags |= RQF_QUEUED; + } else + rq = __blk_mq_alloc_request(data, op); + } else { + rq = __blk_mq_alloc_request(data, op); + data->hctx->tags->rqs[rq->tag] = rq; + } + + if (rq) { + if (!is_flush) { + rq->elv.icq = NULL; + if (e && e->type->icq_cache) + blk_mq_sched_assign_ioc(q, rq, bio); + } + data->hctx->queued++; + return rq; + } + + blk_queue_exit(q); + return NULL; +} + +void blk_mq_sched_put_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (rq->rq_flags & RQF_ELVPRIV) { + blk_mq_sched_put_rq_priv(rq->q, rq); + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } + } + + if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) + e->type->ops.mq.put_request(rq); + else + blk_mq_finish_request(rq); +} + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +{ + struct elevator_queue *e = hctx->queue->elevator; + LIST_HEAD(rq_list); + + if (unlikely(blk_mq_hctx_stopped(hctx))) + return; + + hctx->run++; + + /* + * If we have previous entries on our dispatch list, grab them first for + * more fair dispatch. + */ + if (!list_empty_careful(&hctx->dispatch)) { + spin_lock(&hctx->lock); + if (!list_empty(&hctx->dispatch)) + list_splice_init(&hctx->dispatch, &rq_list); + spin_unlock(&hctx->lock); + } + + /* + * Only ask the scheduler for requests, if we didn't have residual + * requests from the dispatch list. This is to avoid the case where + * we only ever dispatch a fraction of the requests available because + * of low device queue depth. Once we pull requests out of the IO + * scheduler, we can no longer merge or sort them. So it's best to + * leave them there for as long as we can. Mark the hw queue as + * needing a restart in that case. + */ + if (list_empty(&rq_list)) { + if (e && e->type->ops.mq.dispatch_requests) + e->type->ops.mq.dispatch_requests(hctx, &rq_list); + else + blk_mq_flush_busy_ctxs(hctx, &rq_list); + } else + blk_mq_sched_mark_restart(hctx); + + blk_mq_dispatch_rq_list(hctx, &rq_list); +} + +void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, + struct request *(*get_rq)(struct blk_mq_hw_ctx *)) +{ + do { + struct request *rq; + + rq = get_rq(hctx); + if (!rq) + break; + + list_add_tail(&rq->queuelist, rq_list); + } while (1); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); + +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio) +{ + struct request *rq; + int ret; + + ret = elv_merge(q, &rq, bio); + if (ret == ELEVATOR_BACK_MERGE) { + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_back_merge(q, rq, bio)) { + if (!attempt_back_merge(q, rq)) + elv_merged_request(q, rq, ret); + return true; + } + } else if (ret == ELEVATOR_FRONT_MERGE) { + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_front_merge(q, rq, bio)) { + if (!attempt_front_merge(q, rq)) + elv_merged_request(q, rq, ret); + return true; + } + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); + +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e->type->ops.mq.bio_merge) { + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + blk_mq_put_ctx(ctx); + return e->type->ops.mq.bio_merge(hctx, bio); + } + + return false; +} + +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +{ + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); + +void blk_mq_sched_request_inserted(struct request *rq) +{ + trace_block_rq_insert(rq->q, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); + +bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + if (rq->tag == -1) { + rq->rq_flags |= RQF_SORTED; + return false; + } + + /* + * If we already have a real request tag, send directly to + * the dispatch list. + */ + spin_lock(&hctx->lock); + list_add(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + return true; +} +EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert); + +static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) +{ + if (hctx->sched_tags) { + blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); + blk_mq_free_rq_map(hctx->sched_tags); + hctx->sched_tags = NULL; + } +} + +int blk_mq_sched_setup(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Default to 256, since we don't split into sync/async like the + * old code did. Additionally, this is a per-hw queue depth. + */ + q->nr_requests = 2 * BLKDEV_MAX_RQ; + + /* + * We're switching to using an IO scheduler, so setup the hctx + * scheduler tags and switch the request map from the regular + * tags to scheduler tags. First allocate what we need, so we + * can safely fail and fallback, if needed. + */ + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); + if (!hctx->sched_tags) { + ret = -ENOMEM; + break; + } + ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); + if (ret) + break; + } + + /* + * If we failed, free what we did allocate + */ + if (ret) { + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + blk_mq_sched_free_tags(set, hctx, i); + } + + return ret; + } + + return 0; +} + +void blk_mq_sched_teardown(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_sched_free_tags(set, hctx, i); +} diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h new file mode 100644 index 000000000000..35c49e2e008a --- /dev/null +++ b/block/blk-mq-sched.h @@ -0,0 +1,170 @@ +#ifndef BLK_MQ_SCHED_H +#define BLK_MQ_SCHED_H + +#include "blk-mq.h" +#include "blk-mq-tag.h" + +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, + int (*init)(struct blk_mq_hw_ctx *), + void (*exit)(struct blk_mq_hw_ctx *)); + +void blk_mq_sched_free_hctx_data(struct request_queue *q, + void (*exit)(struct blk_mq_hw_ctx *)); + +struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data); +void blk_mq_sched_put_request(struct request *rq); + +void blk_mq_sched_request_inserted(struct request *rq); +bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq); +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio); +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, + struct request *(*get_rq)(struct blk_mq_hw_ctx *)); + +int blk_mq_sched_setup(struct request_queue *q); +void blk_mq_sched_teardown(struct request_queue *q); + +static inline bool +blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio)) + return false; + + return __blk_mq_sched_bio_merge(q, bio); +} + +static inline int blk_mq_sched_get_rq_priv(struct request_queue *q, + struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.get_rq_priv) + return e->type->ops.mq.get_rq_priv(q, rq); + + return 0; +} + +static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, + struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.put_rq_priv) + e->type->ops.mq.put_rq_priv(q, rq); +} + +static inline void +blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, + bool async) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + if (e && e->type->ops.mq.insert_requests) { + LIST_HEAD(list); + + list_add(&rq->queuelist, &list); + e->type->ops.mq.insert_requests(hctx, &list, at_head); + } else { + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); + } + + if (run_queue) + blk_mq_run_hw_queue(hctx, async); +} + +static inline void +blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async) +{ + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.insert_requests) + e->type->ops.mq.insert_requests(hctx, list, false); + else + blk_mq_insert_requests(hctx, ctx, list); + + blk_mq_run_hw_queue(hctx, run_queue_async); +} + +static inline bool +blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.allow_merge) + return e->type->ops.mq.allow_merge(q, rq, bio); + + return true; +} + +static inline void +blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.completed_request) + e->type->ops.mq.completed_request(hctx, rq); + + BUG_ON(rq->internal_tag == -1); + + blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag); + + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + blk_mq_run_hw_queue(hctx, true); + } +} + +static inline void blk_mq_sched_started_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.started_request) + e->type->ops.mq.started_request(rq); +} + +static inline void blk_mq_sched_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.requeue_request) + e->type->ops.mq.requeue_request(rq); +} + +static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.has_work) + return e->type->ops.mq.has_work(hctx); + + return false; +} + +static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} + +static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) +{ + return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} + +#endif diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index eacd3af72099..2caecaa98e40 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -231,6 +231,14 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, return ret; } +static ssize_t blk_mq_hw_sysfs_sched_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + if (hctx->sched_tags) + return blk_mq_tag_sysfs_show(hctx->sched_tags, page); + + return 0; +} + static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return blk_mq_tag_sysfs_show(hctx->tags, page); @@ -345,6 +353,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { .attr = {.name = "pending", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_rq_list_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_sched_tags = { + .attr = {.name = "sched_tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_sched_tags_show, +}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { .attr = {.name = "tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_tags_show, @@ -370,6 +382,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_dispatched.attr, &blk_mq_hw_sysfs_pending.attr, &blk_mq_hw_sysfs_tags.attr, + &blk_mq_hw_sysfs_sched_tags.attr, &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, diff --git a/block/blk-mq.c b/block/blk-mq.c index 89b81254201b..45e1707a9f86 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -32,6 +32,7 @@ #include "blk-mq-tag.h" #include "blk-stat.h" #include "blk-wbt.h" +#include "blk-mq-sched.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -41,7 +42,9 @@ static LIST_HEAD(all_q_list); */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - return sbitmap_any_bit_set(&hctx->ctx_map); + return sbitmap_any_bit_set(&hctx->ctx_map) || + !list_empty_careful(&hctx->dispatch) || + blk_mq_sched_has_work(hctx); } /* @@ -223,15 +226,23 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, tag = blk_mq_get_tag(data); if (tag != BLK_MQ_TAG_FAIL) { - rq = data->hctx->tags->static_rqs[tag]; + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + + rq = tags->static_rqs[tag]; if (blk_mq_tag_busy(data->hctx)) { rq->rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } - rq->tag = tag; - data->hctx->tags->rqs[tag] = rq; + if (data->flags & BLK_MQ_REQ_INTERNAL) { + rq->tag = -1; + rq->internal_tag = tag; + } else { + rq->tag = tag; + rq->internal_tag = -1; + } + blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } @@ -243,26 +254,21 @@ EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags) { - struct blk_mq_ctx *ctx; - struct blk_mq_hw_ctx *hctx; - struct request *rq; struct blk_mq_alloc_data alloc_data; + struct request *rq; int ret; ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw); - blk_mq_put_ctx(ctx); + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); - if (!rq) { - blk_queue_exit(q); + blk_mq_put_ctx(alloc_data.ctx); + blk_queue_exit(q); + + if (!rq) return ERR_PTR(-EWOULDBLOCK); - } rq->__data_len = 0; rq->__sector = (sector_t) -1; @@ -322,10 +328,10 @@ out_queue_exit: } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct request *rq) +void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq) { - const int tag = rq->tag; + const int sched_tag = rq->internal_tag; struct request_queue *q = rq->q; if (rq->rq_flags & RQF_MQ_INFLIGHT) @@ -336,22 +342,30 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); - blk_mq_put_tag(hctx, hctx->tags, ctx, tag); + if (rq->tag != -1) + blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); + if (sched_tag != -1) + blk_mq_sched_completed_request(hctx, rq); blk_queue_exit(q); } -static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, +static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; ctx->rq_completed[rq_is_sync(rq)]++; - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); +} + +void blk_mq_finish_request(struct request *rq) +{ + blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); } void blk_mq_free_request(struct request *rq) { - blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); + blk_mq_sched_put_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -469,6 +483,8 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_sched_started_request(rq); + trace_block_rq_issue(q, rq); rq->resid_len = blk_rq_bytes(rq); @@ -517,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); + blk_mq_sched_requeue_request(rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -551,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, true, false, false); + blk_mq_sched_insert_request(rq, true, false, false); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, false, false, false); + blk_mq_sched_insert_request(rq, false, false, false); } blk_mq_run_hw_queues(q, false); @@ -765,6 +782,12 @@ static bool blk_mq_attempt_merge(struct request_queue *q, continue; el_ret = blk_try_merge(rq, bio); + if (el_ret == ELEVATOR_NO_MERGE) + continue; + + if (!blk_mq_sched_allow_merge(q, rq, bio)) + break; + if (el_ret == ELEVATOR_BACK_MERGE) { if (bio_attempt_back_merge(q, rq, bio)) { ctx->rq_merged++; @@ -824,6 +847,59 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } +static bool blk_mq_get_driver_tag(struct request *rq, + struct blk_mq_hw_ctx **hctx, bool wait) +{ + struct blk_mq_alloc_data data = { + .q = rq->q, + .ctx = rq->mq_ctx, + .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), + .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + }; + + if (blk_mq_hctx_stopped(data.hctx)) + return false; + + if (rq->tag != -1) { +done: + if (hctx) + *hctx = data.hctx; + return true; + } + + rq->tag = blk_mq_get_tag(&data); + if (rq->tag >= 0) { + data.hctx->tags->rqs[rq->tag] = rq; + goto done; + } + + return false; +} + +/* + * If we fail getting a driver tag because all the driver tags are already + * assigned and on the dispatch list, BUT the first entry does not have a + * tag, then we could deadlock. For that case, move entries with assigned + * driver tags to the front, leaving the set of tagged requests in the + * same order, and the untagged set in the same order. + */ +static bool reorder_tags_to_front(struct list_head *list) +{ + struct request *rq, *tmp, *first = NULL; + + list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { + if (rq == first) + break; + if (rq->tag != -1) { + list_move(&rq->queuelist, list); + if (!first) + first = rq; + } + } + + return first != NULL; +} + bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct request_queue *q = hctx->queue; @@ -846,6 +922,12 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); + if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (!queued && reorder_tags_to_front(list)) + continue; + blk_mq_sched_mark_restart(hctx); + break; + } list_del_init(&rq->queuelist); bd.rq = rq; @@ -899,48 +981,17 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * the requests in rq_list might get lost. * * blk_mq_run_hw_queue() already checks the STOPPED bit - **/ - blk_mq_run_hw_queue(hctx, true); + * + * If RESTART is set, then let completion restart the queue + * instead of potentially looping here. + */ + if (!blk_mq_sched_needs_restart(hctx)) + blk_mq_run_hw_queue(hctx, true); } return ret != BLK_MQ_RQ_QUEUE_BUSY; } -/* - * Run this hardware queue, pulling any software queues mapped to it in. - * Note that this function currently has various problems around ordering - * of IO. In particular, we'd like FIFO behaviour on handling existing - * items on the hctx->dispatch list. Ignore that for now. - */ -static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) -{ - LIST_HEAD(rq_list); - LIST_HEAD(driver_list); - - if (unlikely(blk_mq_hctx_stopped(hctx))) - return; - - hctx->run++; - - /* - * Touch any software queue that has pending entries. - */ - blk_mq_flush_busy_ctxs(hctx, &rq_list); - - /* - * If we have previous entries on our dispatch list, grab them - * and stuff them at the front for more fair dispatch. - */ - if (!list_empty_careful(&hctx->dispatch)) { - spin_lock(&hctx->lock); - if (!list_empty(&hctx->dispatch)) - list_splice_init(&hctx->dispatch, &rq_list); - spin_unlock(&hctx->lock); - } - - blk_mq_dispatch_rq_list(hctx, &rq_list); -} - static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; @@ -950,11 +1001,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_process_rq_list(hctx); + blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); - blk_mq_process_rq_list(hctx); + blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); } } @@ -1010,8 +1061,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if ((!blk_mq_hctx_has_pending(hctx) && - list_empty_careful(&hctx->dispatch)) || + if (!blk_mq_hctx_has_pending(hctx) || blk_mq_hctx_stopped(hctx)) continue; @@ -1148,32 +1198,10 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_mq_hctx_mark_pending(hctx, ctx); } -void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, - bool async) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - - if (run_queue) - blk_mq_run_hw_queue(hctx, async); -} - -static void blk_mq_insert_requests(struct request_queue *q, - struct blk_mq_ctx *ctx, - struct list_head *list, - int depth, - bool from_schedule) +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list) { - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - trace_block_unplug(q, depth, !from_schedule); - /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now @@ -1189,8 +1217,6 @@ static void blk_mq_insert_requests(struct request_queue *q, } blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - - blk_mq_run_hw_queue(hctx, from_schedule); } static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1226,9 +1252,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(!rq->q); if (rq->mq_ctx != this_ctx) { if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, - &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, + &ctx_list, + from_schedule); } this_ctx = rq->mq_ctx; @@ -1245,8 +1272,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * on 'ctx_list'. Do those. */ if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + from_schedule); } } @@ -1284,51 +1312,39 @@ insert_rq: } spin_unlock(&ctx->lock); - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); return true; } } -static struct request *blk_mq_map_request(struct request_queue *q, - struct bio *bio, - struct blk_mq_alloc_data *data) -{ - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - struct request *rq; - - blk_queue_enter_live(q); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); - - trace_block_getrq(q, bio, bio->bi_opf); - blk_mq_set_alloc_data(data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(data, bio->bi_opf); - - data->hctx->queued++; - return rq; -} - static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) { - return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); + if (rq->tag != -1) + return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); + + return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) { - int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, .last = 1 }; - blk_qc_t new_cookie = request_to_qc_t(hctx, rq); + struct blk_mq_hw_ctx *hctx; + blk_qc_t new_cookie; + int ret; - if (blk_mq_hctx_stopped(hctx)) + if (q->elevator) goto insert; + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + goto insert; + + new_cookie = request_to_qc_t(hctx, rq); + /* * For OK queue, we are done. For error, kill it. Any other * error (busy), just add it to our list as we previously @@ -1350,7 +1366,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) } insert: - blk_mq_insert_request(rq, false, true, true); + blk_mq_sched_insert_request(rq, false, true, true); } /* @@ -1383,9 +1399,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return BLK_QC_T_NONE; + if (blk_mq_sched_bio_merge(q, bio)) + return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); - rq = blk_mq_map_request(q, bio, &data); + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; @@ -1397,6 +1418,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); + blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); goto run_queue; } @@ -1447,6 +1469,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) goto done; } + if (q->elevator) { + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1492,9 +1520,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + if (blk_mq_sched_bio_merge(q, bio)) + return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); - rq = blk_mq_map_request(q, bio, &data); + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; @@ -1506,6 +1539,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); + blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); goto run_queue; } @@ -1544,6 +1578,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) return cookie; } + if (q->elevator) { + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1556,6 +1596,7 @@ run_queue: } blk_mq_put_ctx(data.ctx); +done: return cookie; } @@ -1925,9 +1966,11 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, unsigned int hctx_idx) { - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx]); - set->tags[hctx_idx] = NULL; + if (set->tags[hctx_idx]) { + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + } } static void blk_mq_map_swqueue(struct request_queue *q, @@ -2084,6 +2127,8 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned int i; + blk_mq_sched_teardown(q); + /* hctx kobj stays in hctx */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx) @@ -2504,14 +2549,22 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) struct blk_mq_hw_ctx *hctx; int i, ret; - if (!set || nr > set->queue_depth) + if (!set) return -EINVAL; ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; - ret = blk_mq_tag_update_depth(hctx->tags, nr); + /* + * If we're using an MQ scheduler, just update the scheduler + * queue depth. This is similar to what the old code would do. + */ + if (!hctx->sched_tags) + ret = blk_mq_tag_update_depth(hctx->tags, + min(nr, set->queue_depth)); + else + ret = blk_mq_tag_update_depth(hctx->sched_tags, nr); if (ret) break; } @@ -2704,7 +2757,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) blk_flush_plug_list(plug, false); hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + if (!blk_qc_t_is_internal(cookie)) + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + else + rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); return __blk_mq_poll(hctx, rq); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 1b279b02d0f6..0c7c034d9ddd 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -52,6 +52,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, */ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head); +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list); /* * CPU hotplug helpers */ @@ -124,6 +126,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { + if (data->flags & BLK_MQ_REQ_INTERNAL) + return data->hctx->sched_tags; + return data->hctx->tags; } @@ -132,8 +137,9 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data */ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, struct request *rq, unsigned int op); -void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, +void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq); +void blk_mq_finish_request(struct request *rq); struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op); diff --git a/block/blk-tag.c b/block/blk-tag.c index bae1decb6ec3..07cc329fa4b0 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) list_del_init(&rq->queuelist); rq->rq_flags &= ~RQF_QUEUED; rq->tag = -1; + rq->internal_tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) printk(KERN_ERR "%s: tag %d is missing\n", diff --git a/block/elevator.c b/block/elevator.c index 022a26830297..0e1ccddab8a2 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -40,6 +40,7 @@ #include #include "blk.h" +#include "blk-mq-sched.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -58,7 +59,9 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->type->ops.sq.elevator_allow_bio_merge_fn) + if (e->uses_mq && e->type->ops.mq.allow_merge) + return e->type->ops.mq.allow_merge(q, rq, bio); + else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn) return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); return 1; @@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); + eq->uses_mq = e->uses_mq; return eq; } @@ -219,14 +223,26 @@ int elevator_init(struct request_queue *q, char *name) if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ - "Using noop.\n"); + "Using noop/none.\n"); + if (q->mq_ops) { + elevator_put(e); + return 0; + } e = elevator_get("noop", false); } } - err = e->ops.sq.elevator_init_fn(q, e); - if (err) + if (e->uses_mq) { + err = blk_mq_sched_setup(q); + if (!err) + err = e->ops.mq.init_sched(q, e); + } else + err = e->ops.sq.elevator_init_fn(q, e); + if (err) { + if (e->uses_mq) + blk_mq_sched_teardown(q); elevator_put(e); + } return err; } EXPORT_SYMBOL(elevator_init); @@ -234,7 +250,9 @@ EXPORT_SYMBOL(elevator_init); void elevator_exit(struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); - if (e->type->ops.sq.elevator_exit_fn) + if (e->uses_mq && e->type->ops.mq.exit_sched) + e->type->ops.mq.exit_sched(e); + else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) e->type->ops.sq.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); @@ -253,6 +271,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq) if (ELV_ON_HASH(rq)) __elv_rqhash_del(rq); } +EXPORT_SYMBOL_GPL(elv_rqhash_del); void elv_rqhash_add(struct request_queue *q, struct request *rq) { @@ -262,6 +281,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq) hash_add(e->hash, &rq->hash, rq_hash_key(rq)); rq->rq_flags |= RQF_HASHED; } +EXPORT_SYMBOL_GPL(elv_rqhash_add); void elv_rqhash_reposition(struct request_queue *q, struct request *rq) { @@ -443,7 +463,9 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) return ELEVATOR_BACK_MERGE; } - if (e->type->ops.sq.elevator_merge_fn) + if (e->uses_mq && e->type->ops.mq.request_merge) + return e->type->ops.mq.request_merge(q, req, bio); + else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn) return e->type->ops.sq.elevator_merge_fn(q, req, bio); return ELEVATOR_NO_MERGE; @@ -456,8 +478,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * * Returns true if we merged, false otherwise */ -static bool elv_attempt_insert_merge(struct request_queue *q, - struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) { struct request *__rq; bool ret; @@ -495,7 +516,9 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type) { struct elevator_queue *e = q->elevator; - if (e->type->ops.sq.elevator_merged_fn) + if (e->uses_mq && e->type->ops.mq.request_merged) + e->type->ops.mq.request_merged(q, rq, type); + else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn) e->type->ops.sq.elevator_merged_fn(q, rq, type); if (type == ELEVATOR_BACK_MERGE) @@ -508,10 +531,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; - const int next_sorted = next->rq_flags & RQF_SORTED; - - if (next_sorted && e->type->ops.sq.elevator_merge_req_fn) - e->type->ops.sq.elevator_merge_req_fn(q, rq, next); + bool next_sorted = false; + + if (e->uses_mq && e->type->ops.mq.requests_merged) + e->type->ops.mq.requests_merged(q, rq, next); + else if (e->type->ops.sq.elevator_merge_req_fn) { + next_sorted = next->rq_flags & RQF_SORTED; + if (next_sorted) + e->type->ops.sq.elevator_merge_req_fn(q, rq, next); + } elv_rqhash_reposition(q, rq); @@ -528,6 +556,9 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; + if (WARN_ON_ONCE(e->uses_mq)) + return; + if (e->type->ops.sq.elevator_bio_merged_fn) e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); } @@ -574,11 +605,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) void elv_drain_elevator(struct request_queue *q) { + struct elevator_queue *e = q->elevator; static int printed; + if (WARN_ON_ONCE(e->uses_mq)) + return; + lockdep_assert_held(q->queue_lock); - while (q->elevator->type->ops.sq.elevator_dispatch_fn(q, 1)) + while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) ; if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " @@ -682,8 +717,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.sq.elevator_latter_req_fn) + if (e->uses_mq && e->type->ops.mq.next_request) + return e->type->ops.mq.next_request(q, rq); + else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn) return e->type->ops.sq.elevator_latter_req_fn(q, rq); + return NULL; } @@ -691,7 +729,9 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.sq.elevator_former_req_fn) + if (e->uses_mq && e->type->ops.mq.former_request) + return e->type->ops.mq.former_request(q, rq); + if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn) return e->type->ops.sq.elevator_former_req_fn(q, rq); return NULL; } @@ -701,6 +741,9 @@ int elv_set_request(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; + if (WARN_ON_ONCE(e->uses_mq)) + return 0; + if (e->type->ops.sq.elevator_set_req_fn) return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; @@ -710,6 +753,9 @@ void elv_put_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + if (WARN_ON_ONCE(e->uses_mq)) + return; + if (e->type->ops.sq.elevator_put_req_fn) e->type->ops.sq.elevator_put_req_fn(rq); } @@ -718,6 +764,9 @@ int elv_may_queue(struct request_queue *q, unsigned int op) { struct elevator_queue *e = q->elevator; + if (WARN_ON_ONCE(e->uses_mq)) + return 0; + if (e->type->ops.sq.elevator_may_queue_fn) return e->type->ops.sq.elevator_may_queue_fn(q, op); @@ -728,6 +777,9 @@ void elv_completed_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + if (WARN_ON_ONCE(e->uses_mq)) + return; + /* * request is released from the driver, io must be done */ @@ -803,7 +855,7 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; - if (e->type->ops.sq.elevator_registered_fn) + if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) e->type->ops.sq.elevator_registered_fn(q); } return error; @@ -891,9 +943,14 @@ EXPORT_SYMBOL_GPL(elv_unregister); static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old = q->elevator; - bool registered = old->registered; + bool old_registered = false; int err; + if (q->mq_ops) { + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + } + /* * Turn on BYPASS and drain all requests w/ elevator private data. * Block layer doesn't call into a quiesced elevator - all requests @@ -901,42 +958,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) * using INSERT_BACK. All requests have SOFTBARRIER set and no * merge happens either. */ - blk_queue_bypass_start(q); + if (old) { + old_registered = old->registered; + + if (old->uses_mq) + blk_mq_sched_teardown(q); - /* unregister and clear all auxiliary data of the old elevator */ - if (registered) - elv_unregister_queue(q); + if (!q->mq_ops) + blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); - ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); + /* unregister and clear all auxiliary data of the old elevator */ + if (old_registered) + elv_unregister_queue(q); + + spin_lock_irq(q->queue_lock); + ioc_clear_queue(q); + spin_unlock_irq(q->queue_lock); + } /* allocate, init and register new elevator */ - err = new_e->ops.sq.elevator_init_fn(q, new_e); - if (err) - goto fail_init; + if (new_e) { + if (new_e->uses_mq) { + err = blk_mq_sched_setup(q); + if (!err) + err = new_e->ops.mq.init_sched(q, new_e); + } else + err = new_e->ops.sq.elevator_init_fn(q, new_e); + if (err) + goto fail_init; - if (registered) { err = elv_register_queue(q); if (err) goto fail_register; - } + } else + q->elevator = NULL; /* done, kill the old one and finish */ - elevator_exit(old); - blk_queue_bypass_end(q); + if (old) { + elevator_exit(old); + if (!q->mq_ops) + blk_queue_bypass_end(q); + } + + if (q->mq_ops) { + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + } - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + if (new_e) + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + else + blk_add_trace_msg(q, "elv switch: none"); return 0; fail_register: + if (q->mq_ops) + blk_mq_sched_teardown(q); elevator_exit(q->elevator); fail_init: /* switch failed, restore and re-register old elevator */ - q->elevator = old; - elv_register_queue(q); - blk_queue_bypass_end(q); + if (old) { + q->elevator = old; + elv_register_queue(q); + if (!q->mq_ops) + blk_queue_bypass_end(q); + } + if (q->mq_ops) { + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + } return err; } @@ -949,8 +1040,11 @@ static int __elevator_change(struct request_queue *q, const char *name) char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; - if (!q->elevator) - return -ENXIO; + /* + * Special case for mq, turn off scheduling + */ + if (q->mq_ops && !strncmp(name, "none", 4)) + return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(strstrip(elevator_name), true); @@ -959,11 +1053,21 @@ static int __elevator_change(struct request_queue *q, const char *name) return -EINVAL; } - if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { + if (q->elevator && + !strcmp(elevator_name, q->elevator->type->elevator_name)) { elevator_put(e); return 0; } + if (!e->uses_mq && q->mq_ops) { + elevator_put(e); + return -EINVAL; + } + if (e->uses_mq && !q->mq_ops) { + elevator_put(e); + return -EINVAL; + } + return elevator_switch(q, e); } @@ -985,7 +1089,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, { int ret; - if (!q->elevator) + if (!(q->mq_ops || q->request_fn)) return count; ret = __elevator_change(q, name); @@ -999,24 +1103,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, ssize_t elv_iosched_show(struct request_queue *q, char *name) { struct elevator_queue *e = q->elevator; - struct elevator_type *elv; + struct elevator_type *elv = NULL; struct elevator_type *__e; int len = 0; - if (!q->elevator || !blk_queue_stackable(q)) + if (!blk_queue_stackable(q)) return sprintf(name, "none\n"); - elv = e->type; + if (!q->elevator) + len += sprintf(name+len, "[none] "); + else + elv = e->type; spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (!strcmp(elv->elevator_name, __e->elevator_name)) + if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); - else + continue; + } + if (__e->uses_mq && q->mq_ops) + len += sprintf(name+len, "%s ", __e->elevator_name); + else if (!__e->uses_mq && !q->mq_ops) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); + if (q->mq_ops && q->elevator) + len += sprintf(name+len, "none"); + len += sprintf(len+name, "\n"); return len; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2686f9e7302a..63569eb46d15 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -22,6 +22,7 @@ struct blk_mq_hw_ctx { unsigned long flags; /* BLK_MQ_F_* flags */ + void *sched_data; struct request_queue *queue; struct blk_flush_queue *fq; @@ -35,6 +36,7 @@ struct blk_mq_hw_ctx { atomic_t wait_index; struct blk_mq_tags *tags; + struct blk_mq_tags *sched_tags; struct srcu_struct queue_rq_srcu; @@ -156,6 +158,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, + BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_MAX_DEPTH = 10240, @@ -179,13 +182,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ + BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ }; struct request *blk_mq_alloc_request(struct request_queue *q, int rw, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2e99d659b0f1..25564857f5f8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -154,6 +154,7 @@ struct request { /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ + int tag; sector_t __sector; /* sector cursor */ struct bio *bio; @@ -220,9 +221,10 @@ struct request { unsigned short ioprio; + int internal_tag; + void *special; /* opaque pointer available for LLD use */ - int tag; int errors; /* diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2a9e966eed03..ecb96fd67c6d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -77,6 +77,34 @@ struct elevator_ops elevator_registered_fn *elevator_registered_fn; }; +struct blk_mq_alloc_data; +struct blk_mq_hw_ctx; + +struct elevator_mq_ops { + int (*init_sched)(struct request_queue *, struct elevator_type *); + void (*exit_sched)(struct elevator_queue *); + + bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); + bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); + int (*request_merge)(struct request_queue *q, struct request **, struct bio *); + void (*request_merged)(struct request_queue *, struct request *, int); + void (*requests_merged)(struct request_queue *, struct request *, struct request *); + struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); + void (*put_request)(struct request *); + void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); + void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *); + bool (*has_work)(struct blk_mq_hw_ctx *); + void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); + void (*started_request)(struct request *); + void (*requeue_request)(struct request *); + struct request *(*former_request)(struct request_queue *, struct request *); + struct request *(*next_request)(struct request_queue *, struct request *); + int (*get_rq_priv)(struct request_queue *, struct request *); + void (*put_rq_priv)(struct request_queue *, struct request *); + void (*init_icq)(struct io_cq *); + void (*exit_icq)(struct io_cq *); +}; + #define ELV_NAME_MAX (16) struct elv_fs_entry { @@ -96,12 +124,14 @@ struct elevator_type /* fields provided by elevator implementation */ union { struct elevator_ops sq; + struct elevator_mq_ops mq; } ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; + bool uses_mq; /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ @@ -125,6 +155,7 @@ struct elevator_queue struct kobject kobj; struct mutex sysfs_lock; unsigned int registered:1; + unsigned int uses_mq:1; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; @@ -141,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *, extern void elv_merged_request(struct request_queue *, struct request *, int); extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -- cgit v1.2.3 From 945ffb60c11dfb228130f3f2bdde961cecb76671 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Jan 2017 17:11:11 -0700 Subject: mq-deadline: add blk-mq adaptation of the deadline IO scheduler This is basically identical to deadline-iosched, except it registers as a MQ capable scheduler. This is still a single queue design. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- block/Kconfig.iosched | 6 + block/Makefile | 1 + block/mq-deadline.c | 553 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 560 insertions(+) create mode 100644 block/mq-deadline.c diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9c4c48..490ef2850fae 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -32,6 +32,12 @@ config IOSCHED_CFQ This is the default I/O scheduler. +config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y + ---help--- + MQ version of the deadline IO scheduler. + config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP diff --git a/block/Makefile b/block/Makefile index 2eee9e1bb6db..3ee0abd7205a 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/mq-deadline.c b/block/mq-deadline.c new file mode 100644 index 000000000000..a01986d7b6fb --- /dev/null +++ b/block/mq-deadline.c @@ -0,0 +1,553 @@ +/* + * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, + * for the blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" + +/* + * See Documentation/block/deadline-iosched.txt + */ +static const int read_expire = HZ / 2; /* max time before a read is submitted. */ +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +static const int writes_starved = 2; /* max times reads can starve a write */ +static const int fifo_batch = 16; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + +struct deadline_data { + /* + * run time data + */ + + /* + * requests (deadline_rq s) are present on both sort_list and fifo_list + */ + struct rb_root sort_list[2]; + struct list_head fifo_list[2]; + + /* + * next in sort order. read, write or both are NULL + */ + struct request *next_rq[2]; + unsigned int batching; /* number of sequential requests made */ + unsigned int starved; /* times reads have starved writes */ + + /* + * settings that change how the i/o scheduler behaves + */ + int fifo_expire[2]; + int fifo_batch; + int writes_starved; + int front_merges; + + spinlock_t lock; + struct list_head dispatch; +}; + +static inline struct rb_root * +deadline_rb_root(struct deadline_data *dd, struct request *rq) +{ + return &dd->sort_list[rq_data_dir(rq)]; +} + +/* + * get the request after `rq' in sector-sorted order + */ +static inline struct request * +deadline_latter_request(struct request *rq) +{ + struct rb_node *node = rb_next(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + +static void +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +{ + struct rb_root *root = deadline_rb_root(dd, rq); + + elv_rb_add(root, rq); +} + +static inline void +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + if (dd->next_rq[data_dir] == rq) + dd->next_rq[data_dir] = deadline_latter_request(rq); + + elv_rb_del(deadline_rb_root(dd, rq), rq); +} + +/* + * remove rq from rbtree and fifo. + */ +static void deadline_remove_request(struct request_queue *q, struct request *rq) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + list_del_init(&rq->queuelist); + + /* + * We might not be on the rbtree, if we are doing an insert merge + */ + if (!RB_EMPTY_NODE(&rq->rb_node)) + deadline_del_rq_rb(dd, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; +} + +static void dd_request_merged(struct request_queue *q, struct request *req, + int type) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + /* + * if the merge was a front merge, we need to reposition request + */ + if (type == ELEVATOR_FRONT_MERGE) { + elv_rb_del(deadline_rb_root(dd, req), req); + deadline_add_rq_rb(dd, req); + } +} + +static void dd_merged_requests(struct request_queue *q, struct request *req, + struct request *next) +{ + /* + * if next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo + */ + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before((unsigned long)next->fifo_time, + (unsigned long)req->fifo_time)) { + list_move(&req->queuelist, &next->queuelist); + req->fifo_time = next->fifo_time; + } + } + + /* + * kill knowledge of next, this one is a goner + */ + deadline_remove_request(q, next); +} + +/* + * move an entry to dispatch queue + */ +static void +deadline_move_request(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + dd->next_rq[READ] = NULL; + dd->next_rq[WRITE] = NULL; + dd->next_rq[data_dir] = deadline_latter_request(rq); + + /* + * take it off the sort and fifo list + */ + deadline_remove_request(rq->q, rq); +} + +/* + * deadline_check_fifo returns 0 if there are no expired requests on the fifo, + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + */ +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +{ + struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + + /* + * rq is expired! + */ + if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) + return 1; + + return 0; +} + +/* + * deadline_dispatch_requests selects the best request according to + * read/write expire, fifo_batch, etc + */ +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + bool reads, writes; + int data_dir; + + if (!list_empty(&dd->dispatch)) { + rq = list_first_entry(&dd->dispatch, struct request, queuelist); + list_del_init(&rq->queuelist); + goto done; + } + + reads = !list_empty(&dd->fifo_list[READ]); + writes = !list_empty(&dd->fifo_list[WRITE]); + + /* + * batches are currently reads XOR writes + */ + if (dd->next_rq[WRITE]) + rq = dd->next_rq[WRITE]; + else + rq = dd->next_rq[READ]; + + if (rq && dd->batching < dd->fifo_batch) + /* we have a next request are still entitled to batch */ + goto dispatch_request; + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (reads) { + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + + if (writes && (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + data_dir = READ; + + goto dispatch_find_request; + } + + /* + * there are either no reads or writes have been starved + */ + + if (writes) { +dispatch_writes: + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + + dd->starved = 0; + + data_dir = WRITE; + + goto dispatch_find_request; + } + + return NULL; + +dispatch_find_request: + /* + * we are not running a batch, find best request for selected data_dir + */ + if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + /* + * A deadline has expired, the last request was in the other + * direction, or we have run out of higher-sectored requests. + * Start again from the request with the earliest expiry time. + */ + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + } else { + /* + * The last req was the same dir and we have a next request in + * sort order. No expired requests so continue on from here. + */ + rq = dd->next_rq[data_dir]; + } + + dd->batching = 0; + +dispatch_request: + /* + * rq is the selected appropriate request. + */ + dd->batching++; + deadline_move_request(dd, rq); +done: + rq->rq_flags |= RQF_STARTED; + return rq; +} + +static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + + spin_lock(&dd->lock); + blk_mq_sched_move_to_dispatch(hctx, rq_list, __dd_dispatch_request); + spin_unlock(&dd->lock); +} + +static void dd_exit_queue(struct elevator_queue *e) +{ + struct deadline_data *dd = e->elevator_data; + + BUG_ON(!list_empty(&dd->fifo_list[READ])); + BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + + kfree(dd); +} + +/* + * initialize elevator private data (deadline_data). + */ +static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct deadline_data *dd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); + if (!dd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = dd; + + INIT_LIST_HEAD(&dd->fifo_list[READ]); + INIT_LIST_HEAD(&dd->fifo_list[WRITE]); + dd->sort_list[READ] = RB_ROOT; + dd->sort_list[WRITE] = RB_ROOT; + dd->fifo_expire[READ] = read_expire; + dd->fifo_expire[WRITE] = write_expire; + dd->writes_starved = writes_starved; + dd->front_merges = 1; + dd->fifo_batch = fifo_batch; + spin_lock_init(&dd->lock); + INIT_LIST_HEAD(&dd->dispatch); + + q->elevator = eq; + return 0; +} + +static int dd_request_merge(struct request_queue *q, struct request **rq, + struct bio *bio) +{ + struct deadline_data *dd = q->elevator->elevator_data; + sector_t sector = bio_end_sector(bio); + struct request *__rq; + + if (!dd->front_merges) + return ELEVATOR_NO_MERGE; + + __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + if (__rq) { + BUG_ON(sector != blk_rq_pos(__rq)); + + if (elv_bio_merge_ok(__rq, bio)) { + *rq = __rq; + return ELEVATOR_FRONT_MERGE; + } + } + + return ELEVATOR_NO_MERGE; +} + +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + int ret; + + spin_lock(&dd->lock); + ret = blk_mq_sched_try_merge(q, bio); + spin_unlock(&dd->lock); + + return ret; +} + +/* + * add rq to rbtree and fifo + */ +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const int data_dir = rq_data_dir(rq); + + if (blk_mq_sched_try_insert_merge(q, rq)) + return; + + blk_mq_sched_request_inserted(rq); + + if (blk_mq_sched_bypass_insert(hctx, rq)) + return; + + if (at_head || rq->cmd_type != REQ_TYPE_FS) { + if (at_head) + list_add(&rq->queuelist, &dd->dispatch); + else + list_add_tail(&rq->queuelist, &dd->dispatch); + } else { + deadline_add_rq_rb(dd, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + + /* + * set expire time and add to fifo list + */ + rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; + list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + } +} + +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(hctx, rq, at_head); + } + spin_unlock(&dd->lock); +} + +static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + + return !list_empty_careful(&dd->dispatch) || + !list_empty_careful(&dd->fifo_list[0]) || + !list_empty_careful(&dd->fifo_list[1]); +} + +/* + * sysfs parts below + */ +static ssize_t +deadline_var_show(int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +deadline_var_store(int *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtol(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return deadline_var_show(__data, (page)); \ +} +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data; \ + int ret = deadline_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#undef STORE_FUNCTION + +#define DD_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \ + deadline_##name##_store) + +static struct elv_fs_entry deadline_attrs[] = { + DD_ATTR(read_expire), + DD_ATTR(write_expire), + DD_ATTR(writes_starved), + DD_ATTR(front_merges), + DD_ATTR(fifo_batch), + __ATTR_NULL +}; + +static struct elevator_type mq_deadline = { + .ops.mq = { + .insert_requests = dd_insert_requests, + .dispatch_requests = dd_dispatch_requests, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .bio_merge = dd_bio_merge, + .request_merge = dd_request_merge, + .requests_merged = dd_merged_requests, + .request_merged = dd_request_merged, + .has_work = dd_has_work, + .init_sched = dd_init_queue, + .exit_sched = dd_exit_queue, + }, + + .uses_mq = true, + .elevator_attrs = deadline_attrs, + .elevator_name = "mq-deadline", + .elevator_owner = THIS_MODULE, +}; + +static int __init deadline_init(void) +{ + return elv_register(&mq_deadline); +} + +static void __exit deadline_exit(void) +{ + elv_unregister(&mq_deadline); +} + +module_init(deadline_init); +module_exit(deadline_exit); + +MODULE_AUTHOR("Jens Axboe"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ deadline IO scheduler"); -- cgit v1.2.3 From d34849913819a5e0cbfbe724dbe79df89278c524 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jan 2017 14:43:58 -0700 Subject: blk-mq-sched: allow setting of default IO scheduler Add Kconfig entries to manage what devices get assigned an MQ scheduler, and add a blk-mq flag for drivers to opt out of scheduling. The latter is useful for admin type queues that still allocate a blk-mq queue and tag set, but aren't use for normal IO. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- block/Kconfig.iosched | 56 +++++++++++++++++++++++++++++++++++++++++++------ block/blk-mq-sched.c | 20 ++++++++++++++++++ block/blk-mq-sched.h | 2 ++ block/blk-mq.c | 8 +++++++ block/elevator.c | 8 ++++++- drivers/nvme/host/pci.c | 1 + include/linux/blk-mq.h | 1 + 7 files changed, 89 insertions(+), 7 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 490ef2850fae..0715ce93daef 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -32,12 +32,6 @@ config IOSCHED_CFQ This is the default I/O scheduler. -config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y - ---help--- - MQ version of the deadline IO scheduler. - config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP @@ -69,6 +63,56 @@ config DEFAULT_IOSCHED default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP +config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y + ---help--- + MQ version of the deadline IO scheduler. + +config MQ_IOSCHED_NONE + bool + default y + +choice + prompt "Default single-queue blk-mq I/O scheduler" + default DEFAULT_SQ_NONE + help + Select the I/O scheduler which will be used by default for blk-mq + managed block devices with a single queue. + + config DEFAULT_SQ_DEADLINE + bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y + + config DEFAULT_SQ_NONE + bool "None" + +endchoice + +config DEFAULT_SQ_IOSCHED + string + default "mq-deadline" if DEFAULT_SQ_DEADLINE + default "none" if DEFAULT_SQ_NONE + +choice + prompt "Default multi-queue blk-mq I/O scheduler" + default DEFAULT_MQ_NONE + help + Select the I/O scheduler which will be used by default for blk-mq + managed block devices with multiple queues. + + config DEFAULT_MQ_DEADLINE + bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y + + config DEFAULT_MQ_NONE + bool "None" + +endchoice + +config DEFAULT_MQ_IOSCHED + string + default "mq-deadline" if DEFAULT_MQ_DEADLINE + default "none" if DEFAULT_MQ_NONE + endmenu endif diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 26759798a0b3..d05061f27bb1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -366,3 +366,23 @@ void blk_mq_sched_teardown(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) blk_mq_sched_free_tags(set, hctx, i); } + +int blk_mq_sched_init(struct request_queue *q) +{ + int ret; + +#if defined(CONFIG_DEFAULT_SQ_NONE) + if (q->nr_hw_queues == 1) + return 0; +#endif +#if defined(CONFIG_DEFAULT_MQ_NONE) + if (q->nr_hw_queues > 1) + return 0; +#endif + + mutex_lock(&q->sysfs_lock); + ret = elevator_init(q, NULL); + mutex_unlock(&q->sysfs_lock); + + return ret; +} diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 35c49e2e008a..6b465bc7014c 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -28,6 +28,8 @@ void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, int blk_mq_sched_setup(struct request_queue *q); void blk_mq_sched_teardown(struct request_queue *q); +int blk_mq_sched_init(struct request_queue *q); + static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 45e1707a9f86..fa1f8619bfe7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2285,6 +2285,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, mutex_unlock(&all_q_mutex); put_online_cpus(); + if (!(set->flags & BLK_MQ_F_NO_SCHED)) { + int ret; + + ret = blk_mq_sched_init(q); + if (ret) + return ERR_PTR(ret); + } + return q; err_hctxs: diff --git a/block/elevator.c b/block/elevator.c index 0e1ccddab8a2..bcba2dd5cb5c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -219,7 +219,13 @@ int elevator_init(struct request_queue *q, char *name) } if (!e) { - e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + if (q->mq_ops && q->nr_hw_queues == 1) + e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false); + else if (q->mq_ops) + e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false); + else + e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 19beeb7b2ac2..e1b4e603b1cf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1181,6 +1181,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 63569eb46d15..8e4df3d6c8cd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -153,6 +153,7 @@ enum { BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, + BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, -- cgit v1.2.3 From 4d199c6f1c847151ea393d5b9946a17cc57cdf6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Jan 2017 07:43:26 -0700 Subject: blk-cgroup: ensure that we clear the stop bit on quiesced queues If we call blk_mq_quiesce_queue() on a queue, we must remember to pair that with something that clears the stopped by on the queues later on. Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2630f64bed19..efb97ec37eee 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1265,9 +1265,10 @@ pd_prealloc: spin_unlock_irq(q->queue_lock); out_bypass_end: - if (q->mq_ops) + if (q->mq_ops) { blk_mq_unfreeze_queue(q); - else + blk_mq_start_stopped_hw_queues(q, true); + } else blk_queue_bypass_end(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); @@ -1317,9 +1318,10 @@ void blkcg_deactivate_policy(struct request_queue *q, spin_unlock_irq(q->queue_lock); - if (q->mq_ops) + if (q->mq_ops) { blk_mq_unfreeze_queue(q); - else + blk_mq_start_stopped_hw_queues(q, true); + } else blk_queue_bypass_end(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); -- cgit v1.2.3 From f66227de5924ed0fde1823f5cbc4d8b8f45faaa2 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 18 Jan 2017 11:55:21 -0800 Subject: sbitmap: use smp_mb__after_atomic() in sbq_wake_up() We always do an atomic clear_bit() right before we call sbq_wake_up(), so we can use smp_mb__after_atomic(). While we're here, comment the memory barriers in here a little more. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- lib/sbitmap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 2cecf05c82fd..df4e472df8a3 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -299,8 +299,14 @@ static void sbq_wake_up(struct sbitmap_queue *sbq) struct sbq_wait_state *ws; int wait_cnt; - /* Ensure that the wait list checks occur after clear_bit(). */ - smp_mb(); + /* + * Pairs with the memory barrier in set_current_state() to ensure the + * proper ordering of clear_bit()/waitqueue_active() in the waker and + * test_and_set_bit()/prepare_to_wait()/finish_wait() in the waiter. See + * the comment on waitqueue_active(). This is __after_atomic because we + * just did clear_bit() in the caller. + */ + smp_mb__after_atomic(); ws = sbq_wake_ptr(sbq); if (!ws) @@ -331,7 +337,8 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) int i, wake_index; /* - * Make sure all changes prior to this are visible from other CPUs. + * Pairs with the memory barrier in set_current_state() like in + * sbq_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); -- cgit v1.2.3 From 6c0ca7ae292adea09b8bdd33a524bb9326c3e989 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 18 Jan 2017 11:55:22 -0800 Subject: sbitmap: fix wakeup hang after sbq resize When we resize a struct sbitmap_queue, we update the wakeup batch size, but we don't update the wait count in the struct sbq_wait_states. If we resized down from a size which could use a bigger batch size, these counts could be too large and cause us to miss necessary wakeups. To fix this, update the wait counts when we resize (ensuring some careful memory ordering so that it's safe w.r.t. concurrent clears). This also fixes a theoretical issue where two threads could end up bumping the wait count up by the batch size, which could also potentially lead to hangs. Reported-by: Martin Raiber Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs") Fixes: 2971c35f3588 ("blk-mq: bitmap tag: fix race on blk_mq_bitmap_tags::wake_cnt") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- lib/sbitmap.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index df4e472df8a3..8f5c3b268c77 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -239,7 +239,19 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { - sbq->wake_batch = sbq_calc_wake_batch(depth); + unsigned int wake_batch = sbq_calc_wake_batch(depth); + int i; + + if (sbq->wake_batch != wake_batch) { + WRITE_ONCE(sbq->wake_batch, wake_batch); + /* + * Pairs with the memory barrier in sbq_wake_up() to ensure that + * the batch size is updated before the wait counts. + */ + smp_mb__before_atomic(); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) + atomic_set(&sbq->ws[i].wait_cnt, 1); + } sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); @@ -297,6 +309,7 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) static void sbq_wake_up(struct sbitmap_queue *sbq) { struct sbq_wait_state *ws; + unsigned int wake_batch; int wait_cnt; /* @@ -313,10 +326,22 @@ static void sbq_wake_up(struct sbitmap_queue *sbq) return; wait_cnt = atomic_dec_return(&ws->wait_cnt); - if (unlikely(wait_cnt < 0)) - wait_cnt = atomic_inc_return(&ws->wait_cnt); - if (wait_cnt == 0) { - atomic_add(sbq->wake_batch, &ws->wait_cnt); + if (wait_cnt <= 0) { + wake_batch = READ_ONCE(sbq->wake_batch); + /* + * Pairs with the memory barrier in sbitmap_queue_resize() to + * ensure that we see the batch size update before the wait + * count is reset. + */ + smp_mb__before_atomic(); + /* + * If there are concurrent callers to sbq_wake_up(), the last + * one to decrement the wait count below zero will bump it back + * up. If there is a concurrent resize, the count reset will + * either cause the cmpxchg to fail or overwrite after the + * cmpxchg. + */ + atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); sbq_index_atomic_inc(&sbq->wake_index); wake_up(&ws->wait); } -- cgit v1.2.3 From 38dbb7dd4db184da4d2673f4bb963f7006465c37 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Jan 2017 15:37:27 -0700 Subject: blk-cgroup: don't quiesce the queue on policy activate/deactivate There's no potential harm in quiescing the queue, but it also doesn't buy us anything. And we can't run the queue async for policy deactivate, since we could be in the path of tearing the queue down. If we schedule an async run of the queue at that time, we're racing with queue teardown AFTER having we've already torn most of it down. Reported-by: Omar Sandoval Fixes: 4d199c6f1c84 ("blk-cgroup: ensure that we clear the stop bit on quiesced queues") Tested-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index efb97ec37eee..fb59a3edc778 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1223,10 +1223,9 @@ int blkcg_activate_policy(struct request_queue *q, if (blkcg_policy_enabled(q, pol)) return 0; - if (q->mq_ops) { + if (q->mq_ops) blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - } else + else blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { @@ -1265,10 +1264,9 @@ pd_prealloc: spin_unlock_irq(q->queue_lock); out_bypass_end: - if (q->mq_ops) { + if (q->mq_ops) blk_mq_unfreeze_queue(q); - blk_mq_start_stopped_hw_queues(q, true); - } else + else blk_queue_bypass_end(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); @@ -1292,10 +1290,9 @@ void blkcg_deactivate_policy(struct request_queue *q, if (!blkcg_policy_enabled(q, pol)) return; - if (q->mq_ops) { + if (q->mq_ops) blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - } else + else blk_queue_bypass_start(q); spin_lock_irq(q->queue_lock); @@ -1318,10 +1315,9 @@ void blkcg_deactivate_policy(struct request_queue *q, spin_unlock_irq(q->queue_lock); - if (q->mq_ops) { + if (q->mq_ops) blk_mq_unfreeze_queue(q); - blk_mq_start_stopped_hw_queues(q, true); - } else + else blk_queue_bypass_end(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); -- cgit v1.2.3 From 610d886c0c22fa7504e817b6d03c402de64b0264 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Jan 2017 07:10:13 -0700 Subject: elevator: fix unnecessary put of elevator in failure case We already checked that e is NULL, so no point in calling elevator_put() to free it. Reported-by: Dan Carpenter Fixes: dc877dbd088f ("blk-mq-sched: add framework for MQ capable IO schedulers") Signed-off-by: Jens Axboe --- block/elevator.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index bcba2dd5cb5c..ef7f59469acc 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -230,10 +230,6 @@ int elevator_init(struct request_queue *q, char *name) printk(KERN_ERR "Default I/O scheduler not found. " \ "Using noop/none.\n"); - if (q->mq_ops) { - elevator_put(e); - return 0; - } e = elevator_get("noop", false); } } -- cgit v1.2.3 From 8cecb07d70e761eb0112f921d939b9ab2ea2171f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Jan 2017 07:39:17 -0700 Subject: blk-mq-tag: remove redundant check for 'data->hctx' being non-NULL We used to pass in NULL for hctx for reserved tags, but we don't do that anymore. Hence the check for whether hctx is NULL or not is now redundant, kill it. Reported-by: Dan Carpenter Fixes: a642a158aec6 ("blk-mq-tag: cleanup the normal/reserved tag allocation") Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9753747a34a2..5504eb7ed10b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -136,11 +136,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for - * some to complete. Note that hctx can be NULL here for - * reserved tag allocation. + * some to complete. */ - if (data->hctx) - blk_mq_run_hw_queue(data->hctx, false); + blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, -- cgit v1.2.3 From 7e79dadce222e06e0c30a77280f3426014bee185 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Jan 2017 07:58:59 -0700 Subject: blk-mq: stop hardware queue in blk_mq_delay_queue() The run handler we register for the delayed work requires that the queue be stopped, yet we leave that up to the caller. Let's move it into blk_mq_delay_queue() itself, so that the API is sane. This fixes a stall with SCSI, where it calls blk_mq_delay_queue() without having stopped the queue. Hence the queue is never run. Reported-by: Hannes Reinecke Fixes: 70f4db639c5b ("blk-mq: add blk_mq_delay_queue") Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index fa1f8619bfe7..b365cde4c909 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1170,6 +1170,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) if (unlikely(!blk_mq_hw_queue_mapped(hctx))) return; + blk_mq_stop_hw_queue(hctx); kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->delay_work, msecs_to_jiffies(msecs)); } -- cgit v1.2.3 From 70f36b6001bf596eb411c4b302e84c4824ae8730 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Jan 2017 10:59:07 -0700 Subject: blk-mq: allow resize of scheduler requests Add support for growing the tags associated with a hardware queue, for the scheduler tags. Currently we only support resizing within the limits of the original depth, change that so we can grow it as well by allocating and replacing the existing scheduler tag set. This is similar to how we could increase the software queue depth with the legacy IO stack and schedulers. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval --- block/blk-mq-tag.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- block/blk-mq-tag.h | 4 +++- block/blk-mq.c | 19 ++++++++++++++----- 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5504eb7ed10b..a49ec77c415a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -387,19 +387,56 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) kfree(tags); } -int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) +int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tagsptr, unsigned int tdepth, + bool can_grow) { - tdepth -= tags->nr_reserved_tags; - if (tdepth > tags->nr_tags) + struct blk_mq_tags *tags = *tagsptr; + + if (tdepth <= tags->nr_reserved_tags) return -EINVAL; + tdepth -= tags->nr_reserved_tags; + /* - * Don't need (or can't) update reserved tags here, they remain - * static and should never need resizing. + * If we are allowed to grow beyond the original size, allocate + * a new set of tags before freeing the old one. */ - sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + if (tdepth > tags->nr_tags) { + struct blk_mq_tag_set *set = hctx->queue->tag_set; + struct blk_mq_tags *new; + bool ret; + + if (!can_grow) + return -EINVAL; + + /* + * We need some sort of upper limit, set it high enough that + * no valid use cases should require more. + */ + if (tdepth > 16 * BLKDEV_MAX_RQ) + return -EINVAL; + + new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0); + if (!new) + return -ENOMEM; + ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); + if (ret) { + blk_mq_free_rq_map(new); + return -ENOMEM; + } + + blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); + blk_mq_free_rq_map(*tagsptr); + *tagsptr = new; + } else { + /* + * Don't need (or can't) update reserved tags here, they + * remain static and should never need resizing. + */ + sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + } - blk_mq_tag_wakeup_all(tags, false); return 0; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 41cd15fd1afd..ac22878462e7 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -29,7 +29,9 @@ extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); -extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); +extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tags, + unsigned int depth, bool can_grow); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); diff --git a/block/blk-mq.c b/block/blk-mq.c index b365cde4c909..ee69e5e89769 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2561,6 +2561,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!set) return -EINVAL; + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) @@ -2569,11 +2572,14 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ - if (!hctx->sched_tags) - ret = blk_mq_tag_update_depth(hctx->tags, - min(nr, set->queue_depth)); - else - ret = blk_mq_tag_update_depth(hctx->sched_tags, nr); + if (!hctx->sched_tags) { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, + min(nr, set->queue_depth), + false); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, + nr, true); + } if (ret) break; } @@ -2581,6 +2587,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + return ret; } -- cgit v1.2.3 From 4d608baac5f4e72b033a122b2d6d9499532c3afc Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 23 Jan 2017 15:06:43 +0100 Subject: block: Initialize cfqq->ioprio_class in cfq_get_queue() KMSAN (KernelMemorySanitizer, a new error detection tool) reports use of uninitialized memory in cfq_init_cfqq(): ================================================================== BUG: KMSAN: use of unitialized memory ... Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x157/0x1d0 lib/dump_stack.c:51 [] kmsan_report+0x205/0x360 ??:? [] __msan_warning+0x5b/0xb0 ??:? [< inline >] cfq_init_cfqq block/cfq-iosched.c:3754 [] cfq_get_queue+0xc80/0x14d0 block/cfq-iosched.c:3857 ... origin: [] save_stack_trace+0x27/0x50 arch/x86/kernel/stacktrace.c:67 [] kmsan_internal_poison_shadow+0xab/0x150 ??:? [] kmsan_poison_slab+0xbb/0x120 ??:? [< inline >] allocate_slab mm/slub.c:1627 [] new_slab+0x3af/0x4b0 mm/slub.c:1641 [< inline >] new_slab_objects mm/slub.c:2407 [] ___slab_alloc+0x323/0x4a0 mm/slub.c:2564 [< inline >] __slab_alloc mm/slub.c:2606 [< inline >] slab_alloc_node mm/slub.c:2669 [] kmem_cache_alloc_node+0x1d2/0x1f0 mm/slub.c:2746 [] cfq_get_queue+0x47d/0x14d0 block/cfq-iosched.c:3850 ... ================================================================== (the line numbers are relative to 4.8-rc6, but the bug persists upstream) The uninitialized struct cfq_queue is created by kmem_cache_alloc_node() and then passed to cfq_init_cfqq(), which accesses cfqq->ioprio_class before it's initialized. Signed-off-by: Alexander Potapenko Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 37aeb20fa454..c472886c727f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3864,6 +3864,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } + /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */ + cfqq->ioprio_class = IOPRIO_CLASS_NONE; cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, cic); cfq_link_cfqq_cfqg(cfqq, cfqg); -- cgit v1.2.3 From d609af3a1397511a7a2c213f9f855fa82df771ee Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 21 Jan 2017 22:15:33 +0100 Subject: blk-throttle: Adjust two function calls together with a variable assignment The script "checkpatch.pl" pointed information out like the following. ERROR: do not use assignment in if condition Thus fix the affected source code places. Signed-off-by: Markus Elfring Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-throttle.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a6bb4fe326c3..82fd0cc394eb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -866,10 +866,12 @@ static void tg_update_disptime(struct throtl_grp *tg) unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; - if ((bio = throtl_peek_queued(&sq->queued[READ]))) + bio = throtl_peek_queued(&sq->queued[READ]); + if (bio) tg_may_dispatch(tg, bio, &read_wait); - if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) + bio = throtl_peek_queued(&sq->queued[WRITE]); + if (bio) tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); -- cgit v1.2.3 From 1cf417530375b475d4a8a9f18dc0867f91e52d78 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 21 Jan 2017 22:44:07 +0100 Subject: cfq-iosched: Adjust one function call together with a variable assignment The script "checkpatch.pl" pointed information out like the following. ERROR: do not use assignment in if condition Thus fix the affected source code place. Signed-off-by: Markus Elfring Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c472886c727f..f0f29ee731e1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2749,9 +2749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) if (!cfqg) return NULL; - for_each_cfqg_st(cfqg, i, j, st) - if ((cfqq = cfq_rb_first(st)) != NULL) + for_each_cfqg_st(cfqg, i, j, st) { + cfqq = cfq_rb_first(st); + if (cfqq) return cfqq; + } return NULL; } -- cgit v1.2.3 From 200e86b3372b51e136a382e007b6b904b1dac7e4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Jan 2017 08:11:38 -0700 Subject: blk-mq: only apply active queue tag throttling for driver tags If we have a scheduler attached, we have two sets of tags. We don't want to apply our active queue throttling for the scheduler side of tags, that only applies to driver tags since that's the resource we need to dispatch an IO. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 12 +++++++----- block/blk-mq.c | 13 ++++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index a49ec77c415a..1b156ca79af6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -90,9 +90,11 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -static int __blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) +static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, + struct sbitmap_queue *bt) { - if (!hctx_may_queue(hctx, bt)) + if (!(data->flags & BLK_MQ_REQ_INTERNAL) && + !hctx_may_queue(data->hctx, bt)) return -1; return __sbitmap_queue_get(bt); } @@ -118,7 +120,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) tag_offset = tags->nr_reserved_tags; } - tag = __blk_mq_get_tag(data->hctx, bt); + tag = __blk_mq_get_tag(data, bt); if (tag != -1) goto found_tag; @@ -129,7 +131,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) do { prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __blk_mq_get_tag(data->hctx, bt); + tag = __blk_mq_get_tag(data, bt); if (tag != -1) break; @@ -144,7 +146,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __blk_mq_get_tag(data->hctx, bt); + tag = __blk_mq_get_tag(data, bt); if (tag != -1) break; diff --git a/block/blk-mq.c b/block/blk-mq.c index ee69e5e89769..dcb567642db7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -230,15 +230,14 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, rq = tags->static_rqs[tag]; - if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } - if (data->flags & BLK_MQ_REQ_INTERNAL) { rq->tag = -1; rq->internal_tag = tag; } else { + if (blk_mq_tag_busy(data->hctx)) { + rq->rq_flags = RQF_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } rq->tag = tag; rq->internal_tag = -1; } @@ -869,6 +868,10 @@ done: rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { + if (blk_mq_tag_busy(data.hctx)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&data.hctx->nr_active); + } data.hctx->tags->rqs[rq->tag] = rq; goto done; } -- cgit v1.2.3 From 5a797e00dc93593c9915841779881b15d9856237 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 12:22:11 -0700 Subject: blk-mq: don't lose flags passed in to blk_mq_alloc_request() If we come in from blk_mq_alloc_requst() with NOWAIT set in flags, we must ensure that we don't later overwrite that in blk_mq_sched_get_request(). Initialize alloc_data->flags before passing it in. Reported-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 2 +- block/blk-mq.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d05061f27bb1..56b92db944ae 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -117,7 +117,7 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(data, q, 0, ctx, hctx); + blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); if (e) { data->flags |= BLK_MQ_REQ_INTERNAL; diff --git a/block/blk-mq.c b/block/blk-mq.c index dcb567642db7..84d13b5cafd0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -253,7 +253,7 @@ EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags) { - struct blk_mq_alloc_data alloc_data; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; int ret; @@ -1382,7 +1382,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); - struct blk_mq_alloc_data data; + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; unsigned int request_count = 0, srcu_idx; struct blk_plug *plug; @@ -1504,7 +1504,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_plug *plug; unsigned int request_count = 0; - struct blk_mq_alloc_data data; + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; blk_qc_t cookie; unsigned int wb_acct; -- cgit v1.2.3 From b48fda0976a802e0fe4fc0bedefb7cf380ec6426 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 14:52:20 -0700 Subject: blk-mq-sched: check for successful allocation before assigning tag We don't trigger this from the normal IO path, since we always use blocking allocations from there. But Bart saw it testing multipath dm, since that is a heavy user of atomic request allocations in the map and clone path. Reported-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 56b92db944ae..4cee060a292d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -134,7 +134,8 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, rq = __blk_mq_alloc_request(data, op); } else { rq = __blk_mq_alloc_request(data, op); - data->hctx->tags->rqs[rq->tag] = rq; + if (rq) + data->hctx->tags->rqs[rq->tag] = rq; } if (rq) { -- cgit v1.2.3 From 07e4fead45e6e1932f0b960655ab554b6aab6a08 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:40 -0800 Subject: blk-mq: create debugfs directory tree In preparation for putting blk-mq debugging information in debugfs, create a directory tree mirroring the one in sysfs: # tree -d /sys/kernel/debug/block /sys/kernel/debug/block |-- nvme0n1 | `-- mq | |-- 0 | | `-- cpu0 | |-- 1 | | `-- cpu1 | |-- 2 | | `-- cpu2 | `-- 3 | `-- cpu3 `-- vda `-- mq `-- 0 |-- cpu0 |-- cpu1 |-- cpu2 `-- cpu3 Also add the scaffolding for the actual files that will go in here, either under the hardware queue or software queue directories. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/Makefile | 1 + block/blk-mq-debugfs.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sysfs.c | 8 +++ block/blk-mq.c | 2 + block/blk-mq.h | 33 +++++++++++ include/linux/blkdev.h | 5 ++ 6 files changed, 201 insertions(+) create mode 100644 block/blk-mq-debugfs.c diff --git a/block/Makefile b/block/Makefile index 3ee0abd7205a..6cabe6bd2882 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o +obj-$(CONFIG_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c new file mode 100644 index 000000000000..01711bbf5ade --- /dev/null +++ b/block/blk-mq-debugfs.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include +#include "blk-mq.h" + +struct blk_mq_debugfs_attr { + const char *name; + umode_t mode; + const struct file_operations *fops; +}; + +static struct dentry *block_debugfs_root; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { +}; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { +}; + +int blk_mq_debugfs_register(struct request_queue *q, const char *name) +{ + if (!block_debugfs_root) + return -ENOENT; + + q->debugfs_dir = debugfs_create_dir(name, block_debugfs_root); + if (!q->debugfs_dir) + goto err; + + if (blk_mq_debugfs_register_hctxs(q)) + goto err; + + return 0; + +err: + blk_mq_debugfs_unregister(q); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister(struct request_queue *q) +{ + debugfs_remove_recursive(q->debugfs_dir); + q->mq_debugfs_dir = NULL; + q->debugfs_dir = NULL; +} + +static int blk_mq_debugfs_register_ctx(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct dentry *hctx_dir) +{ + struct dentry *ctx_dir; + char name[20]; + int i; + + snprintf(name, sizeof(name), "cpu%u", ctx->cpu); + ctx_dir = debugfs_create_dir(name, hctx_dir); + if (!ctx_dir) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_ctx_attrs); i++) { + const struct blk_mq_debugfs_attr *attr; + + attr = &blk_mq_debugfs_ctx_attrs[i]; + if (!debugfs_create_file(attr->name, attr->mode, ctx_dir, ctx, + attr->fops)) + return -ENOMEM; + } + + return 0; +} + +static int blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + struct dentry *hctx_dir; + char name[20]; + int i; + + snprintf(name, sizeof(name), "%u", hctx->queue_num); + hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir); + if (!hctx_dir) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_hctx_attrs); i++) { + const struct blk_mq_debugfs_attr *attr; + + attr = &blk_mq_debugfs_hctx_attrs[i]; + if (!debugfs_create_file(attr->name, attr->mode, hctx_dir, hctx, + attr->fops)) + return -ENOMEM; + } + + hctx_for_each_ctx(hctx, ctx, i) { + if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir)) + return -ENOMEM; + } + + return 0; +} + +int blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + if (!q->debugfs_dir) + return -ENOENT; + + q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir); + if (!q->mq_debugfs_dir) + goto err; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_debugfs_register_hctx(q, hctx)) + goto err; + } + + return 0; + +err: + blk_mq_debugfs_unregister_hctxs(q); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ + debugfs_remove_recursive(q->mq_debugfs_dir); + q->mq_debugfs_dir = NULL; +} + +void blk_mq_debugfs_init(void) +{ + block_debugfs_root = debugfs_create_dir("block", NULL); +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 2caecaa98e40..91b93ca1c1e0 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -468,6 +468,8 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) kobject_put(&hctx->kobj); } + blk_mq_debugfs_unregister(q); + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); kobject_del(&q->mq_kobj); kobject_put(&q->mq_kobj); @@ -517,6 +519,8 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q) kobject_uevent(&q->mq_kobj, KOBJ_ADD); + blk_mq_debugfs_register(q, kobject_name(&dev->kobj)); + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) @@ -542,6 +546,8 @@ void blk_mq_sysfs_unregister(struct request_queue *q) if (!q->mq_sysfs_init_done) return; + blk_mq_debugfs_unregister_hctxs(q); + queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } @@ -554,6 +560,8 @@ int blk_mq_sysfs_register(struct request_queue *q) if (!q->mq_sysfs_init_done) return ret; + blk_mq_debugfs_register_hctxs(q); + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) diff --git a/block/blk-mq.c b/block/blk-mq.c index 84d13b5cafd0..ede835c2b08b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2799,6 +2799,8 @@ void blk_mq_enable_hotplug(void) static int __init blk_mq_init(void) { + blk_mq_debugfs_init(); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); diff --git a/block/blk-mq.h b/block/blk-mq.h index 0c7c034d9ddd..6c24b901acd7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -78,6 +78,39 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +/* + * debugfs helpers + */ +#ifdef CONFIG_DEBUG_FS +void blk_mq_debugfs_init(void); +int blk_mq_debugfs_register(struct request_queue *q, const char *name); +void blk_mq_debugfs_unregister(struct request_queue *q); +int blk_mq_debugfs_register_hctxs(struct request_queue *q); +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); +#else +static inline void blk_mq_debugfs_init(void) +{ +} + +int blk_mq_debugfs_register(struct request_queue *q, const char *name); +{ + return 0; +} + +void blk_mq_debugfs_unregister(struct request_queue *q) +{ +} + +int blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ + return 0; +} + +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ +} +#endif + extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25564857f5f8..0ee283f3cffe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,11 @@ struct request_queue { struct list_head tag_set_list; struct bio_set *bio_split; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *mq_debugfs_dir; +#endif + bool mq_sysfs_init_done; }; -- cgit v1.2.3 From 9abb2ad21e8b9b7a2555411fbee225a508cf962d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:41 -0800 Subject: blk-mq: add hctx->{state,flags} to debugfs hctx->state could come in handy for bugs where the hardware queue gets stuck in the stopped state, and hctx->flags is just useful to know. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 01711bbf5ade..0c14511fa9e0 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -29,7 +29,49 @@ struct blk_mq_debugfs_attr { static struct dentry *block_debugfs_root; +static int hctx_state_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "0x%lx\n", hctx->state); + return 0; +} + +static int hctx_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_state_show, inode->i_private); +} + +static const struct file_operations hctx_state_fops = { + .open = hctx_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_flags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "0x%lx\n", hctx->flags); + return 0; +} + +static int hctx_flags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_flags_show, inode->i_private); +} + +static const struct file_operations hctx_flags_fops = { + .open = hctx_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { + {"state", 0400, &hctx_state_fops}, + {"flags", 0400, &hctx_flags_fops}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { -- cgit v1.2.3 From 950cd7e9ffdc44c340b8914126b39cc079f0c844 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:42 -0800 Subject: blk-mq: move hctx->dispatch and ctx->rq_list from sysfs to debugfs These lists are only useful for debugging; they definitely don't belong in sysfs. Putting them in debugfs also removes the limitation of a single page of output. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sysfs.c | 57 -------------------------- 2 files changed, 106 insertions(+), 57 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 0c14511fa9e0..1967c06d80e0 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -29,6 +29,20 @@ struct blk_mq_debugfs_attr { static struct dentry *block_debugfs_root; +static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, ops); + if (!ret) { + m = file->private_data; + m->private = inode->i_private; + } + return ret; +} + static int hctx_state_show(struct seq_file *m, void *v) { struct blk_mq_hw_ctx *hctx = m->private; @@ -69,12 +83,104 @@ static const struct file_operations hctx_flags_fops = { .release = single_release, }; +static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) +{ + struct request *rq = list_entry_rq(v); + + seq_printf(m, "%p\n", rq); + return 0; +} + +static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_lock(&hctx->lock); + return seq_list_start(&hctx->dispatch, *pos); +} + +static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + return seq_list_next(v, &hctx->dispatch, pos); +} + +static void hctx_dispatch_stop(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_unlock(&hctx->lock); +} + +static const struct seq_operations hctx_dispatch_seq_ops = { + .start = hctx_dispatch_start, + .next = hctx_dispatch_next, + .stop = hctx_dispatch_stop, + .show = blk_mq_debugfs_rq_show, +}; + +static int hctx_dispatch_open(struct inode *inode, struct file *file) +{ + return blk_mq_debugfs_seq_open(inode, file, &hctx_dispatch_seq_ops); +} + +static const struct file_operations hctx_dispatch_fops = { + .open = hctx_dispatch_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) +{ + struct blk_mq_ctx *ctx = m->private; + + spin_lock(&ctx->lock); + return seq_list_start(&ctx->rq_list, *pos); +} + +static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct blk_mq_ctx *ctx = m->private; + + return seq_list_next(v, &ctx->rq_list, pos); +} + +static void ctx_rq_list_stop(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + spin_unlock(&ctx->lock); +} + +static const struct seq_operations ctx_rq_list_seq_ops = { + .start = ctx_rq_list_start, + .next = ctx_rq_list_next, + .stop = ctx_rq_list_stop, + .show = blk_mq_debugfs_rq_show, +}; + +static int ctx_rq_list_open(struct inode *inode, struct file *file) +{ + return blk_mq_debugfs_seq_open(inode, file, &ctx_rq_list_seq_ops); +} + +static const struct file_operations ctx_rq_list_fops = { + .open = ctx_rq_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, &hctx_state_fops}, {"flags", 0400, &hctx_flags_fops}, + {"dispatch", 0400, &hctx_dispatch_fops}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { + {"rq_list", 0400, &ctx_rq_list_fops}, }; int blk_mq_debugfs_register(struct request_queue *q, const char *name) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 91b93ca1c1e0..ee3694d8c4ee 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -139,41 +139,6 @@ static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) ctx->rq_completed[0]); } -static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) -{ - struct request *rq; - int len = snprintf(page, PAGE_SIZE - 1, "%s:\n", msg); - - list_for_each_entry(rq, list, queuelist) { - const int rq_len = 2 * sizeof(rq) + 2; - - /* if the output will be truncated */ - if (PAGE_SIZE - 1 < len + rq_len) { - /* backspacing if it can't hold '\t...\n' */ - if (PAGE_SIZE - 1 < len + 5) - len -= rq_len; - len += snprintf(page + len, PAGE_SIZE - 1 - len, - "\t...\n"); - break; - } - len += snprintf(page + len, PAGE_SIZE - 1 - len, - "\t%p\n", rq); - } - - return len; -} - -static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) -{ - ssize_t ret; - - spin_lock(&ctx->lock); - ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); - spin_unlock(&ctx->lock); - - return ret; -} - static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) { return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n", @@ -219,18 +184,6 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, return page - start_page; } -static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, - char *page) -{ - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); - spin_unlock(&hctx->lock); - - return ret; -} - static ssize_t blk_mq_hw_sysfs_sched_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { if (hctx->sched_tags) @@ -320,16 +273,11 @@ static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { .attr = {.name = "completed", .mode = S_IRUGO }, .show = blk_mq_sysfs_completed_show, }; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { - .attr = {.name = "rq_list", .mode = S_IRUGO }, - .show = blk_mq_sysfs_rq_list_show, -}; static struct attribute *default_ctx_attrs[] = { &blk_mq_sysfs_dispatched.attr, &blk_mq_sysfs_merged.attr, &blk_mq_sysfs_completed.attr, - &blk_mq_sysfs_rq_list.attr, NULL, }; @@ -349,10 +297,6 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { .attr = {.name = "active", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_active_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { - .attr = {.name = "pending", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_rq_list_show, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_sched_tags = { .attr = {.name = "sched_tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_sched_tags_show, @@ -380,7 +324,6 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_dispatched.attr, - &blk_mq_hw_sysfs_pending.attr, &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_sched_tags.attr, &blk_mq_hw_sysfs_cpus.attr, -- cgit v1.2.3 From 7b3938524cfbd7abd1814d879ecfcd2e2a9aa775 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:43 -0800 Subject: blk-mq: add extra request information to debugfs The request pointers by themselves aren't super useful. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1967c06d80e0..ee947f2f605b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -87,7 +87,9 @@ static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) { struct request *rq = list_entry_rq(v); - seq_printf(m, "%p\n", rq); + seq_printf(m, "%p {.cmd_type=%u, .cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n", + rq, rq->cmd_type, rq->cmd_flags, (unsigned int)rq->rq_flags, + rq->tag, rq->internal_tag); return 0; } -- cgit v1.2.3 From 24af1ccfe12adddbe17d11801e1689791a4cc282 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 14:32:13 -0800 Subject: sbitmap: add helpers for dumping to a seq_file This is useful debugging information that will be used in the blk-mq debugfs directory. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Changed 'weight' to 'busy'. Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 30 ++++++++++++++++ lib/sbitmap.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index f017fd6e69c4..d4e0a204c118 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -258,6 +258,26 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) unsigned int sbitmap_weight(const struct sbitmap *sb); +/** + * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_show(struct sbitmap *sb, struct seq_file *m); + +/** + * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct + * seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The output isn't guaranteed to be internally + * consistent. + */ +void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); + /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. @@ -370,4 +390,14 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); +/** + * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct + * seq_file. + * @sbq: Bitmap queue to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); + #endif /* __LINUX_SCALE_BITMAP_H */ diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 8f5c3b268c77..55e11c4b2f3b 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -17,6 +17,7 @@ #include #include +#include int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node) @@ -180,6 +181,62 @@ unsigned int sbitmap_weight(const struct sbitmap *sb) } EXPORT_SYMBOL_GPL(sbitmap_weight); +void sbitmap_show(struct sbitmap *sb, struct seq_file *m) +{ + seq_printf(m, "depth=%u\n", sb->depth); + seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); + seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); + seq_printf(m, "map_nr=%u\n", sb->map_nr); +} +EXPORT_SYMBOL_GPL(sbitmap_show); + +static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) +{ + if ((offset & 0xf) == 0) { + if (offset != 0) + seq_putc(m, '\n'); + seq_printf(m, "%08x:", offset); + } + if ((offset & 0x1) == 0) + seq_putc(m, ' '); + seq_printf(m, "%02x", byte); +} + +void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) +{ + u8 byte = 0; + unsigned int byte_bits = 0; + unsigned int offset = 0; + int i; + + for (i = 0; i < sb->map_nr; i++) { + unsigned long word = READ_ONCE(sb->map[i].word); + unsigned int word_bits = READ_ONCE(sb->map[i].depth); + + while (word_bits > 0) { + unsigned int bits = min(8 - byte_bits, word_bits); + + byte |= (word & (BIT(bits) - 1)) << byte_bits; + byte_bits += bits; + if (byte_bits == 8) { + emit_byte(m, offset, byte); + byte = 0; + byte_bits = 0; + offset++; + } + word >>= bits; + word_bits -= bits; + } + } + if (byte_bits) { + emit_byte(m, offset, byte); + offset++; + } + if (offset) + seq_putc(m, '\n'); +} +EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); + static unsigned int sbq_calc_wake_batch(unsigned int depth) { unsigned int wake_batch; @@ -377,3 +434,37 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) } } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); + +void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) +{ + bool first; + int i; + + sbitmap_show(&sbq->sb, m); + + seq_puts(m, "alloc_hint={"); + first = true; + for_each_possible_cpu(i) { + if (!first) + seq_puts(m, ", "); + first = false; + seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i)); + } + seq_puts(m, "}\n"); + + seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); + seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); + + seq_puts(m, "ws={\n"); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[i]; + + seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", + atomic_read(&ws->wait_cnt), + waitqueue_active(&ws->wait) ? "active" : "inactive"); + } + seq_puts(m, "}\n"); + + seq_printf(m, "round_robin=%d\n", sbq->round_robin); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_show); -- cgit v1.2.3 From 0bfa5288a78176be8a895ce65dbbf61f63f54b96 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:45 -0800 Subject: blk-mq: export software queue pending map to debugfs This is useful for debugging problems where we've gotten stuck with requests in the software queues. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index ee947f2f605b..857bdadc6b1f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -134,6 +134,26 @@ static const struct file_operations hctx_dispatch_fops = { .release = seq_release, }; +static int hctx_ctx_map_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + sbitmap_bitmap_show(&hctx->ctx_map, m); + return 0; +} + +static int hctx_ctx_map_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_ctx_map_show, inode->i_private); +} + +static const struct file_operations hctx_ctx_map_fops = { + .open = hctx_ctx_map_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) { struct blk_mq_ctx *ctx = m->private; @@ -179,6 +199,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, &hctx_state_fops}, {"flags", 0400, &hctx_flags_fops}, {"dispatch", 0400, &hctx_dispatch_fops}, + {"ctx_map", 0400, &hctx_ctx_map_fops}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { -- cgit v1.2.3 From d96b37c0af3e4f42928a1361d5cd9f4f8921b4a8 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:46 -0800 Subject: blk-mq: move tags and sched_tags info from sysfs to debugfs These are very tied to the blk-mq tag implementation, so exposing them to sysfs isn't a great idea. Move the debugging information to debugfs and add basic entries for the number of tags and the number of reserved tags to sysfs. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sysfs.c | 33 ++++++++++++------------ block/blk-mq-tag.c | 27 ------------------- block/blk-mq-tag.h | 1 - 4 files changed, 86 insertions(+), 45 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 857bdadc6b1f..9f811007cf4f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -20,6 +20,7 @@ #include #include "blk-mq.h" +#include "blk-mq-tag.h" struct blk_mq_debugfs_attr { const char *name; @@ -154,6 +155,73 @@ static const struct file_operations hctx_ctx_map_fops = { .release = single_release, }; +static void blk_mq_debugfs_tags_show(struct seq_file *m, + struct blk_mq_tags *tags) +{ + seq_printf(m, "nr_tags=%u\n", tags->nr_tags); + seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); + seq_printf(m, "active_queues=%d\n", + atomic_read(&tags->active_queues)); + + seq_puts(m, "\nbitmap_tags:\n"); + sbitmap_queue_show(&tags->bitmap_tags, m); + + if (tags->nr_reserved_tags) { + seq_puts(m, "\nbreserved_tags:\n"); + sbitmap_queue_show(&tags->breserved_tags, m); + } +} + +static int hctx_tags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->tags) + blk_mq_debugfs_tags_show(m, hctx->tags); + mutex_unlock(&q->sysfs_lock); + + return 0; +} + +static int hctx_tags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_tags_show, inode->i_private); +} + +static const struct file_operations hctx_tags_fops = { + .open = hctx_tags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_sched_tags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->sched_tags) + blk_mq_debugfs_tags_show(m, hctx->sched_tags); + mutex_unlock(&q->sysfs_lock); + + return 0; +} + +static int hctx_sched_tags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_sched_tags_show, inode->i_private); +} + +static const struct file_operations hctx_sched_tags_fops = { + .open = hctx_sched_tags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) { struct blk_mq_ctx *ctx = m->private; @@ -200,6 +268,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"flags", 0400, &hctx_flags_fops}, {"dispatch", 0400, &hctx_dispatch_fops}, {"ctx_map", 0400, &hctx_ctx_map_fops}, + {"tags", 0400, &hctx_tags_fops}, + {"sched_tags", 0400, &hctx_sched_tags_fops}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ee3694d8c4ee..a0ae1f536ed0 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -184,17 +184,16 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, return page - start_page; } -static ssize_t blk_mq_hw_sysfs_sched_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, + char *page) { - if (hctx->sched_tags) - return blk_mq_tag_sysfs_show(hctx->sched_tags, page); - - return 0; + return sprintf(page, "%u\n", hctx->tags->nr_tags); } -static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, + char *page) { - return blk_mq_tag_sysfs_show(hctx->tags, page); + return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); } static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) @@ -293,18 +292,18 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_dispatched_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { + .attr = {.name = "nr_tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_nr_tags_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { + .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_nr_reserved_tags_show, +}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { .attr = {.name = "active", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_active_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_sched_tags = { - .attr = {.name = "sched_tags", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_sched_tags_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { - .attr = {.name = "tags", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_tags_show, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, @@ -324,8 +323,8 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_dispatched.attr, - &blk_mq_hw_sysfs_tags.attr, - &blk_mq_hw_sysfs_sched_tags.attr, + &blk_mq_hw_sysfs_nr_tags.attr, + &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 1b156ca79af6..f8de2dbbb29f 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -329,11 +329,6 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, } -static unsigned int bt_unused_tags(const struct sbitmap_queue *bt) -{ - return bt->sb.depth - sbitmap_weight(&bt->sb); -} - static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { @@ -469,25 +464,3 @@ u32 blk_mq_unique_tag(struct request *rq) (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag); - -ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) -{ - char *orig_page = page; - unsigned int free, res; - - if (!tags) - return 0; - - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " - "bits_per_word=%u\n", - tags->nr_tags, tags->nr_reserved_tags, - 1U << tags->bitmap_tags.sb.shift); - - free = bt_unused_tags(&tags->bitmap_tags); - res = bt_unused_tags(&tags->breserved_tags); - - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); - page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); - - return page - orig_page; -} diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index ac22878462e7..63497423c5cd 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -28,7 +28,6 @@ extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); -extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); -- cgit v1.2.3 From d7e3621ad1e48f08dc14c34e353af69f7c35cb20 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:47 -0800 Subject: blk-mq: add tags and sched_tags bitmaps to debugfs These can be used to debug issues like tag leaks and stuck requests. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9f811007cf4f..9b87b15619f1 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -197,6 +197,30 @@ static const struct file_operations hctx_tags_fops = { .release = single_release, }; +static int hctx_tags_bitmap_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->tags) + sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); + mutex_unlock(&q->sysfs_lock); + return 0; +} + +static int hctx_tags_bitmap_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_tags_bitmap_show, inode->i_private); +} + +static const struct file_operations hctx_tags_bitmap_fops = { + .open = hctx_tags_bitmap_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int hctx_sched_tags_show(struct seq_file *m, void *v) { struct blk_mq_hw_ctx *hctx = m->private; @@ -222,6 +246,30 @@ static const struct file_operations hctx_sched_tags_fops = { .release = single_release, }; +static int hctx_sched_tags_bitmap_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + + mutex_lock(&q->sysfs_lock); + if (hctx->sched_tags) + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); + mutex_unlock(&q->sysfs_lock); + return 0; +} + +static int hctx_sched_tags_bitmap_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_sched_tags_bitmap_show, inode->i_private); +} + +static const struct file_operations hctx_sched_tags_bitmap_fops = { + .open = hctx_sched_tags_bitmap_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) { struct blk_mq_ctx *ctx = m->private; @@ -269,7 +317,9 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"dispatch", 0400, &hctx_dispatch_fops}, {"ctx_map", 0400, &hctx_ctx_map_fops}, {"tags", 0400, &hctx_tags_fops}, + {"tags_bitmap", 0400, &hctx_tags_bitmap_fops}, {"sched_tags", 0400, &hctx_sched_tags_fops}, + {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { -- cgit v1.2.3 From be21547318b2c7d9988237b106cc63767b86eae4 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:48 -0800 Subject: blk-mq: move hctx io_poll, stats, and dispatched from sysfs to debugfs These statistics _might_ be useful to userspace, but it's better not to commit to an ABI for these yet. Also, the dispatched file in sysfs couldn't be cleared, so make it clearable like the others in debugfs. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sysfs.c | 92 ---------------------------------- 2 files changed, 132 insertions(+), 92 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9b87b15619f1..28ca56dd2c4e 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -270,6 +270,135 @@ static const struct file_operations hctx_sched_tags_bitmap_fops = { .release = single_release, }; +static int hctx_io_poll_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "considered=%lu\n", hctx->poll_considered); + seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); + seq_printf(m, "success=%lu\n", hctx->poll_success); + return 0; +} + +static int hctx_io_poll_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_io_poll_show, inode->i_private); +} + +static ssize_t hctx_io_poll_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; + return count; +} + +static const struct file_operations hctx_io_poll_fops = { + .open = hctx_io_poll_open, + .read = seq_read, + .write = hctx_io_poll_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) +{ + seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", + stat->nr_samples, stat->mean, stat->min, stat->max); +} + +static int hctx_stats_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct blk_rq_stat stat[2]; + + blk_stat_init(&stat[BLK_STAT_READ]); + blk_stat_init(&stat[BLK_STAT_WRITE]); + + blk_hctx_stat_get(hctx, stat); + + seq_puts(m, "read: "); + print_stat(m, &stat[BLK_STAT_READ]); + seq_puts(m, "\n"); + + seq_puts(m, "write: "); + print_stat(m, &stat[BLK_STAT_WRITE]); + seq_puts(m, "\n"); + return 0; +} + +static int hctx_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_stats_show, inode->i_private); +} + +static ssize_t hctx_stats_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + struct blk_mq_ctx *ctx; + int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } + return count; +} + +static const struct file_operations hctx_stats_fops = { + .open = hctx_stats_open, + .read = seq_read, + .write = hctx_stats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_dispatched_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + int i; + + seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); + + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { + unsigned int d = 1U << (i - 1); + + seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); + } + + seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); + return 0; +} + +static int hctx_dispatched_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_dispatched_show, inode->i_private); +} + +static ssize_t hctx_dispatched_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + int i; + + for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) + hctx->dispatched[i] = 0; + return count; +} + +static const struct file_operations hctx_dispatched_fops = { + .open = hctx_dispatched_open, + .read = seq_read, + .write = hctx_dispatched_write, + .llseek = seq_lseek, + .release = single_release, +}; + static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) { struct blk_mq_ctx *ctx = m->private; @@ -320,6 +449,9 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, &hctx_tags_bitmap_fops}, {"sched_tags", 0400, &hctx_sched_tags_fops}, {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, + {"io_poll", 0600, &hctx_io_poll_fops}, + {"stats", 0600, &hctx_stats_fops}, + {"dispatched", 0600, &hctx_dispatched_fops}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index a0ae1f536ed0..f505da2c542e 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -139,21 +139,6 @@ static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) ctx->rq_completed[0]); } -static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n", - hctx->poll_considered, hctx->poll_invoked, - hctx->poll_success); -} - -static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t size) -{ - hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; - - return size; -} - static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -165,25 +150,6 @@ static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) return sprintf(page, "%lu\n", hctx->run); } -static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, - char *page) -{ - char *start_page = page; - int i; - - page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { - unsigned int d = 1U << (i - 1); - - page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]); - } - - page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1), - hctx->dispatched[i]); - return page - start_page; -} - static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -219,47 +185,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } -static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) -{ - struct blk_mq_ctx *ctx; - unsigned int i; - - hctx_for_each_ctx(hctx, ctx, i) { - blk_stat_init(&ctx->stat[BLK_STAT_READ]); - blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); - } -} - -static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t count) -{ - blk_mq_stat_clear(hctx); - return count; -} - -static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) -{ - return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", - pre, (long long) stat->nr_samples, - (long long) stat->mean, (long long) stat->min, - (long long) stat->max); -} - -static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - struct blk_rq_stat stat[2]; - ssize_t ret; - - blk_stat_init(&stat[BLK_STAT_READ]); - blk_stat_init(&stat[BLK_STAT_WRITE]); - - blk_hctx_stat_get(hctx, stat); - - ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); - ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); - return ret; -} - static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -288,10 +213,6 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { .attr = {.name = "run", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_run_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { - .attr = {.name = "dispatched", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_dispatched_show, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { .attr = {.name = "nr_tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_nr_tags_show, @@ -308,27 +229,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { - .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO }, - .show = blk_mq_hw_sysfs_poll_show, - .store = blk_mq_hw_sysfs_poll_store, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { - .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, - .show = blk_mq_hw_sysfs_stat_show, - .store = blk_mq_hw_sysfs_stat_store, -}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, &blk_mq_hw_sysfs_run.attr, - &blk_mq_hw_sysfs_dispatched.attr, &blk_mq_hw_sysfs_nr_tags.attr, &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, - &blk_mq_hw_sysfs_poll.attr, - &blk_mq_hw_sysfs_stat.attr, NULL, }; -- cgit v1.2.3 From 4a46f05ebf9921dcba35770106e42574d323d6dd Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:49 -0800 Subject: blk-mq: move hctx and ctx counters from sysfs to debugfs These counters aren't as out-of-place in sysfs as the other stuff, but debugfs is a slightly better home for them. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sysfs.c | 64 ----------------- 2 files changed, 181 insertions(+), 64 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28ca56dd2c4e..5cd2b435a9f5 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -399,6 +399,88 @@ static const struct file_operations hctx_dispatched_fops = { .release = single_release, }; +static int hctx_queued_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%lu\n", hctx->queued); + return 0; +} + +static int hctx_queued_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_queued_show, inode->i_private); +} + +static ssize_t hctx_queued_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->queued = 0; + return count; +} + +static const struct file_operations hctx_queued_fops = { + .open = hctx_queued_open, + .read = seq_read, + .write = hctx_queued_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_run_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%lu\n", hctx->run); + return 0; +} + +static int hctx_run_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_run_show, inode->i_private); +} + +static ssize_t hctx_run_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->run = 0; + return count; +} + +static const struct file_operations hctx_run_fops = { + .open = hctx_run_open, + .read = seq_read, + .write = hctx_run_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_active_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + return 0; +} + +static int hctx_active_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_active_show, inode->i_private); +} + +static const struct file_operations hctx_active_fops = { + .open = hctx_active_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) { struct blk_mq_ctx *ctx = m->private; @@ -440,6 +522,99 @@ static const struct file_operations ctx_rq_list_fops = { .release = seq_release, }; +static int ctx_dispatched_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); + return 0; +} + +static int ctx_dispatched_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_dispatched_show, inode->i_private); +} + +static ssize_t ctx_dispatched_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; + return count; +} + +static const struct file_operations ctx_dispatched_fops = { + .open = ctx_dispatched_open, + .read = seq_read, + .write = ctx_dispatched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ctx_merged_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu\n", ctx->rq_merged); + return 0; +} + +static int ctx_merged_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_merged_show, inode->i_private); +} + +static ssize_t ctx_merged_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_merged = 0; + return count; +} + +static const struct file_operations ctx_merged_fops = { + .open = ctx_merged_open, + .read = seq_read, + .write = ctx_merged_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ctx_completed_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); + return 0; +} + +static int ctx_completed_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_completed_show, inode->i_private); +} + +static ssize_t ctx_completed_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_completed[0] = ctx->rq_completed[1] = 0; + return count; +} + +static const struct file_operations ctx_completed_fops = { + .open = ctx_completed_open, + .read = seq_read, + .write = ctx_completed_write, + .llseek = seq_lseek, + .release = single_release, +}; + static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, &hctx_state_fops}, {"flags", 0400, &hctx_flags_fops}, @@ -452,10 +627,16 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"io_poll", 0600, &hctx_io_poll_fops}, {"stats", 0600, &hctx_stats_fops}, {"dispatched", 0600, &hctx_dispatched_fops}, + {"queued", 0600, &hctx_queued_fops}, + {"run", 0600, &hctx_run_fops}, + {"active", 0400, &hctx_active_fops}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"rq_list", 0400, &ctx_rq_list_fops}, + {"dispatched", 0600, &ctx_dispatched_fops}, + {"merged", 0600, &ctx_merged_fops}, + {"completed", 0600, &ctx_completed_fops}, }; int blk_mq_debugfs_register(struct request_queue *q, const char *name) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index f505da2c542e..308b3f4fc310 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -122,34 +122,6 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, return res; } -static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], - ctx->rq_dispatched[0]); -} - -static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu\n", ctx->rq_merged); -} - -static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], - ctx->rq_completed[0]); -} - -static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, - char *page) -{ - return sprintf(page, "%lu\n", hctx->queued); -} - -static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "%lu\n", hctx->run); -} - static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -162,11 +134,6 @@ static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); } -static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); -} - static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { unsigned int i, first = 1; @@ -185,34 +152,10 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { - .attr = {.name = "dispatched", .mode = S_IRUGO }, - .show = blk_mq_sysfs_dispatched_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { - .attr = {.name = "merged", .mode = S_IRUGO }, - .show = blk_mq_sysfs_merged_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { - .attr = {.name = "completed", .mode = S_IRUGO }, - .show = blk_mq_sysfs_completed_show, -}; - static struct attribute *default_ctx_attrs[] = { - &blk_mq_sysfs_dispatched.attr, - &blk_mq_sysfs_merged.attr, - &blk_mq_sysfs_completed.attr, NULL, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { - .attr = {.name = "queued", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_queued_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { - .attr = {.name = "run", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_run_show, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { .attr = {.name = "nr_tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_nr_tags_show, @@ -221,22 +164,15 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { - .attr = {.name = "active", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_active_show, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; static struct attribute *default_hw_ctx_attrs[] = { - &blk_mq_hw_sysfs_queued.attr, - &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_nr_tags.attr, &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, - &blk_mq_hw_sysfs_active.attr, NULL, }; -- cgit v1.2.3 From 0abad774124351ba211b9053786ebd5a5a722d53 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 12:28:10 -0700 Subject: blk-mq: improve scheduler queue sync/async running We'll use the same criteria for whether we need to run the queue sync or async when we have a scheduler, as we do without one. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ede835c2b08b..1f948d5c2715 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1476,7 +1476,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (q->elevator) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua); goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1585,7 +1586,8 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) if (q->elevator) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua); goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { -- cgit v1.2.3 From 3c782d67c16889d753a79a8da5422c583d7cfd51 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 12:50:36 -0700 Subject: blk-mq: fix potential race in queue restart and driver tag allocation Once we mark the queue as needing a restart, re-check if we can get a driver tag. This fixes a theoretical issue where the needed IO completes _after_ blk_mq_get_driver_tag() fails, but before we manage to set the restart bit. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- block/blk-mq.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1f948d5c2715..fd80101c7591 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -928,8 +928,16 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) if (!blk_mq_get_driver_tag(rq, &hctx, false)) { if (!queued && reorder_tags_to_front(list)) continue; + + /* + * We failed getting a driver tag. Mark the queue(s) + * as needing a restart. Retry getting a tag again, + * in case the needed IO completed right before we + * marked the queue as needing a restart. + */ blk_mq_sched_mark_restart(hctx); - break; + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + break; } list_del_init(&rq->queuelist); -- cgit v1.2.3 From 99cf1dc580f0766825395aae4f60ec1d8438f011 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 12:32:32 -0700 Subject: blk-mq: release driver tag on a requeue event We don't want to hold on to this resource when we have a scheduler attached. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- block/blk-mq.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index fd80101c7591..711883384585 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -879,6 +879,21 @@ done: return false; } +static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = -1; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + /* * If we fail getting a driver tag because all the driver tags are already * assigned and on the dispatch list, BUT the first entry does not have a @@ -951,6 +966,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: + blk_mq_put_driver_tag(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; -- cgit v1.2.3 From 50e1dab86aa2c10cbca2f754aae9542169403141 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 14:42:34 -0700 Subject: blk-mq-sched: fix starvation for multiple hardware queues and shared tags If we have both multiple hardware queues and shared tag map between devices, we need to ensure that we propagate the hardware queue restart bit higher up. This is because we can get into a situation where we don't have any IO pending on a hardware queue, yet we fail getting a tag to start new IO. If that happens, it's not enough to mark the hardware queue as needing a restart, we need to bubble that up to the higher level queue as well. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- block/blk-mq-sched.c | 28 ++++++++++++++++++++++++++++ block/blk-mq-sched.h | 15 +++++++++------ block/blk-mq.c | 3 ++- block/blk-mq.h | 1 + include/linux/blkdev.h | 1 + 5 files changed, 41 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4cee060a292d..fcc0e893d687 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -301,6 +301,34 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) } EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert); +static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (blk_mq_hctx_has_pending(hctx)) + blk_mq_run_hw_queue(hctx, true); + } +} + +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) +{ + unsigned int i; + + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + blk_mq_sched_restart_hctx(hctx); + else { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + return; + + clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_sched_restart_hctx(hctx); + } +} + static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 6b465bc7014c..becbc7840364 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -19,6 +19,7 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, @@ -123,11 +124,6 @@ blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq) BUG_ON(rq->internal_tag == -1); blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag); - - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - blk_mq_run_hw_queue(hctx, true); - } } static inline void blk_mq_sched_started_request(struct request *rq) @@ -160,8 +156,15 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + set_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + } + } } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq.c b/block/blk-mq.c index 711883384585..21795c6575bc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -40,7 +40,7 @@ static LIST_HEAD(all_q_list); /* * Check if any of the ctx's have pending work in this hardware queue */ -static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return sbitmap_any_bit_set(&hctx->ctx_map) || !list_empty_careful(&hctx->dispatch) || @@ -345,6 +345,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) blk_mq_sched_completed_request(hctx, rq); + blk_mq_sched_restart_queues(hctx); blk_queue_exit(q); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c24b901acd7..077a4003f1fd 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -33,6 +33,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); /* * Internal helpers for allocating/freeing the request map diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ee283f3cffe..883b8abe4305 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -607,6 +607,7 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ +#define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From c13660a08c8b3bb49def4374bfd414aaaa564662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 12:40:07 -0700 Subject: blk-mq-sched: change ->dispatch_requests() to ->dispatch_request() When we invoke dispatch_requests(), the scheduler empties everything into the passed in list. This isn't always a good thing, since it means that we remove items that we could have potentially merged with. Change the function to dispatch single requests at the time. If we do that, we can backoff exactly at the point where the device can't consume more IO, and leave the rest with the scheduler for better merging and future dispatch decision making. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- block/blk-mq-sched.c | 23 +++++++++++++++-------- block/blk-mq.c | 2 +- block/mq-deadline.c | 10 ++++++---- include/linux/elevator.h | 2 +- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index fcc0e893d687..c27613de80c5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -201,15 +201,22 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. */ - if (list_empty(&rq_list)) { - if (e && e->type->ops.mq.dispatch_requests) - e->type->ops.mq.dispatch_requests(hctx, &rq_list); - else - blk_mq_flush_busy_ctxs(hctx, &rq_list); - } else + if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart(hctx); - - blk_mq_dispatch_rq_list(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list); + } else if (!e || !e->type->ops.mq.dispatch_request) { + blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list); + } else { + do { + struct request *rq; + + rq = e->type->ops.mq.dispatch_request(hctx); + if (!rq) + break; + list_add(&rq->queuelist, &rq_list); + } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); + } } void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq.c b/block/blk-mq.c index 21795c6575bc..301ae29fd229 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -998,7 +998,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) */ if (!list_empty(list)) { spin_lock(&hctx->lock); - list_splice(list, &hctx->dispatch); + list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* diff --git a/block/mq-deadline.c b/block/mq-deadline.c index a01986d7b6fb..d93ec713fa62 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -287,14 +287,16 @@ done: return rq; } -static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *rq_list) +static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; spin_lock(&dd->lock); - blk_mq_sched_move_to_dispatch(hctx, rq_list, __dd_dispatch_request); + rq = __dd_dispatch_request(hctx); spin_unlock(&dd->lock); + + return rq; } static void dd_exit_queue(struct elevator_queue *e) @@ -517,7 +519,7 @@ static struct elv_fs_entry deadline_attrs[] = { static struct elevator_type mq_deadline = { .ops.mq = { .insert_requests = dd_insert_requests, - .dispatch_requests = dd_dispatch_requests, + .dispatch_request = dd_dispatch_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ecb96fd67c6d..b5825c4f06f7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -92,7 +92,7 @@ struct elevator_mq_ops { struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); void (*put_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); - void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *); + struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); void (*started_request)(struct request *); -- cgit v1.2.3 From f73f44eb00cb136990cfb7d40e436c13d7669ec8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Jan 2017 08:30:47 -0700 Subject: block: add a op_is_flush helper This centralizes the checks for bios that needs to be go into the flush state machine. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++---- block/blk-mq-sched.c | 5 ++--- block/blk-mq.c | 4 ++-- drivers/md/bcache/request.c | 2 +- drivers/md/dm-cache-target.c | 13 +++---------- drivers/md/dm-thin.c | 13 +++++-------- include/linux/blk_types.h | 9 +++++++++ 7 files changed, 26 insertions(+), 28 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a61f1407f4f6..b830e14117dd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1035,7 +1035,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio) * Flush requests do not use the elevator so skip initialization. * This allows a request to share the flush and elevator data. */ - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(bio->bi_opf)) return false; return true; @@ -1641,7 +1641,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) { + if (op_is_flush(bio->bi_opf)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; goto get_rq; @@ -2145,7 +2145,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(rq->cmd_flags)) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); @@ -3256,7 +3256,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) /* * rq is already accounted, so use raw insert */ - if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(rq->cmd_flags)) __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); else __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c27613de80c5..4139b07ab33b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -111,7 +111,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct request *rq; - const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA); blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); @@ -126,7 +125,7 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, * Flush requests are special and go directly to the * dispatch list. */ - if (!is_flush && e->type->ops.mq.get_request) { + if (!op_is_flush(op) && e->type->ops.mq.get_request) { rq = e->type->ops.mq.get_request(q, op, data); if (rq) rq->rq_flags |= RQF_QUEUED; @@ -139,7 +138,7 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, } if (rq) { - if (!is_flush) { + if (!op_is_flush(op)) { rq->elv.icq = NULL; if (e && e->type->icq_cache) blk_mq_sched_assign_ioc(q, rq, bio); diff --git a/block/blk-mq.c b/block/blk-mq.c index 301ae29fd229..da2123dd681e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1406,7 +1406,7 @@ insert: static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; unsigned int request_count = 0, srcu_idx; @@ -1527,7 +1527,7 @@ done: static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_plug *plug; unsigned int request_count = 0; struct blk_mq_alloc_data data = { .flags = 0 }; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 76d20875503c..01035e718c1c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -666,7 +666,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.write_prio = 0; s->iop.error = 0; s->iop.flags = 0; - s->iop.flush_journal = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = op_is_flush(bio->bi_opf); s->iop.wq = bcache_wq; return s; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index e04c61e0839e..5b9cf56de8ef 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -787,8 +787,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); spin_lock_irqsave(&cache->lock, flags); - if (cache->need_tick_bio && - !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) && + if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb->tick = true; cache->need_tick_bio = false; @@ -828,11 +827,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) return to_oblock(block_nr); } -static int bio_triggers_commit(struct cache *cache, struct bio *bio) -{ - return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); -} - /* * You must increment the deferred set whilst the prison cell is held. To * encourage this, we ask for 'cell' to be passed in. @@ -884,7 +878,7 @@ static void issue(struct cache *cache, struct bio *bio) { unsigned long flags; - if (!bio_triggers_commit(cache, bio)) { + if (!op_is_flush(bio->bi_opf)) { accounted_request(cache, bio); return; } @@ -1069,8 +1063,7 @@ static void dec_io_migrations(struct cache *cache) static bool discard_or_flush(struct bio *bio) { - return bio_op(bio) == REQ_OP_DISCARD || - bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d1c05c12a9db..110982db4b48 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) { - return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && + return op_is_flush(bio->bi_opf) && dm_thin_changed_this_transaction(tc->td); } @@ -870,8 +870,7 @@ static void __inc_remap_and_issue_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { inc_all_io_entry(info->tc->pool, bio); @@ -1716,9 +1715,8 @@ static void __remap_and_issue_shared_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if ((bio_data_dir(bio) == WRITE) || - (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD)) + if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) || + bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; @@ -2635,8 +2633,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) { + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) { thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0e5b1cd5113c..37c9a43c5e78 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -220,6 +220,15 @@ static inline bool op_is_write(unsigned int op) return (op & 1); } +/* + * Check if the bio or request is one that needs special treatment in the + * flush state machine. + */ +static inline bool op_is_flush(unsigned int op) +{ + return op & (REQ_FUA | REQ_PREFLUSH); +} + /* * Reads are always treated as synchronous, as are requests with the FUA or * PREFLUSH flag. Other operations may be marked as synchronous using the -- cgit v1.2.3 From bd6737f1ae92e2f1c6e8362efe96dbe7f18fa07d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Jan 2017 01:00:47 -0700 Subject: blk-mq-sched: add flush insertion into blk_mq_sched_insert_request() Instead of letting the caller check this and handle the details of inserting a flush request, put the logic in the scheduler insertion function. This fixes direct flush insertion outside of the usual make_request_fn calls, like from dm via blk_insert_cloned_request(). Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-exec.c | 2 +- block/blk-flush.c | 2 +- block/blk-mq-sched.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sched.h | 45 ++++++---------------------------------- block/blk-mq-tag.c | 8 +++++++- block/blk-mq.c | 25 +++++++++++----------- block/blk-mq.h | 2 ++ 8 files changed, 89 insertions(+), 55 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b830e14117dd..4bfd8674afd0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2129,7 +2129,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_sched_insert_request(rq, false, true, false); + blk_mq_sched_insert_request(rq, false, true, false, false); return 0; } diff --git a/block/blk-exec.c b/block/blk-exec.c index 86656fdfa637..ed1f10165268 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -66,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false); + blk_mq_sched_insert_request(rq, at_head, true, false, false); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index d7de34ee39c2..4427896641ac 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -456,7 +456,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) - blk_mq_sched_insert_request(rq, false, true, false); + blk_mq_sched_insert_request(rq, false, true, false, false); else list_add_tail(&rq->queuelist, &q->queue_head); return; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4139b07ab33b..1112752f888d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -335,6 +335,64 @@ void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) } } +/* + * Add flush/fua to the queue. If we fail getting a driver tag, then + * punt to the requeue list. Requeue will re-invoke us from a context + * that's safe to block from. + */ +static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool can_block) +{ + if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { + blk_insert_flush(rq); + blk_mq_run_hw_queue(hctx, true); + } else + blk_mq_add_to_requeue_list(rq, true, true); +} + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + if (rq->tag == -1 && (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))) { + blk_mq_sched_insert_flush(hctx, rq, can_block); + return; + } + + if (e && e->type->ops.mq.insert_requests) { + LIST_HEAD(list); + + list_add(&rq->queuelist, &list); + e->type->ops.mq.insert_requests(hctx, &list, at_head); + } else { + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); + } + + if (run_queue) + blk_mq_run_hw_queue(hctx, async); +} + +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async) +{ + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.insert_requests) + e->type->ops.mq.insert_requests(hctx, list, false); + else + blk_mq_insert_requests(hctx, ctx, list); + + blk_mq_run_hw_queue(hctx, run_queue_async); +} + static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index becbc7840364..9478aaeb48c5 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -21,6 +21,12 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block); +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async); + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, struct list_head *rq_list, @@ -62,45 +68,6 @@ static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, e->type->ops.mq.put_rq_priv(q, rq); } -static inline void -blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, - bool async) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - if (e && e->type->ops.mq.insert_requests) { - LIST_HEAD(list); - - list_add(&rq->queuelist, &list); - e->type->ops.mq.insert_requests(hctx, &list, at_head); - } else { - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - } - - if (run_queue) - blk_mq_run_hw_queue(hctx, async); -} - -static inline void -blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, - struct list_head *list, bool run_queue_async) -{ - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - struct elevator_queue *e = hctx->queue->elevator; - - if (e && e->type->ops.mq.insert_requests) - e->type->ops.mq.insert_requests(hctx, list, false); - else - blk_mq_insert_requests(hctx, ctx, list); - - blk_mq_run_hw_queue(hctx, run_queue_async); -} - static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f8de2dbbb29f..54c84363c1b2 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -106,6 +106,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct sbq_wait_state *ws; DEFINE_WAIT(wait); unsigned int tag_offset; + bool drop_ctx; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { @@ -128,6 +129,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return BLK_MQ_TAG_FAIL; ws = bt_wait_ptr(bt, data->hctx); + drop_ctx = data->ctx == NULL; do { prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); @@ -150,7 +152,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - blk_mq_put_ctx(data->ctx); + if (data->ctx) + blk_mq_put_ctx(data->ctx); io_schedule(); @@ -166,6 +169,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); } while (1); + if (drop_ctx && data->ctx) + blk_mq_put_ctx(data->ctx); + finish_wait(&ws->wait, &wait); found_tag: diff --git a/block/blk-mq.c b/block/blk-mq.c index da2123dd681e..60dac10228fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -568,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false); + blk_mq_sched_insert_request(rq, true, false, false, true); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false); + blk_mq_sched_insert_request(rq, false, false, false, true); } blk_mq_run_hw_queues(q, false); @@ -847,12 +847,11 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -static bool blk_mq_get_driver_tag(struct request *rq, - struct blk_mq_hw_ctx **hctx, bool wait) +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait) { struct blk_mq_alloc_data data = { .q = rq->q, - .ctx = rq->mq_ctx, .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, }; @@ -1395,7 +1394,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) } insert: - blk_mq_sched_insert_request(rq, false, true, true); + blk_mq_sched_insert_request(rq, false, true, true, false); } /* @@ -1446,10 +1445,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); - goto run_queue; + blk_mq_run_hw_queue(data.hctx, true); + goto done; } plug = current->plug; @@ -1502,7 +1503,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua); + !is_sync || is_flush_fua, true); goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1512,7 +1513,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) * latter allows for merging opportunities and more efficient * dispatching. */ -run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } blk_mq_put_ctx(data.ctx); @@ -1568,10 +1568,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); - goto run_queue; + blk_mq_run_hw_queue(data.hctx, true); + goto done; } /* @@ -1612,7 +1614,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua); + !is_sync || is_flush_fua, true); goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1622,7 +1624,6 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) * latter allows for merging opportunities and more efficient * dispatching. */ -run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 077a4003f1fd..57cdbf6c0cee 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -34,6 +34,8 @@ void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait); /* * Internal helpers for allocating/freeing the request map -- cgit v1.2.3 From f3a8ab7d55bc49b44baa229723e0b5b6ebacac4a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Jan 2017 09:08:23 -0700 Subject: block: cleanup remaining manual checks for PREFLUSH|FUA Use op_is_flush() where applicable. Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq-sched.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 4bfd8674afd0..02833ce5664e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1880,7 +1880,7 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && + if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 1112752f888d..114814ec3d49 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -358,7 +358,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - if (rq->tag == -1 && (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))) { + if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { blk_mq_sched_insert_flush(hctx, rq, can_block); return; } -- cgit v1.2.3 From 400f73b23f457a82288814e21af57dbc9f3f2afd Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 27 Jan 2017 15:03:01 -0700 Subject: blk-mq: fix debugfs compilation issues This fixes a couple of problems: 1. In the !CONFIG_DEBUG_FS case, the stub definitions were bogus. 2. In the !CONFIG_BLOCK case, blk-mq-debugfs.c shouldn't be compiled at all. Fix the stub definitions and add a CONFIG_BLK_DEBUG_FS Kconfig option. Fixes: 07e4fead45e6 ("blk-mq: create debugfs directory tree") Signed-off-by: Omar Sandoval Augment Kconfig description. Signed-off-by: Jens Axboe --- block/Kconfig | 12 ++++++++++++ block/Makefile | 2 +- block/blk-mq.h | 11 ++++++----- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 8bf114a3858a..7f659b23850c 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -147,6 +147,18 @@ config BLK_WBT_MQ Multiqueue currently doesn't have support for IO scheduling, enabling this option is recommended. +config BLK_DEBUG_FS + bool "Block layer debugging information in debugfs" + default y + depends on DEBUG_FS + ---help--- + Include block layer debugging information in debugfs. This information + is mostly useful for kernel developers, but it doesn't incur any cost + at runtime. + + Unless you are building a kernel for a tiny system, you should + say Y here. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 6cabe6bd2882..317165f8708c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,4 +26,4 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o -obj-$(CONFIG_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/blk-mq.h b/block/blk-mq.h index 57cdbf6c0cee..b52abd62b1b0 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -84,7 +84,7 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); /* * debugfs helpers */ -#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_BLK_DEBUG_FS void blk_mq_debugfs_init(void); int blk_mq_debugfs_register(struct request_queue *q, const char *name); void blk_mq_debugfs_unregister(struct request_queue *q); @@ -95,21 +95,22 @@ static inline void blk_mq_debugfs_init(void) { } -int blk_mq_debugfs_register(struct request_queue *q, const char *name); +static inline int blk_mq_debugfs_register(struct request_queue *q, + const char *name) { return 0; } -void blk_mq_debugfs_unregister(struct request_queue *q) +static inline void blk_mq_debugfs_unregister(struct request_queue *q) { } -int blk_mq_debugfs_register_hctxs(struct request_queue *q) +static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q) { return 0; } -void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { } #endif -- cgit v1.2.3 From ade69e2432b795c76653e1dfa09c684549826a50 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:09 +0100 Subject: lightnvm: merge gennvm with core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the first iteration of Open-Channel SSDs, it was anticipated that there could be various media managers on top of an open-channel SSD, such to allow vendors to plug in their own host-side FTLs, without the media manager in between. Now that an Open-Channel SSD is exposed as a traditional block device, there is no longer a need for this. Therefore lets merge the gennvm code with core and simplify the stack. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 9 - drivers/lightnvm/Makefile | 3 +- drivers/lightnvm/core.c | 786 ++++++++++++++++++++++++++++++++----------- drivers/lightnvm/gennvm.c | 657 ------------------------------------ drivers/lightnvm/gennvm.h | 62 ---- drivers/lightnvm/sysblk.c | 733 ---------------------------------------- drivers/nvme/host/lightnvm.c | 7 +- include/linux/lightnvm.h | 97 +----- 8 files changed, 606 insertions(+), 1748 deletions(-) delete mode 100644 drivers/lightnvm/gennvm.c delete mode 100644 drivers/lightnvm/gennvm.h delete mode 100644 drivers/lightnvm/sysblk.c diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 2f5d5f4a4c75..052714106b7b 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -26,15 +26,6 @@ config NVM_DEBUG It is required to create/remove targets without IOCTLs. -config NVM_GENNVM - tristate "General Non-Volatile Memory Manager for Open-Channel SSDs" - ---help--- - Non-volatile memory media manager for Open-Channel SSDs that implements - physical media metadata management and block provisioning API. - - This is the standard media manager for using Open-Channel SSDs, and - required for targets to be instantiated. - config NVM_RRPC tristate "Round-robin Hybrid Open-Channel SSD target" ---help--- diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index a7a0a22cf1a5..b2a39e2d2895 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -2,6 +2,5 @@ # Makefile for Open-Channel SSDs. # -obj-$(CONFIG_NVM) := core.o sysblk.o -obj-$(CONFIG_NVM_GENNVM) += gennvm.o +obj-$(CONFIG_NVM) := core.o obj-$(CONFIG_NVM_RRPC) += rrpc.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 02240a0b39c9..e9a495650dd0 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -29,10 +29,492 @@ static LIST_HEAD(nvm_tgt_types); static DECLARE_RWSEM(nvm_tgtt_lock); -static LIST_HEAD(nvm_mgrs); static LIST_HEAD(nvm_devices); static DECLARE_RWSEM(nvm_lock); +/* Map between virtual and physical channel and lun */ +struct nvm_ch_map { + int ch_off; + int nr_luns; + int *lun_offs; +}; + +struct nvm_dev_map { + struct nvm_ch_map *chnls; + int nr_chnls; +}; + +struct nvm_area { + struct list_head list; + sector_t begin; + sector_t end; /* end is excluded */ +}; + +enum { + TRANS_TGT_TO_DEV = 0x0, + TRANS_DEV_TO_TGT = 0x1, +}; + +static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) +{ + struct nvm_target *tgt; + + list_for_each_entry(tgt, &dev->targets, list) + if (!strcmp(name, tgt->disk->disk_name)) + return tgt; + + return NULL; +} + +static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) +{ + int i; + + for (i = lun_begin; i <= lun_end; i++) { + if (test_and_set_bit(i, dev->lun_map)) { + pr_err("nvm: lun %d already allocated\n", i); + goto err; + } + } + + return 0; +err: + while (--i > lun_begin) + clear_bit(i, dev->lun_map); + + return -EBUSY; +} + +static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin, + int lun_end) +{ + int i; + + for (i = lun_begin; i <= lun_end; i++) + WARN_ON(!test_and_clear_bit(i, dev->lun_map)); +} + +static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_dev_map *dev_map = tgt_dev->map; + int i, j; + + for (i = 0; i < dev_map->nr_chnls; i++) { + struct nvm_ch_map *ch_map = &dev_map->chnls[i]; + int *lun_offs = ch_map->lun_offs; + int ch = i + ch_map->ch_off; + + for (j = 0; j < ch_map->nr_luns; j++) { + int lun = j + lun_offs[j]; + int lunid = (ch * dev->geo.luns_per_chnl) + lun; + + WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); + } + + kfree(ch_map->lun_offs); + } + + kfree(dev_map->chnls); + kfree(dev_map); + + kfree(tgt_dev->luns); + kfree(tgt_dev); +} + +static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, + int lun_begin, int lun_end) +{ + struct nvm_tgt_dev *tgt_dev = NULL; + struct nvm_dev_map *dev_rmap = dev->rmap; + struct nvm_dev_map *dev_map; + struct ppa_addr *luns; + int nr_luns = lun_end - lun_begin + 1; + int luns_left = nr_luns; + int nr_chnls = nr_luns / dev->geo.luns_per_chnl; + int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; + int bch = lun_begin / dev->geo.luns_per_chnl; + int blun = lun_begin % dev->geo.luns_per_chnl; + int lunid = 0; + int lun_balanced = 1; + int prev_nr_luns; + int i, j; + + nr_chnls = nr_luns / dev->geo.luns_per_chnl; + nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; + + dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); + if (!dev_map) + goto err_dev; + + dev_map->chnls = kcalloc(nr_chnls, sizeof(struct nvm_ch_map), + GFP_KERNEL); + if (!dev_map->chnls) + goto err_chnls; + + luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL); + if (!luns) + goto err_luns; + + prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? + dev->geo.luns_per_chnl : luns_left; + for (i = 0; i < nr_chnls; i++) { + struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; + int *lun_roffs = ch_rmap->lun_offs; + struct nvm_ch_map *ch_map = &dev_map->chnls[i]; + int *lun_offs; + int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? + dev->geo.luns_per_chnl : luns_left; + + if (lun_balanced && prev_nr_luns != luns_in_chnl) + lun_balanced = 0; + + ch_map->ch_off = ch_rmap->ch_off = bch; + ch_map->nr_luns = luns_in_chnl; + + lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); + if (!lun_offs) + goto err_ch; + + for (j = 0; j < luns_in_chnl; j++) { + luns[lunid].ppa = 0; + luns[lunid].g.ch = i; + luns[lunid++].g.lun = j; + + lun_offs[j] = blun; + lun_roffs[j + blun] = blun; + } + + ch_map->lun_offs = lun_offs; + + /* when starting a new channel, lun offset is reset */ + blun = 0; + luns_left -= luns_in_chnl; + } + + dev_map->nr_chnls = nr_chnls; + + tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); + if (!tgt_dev) + goto err_ch; + + memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); + /* Target device only owns a portion of the physical device */ + tgt_dev->geo.nr_chnls = nr_chnls; + tgt_dev->geo.nr_luns = nr_luns; + tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; + tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; + tgt_dev->q = dev->q; + tgt_dev->map = dev_map; + tgt_dev->luns = luns; + memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id)); + + tgt_dev->parent = dev; + + return tgt_dev; +err_ch: + while (--i > 0) + kfree(dev_map->chnls[i].lun_offs); + kfree(luns); +err_luns: + kfree(dev_map->chnls); +err_chnls: + kfree(dev_map); +err_dev: + return tgt_dev; +} + +static const struct block_device_operations nvm_fops = { + .owner = THIS_MODULE, +}; + +static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) +{ + struct nvm_ioctl_create_simple *s = &create->conf.s; + struct request_queue *tqueue; + struct gendisk *tdisk; + struct nvm_tgt_type *tt; + struct nvm_target *t; + struct nvm_tgt_dev *tgt_dev; + void *targetdata; + + tt = nvm_find_target_type(create->tgttype, 1); + if (!tt) { + pr_err("nvm: target type %s not found\n", create->tgttype); + return -EINVAL; + } + + mutex_lock(&dev->mlock); + t = nvm_find_target(dev, create->tgtname); + if (t) { + pr_err("nvm: target name already exists.\n"); + mutex_unlock(&dev->mlock); + return -EINVAL; + } + mutex_unlock(&dev->mlock); + + if (nvm_reserve_luns(dev, s->lun_begin, s->lun_end)) + return -ENOMEM; + + t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); + if (!t) + goto err_reserve; + + tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end); + if (!tgt_dev) { + pr_err("nvm: could not create target device\n"); + goto err_t; + } + + tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); + if (!tqueue) + goto err_dev; + blk_queue_make_request(tqueue, tt->make_rq); + + tdisk = alloc_disk(0); + if (!tdisk) + goto err_queue; + + sprintf(tdisk->disk_name, "%s", create->tgtname); + tdisk->flags = GENHD_FL_EXT_DEVT; + tdisk->major = 0; + tdisk->first_minor = 0; + tdisk->fops = &nvm_fops; + tdisk->queue = tqueue; + + targetdata = tt->init(tgt_dev, tdisk); + if (IS_ERR(targetdata)) + goto err_init; + + tdisk->private_data = targetdata; + tqueue->queuedata = targetdata; + + blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); + + set_capacity(tdisk, tt->capacity(targetdata)); + add_disk(tdisk); + + t->type = tt; + t->disk = tdisk; + t->dev = tgt_dev; + + mutex_lock(&dev->mlock); + list_add_tail(&t->list, &dev->targets); + mutex_unlock(&dev->mlock); + + return 0; +err_init: + put_disk(tdisk); +err_queue: + blk_cleanup_queue(tqueue); +err_dev: + kfree(tgt_dev); +err_t: + kfree(t); +err_reserve: + nvm_release_luns_err(dev, s->lun_begin, s->lun_end); + return -ENOMEM; +} + +static void __nvm_remove_target(struct nvm_target *t) +{ + struct nvm_tgt_type *tt = t->type; + struct gendisk *tdisk = t->disk; + struct request_queue *q = tdisk->queue; + + del_gendisk(tdisk); + blk_cleanup_queue(q); + + if (tt->exit) + tt->exit(tdisk->private_data); + + nvm_remove_tgt_dev(t->dev); + put_disk(tdisk); + + list_del(&t->list); + kfree(t); +} + +/** + * nvm_remove_tgt - Removes a target from the media manager + * @dev: device + * @remove: ioctl structure with target name to remove. + * + * Returns: + * 0: on success + * 1: on not found + * <0: on error + */ +static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) +{ + struct nvm_target *t; + + mutex_lock(&dev->mlock); + t = nvm_find_target(dev, remove->tgtname); + if (!t) { + mutex_unlock(&dev->mlock); + return 1; + } + __nvm_remove_target(t); + mutex_unlock(&dev->mlock); + + return 0; +} + +static int nvm_register_map(struct nvm_dev *dev) +{ + struct nvm_dev_map *rmap; + int i, j; + + rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); + if (!rmap) + goto err_rmap; + + rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct nvm_ch_map), + GFP_KERNEL); + if (!rmap->chnls) + goto err_chnls; + + for (i = 0; i < dev->geo.nr_chnls; i++) { + struct nvm_ch_map *ch_rmap; + int *lun_roffs; + int luns_in_chnl = dev->geo.luns_per_chnl; + + ch_rmap = &rmap->chnls[i]; + + ch_rmap->ch_off = -1; + ch_rmap->nr_luns = luns_in_chnl; + + lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); + if (!lun_roffs) + goto err_ch; + + for (j = 0; j < luns_in_chnl; j++) + lun_roffs[j] = -1; + + ch_rmap->lun_offs = lun_roffs; + } + + dev->rmap = rmap; + + return 0; +err_ch: + while (--i >= 0) + kfree(rmap->chnls[i].lun_offs); +err_chnls: + kfree(rmap); +err_rmap: + return -ENOMEM; +} + +static int nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +{ + struct nvm_dev_map *dev_map = tgt_dev->map; + struct nvm_ch_map *ch_map = &dev_map->chnls[p->g.ch]; + int lun_off = ch_map->lun_offs[p->g.lun]; + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_dev_map *dev_rmap = dev->rmap; + struct nvm_ch_map *ch_rmap; + int lun_roff; + + p->g.ch += ch_map->ch_off; + p->g.lun += lun_off; + + ch_rmap = &dev_rmap->chnls[p->g.ch]; + lun_roff = ch_rmap->lun_offs[p->g.lun]; + + if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) { + pr_err("nvm: corrupted device partition table\n"); + return -EINVAL; + } + + return 0; +} + +static int nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_dev_map *dev_rmap = dev->rmap; + struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch]; + int lun_roff = ch_rmap->lun_offs[p->g.lun]; + + p->g.ch -= ch_rmap->ch_off; + p->g.lun -= lun_roff; + + return 0; +} + +static int nvm_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, + int flag) +{ + int i; + int ret; + + if (rqd->nr_ppas == 1) { + if (flag == TRANS_TGT_TO_DEV) + return nvm_map_to_dev(tgt_dev, &rqd->ppa_addr); + else + return nvm_map_to_tgt(tgt_dev, &rqd->ppa_addr); + } + + for (i = 0; i < rqd->nr_ppas; i++) { + if (flag == TRANS_TGT_TO_DEV) + ret = nvm_map_to_dev(tgt_dev, &rqd->ppa_list[i]); + else + ret = nvm_map_to_tgt(tgt_dev, &rqd->ppa_list[i]); + + if (ret) + break; + } + + return ret; +} + +static struct ppa_addr nvm_trans_ppa(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr p, int dir) +{ + struct ppa_addr ppa = p; + + if (dir == TRANS_TGT_TO_DEV) + nvm_map_to_dev(tgt_dev, &ppa); + else + nvm_map_to_tgt(tgt_dev, &ppa); + + return ppa; +} + +void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, + int len) +{ + struct nvm_geo *geo = &dev->geo; + struct nvm_dev_map *dev_rmap = dev->rmap; + u64 i; + + for (i = 0; i < len; i++) { + struct nvm_ch_map *ch_rmap; + int *lun_roffs; + struct ppa_addr gaddr; + u64 pba = le64_to_cpu(entries[i]); + int off; + u64 diff; + + if (!pba) + continue; + + gaddr = linear_to_generic_addr(geo, pba); + ch_rmap = &dev_rmap->chnls[gaddr.g.ch]; + lun_roffs = ch_rmap->lun_offs; + + off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun; + + diff = ((ch_rmap->ch_off * geo->luns_per_chnl) + + (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun; + + entries[i] -= cpu_to_le64(diff); + } +} +EXPORT_SYMBOL(nvm_part_to_tgt); + struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) { struct nvm_tgt_type *tmp, *tt = NULL; @@ -92,78 +574,6 @@ void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler) } EXPORT_SYMBOL(nvm_dev_dma_free); -static struct nvmm_type *nvm_find_mgr_type(const char *name) -{ - struct nvmm_type *mt; - - list_for_each_entry(mt, &nvm_mgrs, list) - if (!strcmp(name, mt->name)) - return mt; - - return NULL; -} - -static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) -{ - struct nvmm_type *mt; - int ret; - - lockdep_assert_held(&nvm_lock); - - list_for_each_entry(mt, &nvm_mgrs, list) { - if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN)) - continue; - - ret = mt->register_mgr(dev); - if (ret < 0) { - pr_err("nvm: media mgr failed to init (%d) on dev %s\n", - ret, dev->name); - return NULL; /* initialization failed */ - } else if (ret > 0) - return mt; - } - - return NULL; -} - -int nvm_register_mgr(struct nvmm_type *mt) -{ - struct nvm_dev *dev; - int ret = 0; - - down_write(&nvm_lock); - if (nvm_find_mgr_type(mt->name)) { - ret = -EEXIST; - goto finish; - } else { - list_add(&mt->list, &nvm_mgrs); - } - - /* try to register media mgr if any device have none configured */ - list_for_each_entry(dev, &nvm_devices, devices) { - if (dev->mt) - continue; - - dev->mt = nvm_init_mgr(dev); - } -finish: - up_write(&nvm_lock); - - return ret; -} -EXPORT_SYMBOL(nvm_register_mgr); - -void nvm_unregister_mgr(struct nvmm_type *mt) -{ - if (!mt) - return; - - down_write(&nvm_lock); - list_del(&mt->list); - up_write(&nvm_lock); -} -EXPORT_SYMBOL(nvm_unregister_mgr); - static struct nvm_dev *nvm_find_nvm_dev(const char *name) { struct nvm_dev *dev; @@ -183,13 +593,13 @@ static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev, if (rqd->nr_ppas > 1) { for (i = 0; i < rqd->nr_ppas; i++) { - rqd->ppa_list[i] = dev->mt->trans_ppa(tgt_dev, + rqd->ppa_list[i] = nvm_trans_ppa(tgt_dev, rqd->ppa_list[i], TRANS_TGT_TO_DEV); rqd->ppa_list[i] = generic_to_dev_addr(dev, rqd->ppa_list[i]); } } else { - rqd->ppa_addr = dev->mt->trans_ppa(tgt_dev, rqd->ppa_addr, + rqd->ppa_addr = nvm_trans_ppa(tgt_dev, rqd->ppa_addr, TRANS_TGT_TO_DEV); rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); } @@ -242,7 +652,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); nvm_free_rqd_ppalist(dev, &rqd); if (ret) { - pr_err("nvm: sysblk failed bb mark\n"); + pr_err("nvm: failed bb mark\n"); return -EINVAL; } @@ -262,15 +672,23 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { struct nvm_dev *dev = tgt_dev->parent; - return dev->mt->submit_io(tgt_dev, rqd); + if (!dev->ops->submit_io) + return -ENODEV; + + /* Convert address space */ + nvm_generic_to_addr_mode(dev, rqd); + + rqd->dev = tgt_dev; + return dev->ops->submit_io(dev, rqd); } EXPORT_SYMBOL(nvm_submit_io); int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags) { - struct nvm_dev *dev = tgt_dev->parent; + /* Convert address space */ + nvm_map_to_dev(tgt_dev, p); - return dev->mt->erase_blk(tgt_dev, p, flags); + return nvm_erase_ppa(tgt_dev->parent, p, 1, flags); } EXPORT_SYMBOL(nvm_erase_blk); @@ -289,16 +707,65 @@ EXPORT_SYMBOL(nvm_get_l2p_tbl); int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len) { struct nvm_dev *dev = tgt_dev->parent; + struct nvm_geo *geo = &dev->geo; + struct nvm_area *area, *prev, *next; + sector_t begin = 0; + sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9; + + if (len > max_sectors) + return -EINVAL; + + area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL); + if (!area) + return -ENOMEM; + + prev = NULL; + + spin_lock(&dev->lock); + list_for_each_entry(next, &dev->area_list, list) { + if (begin + len > next->begin) { + begin = next->end; + prev = next; + continue; + } + break; + } + + if ((begin + len) > max_sectors) { + spin_unlock(&dev->lock); + kfree(area); + return -EINVAL; + } - return dev->mt->get_area(dev, lba, len); + area->begin = *lba = begin; + area->end = begin + len; + + if (prev) /* insert into sorted order */ + list_add(&area->list, &prev->list); + else + list_add(&area->list, &dev->area_list); + spin_unlock(&dev->lock); + + return 0; } EXPORT_SYMBOL(nvm_get_area); -void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t lba) +void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) { struct nvm_dev *dev = tgt_dev->parent; + struct nvm_area *area; - dev->mt->put_area(dev, lba); + spin_lock(&dev->lock); + list_for_each_entry(area, &dev->area_list, list) { + if (area->begin != begin) + continue; + + list_del(&area->list); + spin_unlock(&dev->lock); + kfree(area); + return; + } + spin_unlock(&dev->lock); } EXPORT_SYMBOL(nvm_put_area); @@ -409,8 +876,15 @@ EXPORT_SYMBOL(nvm_erase_ppa); void nvm_end_io(struct nvm_rq *rqd, int error) { + struct nvm_tgt_dev *tgt_dev = rqd->dev; + struct nvm_tgt_instance *ins = rqd->ins; + + /* Convert address space */ + if (tgt_dev) + nvm_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT); + rqd->error = error; - rqd->end_io(rqd); + ins->tt->end_io(rqd); } EXPORT_SYMBOL(nvm_end_io); @@ -570,10 +1044,9 @@ EXPORT_SYMBOL(nvm_get_bb_tbl); int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, u8 *blks) { - struct nvm_dev *dev = tgt_dev->parent; + ppa = nvm_trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV); - ppa = dev->mt->trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV); - return nvm_get_bb_tbl(dev, ppa, blks); + return nvm_get_bb_tbl(tgt_dev->parent, ppa, blks); } EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); @@ -691,36 +1164,31 @@ static int nvm_core_init(struct nvm_dev *dev) goto err_fmtype; } + INIT_LIST_HEAD(&dev->area_list); + INIT_LIST_HEAD(&dev->targets); mutex_init(&dev->mlock); spin_lock_init(&dev->lock); - blk_queue_logical_block_size(dev->q, geo->sec_size); + ret = nvm_register_map(dev); + if (ret) + goto err_fmtype; + blk_queue_logical_block_size(dev->q, geo->sec_size); return 0; err_fmtype: kfree(dev->lun_map); return ret; } -static void nvm_free_mgr(struct nvm_dev *dev) -{ - if (!dev->mt) - return; - - dev->mt->unregister_mgr(dev); - dev->mt = NULL; -} - void nvm_free(struct nvm_dev *dev) { if (!dev) return; - nvm_free_mgr(dev); - if (dev->dma_pool) dev->ops->destroy_dma_pool(dev->dma_pool); + kfree(dev->rmap); kfree(dev->lptbl); kfree(dev->lun_map); kfree(dev); @@ -731,9 +1199,6 @@ static int nvm_init(struct nvm_dev *dev) struct nvm_geo *geo = &dev->geo; int ret = -EINVAL; - if (!dev->q || !dev->ops) - return ret; - if (dev->ops->identity(dev, &dev->identity)) { pr_err("nvm: device could not be identified\n"); goto err; @@ -779,49 +1244,50 @@ int nvm_register(struct nvm_dev *dev) { int ret; - ret = nvm_init(dev); - if (ret) - goto err_init; + if (!dev->q || !dev->ops) + return -EINVAL; if (dev->ops->max_phys_sect > 256) { pr_info("nvm: max sectors supported is 256.\n"); - ret = -EINVAL; - goto err_init; + return -EINVAL; } if (dev->ops->max_phys_sect > 1) { dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist"); if (!dev->dma_pool) { pr_err("nvm: could not create dma pool\n"); - ret = -ENOMEM; - goto err_init; + return -ENOMEM; } } - if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { - ret = nvm_get_sysblock(dev, &dev->sb); - if (!ret) - pr_err("nvm: device not initialized.\n"); - else if (ret < 0) - pr_err("nvm: err (%d) on device initialization\n", ret); - } + ret = nvm_init(dev); + if (ret) + goto err_init; /* register device with a supported media manager */ down_write(&nvm_lock); - if (ret > 0) - dev->mt = nvm_init_mgr(dev); list_add(&dev->devices, &nvm_devices); up_write(&nvm_lock); return 0; err_init: - kfree(dev->lun_map); + dev->ops->destroy_dma_pool(dev->dma_pool); return ret; } EXPORT_SYMBOL(nvm_register); void nvm_unregister(struct nvm_dev *dev) { + struct nvm_target *t, *tmp; + + mutex_lock(&dev->mlock); + list_for_each_entry_safe(t, tmp, &dev->targets, list) { + if (t->dev->parent != dev) + continue; + __nvm_remove_target(t); + } + mutex_unlock(&dev->mlock); + down_write(&nvm_lock); list_del(&dev->devices); up_write(&nvm_lock); @@ -844,11 +1310,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) return -EINVAL; } - if (!dev->mt) { - pr_info("nvm: device has no media manager registered.\n"); - return -ENODEV; - } - if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) { pr_err("nvm: config type not valid\n"); return -EINVAL; @@ -861,7 +1322,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) return -EINVAL; } - return dev->mt->create_tgt(dev, create); + return nvm_create_tgt(dev, create); } static long nvm_ioctl_info(struct file *file, void __user *arg) @@ -923,16 +1384,14 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg) struct nvm_ioctl_device_info *info = &devices->info[i]; sprintf(info->devname, "%s", dev->name); - if (dev->mt) { - info->bmversion[0] = dev->mt->version[0]; - info->bmversion[1] = dev->mt->version[1]; - info->bmversion[2] = dev->mt->version[2]; - sprintf(info->bmname, "%s", dev->mt->name); - } else { - sprintf(info->bmname, "none"); - } + /* kept for compatibility */ + info->bmversion[0] = 1; + info->bmversion[1] = 0; + info->bmversion[2] = 0; + sprintf(info->bmname, "%s", "gennvm"); i++; + if (i > 31) { pr_err("nvm: max 31 devices can be reported.\n"); break; @@ -994,7 +1453,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) } list_for_each_entry(dev, &nvm_devices, devices) { - ret = dev->mt->remove_tgt(dev, &remove); + ret = nvm_remove_tgt(dev, &remove); if (!ret) break; } @@ -1002,47 +1461,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) return ret; } -static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) -{ - info->seqnr = 1; - info->erase_cnt = 0; - info->version = 1; -} - -static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init) -{ - struct nvm_dev *dev; - struct nvm_sb_info info; - int ret; - - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(init->dev); - up_write(&nvm_lock); - if (!dev) { - pr_err("nvm: device not found\n"); - return -EINVAL; - } - - nvm_setup_nvm_sb_info(&info); - - strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN); - info.fs_ppa.ppa = -1; - - if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { - ret = nvm_init_sysblock(dev, &info); - if (ret) - return ret; - } - - memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info)); - - down_write(&nvm_lock); - dev->mt = nvm_init_mgr(dev); - up_write(&nvm_lock); - - return 0; -} - +/* kept for compatibility reasons */ static long nvm_ioctl_dev_init(struct file *file, void __user *arg) { struct nvm_ioctl_dev_init init; @@ -1058,15 +1477,13 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg) return -EINVAL; } - init.dev[DISK_NAME_LEN - 1] = '\0'; - - return __nvm_ioctl_dev_init(&init); + return 0; } +/* Kept for compatibility reasons */ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) { struct nvm_ioctl_dev_factory fact; - struct nvm_dev *dev; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1079,19 +1496,6 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) return -EINVAL; - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(fact.dev); - up_write(&nvm_lock); - if (!dev) { - pr_err("nvm: device not found\n"); - return -EINVAL; - } - - nvm_free_mgr(dev); - - if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) - return nvm_dev_factory(dev, fact.flags); - return 0; } diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c deleted file mode 100644 index ca7880082d80..000000000000 --- a/drivers/lightnvm/gennvm.c +++ /dev/null @@ -1,657 +0,0 @@ -/* - * Copyright (C) 2015 Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * - * Implementation of a general nvm manager for Open-Channel SSDs. - */ - -#include "gennvm.h" - -static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name) -{ - struct nvm_target *tgt; - - list_for_each_entry(tgt, &gn->targets, list) - if (!strcmp(name, tgt->disk->disk_name)) - return tgt; - - return NULL; -} - -static const struct block_device_operations gen_fops = { - .owner = THIS_MODULE, -}; - -static int gen_reserve_luns(struct nvm_dev *dev, struct nvm_target *t, - int lun_begin, int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) { - if (test_and_set_bit(i, dev->lun_map)) { - pr_err("nvm: lun %d already allocated\n", i); - goto err; - } - } - - return 0; - -err: - while (--i > lun_begin) - clear_bit(i, dev->lun_map); - - return -EBUSY; -} - -static void gen_release_luns_err(struct nvm_dev *dev, int lun_begin, - int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) - WARN_ON(!test_and_clear_bit(i, dev->lun_map)); -} - -static void gen_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct gen_dev_map *dev_map = tgt_dev->map; - int i, j; - - for (i = 0; i < dev_map->nr_chnls; i++) { - struct gen_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs = ch_map->lun_offs; - int ch = i + ch_map->ch_off; - - for (j = 0; j < ch_map->nr_luns; j++) { - int lun = j + lun_offs[j]; - int lunid = (ch * dev->geo.luns_per_chnl) + lun; - - WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); - } - - kfree(ch_map->lun_offs); - } - - kfree(dev_map->chnls); - kfree(dev_map); - kfree(tgt_dev->luns); - kfree(tgt_dev); -} - -static struct nvm_tgt_dev *gen_create_tgt_dev(struct nvm_dev *dev, - int lun_begin, int lun_end) -{ - struct nvm_tgt_dev *tgt_dev = NULL; - struct gen_dev_map *dev_rmap = dev->rmap; - struct gen_dev_map *dev_map; - struct ppa_addr *luns; - int nr_luns = lun_end - lun_begin + 1; - int luns_left = nr_luns; - int nr_chnls = nr_luns / dev->geo.luns_per_chnl; - int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; - int bch = lun_begin / dev->geo.luns_per_chnl; - int blun = lun_begin % dev->geo.luns_per_chnl; - int lunid = 0; - int lun_balanced = 1; - int prev_nr_luns; - int i, j; - - nr_chnls = nr_luns / dev->geo.luns_per_chnl; - nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; - - dev_map = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL); - if (!dev_map) - goto err_dev; - - dev_map->chnls = kcalloc(nr_chnls, sizeof(struct gen_ch_map), - GFP_KERNEL); - if (!dev_map->chnls) - goto err_chnls; - - luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL); - if (!luns) - goto err_luns; - - prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? - dev->geo.luns_per_chnl : luns_left; - for (i = 0; i < nr_chnls; i++) { - struct gen_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; - int *lun_roffs = ch_rmap->lun_offs; - struct gen_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs; - int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? - dev->geo.luns_per_chnl : luns_left; - - if (lun_balanced && prev_nr_luns != luns_in_chnl) - lun_balanced = 0; - - ch_map->ch_off = ch_rmap->ch_off = bch; - ch_map->nr_luns = luns_in_chnl; - - lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_offs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) { - luns[lunid].ppa = 0; - luns[lunid].g.ch = i; - luns[lunid++].g.lun = j; - - lun_offs[j] = blun; - lun_roffs[j + blun] = blun; - } - - ch_map->lun_offs = lun_offs; - - /* when starting a new channel, lun offset is reset */ - blun = 0; - luns_left -= luns_in_chnl; - } - - dev_map->nr_chnls = nr_chnls; - - tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); - if (!tgt_dev) - goto err_ch; - - memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); - /* Target device only owns a portion of the physical device */ - tgt_dev->geo.nr_chnls = nr_chnls; - tgt_dev->geo.nr_luns = nr_luns; - tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; - tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; - tgt_dev->q = dev->q; - tgt_dev->map = dev_map; - tgt_dev->luns = luns; - memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id)); - - tgt_dev->parent = dev; - - return tgt_dev; -err_ch: - while (--i > 0) - kfree(dev_map->chnls[i].lun_offs); - kfree(luns); -err_luns: - kfree(dev_map->chnls); -err_chnls: - kfree(dev_map); -err_dev: - return tgt_dev; -} - -static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) -{ - struct gen_dev *gn = dev->mp; - struct nvm_ioctl_create_simple *s = &create->conf.s; - struct request_queue *tqueue; - struct gendisk *tdisk; - struct nvm_tgt_type *tt; - struct nvm_target *t; - struct nvm_tgt_dev *tgt_dev; - void *targetdata; - - tt = nvm_find_target_type(create->tgttype, 1); - if (!tt) { - pr_err("nvm: target type %s not found\n", create->tgttype); - return -EINVAL; - } - - mutex_lock(&gn->lock); - t = gen_find_target(gn, create->tgtname); - if (t) { - pr_err("nvm: target name already exists.\n"); - mutex_unlock(&gn->lock); - return -EINVAL; - } - mutex_unlock(&gn->lock); - - t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); - if (!t) - return -ENOMEM; - - if (gen_reserve_luns(dev, t, s->lun_begin, s->lun_end)) - goto err_t; - - tgt_dev = gen_create_tgt_dev(dev, s->lun_begin, s->lun_end); - if (!tgt_dev) { - pr_err("nvm: could not create target device\n"); - goto err_reserve; - } - - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); - if (!tqueue) - goto err_dev; - blk_queue_make_request(tqueue, tt->make_rq); - - tdisk = alloc_disk(0); - if (!tdisk) - goto err_queue; - - sprintf(tdisk->disk_name, "%s", create->tgtname); - tdisk->flags = GENHD_FL_EXT_DEVT; - tdisk->major = 0; - tdisk->first_minor = 0; - tdisk->fops = &gen_fops; - tdisk->queue = tqueue; - - targetdata = tt->init(tgt_dev, tdisk); - if (IS_ERR(targetdata)) - goto err_init; - - tdisk->private_data = targetdata; - tqueue->queuedata = targetdata; - - blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); - - set_capacity(tdisk, tt->capacity(targetdata)); - add_disk(tdisk); - - t->type = tt; - t->disk = tdisk; - t->dev = tgt_dev; - - mutex_lock(&gn->lock); - list_add_tail(&t->list, &gn->targets); - mutex_unlock(&gn->lock); - - return 0; -err_init: - put_disk(tdisk); -err_queue: - blk_cleanup_queue(tqueue); -err_dev: - kfree(tgt_dev); -err_reserve: - gen_release_luns_err(dev, s->lun_begin, s->lun_end); -err_t: - kfree(t); - return -ENOMEM; -} - -static void __gen_remove_target(struct nvm_target *t) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - struct request_queue *q = tdisk->queue; - - del_gendisk(tdisk); - blk_cleanup_queue(q); - - if (tt->exit) - tt->exit(tdisk->private_data); - - gen_remove_tgt_dev(t->dev); - put_disk(tdisk); - - list_del(&t->list); - kfree(t); -} - -/** - * gen_remove_tgt - Removes a target from the media manager - * @dev: device - * @remove: ioctl structure with target name to remove. - * - * Returns: - * 0: on success - * 1: on not found - * <0: on error - */ -static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) -{ - struct gen_dev *gn = dev->mp; - struct nvm_target *t; - - if (!gn) - return 1; - - mutex_lock(&gn->lock); - t = gen_find_target(gn, remove->tgtname); - if (!t) { - mutex_unlock(&gn->lock); - return 1; - } - __gen_remove_target(t); - mutex_unlock(&gn->lock); - - return 0; -} - -static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) -{ - struct nvm_geo *geo = &dev->geo; - struct gen_dev *gn = dev->mp; - struct gen_area *area, *prev, *next; - sector_t begin = 0; - sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9; - - if (len > max_sectors) - return -EINVAL; - - area = kmalloc(sizeof(struct gen_area), GFP_KERNEL); - if (!area) - return -ENOMEM; - - prev = NULL; - - spin_lock(&dev->lock); - list_for_each_entry(next, &gn->area_list, list) { - if (begin + len > next->begin) { - begin = next->end; - prev = next; - continue; - } - break; - } - - if ((begin + len) > max_sectors) { - spin_unlock(&dev->lock); - kfree(area); - return -EINVAL; - } - - area->begin = *lba = begin; - area->end = begin + len; - - if (prev) /* insert into sorted order */ - list_add(&area->list, &prev->list); - else - list_add(&area->list, &gn->area_list); - spin_unlock(&dev->lock); - - return 0; -} - -static void gen_put_area(struct nvm_dev *dev, sector_t begin) -{ - struct gen_dev *gn = dev->mp; - struct gen_area *area; - - spin_lock(&dev->lock); - list_for_each_entry(area, &gn->area_list, list) { - if (area->begin != begin) - continue; - - list_del(&area->list); - spin_unlock(&dev->lock); - kfree(area); - return; - } - spin_unlock(&dev->lock); -} - -static void gen_free(struct nvm_dev *dev) -{ - kfree(dev->mp); - kfree(dev->rmap); - dev->mp = NULL; -} - -static int gen_register(struct nvm_dev *dev) -{ - struct gen_dev *gn; - struct gen_dev_map *dev_rmap; - int i, j; - - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - - gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL); - if (!gn) - goto err_gn; - - dev_rmap = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL); - if (!dev_rmap) - goto err_rmap; - - dev_rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct gen_ch_map), - GFP_KERNEL); - if (!dev_rmap->chnls) - goto err_chnls; - - for (i = 0; i < dev->geo.nr_chnls; i++) { - struct gen_ch_map *ch_rmap; - int *lun_roffs; - int luns_in_chnl = dev->geo.luns_per_chnl; - - ch_rmap = &dev_rmap->chnls[i]; - - ch_rmap->ch_off = -1; - ch_rmap->nr_luns = luns_in_chnl; - - lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_roffs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) - lun_roffs[j] = -1; - - ch_rmap->lun_offs = lun_roffs; - } - - gn->dev = dev; - gn->nr_luns = dev->geo.nr_luns; - INIT_LIST_HEAD(&gn->area_list); - mutex_init(&gn->lock); - INIT_LIST_HEAD(&gn->targets); - dev->mp = gn; - dev->rmap = dev_rmap; - - return 1; -err_ch: - while (--i >= 0) - kfree(dev_rmap->chnls[i].lun_offs); -err_chnls: - kfree(dev_rmap); -err_rmap: - gen_free(dev); -err_gn: - module_put(THIS_MODULE); - return -ENOMEM; -} - -static void gen_unregister(struct nvm_dev *dev) -{ - struct gen_dev *gn = dev->mp; - struct nvm_target *t, *tmp; - - mutex_lock(&gn->lock); - list_for_each_entry_safe(t, tmp, &gn->targets, list) { - if (t->dev->parent != dev) - continue; - __gen_remove_target(t); - } - mutex_unlock(&gn->lock); - - gen_free(dev); - module_put(THIS_MODULE); -} - -static int gen_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct gen_dev_map *dev_map = tgt_dev->map; - struct gen_ch_map *ch_map = &dev_map->chnls[p->g.ch]; - int lun_off = ch_map->lun_offs[p->g.lun]; - struct nvm_dev *dev = tgt_dev->parent; - struct gen_dev_map *dev_rmap = dev->rmap; - struct gen_ch_map *ch_rmap; - int lun_roff; - - p->g.ch += ch_map->ch_off; - p->g.lun += lun_off; - - ch_rmap = &dev_rmap->chnls[p->g.ch]; - lun_roff = ch_rmap->lun_offs[p->g.lun]; - - if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) { - pr_err("nvm: corrupted device partition table\n"); - return -EINVAL; - } - - return 0; -} - -static int gen_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct gen_dev_map *dev_rmap = dev->rmap; - struct gen_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch]; - int lun_roff = ch_rmap->lun_offs[p->g.lun]; - - p->g.ch -= ch_rmap->ch_off; - p->g.lun -= lun_roff; - - return 0; -} - -static int gen_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - int flag) -{ - gen_trans_fn *f; - int i; - int ret = 0; - - f = (flag == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt; - - if (rqd->nr_ppas == 1) - return f(tgt_dev, &rqd->ppa_addr); - - for (i = 0; i < rqd->nr_ppas; i++) { - ret = f(tgt_dev, &rqd->ppa_list[i]); - if (ret) - goto out; - } - -out: - return ret; -} - -static void gen_end_io(struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *tgt_dev = rqd->dev; - struct nvm_tgt_instance *ins = rqd->ins; - - /* Convert address space */ - if (tgt_dev) - gen_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT); - - ins->tt->end_io(rqd); -} - -static int gen_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct nvm_dev *dev = tgt_dev->parent; - - if (!dev->ops->submit_io) - return -ENODEV; - - /* Convert address space */ - gen_trans_rq(tgt_dev, rqd, TRANS_TGT_TO_DEV); - nvm_generic_to_addr_mode(dev, rqd); - - rqd->dev = tgt_dev; - rqd->end_io = gen_end_io; - return dev->ops->submit_io(dev, rqd); -} - -static int gen_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, - int flags) -{ - /* Convert address space */ - gen_map_to_dev(tgt_dev, p); - - return nvm_erase_ppa(tgt_dev->parent, p, 1, flags); -} - -static struct ppa_addr gen_trans_ppa(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr p, int direction) -{ - gen_trans_fn *f; - struct ppa_addr ppa = p; - - f = (direction == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt; - f(tgt_dev, &ppa); - - return ppa; -} - -static void gen_part_to_tgt(struct nvm_dev *dev, sector_t *entries, - int len) -{ - struct nvm_geo *geo = &dev->geo; - struct gen_dev_map *dev_rmap = dev->rmap; - u64 i; - - for (i = 0; i < len; i++) { - struct gen_ch_map *ch_rmap; - int *lun_roffs; - struct ppa_addr gaddr; - u64 pba = le64_to_cpu(entries[i]); - int off; - u64 diff; - - if (!pba) - continue; - - gaddr = linear_to_generic_addr(geo, pba); - ch_rmap = &dev_rmap->chnls[gaddr.g.ch]; - lun_roffs = ch_rmap->lun_offs; - - off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun; - - diff = ((ch_rmap->ch_off * geo->luns_per_chnl) + - (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun; - - entries[i] -= cpu_to_le64(diff); - } -} - -static struct nvmm_type gen = { - .name = "gennvm", - .version = {0, 1, 0}, - - .register_mgr = gen_register, - .unregister_mgr = gen_unregister, - - .create_tgt = gen_create_tgt, - .remove_tgt = gen_remove_tgt, - - .submit_io = gen_submit_io, - .erase_blk = gen_erase_blk, - - .get_area = gen_get_area, - .put_area = gen_put_area, - - .trans_ppa = gen_trans_ppa, - .part_to_tgt = gen_part_to_tgt, -}; - -static int __init gen_module_init(void) -{ - return nvm_register_mgr(&gen); -} - -static void gen_module_exit(void) -{ - nvm_unregister_mgr(&gen); -} - -module_init(gen_module_init); -module_exit(gen_module_exit); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("General media manager for Open-Channel SSDs"); diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h deleted file mode 100644 index 6a4b3f368848..000000000000 --- a/drivers/lightnvm/gennvm.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright: Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - */ - -#ifndef GENNVM_H_ -#define GENNVM_H_ - -#include -#include - -#include - -struct gen_dev { - struct nvm_dev *dev; - - int nr_luns; - struct list_head area_list; - - struct mutex lock; - struct list_head targets; -}; - -/* Map between virtual and physical channel and lun */ -struct gen_ch_map { - int ch_off; - int nr_luns; - int *lun_offs; -}; - -struct gen_dev_map { - struct gen_ch_map *chnls; - int nr_chnls; -}; - -struct gen_area { - struct list_head list; - sector_t begin; - sector_t end; /* end is excluded */ -}; - -static inline void *ch_map_to_lun_offs(struct gen_ch_map *ch_map) -{ - return ch_map + 1; -} - -typedef int (gen_trans_fn)(struct nvm_tgt_dev *, struct ppa_addr *); - -#define gen_for_each_lun(bm, lun, i) \ - for ((i) = 0, lun = &(bm)->luns[0]; \ - (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)]) - -#endif /* GENNVM_H_ */ diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c deleted file mode 100644 index 12002bf4efc2..000000000000 --- a/drivers/lightnvm/sysblk.c +++ /dev/null @@ -1,733 +0,0 @@ -/* - * Copyright (C) 2015 Matias Bjorling. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * - */ - -#include - -#define MAX_SYSBLKS 3 /* remember to update mapping scheme on change */ -#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases - * enables ~1.5M updates per sysblk unit - */ - -struct sysblk_scan { - /* A row is a collection of flash blocks for a system block. */ - int nr_rows; - int row; - int act_blk[MAX_SYSBLKS]; - - int nr_ppas; - struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */ -}; - -static inline int scan_ppa_idx(int row, int blkid) -{ - return (row * MAX_BLKS_PR_SYSBLK) + blkid; -} - -static void nvm_sysblk_to_cpu(struct nvm_sb_info *info, - struct nvm_system_block *sb) -{ - info->seqnr = be32_to_cpu(sb->seqnr); - info->erase_cnt = be32_to_cpu(sb->erase_cnt); - info->version = be16_to_cpu(sb->version); - strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN); - info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); -} - -static void nvm_cpu_to_sysblk(struct nvm_system_block *sb, - struct nvm_sb_info *info) -{ - sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); - sb->seqnr = cpu_to_be32(info->seqnr); - sb->erase_cnt = cpu_to_be32(info->erase_cnt); - sb->version = cpu_to_be16(info->version); - strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN); - sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa); -} - -static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas) -{ - struct nvm_geo *geo = &dev->geo; - int nr_rows = min_t(int, MAX_SYSBLKS, geo->nr_chnls); - int i; - - for (i = 0; i < nr_rows; i++) - sysblk_ppas[i].ppa = 0; - - /* if possible, place sysblk at first channel, middle channel and last - * channel of the device. If not, create only one or two sys blocks - */ - switch (geo->nr_chnls) { - case 2: - sysblk_ppas[1].g.ch = 1; - /* fall-through */ - case 1: - sysblk_ppas[0].g.ch = 0; - break; - default: - sysblk_ppas[0].g.ch = 0; - sysblk_ppas[1].g.ch = geo->nr_chnls / 2; - sysblk_ppas[2].g.ch = geo->nr_chnls - 1; - break; - } - - return nr_rows; -} - -static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, - struct ppa_addr *sysblk_ppas) -{ - memset(s, 0, sizeof(struct sysblk_scan)); - s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas); -} - -static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, - struct sysblk_scan *s) -{ - struct ppa_addr *sppa; - int i, blkid = 0; - - nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); - if (nr_blks < 0) - return nr_blks; - - for (i = 0; i < nr_blks; i++) { - if (blks[i] == NVM_BLK_T_HOST) - return -EEXIST; - - if (blks[i] != NVM_BLK_T_FREE) - continue; - - sppa = &s->ppas[scan_ppa_idx(s->row, blkid)]; - sppa->g.ch = ppa.g.ch; - sppa->g.lun = ppa.g.lun; - sppa->g.blk = i; - s->nr_ppas++; - blkid++; - - pr_debug("nvm: use (%u %u %u) as sysblk\n", - sppa->g.ch, sppa->g.lun, sppa->g.blk); - if (blkid > MAX_BLKS_PR_SYSBLK - 1) - return 0; - } - - pr_err("nvm: sysblk failed get sysblk\n"); - return -EINVAL; -} - -static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, - struct sysblk_scan *s) -{ - int i, nr_sysblk = 0; - - nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); - if (nr_blks < 0) - return nr_blks; - - for (i = 0; i < nr_blks; i++) { - if (blks[i] != NVM_BLK_T_HOST) - continue; - - if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) { - pr_err("nvm: too many host blks\n"); - return -EINVAL; - } - - ppa.g.blk = i; - - s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa; - s->nr_ppas++; - nr_sysblk++; - } - - return 0; -} - -static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, - struct ppa_addr *ppas, int get_free) -{ - struct nvm_geo *geo = &dev->geo; - int i, nr_blks, ret = 0; - u8 *blks; - - s->nr_ppas = 0; - nr_blks = geo->blks_per_lun * geo->plane_mode; - - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - for (i = 0; i < s->nr_rows; i++) { - s->row = i; - - ret = nvm_get_bb_tbl(dev, ppas[i], blks); - if (ret) { - pr_err("nvm: failed bb tbl for ppa (%u %u)\n", - ppas[i].g.ch, - ppas[i].g.blk); - goto err_get; - } - - if (get_free) - ret = sysblk_get_free_blks(dev, ppas[i], blks, nr_blks, - s); - else - ret = sysblk_get_host_blks(dev, ppas[i], blks, nr_blks, - s); - - if (ret) - goto err_get; - } - -err_get: - kfree(blks); - return ret; -} - -/* - * scans a block for latest sysblk. - * Returns: - * 0 - newer sysblk not found. PPA is updated to latest page. - * 1 - newer sysblk found and stored in *cur. PPA is updated to - * next valid page. - * <0- error. - */ -static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa, - struct nvm_system_block *sblk) -{ - struct nvm_geo *geo = &dev->geo; - struct nvm_system_block *cur; - int pg, ret, found = 0; - - /* the full buffer for a flash page is allocated. Only the first of it - * contains the system block information - */ - cur = kmalloc(geo->pfpg_size, GFP_KERNEL); - if (!cur) - return -ENOMEM; - - /* perform linear scan through the block */ - for (pg = 0; pg < dev->lps_per_blk; pg++) { - ppa->g.pg = ppa_to_slc(dev, pg); - - ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE, - cur, geo->pfpg_size); - if (ret) { - if (ret == NVM_RSP_ERR_EMPTYPAGE) { - pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n", - ppa->g.ch, - ppa->g.lun, - ppa->g.blk, - ppa->g.pg); - break; - } - pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)", - ret, - ppa->g.ch, - ppa->g.lun, - ppa->g.blk, - ppa->g.pg); - break; /* if we can't read a page, continue to the - * next blk - */ - } - - if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) { - pr_debug("nvm: scan break for ppa (%u %u %u %u)\n", - ppa->g.ch, - ppa->g.lun, - ppa->g.blk, - ppa->g.pg); - break; /* last valid page already found */ - } - - if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr)) - continue; - - memcpy(sblk, cur, sizeof(struct nvm_system_block)); - found = 1; - } - - kfree(cur); - - return found; -} - -static int nvm_sysblk_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, - int type) -{ - return nvm_set_bb_tbl(dev, s->ppas, s->nr_ppas, type); -} - -static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, - struct sysblk_scan *s) -{ - struct nvm_geo *geo = &dev->geo; - struct nvm_system_block nvmsb; - void *buf; - int i, sect, ret = 0; - struct ppa_addr *ppas; - - nvm_cpu_to_sysblk(&nvmsb, info); - - buf = kzalloc(geo->pfpg_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - memcpy(buf, &nvmsb, sizeof(struct nvm_system_block)); - - ppas = kcalloc(geo->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL); - if (!ppas) { - ret = -ENOMEM; - goto err; - } - - /* Write and verify */ - for (i = 0; i < s->nr_rows; i++) { - ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])]; - - pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk, - ppas[0].g.pg); - - /* Expand to all sectors within a flash page */ - if (geo->sec_per_pg > 1) { - for (sect = 1; sect < geo->sec_per_pg; sect++) { - ppas[sect].ppa = ppas[0].ppa; - ppas[sect].g.sec = sect; - } - } - - ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PWRITE, - NVM_IO_SLC_MODE, buf, geo->pfpg_size); - if (ret) { - pr_err("nvm: sysblk failed program (%u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk); - break; - } - - ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PREAD, - NVM_IO_SLC_MODE, buf, geo->pfpg_size); - if (ret) { - pr_err("nvm: sysblk failed read (%u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk); - break; - } - - if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) { - pr_err("nvm: sysblk failed verify (%u %u %u)\n", - ppas[0].g.ch, - ppas[0].g.lun, - ppas[0].g.blk); - ret = -EINVAL; - break; - } - } - - kfree(ppas); -err: - kfree(buf); - - return ret; -} - -static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s) -{ - int i, ret; - unsigned long nxt_blk; - struct ppa_addr *ppa; - - for (i = 0; i < s->nr_rows; i++) { - nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK; - ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)]; - ppa->g.pg = ppa_to_slc(dev, 0); - - ret = nvm_erase_ppa(dev, ppa, 1, 0); - if (ret) - return ret; - - s->act_blk[i] = nxt_blk; - } - - return 0; -} - -int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) -{ - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - struct nvm_system_block *cur; - int i, j, found = 0; - int ret = -ENOMEM; - - /* - * 1. setup sysblk locations - * 2. get bad block list - * 3. filter on host-specific (type 3) - * 4. iterate through all and find the highest seq nr. - * 5. return superblock information - */ - - if (!dev->ops->get_bb_tbl) - return -EINVAL; - - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); - if (ret) - goto err_sysblk; - - /* no sysblocks initialized */ - if (!s.nr_ppas) - goto err_sysblk; - - cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); - if (!cur) - goto err_sysblk; - - /* find the latest block across all sysblocks */ - for (i = 0; i < s.nr_rows; i++) { - for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { - struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)]; - - ret = nvm_scan_block(dev, &ppa, cur); - if (ret > 0) - found = 1; - else if (ret < 0) - break; - } - } - - nvm_sysblk_to_cpu(info, cur); - - kfree(cur); -err_sysblk: - mutex_unlock(&dev->mlock); - - if (found) - return 1; - return ret; -} - -int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new) -{ - /* 1. for each latest superblock - * 2. if room - * a. write new flash page entry with the updated information - * 3. if no room - * a. find next available block on lun (linear search) - * if none, continue to next lun - * if none at all, report error. also report that it wasn't - * possible to write to all superblocks. - * c. write data to block. - */ - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - struct nvm_system_block *cur; - int i, j, ppaidx, found = 0; - int ret = -ENOMEM; - - if (!dev->ops->get_bb_tbl) - return -EINVAL; - - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); - if (ret) - goto err_sysblk; - - cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); - if (!cur) - goto err_sysblk; - - /* Get the latest sysblk for each sysblk row */ - for (i = 0; i < s.nr_rows; i++) { - found = 0; - for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { - ppaidx = scan_ppa_idx(i, j); - ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur); - if (ret > 0) { - s.act_blk[i] = j; - found = 1; - } else if (ret < 0) - break; - } - } - - if (!found) { - pr_err("nvm: no valid sysblks found to update\n"); - ret = -EINVAL; - goto err_cur; - } - - /* - * All sysblocks found. Check that they have same page id in their flash - * blocks - */ - for (i = 1; i < s.nr_rows; i++) { - struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])]; - struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])]; - - if (l.g.pg != r.g.pg) { - pr_err("nvm: sysblks not on same page. Previous update failed.\n"); - ret = -EINVAL; - goto err_cur; - } - } - - /* - * Check that there haven't been another update to the seqnr since we - * began - */ - if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) { - pr_err("nvm: seq is not sequential\n"); - ret = -EINVAL; - goto err_cur; - } - - /* - * When all pages in a block has been written, a new block is selected - * and writing is performed on the new block. - */ - if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg == - dev->lps_per_blk - 1) { - ret = nvm_prepare_new_sysblks(dev, &s); - if (ret) - goto err_cur; - } - - ret = nvm_write_and_verify(dev, new, &s); -err_cur: - kfree(cur); -err_sysblk: - mutex_unlock(&dev->mlock); - - return ret; -} - -int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - int ret; - - /* - * 1. select master blocks and select first available blks - * 2. get bad block list - * 3. mark MAX_SYSBLKS block as host-based device allocated. - * 4. write and verify data to block - */ - - if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl) - return -EINVAL; - - if (!(geo->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) { - pr_err("nvm: memory does not support SLC access\n"); - return -EINVAL; - } - - /* Index all sysblocks and mark them as host-driven */ - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 1); - if (ret) - goto err_mark; - - ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_HOST); - if (ret) - goto err_mark; - - /* Write to the first block of each row */ - ret = nvm_write_and_verify(dev, info, &s); -err_mark: - mutex_unlock(&dev->mlock); - return ret; -} - -static int factory_nblks(int nblks) -{ - /* Round up to nearest BITS_PER_LONG */ - return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); -} - -static unsigned int factory_blk_offset(struct nvm_geo *geo, struct ppa_addr ppa) -{ - int nblks = factory_nblks(geo->blks_per_lun); - - return ((ppa.g.ch * geo->luns_per_chnl * nblks) + (ppa.g.lun * nblks)) / - BITS_PER_LONG; -} - -static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, - unsigned long *blk_bitmap, int flags) -{ - int i, lunoff; - - nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks); - if (nr_blks < 0) - return nr_blks; - - lunoff = factory_blk_offset(&dev->geo, ppa); - - /* non-set bits correspond to the block must be erased */ - for (i = 0; i < nr_blks; i++) { - switch (blks[i]) { - case NVM_BLK_T_FREE: - if (flags & NVM_FACTORY_ERASE_ONLY_USER) - set_bit(i, &blk_bitmap[lunoff]); - break; - case NVM_BLK_T_HOST: - if (!(flags & NVM_FACTORY_RESET_HOST_BLKS)) - set_bit(i, &blk_bitmap[lunoff]); - break; - case NVM_BLK_T_GRWN_BAD: - if (!(flags & NVM_FACTORY_RESET_GRWN_BBLKS)) - set_bit(i, &blk_bitmap[lunoff]); - break; - default: - set_bit(i, &blk_bitmap[lunoff]); - break; - } - } - - return 0; -} - -static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, - int max_ppas, unsigned long *blk_bitmap) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - int ch, lun, blkid, idx, done = 0, ppa_cnt = 0; - unsigned long *offset; - - while (!done) { - done = 1; - nvm_for_each_lun_ppa(geo, ppa, ch, lun) { - idx = factory_blk_offset(geo, ppa); - offset = &blk_bitmap[idx]; - - blkid = find_first_zero_bit(offset, geo->blks_per_lun); - if (blkid >= geo->blks_per_lun) - continue; - set_bit(blkid, offset); - - ppa.g.blk = blkid; - pr_debug("nvm: erase ppa (%u %u %u)\n", - ppa.g.ch, - ppa.g.lun, - ppa.g.blk); - - erase_list[ppa_cnt] = ppa; - ppa_cnt++; - done = 0; - - if (ppa_cnt == max_ppas) - return ppa_cnt; - } - } - - return ppa_cnt; -} - -static int nvm_fact_select_blks(struct nvm_dev *dev, unsigned long *blk_bitmap, - int flags) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - int ch, lun, nr_blks, ret = 0; - u8 *blks; - - nr_blks = geo->blks_per_lun * geo->plane_mode; - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - nvm_for_each_lun_ppa(geo, ppa, ch, lun) { - ret = nvm_get_bb_tbl(dev, ppa, blks); - if (ret) - pr_err("nvm: failed bb tbl for ch%u lun%u\n", - ppa.g.ch, ppa.g.blk); - - ret = nvm_factory_blks(dev, ppa, blks, nr_blks, blk_bitmap, - flags); - if (ret) - break; - } - - kfree(blks); - return ret; -} - -int nvm_dev_factory(struct nvm_dev *dev, int flags) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppas; - int ppa_cnt, ret = -ENOMEM; - int max_ppas = dev->ops->max_phys_sect / geo->nr_planes; - struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; - struct sysblk_scan s; - unsigned long *blk_bitmap; - - blk_bitmap = kzalloc(factory_nblks(geo->blks_per_lun) * geo->nr_luns, - GFP_KERNEL); - if (!blk_bitmap) - return ret; - - ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL); - if (!ppas) - goto err_blks; - - /* create list of blks to be erased */ - ret = nvm_fact_select_blks(dev, blk_bitmap, flags); - if (ret) - goto err_ppas; - - /* continue to erase until list of blks until empty */ - while ((ppa_cnt = - nvm_fact_get_blks(dev, ppas, max_ppas, blk_bitmap)) > 0) - nvm_erase_ppa(dev, ppas, ppa_cnt, 0); - - /* mark host reserved blocks free */ - if (flags & NVM_FACTORY_RESET_HOST_BLKS) { - nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); - mutex_lock(&dev->mlock); - ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0); - if (!ret) - ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_FREE); - mutex_unlock(&dev->mlock); - } -err_ppas: - kfree(ppas); -err_blks: - kfree(blk_bitmap); - return ret; -} -EXPORT_SYMBOL(nvm_dev_factory); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 588d4a34c083..ad54ec9c986f 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -372,7 +372,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, } /* Transform physical address to target address space */ - nvmdev->mt->part_to_tgt(nvmdev, entries, cmd_nlb); + nvm_part_to_tgt(nvmdev, entries, cmd_nlb); if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) { ret = -EINTR; @@ -633,10 +633,9 @@ static ssize_t nvm_dev_attr_show(struct device *dev, return scnprintf(page, PAGE_SIZE, "%u\n", id->cap); } else if (strcmp(attr->name, "device_mode") == 0) { return scnprintf(page, PAGE_SIZE, "%u\n", id->dom); + /* kept for compatibility */ } else if (strcmp(attr->name, "media_manager") == 0) { - if (!ndev->mt) - return scnprintf(page, PAGE_SIZE, "%s\n", "none"); - return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name); + return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm"); } else if (strcmp(attr->name, "ppa_format") == 0) { return scnprintf(page, PAGE_SIZE, "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7c273bbc5351..84309fe27472 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -80,8 +80,6 @@ struct nvm_dev_ops { unsigned int max_phys_sect; }; - - #ifdef CONFIG_NVM #include @@ -272,15 +270,6 @@ enum { NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; -/* system block cpu representation */ -struct nvm_sb_info { - unsigned long seqnr; - unsigned long erase_cnt; - unsigned int version; - char mmtype[NVM_MMTYPE_LEN]; - struct ppa_addr fs_ppa; -}; - /* Device generic information */ struct nvm_geo { int nr_chnls; @@ -308,6 +297,7 @@ struct nvm_geo { int sec_per_lun; }; +/* sub-device structure */ struct nvm_tgt_dev { /* Device information */ struct nvm_geo geo; @@ -329,17 +319,10 @@ struct nvm_dev { struct list_head devices; - /* Media manager */ - struct nvmm_type *mt; - void *mp; - - /* System blocks */ - struct nvm_sb_info sb; - /* Device information */ struct nvm_geo geo; - /* lower page table */ + /* lower page table */ int lps_per_blk; int *lptbl; @@ -359,6 +342,10 @@ struct nvm_dev { struct mutex mlock; spinlock_t lock; + + /* target management */ + struct list_head area_list; + struct list_head targets; }; static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, @@ -452,11 +439,6 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) (ppa1.g.blk == ppa2.g.blk)); } -static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) -{ - return dev->lptbl[slc_pg]; -} - typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); @@ -487,49 +469,6 @@ extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *); extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); -typedef int (nvmm_register_fn)(struct nvm_dev *); -typedef void (nvmm_unregister_fn)(struct nvm_dev *); - -typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); -typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); -typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int); -typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); -typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); -typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *, - struct ppa_addr, int); -typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int); - -enum { - TRANS_TGT_TO_DEV = 0x0, - TRANS_DEV_TO_TGT = 0x1, -}; - -struct nvmm_type { - const char *name; - unsigned int version[3]; - - nvmm_register_fn *register_mgr; - nvmm_unregister_fn *unregister_mgr; - - nvmm_create_tgt_fn *create_tgt; - nvmm_remove_tgt_fn *remove_tgt; - - nvmm_submit_io_fn *submit_io; - nvmm_erase_blk_fn *erase_blk; - - nvmm_get_area_fn *get_area; - nvmm_put_area_fn *put_area; - - nvmm_trans_ppa_fn *trans_ppa; - nvmm_part_to_tgt_fn *part_to_tgt; - - struct list_head list; -}; - -extern int nvm_register_mgr(struct nvmm_type *); -extern void nvm_unregister_mgr(struct nvmm_type *); - extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); @@ -559,31 +498,9 @@ extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -/* sysblk.c */ -#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ - -/* system block on disk representation */ -struct nvm_system_block { - __be32 magic; /* magic signature */ - __be32 seqnr; /* sequence number */ - __be32 erase_cnt; /* erase count */ - __be16 version; /* version number */ - u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ - __be64 fs_ppa; /* PPA for media manager - * superblock */ -}; - -extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); - extern int nvm_dev_factory(struct nvm_dev *, int flags); -#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid) \ - for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls; \ - (chid)++, (ppa).g.ch = (chid)) \ - for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl; \ - (lunid)++, (ppa).g.lun = (lunid)) +extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From 10995c3dc9d7f47b92ff3e74b4bd191ddb7991ff Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:10 +0100 Subject: lightnvm: collapse nvm_erase_ppa and nvm_erase_blk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After gennvm and core have been merged, there are no more callers to nvm_erase_ppa. Therefore collapse the device specific and target specific erase functions. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 57 ++++++++++++++++++++++-------------------------- include/linux/lightnvm.h | 1 - 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index e9a495650dd0..a4e2e3b01ae4 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -683,12 +683,34 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_submit_io); -int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags) +int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags) { - /* Convert address space */ - nvm_map_to_dev(tgt_dev, p); + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_rq rqd; + int ret; + + if (!dev->ops->erase_block) + return 0; + + ret = nvm_map_to_dev(tgt_dev, ppas); + if (ret) + return ret; + + memset(&rqd, 0, sizeof(struct nvm_rq)); - return nvm_erase_ppa(tgt_dev->parent, p, 1, flags); + ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1); + if (ret) + return ret; + + nvm_generic_to_addr_mode(dev, &rqd); + + rqd.flags = flags; + + ret = dev->ops->erase_block(dev, &rqd); + + nvm_free_rqd_ppalist(dev, &rqd); + + return ret; } EXPORT_SYMBOL(nvm_erase_blk); @@ -847,33 +869,6 @@ void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_free_rqd_ppalist); -int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas, - int flags) -{ - struct nvm_rq rqd; - int ret; - - if (!dev->ops->erase_block) - return 0; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1); - if (ret) - return ret; - - nvm_generic_to_addr_mode(dev, &rqd); - - rqd.flags = flags; - - ret = dev->ops->erase_block(dev, &rqd); - - nvm_free_rqd_ppalist(dev, &rqd); - - return ret; -} -EXPORT_SYMBOL(nvm_erase_ppa); - void nvm_end_io(struct nvm_rq *rqd, int error) { struct nvm_tgt_dev *tgt_dev = rqd->dev; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 84309fe27472..f2007b2c4979 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -483,7 +483,6 @@ extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.3 From 583b7058b2e8071f59646c8fb027f6c3597417ac Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:11 +0100 Subject: lightnvm: remove nvm_submit_ppa* functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_submit_ppa* functions are no longer needed after gennvm and core have been merged. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 109 ----------------------------------------------- include/linux/lightnvm.h | 4 -- 2 files changed, 113 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index a4e2e3b01ae4..ffdf048d58d1 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -883,115 +883,6 @@ void nvm_end_io(struct nvm_rq *rqd, int error) } EXPORT_SYMBOL(nvm_end_io); -static void nvm_end_io_sync(struct nvm_rq *rqd) -{ - struct completion *waiting = rqd->wait; - - rqd->wait = NULL; - - complete(waiting); -} - -static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, - int flags, void *buf, int len) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct bio *bio; - int ret; - unsigned long hang_check; - - bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL); - if (IS_ERR_OR_NULL(bio)) - return -ENOMEM; - - nvm_generic_to_addr_mode(dev, rqd); - - rqd->dev = NULL; - rqd->opcode = opcode; - rqd->flags = flags; - rqd->bio = bio; - rqd->wait = &wait; - rqd->end_io = nvm_end_io_sync; - - ret = dev->ops->submit_io(dev, rqd); - if (ret) { - bio_put(bio); - return ret; - } - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) - while (!wait_for_completion_io_timeout(&wait, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&wait); - - return rqd->error; -} - -/** - * nvm_submit_ppa_list - submit user-defined ppa list to device. The user must - * take to free ppa list if necessary. - * @dev: device - * @ppa_list: user created ppa_list - * @nr_ppas: length of ppa_list - * @opcode: device opcode - * @flags: device flags - * @buf: data buffer - * @len: data buffer length - */ -int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list, - int nr_ppas, int opcode, int flags, void *buf, int len) -{ - struct nvm_rq rqd; - - if (dev->ops->max_phys_sect < nr_ppas) - return -EINVAL; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - rqd.nr_ppas = nr_ppas; - if (nr_ppas > 1) - rqd.ppa_list = ppa_list; - else - rqd.ppa_addr = ppa_list[0]; - - return __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len); -} -EXPORT_SYMBOL(nvm_submit_ppa_list); - -/** - * nvm_submit_ppa - submit PPAs to device. PPAs will automatically be unfolded - * as single, dual, quad plane PPAs depending on device type. - * @dev: device - * @ppa: user created ppa_list - * @nr_ppas: length of ppa_list - * @opcode: device opcode - * @flags: device flags - * @buf: data buffer - * @len: data buffer length - */ -int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, - int opcode, int flags, void *buf, int len) -{ - struct nvm_rq rqd; - int ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas, 1); - if (ret) - return ret; - - ret = __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len); - - nvm_free_rqd_ppalist(dev, &rqd); - - return ret; -} -EXPORT_SYMBOL(nvm_submit_ppa); - /* * folds a bad block list from its plane representation to its virtual * block representation. The fold is done in place and reduced size is diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index f2007b2c4979..abb3d55c107f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -489,10 +489,6 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); -extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, - void *, int); -extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, - int, void *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -- cgit v1.2.3 From 8f4fe008fb256649bd0e16c96a6eafa3bd916ac3 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:12 +0100 Subject: lightnvm: remove nvm_get_bb_tbl and nvm_set_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the merge of gennvm and core, there is no longer a need for the device specific bad block functions. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 40 ++++------------------------------------ include/linux/lightnvm.h | 2 -- 2 files changed, 4 insertions(+), 38 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ffdf048d58d1..18d48732fe74 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -605,33 +605,6 @@ static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev, } } -int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas, - int type) -{ - struct nvm_rq rqd; - int ret; - - if (nr_ppas > dev->ops->max_phys_sect) { - pr_err("nvm: unable to update all sysblocks atomically\n"); - return -EINVAL; - } - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1); - nvm_generic_to_addr_mode(dev, &rqd); - - ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); - nvm_free_rqd_ppalist(dev, &rqd); - if (ret) { - pr_err("nvm: sysblk failed bb mark\n"); - return -EINVAL; - } - - return 0; -} -EXPORT_SYMBOL(nvm_set_bb_tbl); - int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas, int type) { @@ -919,20 +892,15 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) } EXPORT_SYMBOL(nvm_bb_tbl_fold); -int nvm_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks) -{ - ppa = generic_to_dev_addr(dev, ppa); - - return dev->ops->get_bb_tbl(dev, ppa, blks); -} -EXPORT_SYMBOL(nvm_get_bb_tbl); - int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, u8 *blks) { + struct nvm_dev *dev = tgt_dev->parent; + ppa = nvm_trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV); + ppa = generic_to_dev_addr(dev, ppa); - return nvm_get_bb_tbl(tgt_dev->parent, ppa, blks); + return dev->ops->get_bb_tbl(dev, ppa, blks); } EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index abb3d55c107f..cad1e1cb0635 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -473,7 +473,6 @@ extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); @@ -490,7 +489,6 @@ extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); -extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); extern int nvm_dev_factory(struct nvm_dev *, int flags); -- cgit v1.2.3 From 61a561d8d7d1e1cac46618d01aa3298eb447eece Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:13 +0100 Subject: lightnvm: make nvm_map_* return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only check there was done was a debugging check. Remove it and replace the return value with void to reduce error checking. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 41 +++++++++-------------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 18d48732fe74..a1a7a5a303bd 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -407,31 +407,17 @@ err_rmap: return -ENOMEM; } -static int nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) { struct nvm_dev_map *dev_map = tgt_dev->map; struct nvm_ch_map *ch_map = &dev_map->chnls[p->g.ch]; int lun_off = ch_map->lun_offs[p->g.lun]; - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_dev_map *dev_rmap = dev->rmap; - struct nvm_ch_map *ch_rmap; - int lun_roff; p->g.ch += ch_map->ch_off; p->g.lun += lun_off; - - ch_rmap = &dev_rmap->chnls[p->g.ch]; - lun_roff = ch_rmap->lun_offs[p->g.lun]; - - if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) { - pr_err("nvm: corrupted device partition table\n"); - return -EINVAL; - } - - return 0; } -static int nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) { struct nvm_dev *dev = tgt_dev->parent; struct nvm_dev_map *dev_rmap = dev->rmap; @@ -440,34 +426,27 @@ static int nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) p->g.ch -= ch_rmap->ch_off; p->g.lun -= lun_roff; - - return 0; } -static int nvm_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, +static void nvm_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, int flag) { int i; - int ret; if (rqd->nr_ppas == 1) { if (flag == TRANS_TGT_TO_DEV) - return nvm_map_to_dev(tgt_dev, &rqd->ppa_addr); + nvm_map_to_dev(tgt_dev, &rqd->ppa_addr); else - return nvm_map_to_tgt(tgt_dev, &rqd->ppa_addr); + nvm_map_to_tgt(tgt_dev, &rqd->ppa_addr); + return; } for (i = 0; i < rqd->nr_ppas; i++) { if (flag == TRANS_TGT_TO_DEV) - ret = nvm_map_to_dev(tgt_dev, &rqd->ppa_list[i]); + nvm_map_to_dev(tgt_dev, &rqd->ppa_list[i]); else - ret = nvm_map_to_tgt(tgt_dev, &rqd->ppa_list[i]); - - if (ret) - break; + nvm_map_to_tgt(tgt_dev, &rqd->ppa_list[i]); } - - return ret; } static struct ppa_addr nvm_trans_ppa(struct nvm_tgt_dev *tgt_dev, @@ -665,9 +644,7 @@ int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags) if (!dev->ops->erase_block) return 0; - ret = nvm_map_to_dev(tgt_dev, ppas); - if (ret) - return ret; + nvm_map_to_dev(tgt_dev, ppas); memset(&rqd, 0, sizeof(struct nvm_rq)); -- cgit v1.2.3 From dab8ee9e8a30620a5b5f22d6c0b3749217093803 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:14 +0100 Subject: lightnvm: cleanup nvm transformation functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Going from target specific ppa addresses to device was accomplished by first converting target to generic ppa addresses and generic to device addresses. The conversion was either open-coded or used the built-in nvm_trans_* and nvm_map_* functions for conversion. Simplify the interface and cleanup the calls to provide clean functions that now either take a list of ppas or a nvm_rq, and is exposed through: void nvm_ppa_* - target to/from device with a list of PPAs, void nvm_rq_* - target to/from device with a nvm_rq. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 115 ++++++++++++++--------------------------------- include/linux/lightnvm.h | 14 +++--- 2 files changed, 40 insertions(+), 89 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index a1a7a5a303bd..0842c85d3b84 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -50,11 +50,6 @@ struct nvm_area { sector_t end; /* end is excluded */ }; -enum { - TRANS_TGT_TO_DEV = 0x0, - TRANS_DEV_TO_TGT = 0x1, -}; - static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) { struct nvm_target *tgt; @@ -428,38 +423,46 @@ static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) p->g.lun -= lun_roff; } -static void nvm_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - int flag) +static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr *ppa_list, int nr_ppas) { int i; - if (rqd->nr_ppas == 1) { - if (flag == TRANS_TGT_TO_DEV) - nvm_map_to_dev(tgt_dev, &rqd->ppa_addr); - else - nvm_map_to_tgt(tgt_dev, &rqd->ppa_addr); - return; + for (i = 0; i < nr_ppas; i++) { + nvm_map_to_dev(tgt_dev, &ppa_list[i]); + ppa_list[i] = generic_to_dev_addr(tgt_dev, ppa_list[i]); } +} - for (i = 0; i < rqd->nr_ppas; i++) { - if (flag == TRANS_TGT_TO_DEV) - nvm_map_to_dev(tgt_dev, &rqd->ppa_list[i]); - else - nvm_map_to_tgt(tgt_dev, &rqd->ppa_list[i]); +static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr *ppa_list, int nr_ppas) +{ + int i; + + for (i = 0; i < nr_ppas; i++) { + ppa_list[i] = dev_to_generic_addr(tgt_dev, ppa_list[i]); + nvm_map_to_tgt(tgt_dev, &ppa_list[i]); } } -static struct ppa_addr nvm_trans_ppa(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr p, int dir) +static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { - struct ppa_addr ppa = p; + if (rqd->nr_ppas == 1) { + nvm_ppa_tgt_to_dev(tgt_dev, &rqd->ppa_addr, 1); + return; + } - if (dir == TRANS_TGT_TO_DEV) - nvm_map_to_dev(tgt_dev, &ppa); - else - nvm_map_to_tgt(tgt_dev, &ppa); + nvm_ppa_tgt_to_dev(tgt_dev, rqd->ppa_list, rqd->nr_ppas); +} + +static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +{ + if (rqd->nr_ppas == 1) { + nvm_ppa_dev_to_tgt(tgt_dev, &rqd->ppa_addr, 1); + return; + } - return ppa; + nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); } void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, @@ -564,26 +567,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } -static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev, - struct nvm_rq *rqd) -{ - struct nvm_dev *dev = tgt_dev->parent; - int i; - - if (rqd->nr_ppas > 1) { - for (i = 0; i < rqd->nr_ppas; i++) { - rqd->ppa_list[i] = nvm_trans_ppa(tgt_dev, - rqd->ppa_list[i], TRANS_TGT_TO_DEV); - rqd->ppa_list[i] = generic_to_dev_addr(dev, - rqd->ppa_list[i]); - } - } else { - rqd->ppa_addr = nvm_trans_ppa(tgt_dev, rqd->ppa_addr, - TRANS_TGT_TO_DEV); - rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); - } -} - int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas, int type) { @@ -599,7 +582,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, memset(&rqd, 0, sizeof(struct nvm_rq)); nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1); - nvm_tgt_generic_to_addr_mode(tgt_dev, &rqd); + nvm_rq_tgt_to_dev(tgt_dev, &rqd); ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); nvm_free_rqd_ppalist(dev, &rqd); @@ -627,8 +610,7 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) if (!dev->ops->submit_io) return -ENODEV; - /* Convert address space */ - nvm_generic_to_addr_mode(dev, rqd); + nvm_rq_tgt_to_dev(tgt_dev, rqd); rqd->dev = tgt_dev; return dev->ops->submit_io(dev, rqd); @@ -652,7 +634,7 @@ int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags) if (ret) return ret; - nvm_generic_to_addr_mode(dev, &rqd); + nvm_rq_tgt_to_dev(tgt_dev, &rqd); rqd.flags = flags; @@ -741,34 +723,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) } EXPORT_SYMBOL(nvm_put_area); -void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - int i; - - if (rqd->nr_ppas > 1) { - for (i = 0; i < rqd->nr_ppas; i++) - rqd->ppa_list[i] = dev_to_generic_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); - } -} -EXPORT_SYMBOL(nvm_addr_to_generic_mode); - -void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - int i; - - if (rqd->nr_ppas > 1) { - for (i = 0; i < rqd->nr_ppas; i++) - rqd->ppa_list[i] = generic_to_dev_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); - } -} -EXPORT_SYMBOL(nvm_generic_to_addr_mode); - int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, const struct ppa_addr *ppas, int nr_ppas, int vblk) { @@ -826,7 +780,7 @@ void nvm_end_io(struct nvm_rq *rqd, int error) /* Convert address space */ if (tgt_dev) - nvm_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT); + nvm_rq_dev_to_tgt(tgt_dev, rqd); rqd->error = error; ins->tt->end_io(rqd); @@ -874,8 +828,7 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, { struct nvm_dev *dev = tgt_dev->parent; - ppa = nvm_trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV); - ppa = generic_to_dev_addr(dev, ppa); + nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1); return dev->ops->get_bb_tbl(dev, ppa, blks); } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cad1e1cb0635..faa0fbfe339a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -378,10 +378,10 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, return l; } -static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset; @@ -394,10 +394,10 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, return l; } -static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = 0; @@ -477,8 +477,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); -extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -- cgit v1.2.3 From 19bd6fe73ca812964963aa30827cff9aae64a715 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:15 +0100 Subject: lightnvm: reduce number of nvm_id groups to one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of configuration groups has been limited to one in current code, even if there is support for up to four. With the introduction of the open-channel SSD 1.3 specification, only a single group is exposed onwards. Reflect this in the nvm_id structure. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 3 +- drivers/lightnvm/core.c | 12 ++---- drivers/nvme/host/lightnvm.c | 87 +++++++++++++++++++++----------------------- include/linux/lightnvm.h | 3 +- 4 files changed, 47 insertions(+), 58 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index c0e14e54909b..e666095278ab 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -460,7 +460,6 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) id->ver_id = 0x1; id->vmnt = 0; - id->cgrps = 1; id->cap = 0x2; id->dom = 0x1; @@ -479,7 +478,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) sector_div(size, bs); /* convert size to pages */ size >>= 8; /* concert size to pgs pr blk */ - grp = &id->groups[0]; + grp = &id->grp; grp->mtype = 0; grp->fmtype = 0; grp->num_ch = 1; diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 0842c85d3b84..80cd7677762d 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -884,7 +884,7 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) static int nvm_core_init(struct nvm_dev *dev) { struct nvm_id *id = &dev->identity; - struct nvm_id_group *grp = &id->groups[0]; + struct nvm_id_group *grp = &id->grp; struct nvm_geo *geo = &dev->geo; int ret; @@ -988,20 +988,14 @@ static int nvm_init(struct nvm_dev *dev) goto err; } - pr_debug("nvm: ver:%x nvm_vendor:%x groups:%u\n", - dev->identity.ver_id, dev->identity.vmnt, - dev->identity.cgrps); + pr_debug("nvm: ver:%x nvm_vendor:%x\n", + dev->identity.ver_id, dev->identity.vmnt); if (dev->identity.ver_id != 1) { pr_err("nvm: device not supported by kernel."); goto err; } - if (dev->identity.cgrps != 1) { - pr_err("nvm: only one group configuration supported."); - goto err; - } - ret = nvm_core_init(dev); if (ret) { pr_err("nvm: could not initialize core structures.\n"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ad54ec9c986f..733992a25d6a 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -248,50 +248,48 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) { struct nvme_nvm_id_group *src; struct nvm_id_group *dst; - int i, end; - - end = min_t(u32, 4, nvm_id->cgrps); - - for (i = 0; i < end; i++) { - src = &nvme_nvm_id->groups[i]; - dst = &nvm_id->groups[i]; - - dst->mtype = src->mtype; - dst->fmtype = src->fmtype; - dst->num_ch = src->num_ch; - dst->num_lun = src->num_lun; - dst->num_pln = src->num_pln; - - dst->num_pg = le16_to_cpu(src->num_pg); - dst->num_blk = le16_to_cpu(src->num_blk); - dst->fpg_sz = le16_to_cpu(src->fpg_sz); - dst->csecs = le16_to_cpu(src->csecs); - dst->sos = le16_to_cpu(src->sos); - - dst->trdt = le32_to_cpu(src->trdt); - dst->trdm = le32_to_cpu(src->trdm); - dst->tprt = le32_to_cpu(src->tprt); - dst->tprm = le32_to_cpu(src->tprm); - dst->tbet = le32_to_cpu(src->tbet); - dst->tbem = le32_to_cpu(src->tbem); - dst->mpos = le32_to_cpu(src->mpos); - dst->mccap = le32_to_cpu(src->mccap); - - dst->cpar = le16_to_cpu(src->cpar); - - if (dst->fmtype == NVM_ID_FMTYPE_MLC) { - memcpy(dst->lptbl.id, src->lptbl.id, 8); - dst->lptbl.mlc.num_pairs = - le16_to_cpu(src->lptbl.mlc.num_pairs); - - if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { - pr_err("nvm: number of MLC pairs not supported\n"); - return -EINVAL; - } - - memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, - dst->lptbl.mlc.num_pairs); + + if (nvme_nvm_id->cgrps != 1) + return -EINVAL; + + src = &nvme_nvm_id->groups[0]; + dst = &nvm_id->grp; + + dst->mtype = src->mtype; + dst->fmtype = src->fmtype; + dst->num_ch = src->num_ch; + dst->num_lun = src->num_lun; + dst->num_pln = src->num_pln; + + dst->num_pg = le16_to_cpu(src->num_pg); + dst->num_blk = le16_to_cpu(src->num_blk); + dst->fpg_sz = le16_to_cpu(src->fpg_sz); + dst->csecs = le16_to_cpu(src->csecs); + dst->sos = le16_to_cpu(src->sos); + + dst->trdt = le32_to_cpu(src->trdt); + dst->trdm = le32_to_cpu(src->trdm); + dst->tprt = le32_to_cpu(src->tprt); + dst->tprm = le32_to_cpu(src->tprm); + dst->tbet = le32_to_cpu(src->tbet); + dst->tbem = le32_to_cpu(src->tbem); + dst->mpos = le32_to_cpu(src->mpos); + dst->mccap = le32_to_cpu(src->mccap); + + dst->cpar = le16_to_cpu(src->cpar); + + if (dst->fmtype == NVM_ID_FMTYPE_MLC) { + memcpy(dst->lptbl.id, src->lptbl.id, 8); + dst->lptbl.mlc.num_pairs = + le16_to_cpu(src->lptbl.mlc.num_pairs); + + if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { + pr_err("nvm: number of MLC pairs not supported\n"); + return -EINVAL; } + + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, + dst->lptbl.mlc.num_pairs); } return 0; @@ -321,7 +319,6 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) nvm_id->ver_id = nvme_nvm_id->ver_id; nvm_id->vmnt = nvme_nvm_id->vmnt; - nvm_id->cgrps = nvme_nvm_id->cgrps; nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap); nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom); memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf, @@ -622,7 +619,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev, return 0; id = &ndev->identity; - grp = &id->groups[0]; + grp = &id->grp; attr = &dattr->attr; if (strcmp(attr->name, "version") == 0) { diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index faa0fbfe339a..ce0b2dac84ac 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -200,11 +200,10 @@ struct nvm_addr_format { struct nvm_id { u8 ver_id; u8 vmnt; - u8 cgrps; u32 cap; u32 dom; struct nvm_addr_format ppaf; - struct nvm_id_group groups[4]; + struct nvm_id_group grp; } __packed; struct nvm_target { -- cgit v1.2.3 From 84d4add793c65b5bda802dcefcf0d7ab1a8e22ed Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:16 +0100 Subject: lightnvm: add ioctls for vector I/Os MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable user-space to issue vector I/O commands through ioctls. To issue a vector I/O, the ppa list with addresses is also required and must be mapped for the controller to access. For each ioctl, the result and status bits are returned as well, such that user-space can retrieve the open-channel SSD completion bits. The implementation covers the traditional use-cases of bad block management, and vectored read/write/erase. Signed-off-by: Matias Bjørling Metadata implementation, test, and fixes. Signed-off-by: Simon A.F. Lund Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 + drivers/nvme/host/lightnvm.c | 220 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 6 ++ include/uapi/linux/lightnvm.h | 50 ++++++++++ 4 files changed, 280 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2fc86dc7a8df..037ee999e759 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -784,6 +784,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_sg_io(ns, (void __user *)arg); #endif default: +#ifdef CONFIG_NVM + if (ns->ndev) + return nvme_nvm_ioctl(ns, cmd, arg); +#endif return -ENOTTY; } } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 733992a25d6a..3b6cd9bdba7e 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include enum nvme_nvm_admin_opcode { nvme_nvm_admin_identity = 0xe2, @@ -583,6 +585,224 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; +static void nvme_nvm_end_user_vio(struct request *rq, int error) +{ + struct completion *waiting = rq->end_io_data; + + complete(waiting); +} + +static int nvme_nvm_submit_user_cmd(struct request_queue *q, + struct nvme_ns *ns, + struct nvme_nvm_command *vcmd, + void __user *ubuf, unsigned int bufflen, + void __user *meta_buf, unsigned int meta_len, + void __user *ppa_buf, unsigned int ppa_len, + u32 *result, u64 *status, unsigned int timeout) +{ + bool write = nvme_is_write((struct nvme_command *)vcmd); + struct nvm_dev *dev = ns->ndev; + struct gendisk *disk = ns->disk; + struct request *rq; + struct bio *bio = NULL; + __le64 *ppa_list = NULL; + dma_addr_t ppa_dma; + __le64 *metadata = NULL; + dma_addr_t metadata_dma; + DECLARE_COMPLETION_ONSTACK(wait); + int ret; + + rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0, + NVME_QID_ANY); + if (IS_ERR(rq)) { + ret = -ENOMEM; + goto err_cmd; + } + + rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; + rq->end_io_data = &wait; + + if (ppa_buf && ppa_len) { + ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); + if (!ppa_list) { + ret = -ENOMEM; + goto err_rq; + } + if (copy_from_user(ppa_list, (void __user *)ppa_buf, + sizeof(u64) * (ppa_len + 1))) { + ret = -EFAULT; + goto err_ppa; + } + vcmd->ph_rw.spba = cpu_to_le64(ppa_dma); + } else { + vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf); + } + + if (ubuf && bufflen) { + ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL); + if (ret) + goto err_ppa; + bio = rq->bio; + + if (meta_buf && meta_len) { + metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, + &metadata_dma); + if (!metadata) { + ret = -ENOMEM; + goto err_map; + } + + if (write) { + if (copy_from_user(metadata, + (void __user *)meta_buf, + meta_len)) { + ret = -EFAULT; + goto err_meta; + } + } + vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); + } + + if (!disk) + goto submit; + + bio->bi_bdev = bdget_disk(disk, 0); + if (!bio->bi_bdev) { + ret = -ENODEV; + goto err_meta; + } + } + +submit: + blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio); + + wait_for_completion_io(&wait); + + ret = nvme_error_status(rq->errors); + if (result) + *result = rq->errors & 0x7ff; + if (status) + *status = le64_to_cpu(nvme_req(rq)->result.u64); + + if (metadata && !ret && !write) { + if (copy_to_user(meta_buf, (void *)metadata, meta_len)) + ret = -EFAULT; + } +err_meta: + if (meta_buf && meta_len) + dma_pool_free(dev->dma_pool, metadata, metadata_dma); +err_map: + if (bio) { + if (disk && bio->bi_bdev) + bdput(bio->bi_bdev); + blk_rq_unmap_user(bio); + } +err_ppa: + if (ppa_buf && ppa_len) + dma_pool_free(dev->dma_pool, ppa_list, ppa_dma); +err_rq: + blk_mq_free_request(rq); +err_cmd: + return ret; +} + +static int nvme_nvm_submit_vio(struct nvme_ns *ns, + struct nvm_user_vio __user *uvio) +{ + struct nvm_user_vio vio; + struct nvme_nvm_command c; + unsigned int length; + int ret; + + if (copy_from_user(&vio, uvio, sizeof(vio))) + return -EFAULT; + if (vio.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.ph_rw.opcode = vio.opcode; + c.ph_rw.nsid = cpu_to_le32(ns->ns_id); + c.ph_rw.control = cpu_to_le16(vio.control); + c.ph_rw.length = cpu_to_le16(vio.nppas); + + length = (vio.nppas + 1) << ns->lba_shift; + + ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c, + (void __user *)(uintptr_t)vio.addr, length, + (void __user *)(uintptr_t)vio.metadata, + vio.metadata_len, + (void __user *)(uintptr_t)vio.ppa_list, vio.nppas, + &vio.result, &vio.status, 0); + + if (ret && copy_to_user(uvio, &vio, sizeof(vio))) + return -EFAULT; + + return ret; +} + +static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, + struct nvm_passthru_vio __user *uvcmd) +{ + struct nvm_passthru_vio vcmd; + struct nvme_nvm_command c; + struct request_queue *q; + unsigned int timeout = 0; + int ret; + + if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd))) + return -EFAULT; + if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN))) + return -EACCES; + if (vcmd.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = vcmd.opcode; + c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); + /* cdw11-12 */ + c.ph_rw.length = cpu_to_le16(vcmd.nppas); + c.ph_rw.control = cpu_to_le32(vcmd.control); + c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15); + + if (vcmd.timeout_ms) + timeout = msecs_to_jiffies(vcmd.timeout_ms); + + q = admin ? ns->ctrl->admin_q : ns->queue; + + ret = nvme_nvm_submit_user_cmd(q, ns, + (struct nvme_nvm_command *)&c, + (void __user *)(uintptr_t)vcmd.addr, vcmd.data_len, + (void __user *)(uintptr_t)vcmd.metadata, + vcmd.metadata_len, + (void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas, + &vcmd.result, &vcmd.status, timeout); + + if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd))) + return -EFAULT; + + return ret; +} + +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case NVME_NVM_IOCTL_ADMIN_VIO: + return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg); + case NVME_NVM_IOCTL_IO_VIO: + return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg); + case NVME_NVM_IOCTL_SUBMIT_VIO: + return nvme_nvm_submit_vio(ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { struct request_queue *q = ns->queue; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6377e14586dc..330713c4abdb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -326,6 +326,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); int nvme_nvm_register_sysfs(struct nvme_ns *ns); void nvme_nvm_unregister_sysfs(struct nvme_ns *ns); +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); #else static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) @@ -343,6 +344,11 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i { return 0; } +static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} #endif /* CONFIG_NVM */ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 774a43128a7a..fd19f36b3129 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -122,6 +122,44 @@ struct nvm_ioctl_dev_factory { __u32 flags; }; +struct nvm_user_vio { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nppas; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 ppa_list; + __u32 metadata_len; + __u32 data_len; + __u64 status; + __u32 result; + __u32 rsvd3[3]; +}; + +struct nvm_passthru_vio { + __u8 opcode; + __u8 flags; + __u8 rsvd[2]; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u64 ppa_list; + __u16 nppas; + __u16 control; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u64 status; + __u32 result; + __u32 timeout_ms; +}; + /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ enum { /* top level cmds */ @@ -137,6 +175,11 @@ enum { /* Factory reset device */ NVM_DEV_FACTORY_CMD, + + /* Vector user I/O */ + NVM_DEV_VIO_ADMIN_CMD = 0x41, + NVM_DEV_VIO_CMD = 0x42, + NVM_DEV_VIO_USER_CMD = 0x43, }; #define NVM_IOCTL 'L' /* 0x4c */ @@ -154,6 +197,13 @@ enum { #define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ struct nvm_ioctl_dev_factory) +#define NVME_NVM_IOCTL_IO_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \ + struct nvm_passthru_vio) +#define NVME_NVM_IOCTL_ADMIN_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\ + struct nvm_passthru_vio) +#define NVME_NVM_IOCTL_SUBMIT_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\ + struct nvm_user_vio) + #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCHLEVEL 0 -- cgit v1.2.3 From 06894efea706b3cd4ce31e341ec51b4c62c34a86 Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 31 Jan 2017 13:17:17 +0100 Subject: lightnvm: use end_io callback instead of instance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the lightnvm core had the "gennvm" layer between the device and the target, there was a need for the core to be able to figure out which target it should send an end_io callback to. Leading to a "double" end_io, first for the media manager instance, and then for the target instance. Now that core and gennvm is merged, there is no longer a need for this, and a single end_io callback will do. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 3 ++- drivers/lightnvm/core.c | 7 +++---- drivers/lightnvm/rrpc.c | 7 +++---- drivers/lightnvm/rrpc.h | 3 --- drivers/nvme/host/lightnvm.c | 3 ++- include/linux/lightnvm.h | 10 +++------- 6 files changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index e666095278ab..a67b7ea1e3bf 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -420,7 +420,8 @@ static void null_lnvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; - nvm_end_io(rqd, error); + rqd->error = error; + nvm_end_io(rqd); blk_put_request(rq); } diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 80cd7677762d..4f4db991c4a6 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -773,17 +773,16 @@ void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_free_rqd_ppalist); -void nvm_end_io(struct nvm_rq *rqd, int error) +void nvm_end_io(struct nvm_rq *rqd) { struct nvm_tgt_dev *tgt_dev = rqd->dev; - struct nvm_tgt_instance *ins = rqd->ins; /* Convert address space */ if (tgt_dev) nvm_rq_dev_to_tgt(tgt_dev, rqd); - rqd->error = error; - ins->tt->end_io(rqd); + if (rqd->end_io) + rqd->end_io(rqd); } EXPORT_SYMBOL(nvm_end_io); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 9fb7de395915..e00b1d7b976f 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -779,7 +779,7 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, static void rrpc_end_io(struct nvm_rq *rqd) { - struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance); + struct rrpc *rrpc = rqd->private; struct nvm_tgt_dev *dev = rrpc->dev; struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); uint8_t npages = rqd->nr_ppas; @@ -972,8 +972,9 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, bio_get(bio); rqd->bio = bio; - rqd->ins = &rrpc->instance; + rqd->private = rrpc; rqd->nr_ppas = nr_pages; + rqd->end_io = rrpc_end_io; rrq->flags = flags; err = nvm_submit_io(dev, rqd); @@ -1532,7 +1533,6 @@ static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk) if (!rrpc) return ERR_PTR(-ENOMEM); - rrpc->instance.tt = &tt_rrpc; rrpc->dev = dev; rrpc->disk = tdisk; @@ -1611,7 +1611,6 @@ static struct nvm_tgt_type tt_rrpc = { .make_rq = rrpc_make_rq, .capacity = rrpc_capacity, - .end_io = rrpc_end_io, .init = rrpc_init, .exit = rrpc_exit, diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index 94e4d73116b2..fdb6ff902903 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -102,9 +102,6 @@ struct rrpc_lun { }; struct rrpc { - /* instance must be kept in top to resolve rrpc in unprep */ - struct nvm_tgt_instance instance; - struct nvm_tgt_dev *dev; struct gendisk *disk; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 3b6cd9bdba7e..21cac8523bd8 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -484,7 +484,8 @@ static void nvme_nvm_end_io(struct request *rq, int error) struct nvm_rq *rqd = rq->end_io_data; rqd->ppa_status = nvme_req(rq)->result.u64; - nvm_end_io(rqd, error); + rqd->error = error; + nvm_end_io(rqd); kfree(nvme_req(rq)->cmd); blk_mq_free_request(rq); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ce0b2dac84ac..17cd454f0d87 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -213,10 +213,6 @@ struct nvm_target { struct gendisk *disk; }; -struct nvm_tgt_instance { - struct nvm_tgt_type *tt; -}; - #define ADDR_EMPTY (~0ULL) #define NVM_VERSION_MAJOR 1 @@ -227,7 +223,6 @@ struct nvm_rq; typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { - struct nvm_tgt_instance *ins; struct nvm_tgt_dev *dev; struct bio *bio; @@ -251,6 +246,8 @@ struct nvm_rq { u64 ppa_status; /* ppa media status */ int error; + + void *private; }; static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) @@ -450,7 +447,6 @@ struct nvm_tgt_type { /* target entry points */ nvm_tgt_make_rq_fn *make_rq; nvm_tgt_capacity_fn *capacity; - nvm_end_io_fn *end_io; /* module-specific init/teardown */ nvm_tgt_init_fn *init; @@ -484,7 +480,7 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); -extern void nvm_end_io(struct nvm_rq *, int); +extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -- cgit v1.2.3 From 38ea2f7656f815e7330868cbec7bada0fd7933a8 Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 31 Jan 2017 13:17:18 +0100 Subject: lightnvm: Add CRC read error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let the host differentiate between a read error and a CRC check error on the device side. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 17cd454f0d87..bc282d26017a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -107,6 +107,7 @@ enum { NVM_RSP_ERR_FAILWRITE = 0x40ff, NVM_RSP_ERR_EMPTYPAGE = 0x42ff, NVM_RSP_ERR_FAILECC = 0x4281, + NVM_RSP_ERR_FAILCRC = 0x4004, NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ -- cgit v1.2.3 From deccf5a52ea59843f5575cb49fe532c7cb8801e4 Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 31 Jan 2017 13:17:19 +0100 Subject: lightnvm: free properly on target creation error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a memory leak when target creation fails. More specifically, free the entire device structure given to the target (tgt_dev). Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 4f4db991c4a6..b2cd3d6f2a31 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -303,7 +303,7 @@ err_init: err_queue: blk_cleanup_queue(tqueue); err_dev: - kfree(tgt_dev); + nvm_remove_tgt_dev(tgt_dev); err_t: kfree(t); err_reserve: -- cgit v1.2.3 From 9a69b0ed6257ae5e71c99bf21ce53f98c558476a Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 31 Jan 2017 13:17:20 +0100 Subject: lightnvm: allow targets to use sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to register through the sysfs interface, a driver needs to know its kobject. On a disk structure, this happens when the partition information is added (device_add_disk), which for lightnvm takes place after the target has been initialized. This means that on target initialization, the kboject has not been created yet. This patch adds a target function to let targets initialize their own kboject as a child of the disk kobject. Signed-off-by: Javier González Added exit typedef and passed gendisk instead of void pointer for exit. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 9 +++++++++ include/linux/lightnvm.h | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index b2cd3d6f2a31..9bfe0352d093 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -289,6 +289,9 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) set_capacity(tdisk, tt->capacity(targetdata)); add_disk(tdisk); + if (tt->sysfs_init && tt->sysfs_init(tdisk)) + goto err_sysfs; + t->type = tt; t->disk = tdisk; t->dev = tgt_dev; @@ -298,6 +301,9 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) mutex_unlock(&dev->mlock); return 0; +err_sysfs: + if (tt->exit) + tt->exit(targetdata); err_init: put_disk(tdisk); err_queue: @@ -320,6 +326,9 @@ static void __nvm_remove_target(struct nvm_target *t) del_gendisk(tdisk); blk_cleanup_queue(q); + if (tt->sysfs_exit) + tt->sysfs_exit(tdisk); + if (tt->exit) tt->exit(tdisk->private_data); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index bc282d26017a..ca45e4a088a9 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -440,6 +440,8 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); typedef void (nvm_tgt_exit_fn)(void *); +typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); +typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); struct nvm_tgt_type { const char *name; @@ -453,6 +455,10 @@ struct nvm_tgt_type { nvm_tgt_init_fn *init; nvm_tgt_exit_fn *exit; + /* sysfs */ + nvm_tgt_sysfs_init_fn *sysfs_init; + nvm_tgt_sysfs_exit_fn *sysfs_exit; + /* For internal use */ struct list_head list; }; -- cgit v1.2.3 From 7bf7d778620d83f14fcd92d0938fb97c7d78bf19 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 24 Jan 2017 18:07:00 -0500 Subject: nvme/pci: Don't mark IOD as aborted if abort wasn't sent This patch sets the aborted flag only if an abort was sent, reducing excessive kernel message spamming for completed IO that wasn't actually aborted. Reported-by: Jens Axboe Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e1b4e603b1cf..06875bc1ba80 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -898,12 +898,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_HANDLED; } - iod->aborted = 1; - if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } + iod->aborted = 1; memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; -- cgit v1.2.3 From 12d70958a2e8d587acaa51dafd5d6620e00b7543 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 31 Jan 2017 23:32:50 -0700 Subject: blk-mq: don't fail allocating driver tag for stopped hw queue We rely on blk_mq_get_driver_tag() not failing if 'wait' is true, but it currently fails in that case if the queue happens to be stopped at the time of the call. We don't need to check for stopped here, it's just assigning the tag. If the queue is stopped, we'll handle it when attempting to run the queue. This fixes a stall/crash on flush intensive workloads, where we proceed to process a flush that doesn't have a valid tag assigned. Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 60dac10228fe..489076e7ae15 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -856,9 +856,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, }; - if (blk_mq_hctx_stopped(data.hctx)) - return false; - if (rq->tag != -1) { done: if (hctx) -- cgit v1.2.3 From 5fad1b64aed8bd63ca7da2ba92107ba9ecd9a2c8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 1 Feb 2017 08:20:08 -0800 Subject: block: Update comments that refer to __bio_map_user() and bio_map_user() Since __bio_map_user() and bio_map_user() have been removed, update the comments that still refer to these functions. Signed-off-by: Bart Van Assche References: commit ddad8dd0a162 ("block: use blk_rq_map_user_iov to implement blk_rq_map_user") Cc: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index 2b375020fc49..d3c26d1cb1da 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1403,7 +1403,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, bio_set_flag(bio, BIO_USER_MAPPED); /* - * subtle -- if __bio_map_user() ended up bouncing a bio, + * subtle -- if bio_map_user_iov() ended up bouncing a bio, * it would normally disappear when its bi_end_io is run. * however, we need it for the unmap, so grab an extra * reference to it @@ -1445,8 +1445,8 @@ static void __bio_unmap_user(struct bio *bio) * bio_unmap_user - unmap a bio * @bio: the bio being unmapped * - * Unmap a bio previously mapped by bio_map_user(). Must be called with - * a process context. + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. * * bio_unmap_user() may sleep. */ -- cgit v1.2.3 From bbfc3c5d6c7882dc65c1230e781644e35c29839f Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Tue, 31 Jan 2017 22:36:50 -0800 Subject: block: queue lock must be acquired when iterating over rls blk_set_queue_dying() does not acquire queue lock before it calls blk_queue_for_each_rl(). This allows a racing blkg_destroy() to remove blkg->q_node from the linked list and have blk_queue_for_each_rl() loop infitely over the removed blkg->q_node list node. Signed-off-by: Tahsin Erdogan Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 02833ce5664e..b2df55a65250 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -527,12 +527,14 @@ void blk_set_queue_dying(struct request_queue *q) else { struct request_list *rl; + spin_lock_irq(q->queue_lock); blk_queue_for_each_rl(rl, q) { if (rl->rq_pool) { wake_up(&rl->wait[BLK_RW_SYNC]); wake_up(&rl->wait[BLK_RW_ASYNC]); } } + spin_unlock_irq(q->queue_lock); } } EXPORT_SYMBOL_GPL(blk_set_queue_dying); -- cgit v1.2.3 From 19641f2d7674fbf2891e9579f61c1b23821086e8 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 3 Feb 2017 12:50:30 -0700 Subject: Include: Uapi: Add user ABI for Sed/Opal Signed-off-by: Scott Bauer Signed-off-by: Rafael Antognolli Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 118 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 include/uapi/linux/sed-opal.h diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h new file mode 100644 index 000000000000..31799526082a --- /dev/null +++ b/include/uapi/linux/sed-opal.h @@ -0,0 +1,118 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli + * Scott Bauer + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _UAPI_SED_OPAL_H +#define _UAPI_SED_OPAL_H + +#include + +#define OPAL_KEY_MAX 256 +#define OPAL_MAX_LRS 9 + +enum opal_mbr { + OPAL_MBR_ENABLE = 0x0, + OPAL_MBR_DISABLE = 0x01, +}; + +enum opal_user { + OPAL_ADMIN1 = 0x0, + OPAL_USER1 = 0x01, + OPAL_USER2 = 0x02, + OPAL_USER3 = 0x03, + OPAL_USER4 = 0x04, + OPAL_USER5 = 0x05, + OPAL_USER6 = 0x06, + OPAL_USER7 = 0x07, + OPAL_USER8 = 0x08, + OPAL_USER9 = 0x09, +}; + +enum opal_lock_state { + OPAL_RO = 0x01, /* 0001 */ + OPAL_RW = 0x02, /* 0010 */ + OPAL_LK = 0x04, /* 0100 */ +}; + +struct opal_key { + uint8_t lr; + uint8_t key_len; + char key[OPAL_KEY_MAX]; +}; + +struct opal_lr_act { + int sum; + uint8_t num_lrs; + uint8_t lr[OPAL_MAX_LRS]; + struct opal_key key; +}; + +struct opal_session_info { + int sum; + enum opal_user who; + struct opal_key opal_key; + uint8_t __align[2]; +}; + +struct opal_user_lr_setup { + size_t range_start; + size_t range_length; + int RLE; /* Read Lock enabled */ + int WLE; /* Write Lock Enabled */ + struct opal_session_info session; + uint8_t __align[4]; +}; + +struct opal_lock_unlock { + enum opal_lock_state l_state; + struct opal_session_info session; +}; + +struct opal_new_pw { + struct opal_session_info session; + + /* When we're not operating in sum, and we first set + * passwords we need to set them via ADMIN authority. + * After passwords are changed, we can set them via, + * User authorities. + * Because of this restriction we need to know about + * Two different users. One in 'session' which we will use + * to start the session and new_userr_pw as the user we're + * chaning the pw for. + */ + struct opal_session_info new_user_pw; +}; + +struct opal_mbr_data { + u8 enable_disable; + struct opal_key key; + uint8_t __align[5]; +}; + +#define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) +#define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) +#define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) +#define IOC_OPAL_ACTIVATE_LSP _IOW('p', 223, struct opal_key) +#define IOC_OPAL_SET_PW _IOW('p', 224, struct opal_new_pw) +#define IOC_OPAL_ACTIVATE_USR _IOW('p', 225, struct opal_session_info) +#define IOC_OPAL_REVERT_TPR _IOW('p', 226, struct opal_key) +#define IOC_OPAL_LR_SETUP _IOW('p', 227, struct opal_user_lr_setup) +#define IOC_OPAL_ADD_USR_TO_LR _IOW('p', 228, struct opal_lock_unlock) +#define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data) +#define IOC_OPAL_ERASE_LR _IOW('p', 230, struct opal_session_info) +#define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) + +#endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 455a7b238cd6bc68c4a550cbbd37c1e22b64f71c Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 3 Feb 2017 12:50:31 -0700 Subject: block: Add Sed-opal library This patch implements the necessary logic to bring an Opal enabled drive out of a factory-enabled into a working Opal state. This patch set also enables logic to save a password to be replayed during a resume from suspend. Signed-off-by: Scott Bauer Signed-off-by: Rafael Antognolli Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- MAINTAINERS | 11 + block/Kconfig | 7 + block/Makefile | 1 + block/opal_proto.h | 429 ++++++++ block/sed-opal.c | 2448 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 178 ++++ 6 files changed, 3074 insertions(+) create mode 100644 block/opal_proto.h create mode 100644 block/sed-opal.c create mode 100644 include/linux/sed-opal.h diff --git a/MAINTAINERS b/MAINTAINERS index f481713fe44c..f96181c927be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11075,6 +11075,17 @@ L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/sdhci-spear.c +SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER +M: Scott Bauer +M: Jonathan Derrick +M: Rafael Antognolli +L: linux-nvme@lists.infradead.org +S: Supported +F: block/sed* +F: block/opal_proto.h +F: include/linux/sed* +F: include/uapi/linux/sed* + SECURITY SUBSYSTEM M: James Morris M: "Serge E. Hallyn" diff --git a/block/Kconfig b/block/Kconfig index 7f659b23850c..1aef809affae 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -159,6 +159,13 @@ config BLK_DEBUG_FS Unless you are building a kernel for a tiny system, you should say Y here. +config BLK_SED_OPAL + bool "Logic for interfacing with Opal enabled SEDs" + ---help--- + Builds Logic for interfacing with Opal enabled controllers. + Enabling this option enables users to setup/unlock/lock + Locking ranges for SED devices using the Opal protocol. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 317165f8708c..6ba1b1bc9529 100644 --- a/block/Makefile +++ b/block/Makefile @@ -27,3 +27,4 @@ obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o diff --git a/block/opal_proto.h b/block/opal_proto.h new file mode 100644 index 000000000000..af9abc56c157 --- /dev/null +++ b/block/opal_proto.h @@ -0,0 +1,429 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli + * Scott Bauer + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include + +#ifndef _OPAL_PROTO_H +#define _OPAL_PROTO_H + +#define DTAERROR_NO_METHOD_STATUS 0x89 +#define GENERIC_HOST_SESSION_NUM 0x41 + +#define TPER_SYNC_SUPPORTED 0x01 + +#define TINY_ATOM_DATA_MASK 0x3F +#define TINY_ATOM_SIGNED 0x40 + +#define SHORT_ATOM_ID 0x80 +#define SHORT_ATOM_BYTESTRING 0x20 +#define SHORT_ATOM_SIGNED 0x10 +#define SHORT_ATOM_LEN_MASK 0xF + +#define MEDIUM_ATOM_ID 0xC0 +#define MEDIUM_ATOM_BYTESTRING 0x10 +#define MEDIUM_ATOM_SIGNED 0x8 +#define MEDIUM_ATOM_LEN_MASK 0x7 + +#define LONG_ATOM_ID 0xe0 +#define LONG_ATOM_BYTESTRING 0x2 +#define LONG_ATOM_SIGNED 0x1 + +/* Derived from TCG Core spec 2.01 Section: + * 3.2.2.1 + * Data Type + */ +#define TINY_ATOM_BYTE 0x7F +#define SHORT_ATOM_BYTE 0xBF +#define MEDIUM_ATOM_BYTE 0xDF +#define LONG_ATOM_BYTE 0xE3 + +#define OPAL_INVAL_PARAM 12 +#define OPAL_MANUFACTURED_INACTIVE 0x08 +#define OPAL_DISCOVERY_COMID 0x0001 + +#define LOCKING_RANGE_NON_GLOBAL 0x03 +/* + * User IDs used in the TCG storage SSCs + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +#define OPAL_UID_LENGTH 8 +#define OPAL_METHOD_LENGTH 8 +#define OPAL_MSID_KEYLEN 15 +#define OPAL_UID_LENGTH_HALF 4 + +/* Enum to index OPALUID array */ +enum opal_uid { + /* users */ + OPAL_SMUID_UID, + OPAL_THISSP_UID, + OPAL_ADMINSP_UID, + OPAL_LOCKINGSP_UID, + OPAL_ENTERPRISE_LOCKINGSP_UID, + OPAL_ANYBODY_UID, + OPAL_SID_UID, + OPAL_ADMIN1_UID, + OPAL_USER1_UID, + OPAL_USER2_UID, + OPAL_PSID_UID, + OPAL_ENTERPRISE_BANDMASTER0_UID, + OPAL_ENTERPRISE_ERASEMASTER_UID, + /* tables */ + OPAL_LOCKINGRANGE_GLOBAL, + OPAL_LOCKINGRANGE_ACE_RDLOCKED, + OPAL_LOCKINGRANGE_ACE_WRLOCKED, + OPAL_MBRCONTROL, + OPAL_MBR, + OPAL_AUTHORITY_TABLE, + OPAL_C_PIN_TABLE, + OPAL_LOCKING_INFO_TABLE, + OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + /* C_PIN_TABLE object ID's */ + OPAL_C_PIN_MSID, + OPAL_C_PIN_SID, + OPAL_C_PIN_ADMIN1, + /* half UID's (only first 4 bytes used) */ + OPAL_HALF_UID_AUTHORITY_OBJ_REF, + OPAL_HALF_UID_BOOLEAN_ACE, + /* omitted optional parameter */ + OPAL_UID_HEXFF, +}; + +#define OPAL_METHOD_LENGTH 8 + +/* Enum for indexing the OPALMETHOD array */ +enum opal_method { + OPAL_PROPERTIES, + OPAL_STARTSESSION, + OPAL_REVERT, + OPAL_ACTIVATE, + OPAL_EGET, + OPAL_ESET, + OPAL_NEXT, + OPAL_EAUTHENTICATE, + OPAL_GETACL, + OPAL_GENKEY, + OPAL_REVERTSP, + OPAL_GET, + OPAL_SET, + OPAL_AUTHENTICATE, + OPAL_RANDOM, + OPAL_ERASE, +}; + +enum opal_token { + /* Boolean */ + OPAL_TRUE = 0x01, + OPAL_FALSE = 0x00, + OPAL_BOOLEAN_EXPR = 0x03, + /* cellblocks */ + OPAL_TABLE = 0x00, + OPAL_STARTROW = 0x01, + OPAL_ENDROW = 0x02, + OPAL_STARTCOLUMN = 0x03, + OPAL_ENDCOLUMN = 0x04, + OPAL_VALUES = 0x01, + /* authority table */ + OPAL_PIN = 0x03, + /* locking tokens */ + OPAL_RANGESTART = 0x03, + OPAL_RANGELENGTH = 0x04, + OPAL_READLOCKENABLED = 0x05, + OPAL_WRITELOCKENABLED = 0x06, + OPAL_READLOCKED = 0x07, + OPAL_WRITELOCKED = 0x08, + OPAL_ACTIVEKEY = 0x0A, + /* locking info table */ + OPAL_MAXRANGES = 0x04, + /* mbr control */ + OPAL_MBRENABLE = 0x01, + OPAL_MBRDONE = 0x02, + /* properties */ + OPAL_HOSTPROPERTIES = 0x00, + /* atoms */ + OPAL_STARTLIST = 0xf0, + OPAL_ENDLIST = 0xf1, + OPAL_STARTNAME = 0xf2, + OPAL_ENDNAME = 0xf3, + OPAL_CALL = 0xf8, + OPAL_ENDOFDATA = 0xf9, + OPAL_ENDOFSESSION = 0xfa, + OPAL_STARTTRANSACTON = 0xfb, + OPAL_ENDTRANSACTON = 0xfC, + OPAL_EMPTYATOM = 0xff, + OPAL_WHERE = 0x00, +}; + +/* Locking state for a locking range */ +enum opal_lockingstate { + OPAL_LOCKING_READWRITE = 0x01, + OPAL_LOCKING_READONLY = 0x02, + OPAL_LOCKING_LOCKED = 0x03, +}; + +/* Packets derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Secion: 3.2.3 ComPackets, Packets & Subpackets + */ + +/* Comm Packet (header) for transmissions. */ +struct opal_compacket { + __be32 reserved0; + u8 extendedComID[4]; + __be32 outstandingData; + __be32 minTransfer; + __be32 length; +}; + +/* Packet structure. */ +struct opal_packet { + __be32 tsn; + __be32 hsn; + __be32 seq_number; + __be16 reserved0; + __be16 ack_type; + __be32 acknowledgment; + __be32 length; +}; + +/* Data sub packet header */ +struct opal_data_subpacket { + u8 reserved0[6]; + __be16 kind; + __be32 length; +}; + +/* header of a response */ +struct opal_header { + struct opal_compacket cp; + struct opal_packet pkt; + struct opal_data_subpacket subpkt; +}; + +#define FC_TPER 0x0001 +#define FC_LOCKING 0x0002 +#define FC_GEOMETRY 0x0003 +#define FC_ENTERPRISE 0x0100 +#define FC_DATASTORE 0x0202 +#define FC_SINGLEUSER 0x0201 +#define FC_OPALV100 0x0200 +#define FC_OPALV200 0x0203 + +/* + * The Discovery 0 Header. As defined in + * Opal SSC Documentation + * Section: 3.3.5 Capability Discovery + */ +struct d0_header { + __be32 length; /* the length of the header 48 in 2.00.100 */ + __be32 revision; /**< revision of the header 1 in 2.00.100 */ + __be32 reserved01; + __be32 reserved02; + /* + * the remainder of the structure is vendor specific and will not be + * addressed now + */ + u8 ignored[32]; +}; + +/* + * TPer Feature Descriptor. Contains flags indicating support for the + * TPer features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x001 in 2.00.100 + */ +struct d0_tper_features { + /* + * supported_features bits: + * bit 7: reserved + * bit 6: com ID management + * bit 5: reserved + * bit 4: streaming support + * bit 3: buffer management + * bit 2: ACK/NACK + * bit 1: async + * bit 0: sync + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Locking Feature Descriptor. Contains flags indicating support for the + * locking features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0002 in 2.00.100 + */ +struct d0_locking_features { + /* + * supported_features bits: + * bits 6-7: reserved + * bit 5: MBR done + * bit 4: MBR enabled + * bit 3: media encryption + * bit 2: locked + * bit 1: locking enabled + * bit 0: locking supported + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Geometry Feature Descriptor. Contains flags indicating support for the + * geometry features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0003 in 2.00.100 + */ +struct d0_geometry_features { + /* + * skip 32 bits from header, needed to align the struct to 64 bits. + */ + u8 header[4]; + /* + * reserved01: + * bits 1-6: reserved + * bit 0: align + */ + u8 reserved01; + u8 reserved02[7]; + __be32 logical_block_size; + __be64 alignment_granularity; + __be64 lowest_aligned_lba; +}; + +/* + * Enterprise SSC Feature + * + * code == 0x0100 + */ +struct d0_enterprise_ssc { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + u8 reserved01; + __be16 reserved02; + __be32 reserved03; + __be32 reserved04; +}; + +/* + * Opal V1 feature + * + * code == 0x0200 + */ +struct d0_opal_v100 { + __be16 baseComID; + __be16 numComIDs; +}; + +/* + * Single User Mode feature + * + * code == 0x0201 + */ +struct d0_single_user_mode { + __be32 num_locking_objects; + /* reserved01: + * bit 0: any + * bit 1: all + * bit 2: policy + * bits 3-7: reserved + */ + u8 reserved01; + u8 reserved02; + __be16 reserved03; + __be32 reserved04; +}; + +/* + * Additonal Datastores feature + * + * code == 0x0202 + */ +struct d0_datastore_table { + __be16 reserved01; + __be16 max_tables; + __be32 max_size_tables; + __be32 table_size_alignment; +}; + +/* + * OPAL 2.0 feature + * + * code == 0x0203 + */ +struct d0_opal_v200 { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + /* num_locking_admin_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_admin_auth[2]; + /* num_locking_user_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_user_auth[2]; + u8 initialPIN; + u8 revertedPIN; + u8 reserved01; + __be32 reserved02; +}; + +/* Union of features used to parse the discovery 0 response */ +struct d0_features { + __be16 code; + /* + * r_version bits: + * bits 4-7: version + * bits 0-3: reserved + */ + u8 r_version; + u8 length; + u8 features[]; +}; + +#endif /* _OPAL_PROTO_H */ diff --git a/block/sed-opal.c b/block/sed-opal.c new file mode 100644 index 000000000000..bf1406e5159b --- /dev/null +++ b/block/sed-opal.c @@ -0,0 +1,2448 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Scott Bauer + * Rafael Antognolli + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opal_proto.h" + +static const u8 opaluid[][OPAL_UID_LENGTH] = { + /* users */ + [OPAL_SMUID_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff }, + [OPAL_THISSP_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ADMINSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 }, + [OPAL_ENTERPRISE_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_ANYBODY_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_SID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ADMIN1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_USER1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 }, + [OPAL_USER2_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 }, + [OPAL_PSID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 }, + [OPAL_ENTERPRISE_BANDMASTER0_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 }, + [OPAL_ENTERPRISE_ERASEMASTER_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 }, + + /* tables */ + + [OPAL_LOCKINGRANGE_GLOBAL] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_WRLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 }, + [OPAL_MBRCONTROL] = + { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_MBR] = + { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_AUTHORITY_TABLE] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00}, + [OPAL_C_PIN_TABLE] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00}, + [OPAL_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + + /* C_PIN_TABLE object ID's */ + + [OPAL_C_PIN_MSID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, + [OPAL_C_PIN_SID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01}, + [OPAL_C_PIN_ADMIN1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, + + /* half UID's (only first 4 bytes used) */ + + [OPAL_HALF_UID_AUTHORITY_OBJ_REF] = + { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, + [OPAL_HALF_UID_BOOLEAN_ACE] = + { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff }, + + /* special value for omitted optional parameter */ + [OPAL_UID_HEXFF] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, +}; + +/* + * TCG Storage SSC Methods. + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +static const u8 opalmethod[][OPAL_UID_LENGTH] = { + [OPAL_PROPERTIES] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 }, + [OPAL_STARTSESSION] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 }, + [OPAL_REVERT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 }, + [OPAL_ACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 }, + [OPAL_EGET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ESET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 }, + [OPAL_NEXT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 }, + [OPAL_EAUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c }, + [OPAL_GETACL] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d }, + [OPAL_GENKEY] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 }, + [OPAL_REVERTSP] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 }, + [OPAL_GET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 }, + [OPAL_SET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 }, + [OPAL_AUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c }, + [OPAL_RANDOM] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, + [OPAL_ERASE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, +}; + +typedef int (cont_fn)(struct opal_dev *dev); + +static int end_opal_session_error(struct opal_dev *dev); + +struct opal_suspend_data { + struct opal_lock_unlock unlk; + u8 lr; + struct list_head node; +}; + +/* + * Derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 5.1.5 Method Status Codes + */ +static const char * const opal_errors[] = { + "Success", + "Not Authorized", + "Unknown Error", + "SP Busy", + "SP Failed", + "SP Disabled", + "SP Frozen", + "No Sessions Available", + "Uniqueness Conflict", + "Insufficient Space", + "Insufficient Rows", + "Invalid Function", + "Invalid Parameter", + "Invalid Reference", + "Unknown Error", + "TPER Malfunction", + "Transaction Failure", + "Response Overflow", + "Authority Locked Out", +}; + +static const char *opal_error_to_human(int error) +{ + if (error == 0x3f) + return "Failed"; + + if (error >= ARRAY_SIZE(opal_errors) || error < 0) + return "Unknown Error"; + + return opal_errors[error]; +} + +static void print_buffer(const u8 *ptr, u32 length) +{ +#ifdef DEBUG + print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length); + pr_debug("\n"); +#endif +} + +static bool check_tper(const void *data) +{ + const struct d0_tper_features *tper = data; + u8 flags = tper->supported_features; + + if (!(flags & TPER_SYNC_SUPPORTED)) { + pr_err("TPer sync not supported. flags = %d\n", + tper->supported_features); + return false; + } + + return true; +} + +static bool check_sum(const void *data) +{ + const struct d0_single_user_mode *sum = data; + u32 nlo = be32_to_cpu(sum->num_locking_objects); + + if (nlo == 0) { + pr_err("Need at least one locking object.\n"); + return false; + } + + pr_debug("Number of locking objects: %d\n", nlo); + + return true; +} + +static u16 get_comid_v100(const void *data) +{ + const struct d0_opal_v100 *v100 = data; + + return be16_to_cpu(v100->baseComID); +} + +static u16 get_comid_v200(const void *data) +{ + const struct d0_opal_v200 *v200 = data; + + return be16_to_cpu(v200->baseComID); +} + +static int opal_send_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev, dev->comid, TCG_SECP_01, + dev->cmd, IO_BUFFER_LENGTH, + true); +} + +static int opal_recv_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev, dev->comid, TCG_SECP_01, + dev->resp, IO_BUFFER_LENGTH, + false); +} + +static int opal_recv_check(struct opal_dev *dev) +{ + size_t buflen = IO_BUFFER_LENGTH; + void *buffer = dev->resp; + struct opal_header *hdr = buffer; + int ret; + + do { + pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n", + hdr->cp.outstandingData, + hdr->cp.minTransfer); + + if (hdr->cp.outstandingData == 0 || + hdr->cp.minTransfer != 0) + return 0; + + memset(buffer, 0, buflen); + ret = opal_recv_cmd(dev); + } while (!ret); + + return ret; +} + +static int opal_send_recv(struct opal_dev *dev, cont_fn *cont) +{ + int ret; + + ret = opal_send_cmd(dev); + if (ret) + return ret; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + ret = opal_recv_check(dev); + if (ret) + return ret; + return cont(dev); +} + +static void check_geometry(struct opal_dev *dev, const void *data) +{ + const struct d0_geometry_features *geo = data; + + dev->align = geo->alignment_granularity; + dev->lowest_lba = geo->lowest_aligned_lba; +} + +static int next(struct opal_dev *dev) +{ + opal_step func; + int error = 0; + + do { + func = dev->funcs[dev->state]; + if (!func) + break; + + error = func(dev); + if (error) { + pr_err("Error on step function: %d with error %d: %s\n", + dev->state, error, + opal_error_to_human(error)); + + /* For each OPAL command we do a discovery0 then we + * start some sort of session. + * If we haven't passed state 1 then there was an error + * on discovery0 or during the attempt to start a + * session. Therefore we shouldn't attempt to terminate + * a session, as one has not yet been created. + */ + if (dev->state > 1) + return end_opal_session_error(dev); + } + dev->state++; + } while (!error); + + return error; +} + +static int opal_discovery0_end(struct opal_dev *dev) +{ + bool found_com_id = false, supported = true, single_user = false; + const struct d0_header *hdr = (struct d0_header *)dev->resp; + const u8 *epos = dev->resp, *cpos = dev->resp; + u16 comid = 0; + + print_buffer(dev->resp, be32_to_cpu(hdr->length)); + + epos += be32_to_cpu(hdr->length); /* end of buffer */ + cpos += sizeof(*hdr); /* current position on buffer */ + + while (cpos < epos && supported) { + const struct d0_features *body = + (const struct d0_features *)cpos; + + switch (be16_to_cpu(body->code)) { + case FC_TPER: + supported = check_tper(body->features); + break; + case FC_SINGLEUSER: + single_user = check_sum(body->features); + break; + case FC_GEOMETRY: + check_geometry(dev, body); + break; + case FC_LOCKING: + case FC_ENTERPRISE: + case FC_DATASTORE: + /* some ignored properties */ + pr_debug("Found OPAL feature description: %d\n", + be16_to_cpu(body->code)); + break; + case FC_OPALV100: + comid = get_comid_v100(body->features); + found_com_id = true; + break; + case FC_OPALV200: + comid = get_comid_v200(body->features); + found_com_id = true; + break; + case 0xbfff ... 0xffff: + /* vendor specific, just ignore */ + break; + default: + pr_debug("OPAL Unknown feature: %d\n", + be16_to_cpu(body->code)); + + } + cpos += body->length + 4; + } + + if (!supported) { + pr_err("This device is not Opal enabled. Not Supported!\n"); + return -EOPNOTSUPP; + } + + if (!single_user) + pr_warn("Device doesn't support single user mode\n"); + + + if (!found_com_id) { + pr_warn("Could not find OPAL comid for device. Returning early\n"); + return -EOPNOTSUPP;; + } + + dev->comid = comid; + + return 0; +} + +static int opal_discovery0(struct opal_dev *dev) +{ + int ret; + + memset(dev->resp, 0, IO_BUFFER_LENGTH); + dev->comid = OPAL_DISCOVERY_COMID; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + return opal_discovery0_end(dev); +} + +static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) +{ + if (*err) + return; + if (cmd->pos >= IO_BUFFER_LENGTH - 1) { + pr_err("Error adding u8: end of buffer.\n"); + *err = -ERANGE; + return; + } + cmd->cmd[cmd->pos++] = tok; +} + +static void add_short_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 atom; + int err = 0; + + atom = SHORT_ATOM_ID; + atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0; + atom |= has_sign ? SHORT_ATOM_SIGNED : 0; + atom |= len & SHORT_ATOM_LEN_MASK; + + add_token_u8(&err, cmd, atom); +} + +static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 header0; + + header0 = MEDIUM_ATOM_ID; + header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0; + header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0; + header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK; + cmd->cmd[cmd->pos++] = header0; + cmd->cmd[cmd->pos++] = len; +} + +static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) +{ + + size_t len; + int msb; + u8 n; + + if (!(number & ~TINY_ATOM_DATA_MASK)) { + add_token_u8(err, cmd, number); + return; + } + + msb = fls(number); + len = DIV_ROUND_UP(msb, 4); + + if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { + pr_err("Error adding u64: end of buffer.\n"); + *err = -ERANGE; + return; + } + add_short_atom_header(cmd, false, false, len); + while (len--) { + n = number >> (len * 8); + add_token_u8(err, cmd, n); + } +} + +static void add_token_bytestring(int *err, struct opal_dev *cmd, + const u8 *bytestring, size_t len) +{ + size_t header_len = 1; + bool is_short_atom = true; + + if (*err) + return; + + if (len & ~SHORT_ATOM_LEN_MASK) { + header_len = 2; + is_short_atom = false; + } + + if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) { + pr_err("Error adding bytestring: end of buffer.\n"); + *err = -ERANGE; + return; + } + + if (is_short_atom) + add_short_atom_header(cmd, true, false, len); + else + add_medium_atom_header(cmd, true, false, len); + + memcpy(&cmd->cmd[cmd->pos], bytestring, len); + cmd->pos += len; + +} + +static int build_locking_range(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_err("Can't build locking range. Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH); + + if (lr == 0) + return 0; + buffer[5] = LOCKING_RANGE_NON_GLOBAL; + buffer[7] = lr; + + return 0; +} + +static int build_locking_user(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_err("Can't build locking range user, Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + buffer[7] = lr + 1; + + return 0; +} + +static void set_comid(struct opal_dev *cmd, u16 comid) +{ + struct opal_header *hdr = (struct opal_header *)cmd->cmd; + + hdr->cp.extendedComID[0] = comid >> 8; + hdr->cp.extendedComID[1] = comid; + hdr->cp.extendedComID[2] = 0; + hdr->cp.extendedComID[3] = 0; +} + +static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) +{ + struct opal_header *hdr; + int err = 0; + + add_token_u8(&err, cmd, OPAL_ENDOFDATA); + add_token_u8(&err, cmd, OPAL_STARTLIST); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, OPAL_ENDLIST); + + if (err) { + pr_err("Error finalizing command.\n"); + return -EFAULT; + } + + hdr = (struct opal_header *) cmd->cmd; + + hdr->pkt.tsn = cpu_to_be32(tsn); + hdr->pkt.hsn = cpu_to_be32(hsn); + + hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr)); + while (cmd->pos % 4) { + if (cmd->pos >= IO_BUFFER_LENGTH) { + pr_err("Error: Buffer overrun\n"); + return -ERANGE; + } + cmd->cmd[cmd->pos++] = 0; + } + hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) - + sizeof(hdr->pkt)); + hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp)); + + return 0; +} + +static enum opal_response_token token_type(const struct parsed_resp *resp, + int n) +{ + const struct opal_resp_tok *tok; + + if (n >= resp->num) { + pr_err("Token number doesn't exist: %d, resp: %d\n", + n, resp->num); + return OPAL_DTA_TOKENID_INVALID; + } + + tok = &resp->toks[n]; + if (tok->len == 0) { + pr_err("Token length must be non-zero\n"); + return OPAL_DTA_TOKENID_INVALID; + } + + return tok->type; +} + +/* + * This function returns 0 in case of invalid token. One should call + * token_type() first to find out if the token is valid or not. + */ +static enum opal_token response_get_token(const struct parsed_resp *resp, + int n) +{ + const struct opal_resp_tok *tok; + + if (n >= resp->num) { + pr_err("Token number doesn't exist: %d, resp: %d\n", + n, resp->num); + return 0; + } + + tok = &resp->toks[n]; + if (tok->len == 0) { + pr_err("Token length must be non-zero\n"); + return 0; + } + + return tok->pos[0]; +} + +static size_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->width = OPAL_WIDTH_TINY; + + if (pos[0] & TINY_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + tok->type = OPAL_DTA_TOKENID_UINT; + tok->stored.u = pos[0] & 0x3f; + } + + return tok->len; +} + +static size_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; + tok->width = OPAL_WIDTH_SHORT; + + if (pos[0] & SHORT_ATOM_BYTESTRING) { + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SHORT_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + u64 u_integer = 0; + int i, b = 0; + + tok->type = OPAL_DTA_TOKENID_UINT; + if (tok->len > 9) { + pr_warn("uint64 with more than 8 bytes\n"); + return -EINVAL; + } + for (i = tok->len - 1; i > 0; i--) { + u_integer |= ((u64)pos[i] << (8 * b)); + b++; + } + tok->stored.u = u_integer; + } + + return tok->len; +} + +static size_t response_parse_medium(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; + tok->width = OPAL_WIDTH_MEDIUM; + + if (pos[0] & MEDIUM_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & MEDIUM_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static size_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; + tok->width = OPAL_WIDTH_LONG; + + if (pos[0] & LONG_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & LONG_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static size_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->type = OPAL_DTA_TOKENID_TOKEN; + tok->width = OPAL_WIDTH_TOKEN; + + return tok->len; +} + +static int response_parse(const u8 *buf, size_t length, + struct parsed_resp *resp) +{ + const struct opal_header *hdr; + struct opal_resp_tok *iter; + int num_entries = 0; + int total; + size_t token_length; + const u8 *pos; + + if (!buf) + return -EFAULT; + + if (!resp) + return -EFAULT; + + hdr = (struct opal_header *)buf; + pos = buf; + pos += sizeof(*hdr); + + pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", + be32_to_cpu(hdr->cp.length), + be32_to_cpu(hdr->pkt.length), + be32_to_cpu(hdr->subpkt.length)); + + if (hdr->cp.length == 0 || hdr->pkt.length == 0 || + hdr->subpkt.length == 0) { + pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", + be32_to_cpu(hdr->cp.length), + be32_to_cpu(hdr->pkt.length), + be32_to_cpu(hdr->subpkt.length)); + print_buffer(pos, sizeof(*hdr)); + return -EINVAL; + } + + if (pos > buf + length) + return -EFAULT; + + iter = resp->toks; + total = be32_to_cpu(hdr->subpkt.length); + print_buffer(pos, total); + while (total > 0) { + if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ + token_length = response_parse_tiny(iter, pos); + else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */ + token_length = response_parse_short(iter, pos); + else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */ + token_length = response_parse_medium(iter, pos); + else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */ + token_length = response_parse_long(iter, pos); + else /* TOKEN */ + token_length = response_parse_token(iter, pos); + + if (token_length == -EINVAL) + return -EINVAL; + + pos += token_length; + total -= token_length; + iter++; + num_entries++; + } + + if (num_entries == 0) { + pr_err("Couldn't parse response.\n"); + return -EINVAL; + } + resp->num = num_entries; + + return 0; +} + +static size_t response_get_string(const struct parsed_resp *resp, int n, + const char **store) +{ + *store = NULL; + if (!resp) { + pr_err("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + pr_err("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) { + pr_err("Token is not a byte string!\n"); + return 0; + } + + *store = resp->toks[n].pos + 1; + return resp->toks[n].len - 1; +} + +static u64 response_get_u64(const struct parsed_resp *resp, int n) +{ + if (!resp) { + pr_err("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + pr_err("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) { + pr_err("Token is not unsigned it: %d\n", + resp->toks[n].type); + return 0; + } + + if (!(resp->toks[n].width == OPAL_WIDTH_TINY || + resp->toks[n].width == OPAL_WIDTH_SHORT)) { + pr_err("Atom is not short or tiny: %d\n", + resp->toks[n].width); + return 0; + } + + return resp->toks[n].stored.u; +} + +static u8 response_status(const struct parsed_resp *resp) +{ + if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && + response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + return 0; + } + + if (resp->num < 5) + return DTAERROR_NO_METHOD_STATUS; + + if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || + token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || + response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || + response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + return DTAERROR_NO_METHOD_STATUS; + + return response_get_u64(resp, resp->num - 4); +} + +/* Parses and checks for errors */ +static int parse_and_check_status(struct opal_dev *dev) +{ + int error; + + print_buffer(dev->cmd, dev->pos); + + error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed); + if (error) { + pr_err("Couldn't parse response.\n"); + return error; + } + + return response_status(&dev->parsed); +} + +static void clear_opal_cmd(struct opal_dev *dev) +{ + dev->pos = sizeof(struct opal_header); + memset(dev->cmd, 0, IO_BUFFER_LENGTH); +} + +static int start_opal_session_cont(struct opal_dev *dev) +{ + u32 hsn, tsn; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + hsn = response_get_u64(&dev->parsed, 4); + tsn = response_get_u64(&dev->parsed, 5); + + if (hsn == 0 && tsn == 0) { + pr_err("Couldn't authenticate session\n"); + return -EPERM; + } + + dev->hsn = hsn; + dev->tsn = tsn; + return 0; +} + +static void add_suspend_info(struct opal_dev *dev, + struct opal_suspend_data *sus) +{ + struct opal_suspend_data *iter; + + list_for_each_entry(iter, &dev->unlk_lst, node) { + if (iter->lr == sus->lr) { + list_del(&iter->node); + kfree(iter); + break; + } + } + list_add_tail(&sus->node, &dev->unlk_lst); +} + +static int end_session_cont(struct opal_dev *dev) +{ + dev->hsn = 0; + dev->tsn = 0; + return parse_and_check_status(dev); +} + +static int finalize_and_send(struct opal_dev *dev, cont_fn cont) +{ + int ret; + + ret = cmd_finalize(dev, dev->hsn, dev->tsn); + if (ret) { + pr_err("Error finalizing command buffer: %d\n", ret); + return ret; + } + + print_buffer(dev->cmd, dev->pos); + + return opal_send_recv(dev, cont); +} + +static int gen_key(struct opal_dev *dev) +{ + const u8 *method; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); + method = opalmethod[OPAL_GENKEY]; + kfree(dev->prev_data); + dev->prev_data = NULL; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building gen key command\n"); + return err; + + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int get_active_key_cont(struct opal_dev *dev) +{ + const char *activekey; + size_t keylen; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + keylen = response_get_string(&dev->parsed, 4, &activekey); + if (!activekey) { + pr_err("%s: Couldn't extract the Activekey from the response\n", + __func__); + return OPAL_INVAL_PARAM; + } + dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); + + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = keylen; + + return 0; +} + +static int get_active_key(struct opal_dev *dev) +{ + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + u8 *lr; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + lr = dev->func_data[dev->state]; + + err = build_locking_range(uid, sizeof(uid), *lr); + if (err) + return err; + + err = 0; + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* startCloumn */ + add_token_u8(&err, dev, 10); /* ActiveKey */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* endColumn */ + add_token_u8(&err, dev, 10); /* ActiveKey */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + if (err) { + pr_err("Error building get active key command\n"); + return err; + } + + return finalize_and_send(dev, get_active_key_cont); +} + +static int generic_lr_enable_disable(struct opal_dev *dev, + u8 *uid, bool rle, bool wle, + bool rl, bool wl) +{ + int err = 0; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /* ReadLockEnabled */ + add_token_u8(&err, dev, rle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 6); /* WriteLockEnabled */ + add_token_u8(&err, dev, wle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, rl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, wl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + return err; +} + +static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, + struct opal_user_lr_setup *setup) +{ + int err; + + err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, + 0, 0); + if (err) + pr_err("Failed to create enable global lr command\n"); + return err; +} + +static int setup_locking_range(struct opal_dev *dev) +{ + u8 uid[OPAL_UID_LENGTH]; + struct opal_user_lr_setup *setup; + u8 lr; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + setup = dev->func_data[dev->state]; + lr = setup->session.opal_key.lr; + err = build_locking_range(uid, sizeof(uid), lr); + if (err) + return err; + + if (lr == 0) + err = enable_global_lr(dev, uid, setup); + else { + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Ranges Start */ + add_token_u64(&err, dev, setup->range_start); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* Ranges length */ + add_token_u64(&err, dev, setup->range_length); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /*ReadLockEnabled */ + add_token_u64(&err, dev, !!setup->RLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 6); /*WriteLockEnabled*/ + add_token_u64(&err, dev, !!setup->WLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + } + if (err) { + pr_err("Error building Setup Locking range command.\n"); + return err; + + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int start_generic_opal_session(struct opal_dev *dev, + enum opal_uid auth, + enum opal_uid sp_type, + const char *key, + u8 key_len) +{ + u32 hsn; + int err = 0; + + if (key == NULL && auth != OPAL_ANYBODY_UID) { + pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \ + "Challenge, and not as the Anybody UID\n", __func__); + return OPAL_INVAL_PARAM; + } + + clear_opal_cmd(dev); + + set_comid(dev, dev->comid); + hsn = GENERIC_HOST_SESSION_NUM; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + + switch (auth) { + case OPAL_ANYBODY_UID: + add_token_u8(&err, dev, OPAL_ENDLIST); + break; + case OPAL_ADMIN1_UID: + case OPAL_SID_UID: + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); /* HostChallenge */ + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* HostSignAuth */ + add_token_bytestring(&err, dev, opaluid[auth], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + break; + default: + pr_err("Cannot start Admin SP session with auth %d\n", auth); + return OPAL_INVAL_PARAM; + } + + if (err) { + pr_err("Error building start adminsp session command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int start_anybodyASP_opal_session(struct opal_dev *dev) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_ADMINSP_UID, NULL, 0); +} + +static int start_SIDASP_opal_session(struct opal_dev *dev) +{ + int ret; + const u8 *key = dev->prev_data; + struct opal_key *okey; + + if (!key) { + okey = dev->func_data[dev->state]; + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + okey->key, + okey->key_len); + } else { + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + key, dev->prev_d_len); + kfree(key); + dev->prev_data = NULL; + } + return ret; +} + +static inline int start_admin1LSP_opal_session(struct opal_dev *dev) +{ + struct opal_key *key = dev->func_data[dev->state]; + + return start_generic_opal_session(dev, OPAL_ADMIN1_UID, + OPAL_LOCKINGSP_UID, + key->key, key->key_len); +} + +static int start_auth_opal_session(struct opal_dev *dev) +{ + u8 lk_ul_user[OPAL_UID_LENGTH]; + int err = 0; + + struct opal_session_info *session = dev->func_data[dev->state]; + size_t keylen = session->opal_key.key_len; + u8 *key = session->opal_key.key; + u32 hsn = GENERIC_HOST_SESSION_NUM; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + if (session->sum) { + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->opal_key.lr); + if (err) + return err; + + } else if (session->who != OPAL_ADMIN1 && !session->sum) { + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->who - 1); + if (err) + return err; + } else + memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); + add_token_bytestring(&err, dev, key, keylen); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building STARTSESSION command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int revert_tper(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + if (err) { + pr_err("Error building REVERT TPER command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int internal_activate_user(struct opal_dev *dev) +{ + struct opal_session_info *session = dev->func_data[dev->state]; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + uid[7] = session->who; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /* Enabled */ + add_token_u8(&err, dev, OPAL_TRUE); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Activate UserN command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int erase_locking_range(struct opal_dev *dev) +{ + struct opal_session_info *session; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + session = dev->func_data[dev->state]; + + if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0) + return -ERANGE; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Erase Locking Range Command.\n"); + return err; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_done(struct opal_dev *dev) +{ + u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 2); /* Done */ + add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building set MBR Done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_enable_disable(struct opal_dev *dev) +{ + u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, mbr_en_dis); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building set MBR done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, + struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* PIN */ + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + return err; +} + +static int set_new_pw(struct opal_dev *dev) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_session_info *usr = dev->func_data[dev->state]; + + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH); + + if (usr->who != OPAL_ADMIN1) { + cpin_uid[5] = 0x03; + if (usr->sum) + cpin_uid[7] = usr->opal_key.lr + 1; + else + cpin_uid[7] = usr->who; + } + + if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len, + cpin_uid, dev)) { + pr_err("Error building set password command.\n"); + return -ERANGE; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_sid_cpin_pin(struct opal_dev *dev) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_key *key = dev->func_data[dev->state]; + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH); + + if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) { + pr_err("Error building Set SID cpin\n"); + return -ERANGE; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int add_user_to_lr(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 user_uid[OPAL_UID_LENGTH]; + struct opal_lock_unlock *lkul; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + lkul = dev->func_data[dev->state]; + + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], + OPAL_UID_LENGTH); + + if (lkul->l_state == OPAL_RW) + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED], + OPAL_UID_LENGTH); + + lr_buffer[7] = lkul->session.opal_key.lr; + + memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + user_uid[7] = lkul->session.who; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + + add_token_u8(&err, dev, OPAL_STARTLIST); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], + OPAL_UID_LENGTH/2); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building add user to locking range command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int lock_unlock_locking_range(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + const u8 *method; + struct opal_lock_unlock *lkul; + u8 read_locked = 1, write_locked = 1; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + method = opalmethod[OPAL_SET]; + lkul = dev->func_data[dev->state]; + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initalized to locked */ + break; + default: + pr_err("Tried to set an invalid locking state... returning to uland\n"); + return OPAL_INVAL_PARAM; + } + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, read_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, write_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building SET command.\n"); + return err; + } + return finalize_and_send(dev, parse_and_check_status); +} + + +static int lock_unlock_locking_range_sum(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 read_locked = 1, write_locked = 1; + const u8 *method; + struct opal_lock_unlock *lkul; + int ret; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + method = opalmethod[OPAL_SET]; + lkul = dev->func_data[dev->state]; + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initalized to locked */ + break; + default: + pr_err("Tried to set an invalid locking state.\n"); + return OPAL_INVAL_PARAM; + } + ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1, + read_locked, write_locked); + + if (ret < 0) { + pr_err("Error building SET command.\n"); + return ret; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int activate_lsp(struct opal_dev *dev) +{ + struct opal_lr_act *opal_act; + u8 user_lr[OPAL_UID_LENGTH]; + u8 uint_3 = 0x83; + int err = 0, i; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + opal_act = dev->func_data[dev->state]; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE], + OPAL_UID_LENGTH); + + + if (opal_act->sum) { + err = build_locking_range(user_lr, sizeof(user_lr), + opal_act->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, uint_3); + add_token_u8(&err, dev, 6); + add_token_u8(&err, dev, 0); + add_token_u8(&err, dev, 0); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_act->num_lrs; i++) { + user_lr[7] = opal_act->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + } else { + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + } + + if (err) { + pr_err("Error building Activate LockingSP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int get_lsp_lifecycle_cont(struct opal_dev *dev) +{ + u8 lc_status; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + lc_status = response_get_u64(&dev->parsed, 4); + /* 0x08 is Manufacured Inactive */ + /* 0x09 is Manufactured */ + if (lc_status != OPAL_MANUFACTURED_INACTIVE) { + pr_err("Couldn't determine the status of the Lifcycle state\n"); + return -ENODEV; + } + + return 0; +} + +/* Determine if we're in the Manufactured Inactive or Active state */ +static int get_lsp_lifecycle(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Start Column */ + add_token_u8(&err, dev, 6); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* End Column */ + add_token_u8(&err, dev, 6); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building GET Lifecycle Status command\n"); + return err; + } + + return finalize_and_send(dev, get_lsp_lifecycle_cont); +} + +static int get_msid_cpin_pin_cont(struct opal_dev *dev) +{ + const char *msid_pin; + size_t strlen; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + strlen = response_get_string(&dev->parsed, 4, &msid_pin); + if (!msid_pin) { + pr_err("%s: Couldn't extract PIN from response\n", __func__); + return OPAL_INVAL_PARAM; + } + + dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL); + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = strlen; + + return 0; +} + +static int get_msid_cpin_pin(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Start Column */ + add_token_u8(&err, dev, 3); /* PIN */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* End Column */ + add_token_u8(&err, dev, 3); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Get MSID CPIN PIN command.\n"); + return err; + } + + return finalize_and_send(dev, get_msid_cpin_pin_cont); +} + +static int build_end_opal_session(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + + set_comid(dev, dev->comid); + add_token_u8(&err, dev, OPAL_ENDOFSESSION); + return err; +} + +static int end_opal_session(struct opal_dev *dev) +{ + int ret = build_end_opal_session(dev); + + if (ret < 0) + return ret; + return finalize_and_send(dev, end_session_cont); +} + +static int end_opal_session_error(struct opal_dev *dev) +{ + const opal_step error_end_session[] = { + end_opal_session, + NULL, + }; + dev->funcs = error_end_session; + dev->state = 0; + return next(dev); +} + +static inline void setup_opal_dev(struct opal_dev *dev, + const opal_step *funcs) +{ + dev->state = 0; + dev->funcs = funcs; + dev->tsn = 0; + dev->hsn = 0; + dev->func_data = NULL; + dev->prev_data = NULL; +} + +static int check_opal_support(struct opal_dev *dev) +{ + static const opal_step funcs[] = { + opal_discovery0, + NULL + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, funcs); + ret = next(dev); + dev->supported = !ret; + mutex_unlock(&dev->dev_lock); + return ret; +} + +void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv) +{ + if (opal_dev->initialized) + return; + INIT_LIST_HEAD(&opal_dev->unlk_lst); + mutex_init(&opal_dev->dev_lock); + opal_dev->send_recv = send_recv; + if (check_opal_support(opal_dev) < 0) + pr_warn("Opal is not supported on this device\n"); + opal_dev->initialized = true; +} +EXPORT_SYMBOL(init_opal_dev); + +static int opal_secure_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + void *data[3] = { NULL }; + static const opal_step erase_funcs[] = { + opal_discovery0, + start_auth_opal_session, + get_active_key, + gen_key, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, erase_funcs); + + dev->func_data = data; + dev->func_data[1] = opal_session; + dev->func_data[2] = &opal_session->opal_key.lr; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + void *data[3] = { NULL }; + static const opal_step erase_funcs[] = { + opal_discovery0, + start_auth_opal_session, + erase_locking_range, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, erase_funcs); + + dev->func_data = data; + dev->func_data[1] = opal_session; + dev->func_data[2] = opal_session; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, + struct opal_mbr_data *opal_mbr) +{ + void *func_data[6] = { NULL }; + static const opal_step mbr_funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + set_mbr_done, + end_opal_session, + start_admin1LSP_opal_session, + set_mbr_enable_disable, + end_opal_session, + NULL, + }; + int ret; + + if (opal_mbr->enable_disable != OPAL_MBR_ENABLE && + opal_mbr->enable_disable != OPAL_MBR_DISABLE) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, mbr_funcs); + dev->func_data = func_data; + dev->func_data[1] = &opal_mbr->key; + dev->func_data[2] = &opal_mbr->enable_disable; + dev->func_data[4] = &opal_mbr->key; + dev->func_data[5] = &opal_mbr->enable_disable; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) +{ + struct opal_suspend_data *suspend; + + suspend = kzalloc(sizeof(*suspend), GFP_KERNEL); + if (!suspend) + return -ENOMEM; + + suspend->unlk = *lk_unlk; + suspend->lr = lk_unlk->session.opal_key.lr; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + add_suspend_info(dev, suspend); + mutex_unlock(&dev->dev_lock); + return 0; +} + +static int opal_add_user_to_lr(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + void *func_data[3] = { NULL }; + static const opal_step funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + add_user_to_lr, + end_opal_session, + NULL + }; + int ret; + + if (lk_unlk->l_state != OPAL_RO && + lk_unlk->l_state != OPAL_RW) { + pr_err("Locking state was not RO or RW\n"); + return -EINVAL; + } + if (lk_unlk->session.who < OPAL_USER1 && + lk_unlk->session.who > OPAL_USER9) { + pr_err("Authority was not within the range of users: %d\n", + lk_unlk->session.who); + return -EINVAL; + } + if (lk_unlk->session.sum) { + pr_err("%s not supported in sum. Use setup locking range\n", + __func__); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, funcs); + dev->func_data = func_data; + dev->func_data[1] = &lk_unlk->session.opal_key; + dev->func_data[2] = lk_unlk; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) +{ + void *data[2] = { NULL }; + static const opal_step revert_funcs[] = { + opal_discovery0, + start_SIDASP_opal_session, + revert_tper, /* controller will terminate session */ + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, revert_funcs); + dev->func_data = data; + dev->func_data[1] = opal; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int __opal_lock_unlock_sum(struct opal_dev *dev) +{ + static const opal_step ulk_funcs_sum[] = { + opal_discovery0, + start_auth_opal_session, + lock_unlock_locking_range_sum, + end_opal_session, + NULL + }; + + dev->funcs = ulk_funcs_sum; + return next(dev); +} + +static int __opal_lock_unlock(struct opal_dev *dev) +{ + static const opal_step _unlock_funcs[] = { + opal_discovery0, + start_auth_opal_session, + lock_unlock_locking_range, + end_opal_session, + NULL + }; + + dev->funcs = _unlock_funcs; + return next(dev); +} + +static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) +{ + void *func_data[3] = { NULL }; + int ret; + + if (lk_unlk->session.who < OPAL_ADMIN1 || + lk_unlk->session.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + dev->func_data = func_data; + dev->func_data[1] = &lk_unlk->session; + dev->func_data[2] = lk_unlk; + + if (lk_unlk->session.sum) + ret = __opal_lock_unlock_sum(dev); + else + ret = __opal_lock_unlock(dev); + + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) +{ + static const opal_step owner_funcs[] = { + opal_discovery0, + start_anybodyASP_opal_session, + get_msid_cpin_pin, + end_opal_session, + start_SIDASP_opal_session, + set_sid_cpin_pin, + end_opal_session, + NULL + }; + void *data[6] = { NULL }; + int ret; + + if (!dev) + return -ENODEV; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, owner_funcs); + dev->func_data = data; + dev->func_data[4] = opal; + dev->func_data[5] = opal; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act) +{ + void *data[4] = { NULL }; + static const opal_step active_funcs[] = { + opal_discovery0, + start_SIDASP_opal_session, /* Open session as SID auth */ + get_lsp_lifecycle, + activate_lsp, + end_opal_session, + NULL + }; + int ret; + + if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, active_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_lr_act->key; + dev->func_data[3] = opal_lr_act; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_setup_locking_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + void *data[3] = { NULL }; + static const opal_step lr_funcs[] = { + opal_discovery0, + start_auth_opal_session, + setup_locking_range, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, lr_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_lrs->session; + dev->func_data[2] = opal_lrs; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) +{ + static const opal_step pw_funcs[] = { + opal_discovery0, + start_auth_opal_session, + set_new_pw, + end_opal_session, + NULL + }; + void *data[3] = { NULL }; + int ret; + + if (opal_pw->session.who < OPAL_ADMIN1 || + opal_pw->session.who > OPAL_USER9 || + opal_pw->new_user_pw.who < OPAL_ADMIN1 || + opal_pw->new_user_pw.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, pw_funcs); + dev->func_data = data; + dev->func_data[1] = (void *) &opal_pw->session; + dev->func_data[2] = (void *) &opal_pw->new_user_pw; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_activate_user(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + static const opal_step act_funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + internal_activate_user, + end_opal_session, + NULL + }; + void *data[3] = { NULL }; + int ret; + + /* We can't activate Admin1 it's active as manufactured */ + if (opal_session->who < OPAL_USER1 && + opal_session->who > OPAL_USER9) { + pr_err("Who was not a valid user: %d\n", opal_session->who); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, act_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_session->opal_key; + dev->func_data[2] = opal_session; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + struct opal_suspend_data *suspend; + void *func_data[3] = { NULL }; + bool was_failure = false; + int ret = 0; + + if (!dev) + return false; + if (!dev->supported) + return false; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + dev->func_data = func_data; + + list_for_each_entry(suspend, &dev->unlk_lst, node) { + dev->state = 0; + dev->func_data[1] = &suspend->unlk.session; + dev->func_data[2] = &suspend->unlk; + dev->tsn = 0; + dev->hsn = 0; + + if (suspend->unlk.session.sum) + ret = __opal_lock_unlock_sum(dev); + else + ret = __opal_lock_unlock(dev); + if (ret) { + pr_warn("Failed to unlock LR %hhu with sum %d\n", + suspend->unlk.session.opal_key.lr, + suspend->unlk.session.sum); + was_failure = true; + } + } + mutex_unlock(&dev->dev_lock); + return was_failure; +} +EXPORT_SYMBOL(opal_unlock_from_suspend); + +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr) +{ + void __user *arg = (void __user *)ptr; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!dev->supported) { + pr_err("Not supported\n"); + return -ENOTSUPP; + } + + switch (cmd) { + case IOC_OPAL_SAVE: { + struct opal_lock_unlock lk_unlk; + + if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk))) + return -EFAULT; + return opal_save(dev, &lk_unlk); + } + case IOC_OPAL_LOCK_UNLOCK: { + struct opal_lock_unlock lk_unlk; + + if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk))) + return -EFAULT; + return opal_lock_unlock(dev, &lk_unlk); + } + case IOC_OPAL_TAKE_OWNERSHIP: { + struct opal_key opal_key; + + if (copy_from_user(&opal_key, arg, sizeof(opal_key))) + return -EFAULT; + return opal_take_ownership(dev, &opal_key); + } + case IOC_OPAL_ACTIVATE_LSP: { + struct opal_lr_act opal_lr_act; + + if (copy_from_user(&opal_lr_act, arg, sizeof(opal_lr_act))) + return -EFAULT; + return opal_activate_lsp(dev, &opal_lr_act); + } + case IOC_OPAL_SET_PW: { + struct opal_new_pw opal_pw; + + if (copy_from_user(&opal_pw, arg, sizeof(opal_pw))) + return -EFAULT; + return opal_set_new_pw(dev, &opal_pw); + } + case IOC_OPAL_ACTIVATE_USR: { + struct opal_session_info session; + + if (copy_from_user(&session, arg, sizeof(session))) + return -EFAULT; + return opal_activate_user(dev, &session); + } + case IOC_OPAL_REVERT_TPR: { + struct opal_key opal_key; + + if (copy_from_user(&opal_key, arg, sizeof(opal_key))) + return -EFAULT; + return opal_reverttper(dev, &opal_key); + } + case IOC_OPAL_LR_SETUP: { + struct opal_user_lr_setup lrs; + + if (copy_from_user(&lrs, arg, sizeof(lrs))) + return -EFAULT; + return opal_setup_locking_range(dev, &lrs); + } + case IOC_OPAL_ADD_USR_TO_LR: { + struct opal_lock_unlock lk_unlk; + + if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk))) + return -EFAULT; + return opal_add_user_to_lr(dev, &lk_unlk); + } + case IOC_OPAL_ENABLE_DISABLE_MBR: { + struct opal_mbr_data mbr; + + if (copy_from_user(&mbr, arg, sizeof(mbr))) + return -EFAULT; + return opal_enable_disable_shadow_mbr(dev, &mbr); + } + case IOC_OPAL_ERASE_LR: { + struct opal_session_info session; + + if (copy_from_user(&session, arg, sizeof(session))) + return -EFAULT; + return opal_erase_locking_range(dev, &session); + } + case IOC_OPAL_SECURE_ERASE_LR: { + struct opal_session_info session; + + if (copy_from_user(&session, arg, sizeof(session))) + return -EFAULT; + return opal_secure_erase_locking_range(dev, &session); + } + default: + pr_warn("No such Opal Ioctl %u\n", cmd); + } + return -ENOTTY; +} +EXPORT_SYMBOL_GPL(sed_ioctl); diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h new file mode 100644 index 000000000000..af1a85eae193 --- /dev/null +++ b/include/linux/sed-opal.h @@ -0,0 +1,178 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli + * Scott Bauer + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef LINUX_OPAL_H +#define LINUX_OPAL_H + +#include +#include + +/* + * These constant values come from: + * SPC-4 section + * 6.30 SECURITY PROTOCOL IN command / table 265. + */ +enum { + TCG_SECP_00 = 0, + TCG_SECP_01, +}; +struct opal_dev; + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 + +typedef int (*opal_step)(struct opal_dev *dev); +typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp, + void *buffer, size_t len, bool send); + + +enum opal_atom_width { + OPAL_WIDTH_TINY, + OPAL_WIDTH_SHORT, + OPAL_WIDTH_MEDIUM, + OPAL_WIDTH_LONG, + OPAL_WIDTH_TOKEN +}; + +/* + * Token defs derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * 3.2.2 Data Stream Encoding + */ +enum opal_response_token { + OPAL_DTA_TOKENID_BYTESTRING = 0xe0, + OPAL_DTA_TOKENID_SINT = 0xe1, + OPAL_DTA_TOKENID_UINT = 0xe2, + OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0 +}; + +/* + * On the parsed response, we don't store again the toks that are already + * stored in the response buffer. Instead, for each token, we just store a + * pointer to the position in the buffer where the token starts, and the size + * of the token in bytes. + */ +struct opal_resp_tok { + const u8 *pos; + size_t len; + enum opal_response_token type; + enum opal_atom_width width; + union { + u64 u; + s64 s; + } stored; +}; + +/* + * From the response header it's not possible to know how many tokens there are + * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later + * if we start dealing with messages that have more than that, we can increase + * this number. This is done to avoid having to make two passes through the + * response, the first one counting how many tokens we have and the second one + * actually storing the positions. + */ +struct parsed_resp { + int num; + struct opal_resp_tok toks[MAX_TOKS]; +}; + +/** + * struct opal_dev - The structure representing a OPAL enabled SED. + * @supported: Whether or not OPAL is supported on this controller. + * @send_recv: The combined sec_send/sec_recv function pointer. + * @opal_step: A series of opal methods that are necessary to complete a command. + * @func_data: An array of parameters for the opal methods above. + * @state: Describes the current opal_step we're working on. + * @dev_lock: Locks the entire opal_dev structure. + * @parsed: Parsed response from controller. + * @prev_data: Data returned from a method to the controller. + * @unlk_lst: A list of Locking ranges to unlock on this device during a resume. + */ +struct opal_dev { + bool initialized; + bool supported; + sec_send_recv *send_recv; + + const opal_step *funcs; + void **func_data; + int state; + struct mutex dev_lock; + u16 comid; + u32 hsn; + u32 tsn; + u64 align; + u64 lowest_lba; + + size_t pos; + u8 cmd[IO_BUFFER_LENGTH]; + u8 resp[IO_BUFFER_LENGTH]; + + struct parsed_resp parsed; + size_t prev_d_len; + void *prev_data; + + struct list_head unlk_lst; +}; + +#ifdef CONFIG_BLK_SED_OPAL +bool opal_unlock_from_suspend(struct opal_dev *dev); +void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr); + +static inline bool is_sed_ioctl(unsigned int cmd) +{ + switch (cmd) { + case IOC_OPAL_SAVE: + case IOC_OPAL_LOCK_UNLOCK: + case IOC_OPAL_TAKE_OWNERSHIP: + case IOC_OPAL_ACTIVATE_LSP: + case IOC_OPAL_SET_PW: + case IOC_OPAL_ACTIVATE_USR: + case IOC_OPAL_REVERT_TPR: + case IOC_OPAL_LR_SETUP: + case IOC_OPAL_ADD_USR_TO_LR: + case IOC_OPAL_ENABLE_DISABLE_MBR: + case IOC_OPAL_ERASE_LR: + case IOC_OPAL_SECURE_ERASE_LR: + return true; + } + return false; +} +#else +static inline bool is_sed_ioctl(unsigned int cmd) +{ + return false; +} + +static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd, + unsigned long ptr) +{ + return 0; +} +static inline bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + return false; +} +static inline void init_opal_dev(struct opal_dev *opal_dev, + sec_send_recv *send_recv) +{ + opal_dev->supported = false; + opal_dev->initialized = true; +} +#endif /* CONFIG_BLK_SED_OPAL */ +#endif /* LINUX_OPAL_H */ -- cgit v1.2.3 From a98e58e54fbd0c80b6a46a7cac6e231eed3b3efa Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 3 Feb 2017 12:50:32 -0700 Subject: nvme: Add Support for Opal: Unlock from S3 & Opal Allocation/Ioctls This patch implements the necessary logic to unlock an Opal enabled device coming back from an S3. The patch also implements the SED/Opal allocation necessary to support the opal ioctls. Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 25 +++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 14 ++++++++++++++ drivers/nvme/host/pci.c | 7 +++++++ 3 files changed, 46 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 037ee999e759..26ae4afd3737 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -788,6 +788,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, if (ns->ndev) return nvme_nvm_ioctl(ns, cmd, arg); #endif + if (is_sed_ioctl(cmd)) + return sed_ioctl(&ns->ctrl->opal_dev, cmd, arg); return -ENOTTY; } } @@ -1055,6 +1057,29 @@ static const struct pr_ops nvme_pr_ops = { .pr_clear = nvme_pr_clear, }; +#ifdef CONFIG_BLK_SED_OPAL +int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp, + void *buffer, size_t len, bool send) +{ + struct nvme_command cmd; + struct nvme_ctrl *ctrl = NULL; + + memset(&cmd, 0, sizeof(cmd)); + if (send) + cmd.common.opcode = nvme_admin_security_send; + else + cmd.common.opcode = nvme_admin_security_recv; + ctrl = container_of(dev, struct nvme_ctrl, opal_dev); + cmd.common.nsid = 0; + cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw10[1] = cpu_to_le32(len); + + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, + ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); +} +EXPORT_SYMBOL_GPL(nvme_sec_submit); +#endif /* CONFIG_BLK_SED_OPAL */ + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 330713c4abdb..1aa353a848e3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,6 +19,7 @@ #include #include #include +#include enum { /* @@ -125,6 +126,8 @@ struct nvme_ctrl { struct list_head node; struct ida ns_ida; + struct opal_dev opal_dev; + char name[12]; char serial[20]; char model[40]; @@ -275,6 +278,17 @@ int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); +#ifdef CONFIG_BLK_SED_OPAL +int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp, + void *buffer, size_t len, bool send); +#else +static inline int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp, + void *buffer, size_t len, bool send) +{ + return 0; +} +#endif /* CONFIG_BLK_DEV_SED_OPAL */ + #define NVME_NR_AERS 1 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 06875bc1ba80..f08e86e73dda 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "nvme.h" @@ -1757,6 +1758,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) @@ -1789,6 +1791,11 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + init_opal_dev(&dev->ctrl.opal_dev, &nvme_sec_submit); + + if (was_suspend) + opal_unlock_from_suspend(&dev->ctrl.opal_dev); + result = nvme_setup_io_queues(dev); if (result) goto out; -- cgit v1.2.3 From 8c87fe722053658467bcc9e5ea82051ce3d3a693 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Mon, 6 Feb 2017 17:22:49 -0700 Subject: Fix SED-OPAL UAPI structs to prevent 32/64 bit size differences. This patch is a quick fixup of the user structures that will prevent the structures from being different sizes on 32 and 64 bit archs. Taking this fix will allow us to *NOT* have to do compat ioctls for the sed code. Signed-off-by: Scott Bauer Fixes: 19641f2d7674 ("Include: Uapi: Add user ABI for Sed/Opal") Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 31799526082a..fc06e3a20a51 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -48,37 +48,38 @@ enum opal_lock_state { }; struct opal_key { - uint8_t lr; - uint8_t key_len; - char key[OPAL_KEY_MAX]; + __u8 lr; + __u8 key_len; + __u8 __align[6]; + __u8 key[OPAL_KEY_MAX]; }; struct opal_lr_act { - int sum; - uint8_t num_lrs; - uint8_t lr[OPAL_MAX_LRS]; struct opal_key key; + __u32 sum; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 align[2]; /* Align to 8 byte boundary */ }; struct opal_session_info { - int sum; - enum opal_user who; + __u32 sum; + __u32 who; struct opal_key opal_key; - uint8_t __align[2]; }; struct opal_user_lr_setup { - size_t range_start; - size_t range_length; - int RLE; /* Read Lock enabled */ - int WLE; /* Write Lock Enabled */ + __u64 range_start; + __u64 range_length; + __u32 RLE; /* Read Lock enabled */ + __u32 WLE; /* Write Lock Enabled */ struct opal_session_info session; - uint8_t __align[4]; }; struct opal_lock_unlock { - enum opal_lock_state l_state; struct opal_session_info session; + __u32 l_state; + __u8 __align[4]; }; struct opal_new_pw { @@ -97,9 +98,9 @@ struct opal_new_pw { }; struct opal_mbr_data { - u8 enable_disable; struct opal_key key; - uint8_t __align[5]; + __u8 enable_disable; + __u8 __align[7]; }; #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) -- cgit v1.2.3 From 605cdf0875f80300be47e79bd91b2d60916407d3 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 6 Feb 2017 23:00:13 +0100 Subject: gdrom: Add missing error code In case of error, 'err' is known to be 0 here, because of the previous test. Set it to a -ENOMEM instead. Signed-off-by: Christophe JAILLET Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 584bc3126403..46ecd95d7161 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -807,16 +807,20 @@ static int probe_gdrom(struct platform_device *devptr) if (err) goto probe_fail_cmdirq_register; gd.gdrom_rq = blk_init_queue(gdrom_request, &gdrom_lock); - if (!gd.gdrom_rq) + if (!gd.gdrom_rq) { + err = -ENOMEM; goto probe_fail_requestq; + } err = probe_gdrom_setupqueue(); if (err) goto probe_fail_toc; gd.toc = kzalloc(sizeof(struct gdromtoc), GFP_KERNEL); - if (!gd.toc) + if (!gd.toc) { + err = -ENOMEM; goto probe_fail_toc; + } add_disk(gd.disk); return 0; -- cgit v1.2.3 From ecdd09597a57251323b0de50e3d45e69298c4a83 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 11 Feb 2017 11:40:45 +0800 Subject: block/loop: fix race between I/O and set_status Inside set_status, transfer need to setup again, so we have to drain IO before the transition, otherwise oops may be triggered like the following: divide error: 0000 [#1] SMP KASAN CPU: 0 PID: 2935 Comm: loop7 Not tainted 4.10.0-rc7+ #213 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff88006ba1e840 task.stack: ffff880067338000 RIP: 0010:transfer_xor+0x1d1/0x440 drivers/block/loop.c:110 RSP: 0018:ffff88006733f108 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8800688d7000 RCX: 0000000000000059 RDX: 0000000000000000 RSI: 1ffff1000d743f43 RDI: ffff880068891c08 RBP: ffff88006733f160 R08: ffff8800688d7001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800688d7000 R13: ffff880067b7d000 R14: dffffc0000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88006d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000006c17e0 CR3: 0000000066e3b000 CR4: 00000000001406f0 Call Trace: lo_do_transfer drivers/block/loop.c:251 [inline] lo_read_transfer drivers/block/loop.c:392 [inline] do_req_filebacked drivers/block/loop.c:541 [inline] loop_handle_cmd drivers/block/loop.c:1677 [inline] loop_queue_work+0xda0/0x49b0 drivers/block/loop.c:1689 kthread_worker_fn+0x4c3/0xa30 kernel/kthread.c:630 kthread+0x326/0x3f0 kernel/kthread.c:227 ret_from_fork+0x31/0x40 arch/x86/entry/entry_64.S:430 Code: 03 83 e2 07 41 29 df 42 0f b6 04 30 4d 8d 44 24 01 38 d0 7f 08 84 c0 0f 85 62 02 00 00 44 89 f8 41 0f b6 48 ff 25 ff 01 00 00 99 7d c8 48 63 d2 48 03 55 d0 48 89 d0 48 89 d7 48 c1 e8 03 83 RIP: transfer_xor+0x1d1/0x440 drivers/block/loop.c:110 RSP: ffff88006733f108 ---[ end trace 0166f7bd3b0c0933 ]--- Reported-by: Dmitry Vyukov Cc: stable@vger.kernel.org Signed-off-by: Ming Lei Tested-by: Dmitry Vyukov Signed-off-by: Jens Axboe --- drivers/block/loop.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f347285c67ec..304377182c1a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1097,9 +1097,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; + /* I/O need to be drained during transfer transition */ + blk_mq_freeze_queue(lo->lo_queue); + err = loop_release_xfer(lo); if (err) - return err; + goto exit; if (info->lo_encrypt_type) { unsigned int type = info->lo_encrypt_type; @@ -1114,12 +1117,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) err = loop_init_xfer(lo, xfer, info); if (err) - return err; + goto exit; if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) - return -EFBIG; + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { + err = -EFBIG; + goto exit; + } loop_config_discard(lo); @@ -1156,7 +1161,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* update dio if lo_offset or transfer is changed */ __loop_update_dio(lo, lo->use_dio); - return 0; + exit: + blk_mq_unfreeze_queue(lo->lo_queue); + return err; } static int -- cgit v1.2.3 From c5c9b26ee5f1295c77d8f2ff5f804ed6c0b07cc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Feb 2017 00:13:20 +0100 Subject: cciss: switch to pci_irq_alloc_vectors Simple cleanup to use the new APIs. Signed-off-by: Christoph Hellwig Acked-by: Don Brace Tested-by: Don Brace Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 54 ++++++++++++++------------------------------------- drivers/block/cciss.h | 6 ++---- 2 files changed, 17 insertions(+), 43 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index e5c5b8eb14a9..3a44438a1195 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4074,41 +4074,27 @@ clean_up: static void cciss_interrupt_mode(ctlr_info_t *h) { -#ifdef CONFIG_PCI_MSI - int err; - struct msix_entry cciss_msix_entries[4] = { {0, 0}, {0, 1}, - {0, 2}, {0, 3} - }; + int ret; /* Some boards advertise MSI but don't really support it */ if ((h->board_id == 0x40700E11) || (h->board_id == 0x40800E11) || (h->board_id == 0x40820E11) || (h->board_id == 0x40830E11)) goto default_int_mode; - if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) { - err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4); - if (!err) { - h->intr[0] = cciss_msix_entries[0].vector; - h->intr[1] = cciss_msix_entries[1].vector; - h->intr[2] = cciss_msix_entries[2].vector; - h->intr[3] = cciss_msix_entries[3].vector; - h->msix_vector = 1; - return; - } else { - dev_warn(&h->pdev->dev, - "MSI-X init failed %d\n", err); - } - } - if (pci_find_capability(h->pdev, PCI_CAP_ID_MSI)) { - if (!pci_enable_msi(h->pdev)) - h->msi_vector = 1; - else - dev_warn(&h->pdev->dev, "MSI init failed\n"); + ret = pci_alloc_irq_vectors(h->pdev, 4, 4, PCI_IRQ_MSIX); + if (ret >= 0) { + h->intr[0] = pci_irq_vector(h->pdev, 0); + h->intr[1] = pci_irq_vector(h->pdev, 1); + h->intr[2] = pci_irq_vector(h->pdev, 2); + h->intr[3] = pci_irq_vector(h->pdev, 3); + return; } + + ret = pci_alloc_irq_vectors(h->pdev, 1, 1, PCI_IRQ_MSI); + default_int_mode: -#endif /* CONFIG_PCI_MSI */ /* if we get here we're going to use the default interrupt mode */ - h->intr[h->intr_mode] = h->pdev->irq; + h->intr[h->intr_mode] = pci_irq_vector(h->pdev, 0); return; } @@ -4888,7 +4874,7 @@ static int cciss_request_irq(ctlr_info_t *h, irqreturn_t (*msixhandler)(int, void *), irqreturn_t (*intxhandler)(int, void *)) { - if (h->msix_vector || h->msi_vector) { + if (h->pdev->msi_enabled || h->pdev->msix_enabled) { if (!request_irq(h->intr[h->intr_mode], msixhandler, 0, h->devname, h)) return 0; @@ -4934,12 +4920,7 @@ static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h) int ctlr = h->ctlr; free_irq(h->intr[h->intr_mode], h); -#ifdef CONFIG_PCI_MSI - if (h->msix_vector) - pci_disable_msix(h->pdev); - else if (h->msi_vector) - pci_disable_msi(h->pdev); -#endif /* CONFIG_PCI_MSI */ + pci_free_irq_vectors(h->pdev); cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); cciss_free_scatterlists(h); cciss_free_cmd_pool(h); @@ -5295,12 +5276,7 @@ static void cciss_remove_one(struct pci_dev *pdev) cciss_shutdown(pdev); -#ifdef CONFIG_PCI_MSI - if (h->msix_vector) - pci_disable_msix(h->pdev); - else if (h->msi_vector) - pci_disable_msi(h->pdev); -#endif /* CONFIG_PCI_MSI */ + pci_free_irq_vectors(h->pdev); iounmap(h->transtable); iounmap(h->cfgtable); diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 7fda30e4a241..4affa94ca17b 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -90,8 +90,6 @@ struct ctlr_info # define SIMPLE_MODE_INT 2 # define MEMQ_MODE_INT 3 unsigned int intr[4]; - unsigned int msix_vector; - unsigned int msi_vector; int intr_mode; int cciss_max_sectors; BYTE cciss_read; @@ -333,7 +331,7 @@ static unsigned long SA5_performant_completed(ctlr_info_t *h) */ register_value = readl(h->vaddr + SA5_OUTDB_STATUS); /* msi auto clears the interrupt pending bit. */ - if (!(h->msi_vector || h->msix_vector)) { + if (!(h->pdev->msi_enabled || h->pdev->msix_enabled)) { writel(SA5_OUTDB_CLEAR_PERF_BIT, h->vaddr + SA5_OUTDB_CLEAR); /* Do a read in order to flush the write to the controller * (as per spec.) @@ -393,7 +391,7 @@ static bool SA5_performant_intr_pending(ctlr_info_t *h) if (!register_value) return false; - if (h->msi_vector || h->msix_vector) + if (h->pdev->msi_enabled || h->pdev->msix_enabled) return true; /* Read outbound doorbell to flush */ -- cgit v1.2.3 From d1a987f35ebf859a771ac530e95a89933b6fcce8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Feb 2017 08:16:41 -0700 Subject: elevator: fix loading wrong elevator type for blk-mq devices The old elevator= boot parameter blindly attempts to load the same scheduler for mq and !mq devices, leading to a crash if we specify the wrong one. Ensure that we only apply this boot parameter to old !mq devices. Signed-off-by: Jens Axboe --- block/elevator.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index ef7f59469acc..b2a55167f0c2 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -207,11 +207,12 @@ int elevator_init(struct request_queue *q, char *name) } /* - * Use the default elevator specified by config boot param or - * config option. Don't try to load modules as we could be running - * off async and request_module() isn't allowed from async. + * Use the default elevator specified by config boot param for + * non-mq devices, or by config option. Don't try to load modules + * as we could be running off async and request_module() isn't + * allowed from async. */ - if (!e && *chosen_elevator) { + if (!e && !q->mq_ops && *chosen_elevator) { e = elevator_get(chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", -- cgit v1.2.3 From 853fe1bf7554155376bb3b231112cdff9ff79177 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Feb 2017 16:25:26 -0800 Subject: cdrom: Make device operations read-only Since function tables are a common target for attackers, it's best to keep them in read-only memory. As such, this makes the CDROM device ops tables const. This drops additionally n_minors, since it isn't used meaningfully, and sets the only user of cdrom_dummy_generic_packet explicitly so the variables can all be const. Inspired by similar changes in grsecurity/PaX. Signed-off-by: Kees Cook Acked-by: David S. Miller Signed-off-by: Jens Axboe --- Documentation/cdrom/cdrom-standard.tex | 9 +----- drivers/block/paride/pcd.c | 2 +- drivers/cdrom/cdrom.c | 58 ++++++++++++++++------------------ drivers/cdrom/gdrom.c | 4 +-- drivers/ide/ide-cd.c | 2 +- drivers/scsi/sr.c | 2 +- include/linux/cdrom.h | 5 +-- 7 files changed, 37 insertions(+), 45 deletions(-) diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex index c06233fe52ac..8f85b0e41046 100644 --- a/Documentation/cdrom/cdrom-standard.tex +++ b/Documentation/cdrom/cdrom-standard.tex @@ -249,7 +249,6 @@ struct& cdrom_device_ops\ \{ \hidewidth\cr unsigned\ long);\cr \noalign{\medskip} &const\ int& capability;& capability flags \cr - &int& n_minors;& number of active minor devices \cr \};\cr } $$ @@ -258,13 +257,7 @@ it should add a function pointer to this $struct$. When a particular function is not implemented, however, this $struct$ should contain a NULL instead. The $capability$ flags specify the capabilities of the \cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive -is registered with the \UCD. The value $n_minors$ should be a positive -value indicating the number of minor devices that are supported by -the low-level device driver, normally~1. Although these two variables -are `informative' rather than `operational,' they are included in -$cdrom_device_ops$ because they describe the capability of the {\em -driver\/} rather than the {\em drive}. Nomenclature has always been -difficult in computer programming. +is registered with the \UCD. Note that most functions have fewer parameters than their $blkdev_fops$ counterparts. This is because very little of the diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 5fd2d0e25567..10aed84244f5 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -273,7 +273,7 @@ static const struct block_device_operations pcd_bdops = { .check_events = pcd_block_check_events, }; -static struct cdrom_device_ops pcd_dops = { +static const struct cdrom_device_ops pcd_dops = { .open = pcd_open, .release = pcd_release, .drive_status = pcd_drive_status, diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 59cca72647a6..bbbd3caa927c 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -342,8 +342,8 @@ static void cdrom_sysctl_register(void); static LIST_HEAD(cdrom_list); -static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, - struct packet_command *cgc) +int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, + struct packet_command *cgc) { if (cgc->sense) { cgc->sense->sense_key = 0x05; @@ -354,6 +354,7 @@ static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, cgc->stat = -EIO; return -EIO; } +EXPORT_SYMBOL(cdrom_dummy_generic_packet); static int cdrom_flush_cache(struct cdrom_device_info *cdi) { @@ -371,7 +372,7 @@ static int cdrom_flush_cache(struct cdrom_device_info *cdi) static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; int ret, buflen; @@ -586,7 +587,7 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) int register_cdrom(struct cdrom_device_info *cdi) { static char banner_printed; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int *change_capability = (int *)&cdo->capability; /* hack */ cd_dbg(CD_OPEN, "entering register_cdrom\n"); @@ -610,7 +611,6 @@ int register_cdrom(struct cdrom_device_info *cdi) ENSURE(reset, CDC_RESET); ENSURE(generic_packet, CDC_GENERIC_PACKET); cdi->mc_flags = 0; - cdo->n_minors = 0; cdi->options = CDO_USE_FFLAGS; if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY)) @@ -630,8 +630,7 @@ int register_cdrom(struct cdrom_device_info *cdi) else cdi->cdda_method = CDDA_OLD; - if (!cdo->generic_packet) - cdo->generic_packet = cdrom_dummy_generic_packet; + WARN_ON(!cdo->generic_packet); cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name); mutex_lock(&cdrom_mutex); @@ -652,7 +651,6 @@ void unregister_cdrom(struct cdrom_device_info *cdi) if (cdi->exit) cdi->exit(cdi); - cdi->ops->n_minors--; cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name); } @@ -1036,7 +1034,7 @@ static int open_for_data(struct cdrom_device_info *cdi) { int ret; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; tracktype tracks; cd_dbg(CD_OPEN, "entering open_for_data\n"); /* Check if the driver can report drive status. If it can, we @@ -1198,8 +1196,8 @@ err: /* This code is similar to that in open_for_data. The routine is called whenever an audio play operation is requested. */ -static int check_for_audio_disc(struct cdrom_device_info * cdi, - struct cdrom_device_ops * cdo) +static int check_for_audio_disc(struct cdrom_device_info *cdi, + const struct cdrom_device_ops *cdo) { int ret; tracktype tracks; @@ -1254,7 +1252,7 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi, void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int opened_for_data; cd_dbg(CD_CLOSE, "entering cdrom_release\n"); @@ -1294,7 +1292,7 @@ static int cdrom_read_mech_status(struct cdrom_device_info *cdi, struct cdrom_changer_info *buf) { struct packet_command cgc; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int length; /* @@ -1643,7 +1641,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) int ret; u_char buf[20]; struct packet_command cgc; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; rpc_state_t rpc_state; memset(buf, 0, sizeof(buf)); @@ -1791,7 +1789,7 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s, { unsigned char buf[21], *base; struct dvd_layer *layer; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; int ret, layer_num = s->physical.layer_num; if (layer_num >= DVD_LAYERS) @@ -1842,7 +1840,7 @@ static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s, { int ret; u_char buf[8]; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; init_cdrom_command(cgc, buf, sizeof(buf), CGC_DATA_READ); cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE; @@ -1866,7 +1864,7 @@ static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s, { int ret, size; u_char *buf; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; size = sizeof(s->disckey.value) + 4; @@ -1894,7 +1892,7 @@ static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s, { int ret, size = 4 + 188; u_char *buf; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; buf = kmalloc(size, GFP_KERNEL); if (!buf) @@ -1928,7 +1926,7 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s, { int ret = 0, size; u_char *buf; - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; size = sizeof(s->manufact.value) + 4; @@ -1995,7 +1993,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi, struct packet_command *cgc, int page_code, int page_control) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(cgc->cmd, 0, sizeof(cgc->cmd)); @@ -2010,7 +2008,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi, int cdrom_mode_select(struct cdrom_device_info *cdi, struct packet_command *cgc) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(cgc->cmd, 0, sizeof(cgc->cmd)); memset(cgc->buffer, 0, 2); @@ -2025,7 +2023,7 @@ int cdrom_mode_select(struct cdrom_device_info *cdi, static int cdrom_read_subchannel(struct cdrom_device_info *cdi, struct cdrom_subchnl *subchnl, int mcn) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; char buffer[32]; int ret; @@ -2073,7 +2071,7 @@ static int cdrom_read_cd(struct cdrom_device_info *cdi, struct packet_command *cgc, int lba, int blocksize, int nblocks) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(&cgc->cmd, 0, sizeof(cgc->cmd)); cgc->cmd[0] = GPCMD_READ_10; @@ -2093,7 +2091,7 @@ static int cdrom_read_block(struct cdrom_device_info *cdi, struct packet_command *cgc, int lba, int nblocks, int format, int blksize) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; memset(&cgc->cmd, 0, sizeof(cgc->cmd)); cgc->cmd[0] = GPCMD_READ_CD; @@ -2764,7 +2762,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi, */ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; struct modesel_head mh; @@ -2790,7 +2788,7 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size) static int cdrom_get_track_info(struct cdrom_device_info *cdi, __u16 track, __u8 type, track_information *ti) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; int ret, buflen; @@ -3049,7 +3047,7 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi, void __user *arg, struct packet_command *cgc) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct cdrom_msf msf; cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf))) @@ -3069,7 +3067,7 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi, void __user *arg, struct packet_command *cgc) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; struct cdrom_blk blk; cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n"); if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk))) @@ -3164,7 +3162,7 @@ static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi, struct packet_command *cgc, int cmd) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n"); cgc->cmd[0] = GPCMD_START_STOP_UNIT; cgc->cmd[1] = 1; @@ -3177,7 +3175,7 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi, struct packet_command *cgc, int cmd) { - struct cdrom_device_ops *cdo = cdi->ops; + const struct cdrom_device_ops *cdo = cdi->ops; cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n"); cgc->cmd[0] = GPCMD_PAUSE_RESUME; cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 46ecd95d7161..1afab6558d0c 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -481,7 +481,7 @@ static int gdrom_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, return -EINVAL; } -static struct cdrom_device_ops gdrom_ops = { +static const struct cdrom_device_ops gdrom_ops = { .open = gdrom_open, .release = gdrom_release, .drive_status = gdrom_drivestatus, @@ -489,9 +489,9 @@ static struct cdrom_device_ops gdrom_ops = { .get_last_session = gdrom_get_last_session, .reset = gdrom_hardreset, .audio_ioctl = gdrom_audio_ioctl, + .generic_packet = cdrom_dummy_generic_packet, .capability = CDC_MULTI_SESSION | CDC_MEDIA_CHANGED | CDC_RESET | CDC_DRIVE_STATUS | CDC_CD_R, - .n_minors = 1, }; static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 9cbd217bc0c9..ab9232e1e16f 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1166,7 +1166,7 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf) CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM | CDC_GENERIC_PACKET | \ CDC_MO_DRIVE | CDC_MRW | CDC_MRW_W | CDC_RAM) -static struct cdrom_device_ops ide_cdrom_dops = { +static const struct cdrom_device_ops ide_cdrom_dops = { .open = ide_cdrom_open_real, .release = ide_cdrom_release_real, .drive_status = ide_cdrom_drive_status, diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 94352e4df831..013bfe049a48 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -117,7 +117,7 @@ static unsigned int sr_check_events(struct cdrom_device_info *cdi, unsigned int clearing, int slot); static int sr_packet(struct cdrom_device_info *, struct packet_command *); -static struct cdrom_device_ops sr_dops = { +static const struct cdrom_device_ops sr_dops = { .open = sr_open, .release = sr_release, .drive_status = sr_drive_status, diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 8609d577bb66..6e8f209a6dff 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -36,7 +36,7 @@ struct packet_command /* Uniform cdrom data structures for cdrom.c */ struct cdrom_device_info { - struct cdrom_device_ops *ops; /* link to device_ops */ + const struct cdrom_device_ops *ops; /* link to device_ops */ struct list_head list; /* linked list of all device_info */ struct gendisk *disk; /* matching block layer disk */ void *handle; /* driver-dependent data */ @@ -87,7 +87,6 @@ struct cdrom_device_ops { /* driver specifications */ const int capability; /* capability flags */ - int n_minors; /* number of active minor devices */ /* handle uniform packets for scsi type devices (scsi,atapi) */ int (*generic_packet) (struct cdrom_device_info *, struct packet_command *); @@ -123,6 +122,8 @@ extern int cdrom_mode_sense(struct cdrom_device_info *cdi, int page_code, int page_control); extern void init_cdrom_command(struct packet_command *cgc, void *buffer, int len, int type); +extern int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, + struct packet_command *cgc); /* The SCSI spec says there could be 256 slots. */ #define CDROM_MAX_SLOTS 256 -- cgit v1.2.3 From 24bff4d78a572d25fe2a0818f55bebda8a2d4709 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 14 Feb 2017 17:29:35 -0700 Subject: uapi: sed-opal fix IOW for activate lsp to use correct struct The IOC_OPAL_ACTIVATE_LSP took the wrong strcure which would give us the wrong size when using _IOC_SIZE, switch it to the right structure. Fixes: 058f8a2 ("Include: Uapi: Add user ABI for Sed/Opal") Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index fc06e3a20a51..c72e0735532d 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -106,7 +106,7 @@ struct opal_mbr_data { #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) -#define IOC_OPAL_ACTIVATE_LSP _IOW('p', 223, struct opal_key) +#define IOC_OPAL_ACTIVATE_LSP _IOW('p', 223, struct opal_lr_act) #define IOC_OPAL_SET_PW _IOW('p', 224, struct opal_new_pw) #define IOC_OPAL_ACTIVATE_USR _IOW('p', 225, struct opal_session_info) #define IOC_OPAL_REVERT_TPR _IOW('p', 226, struct opal_key) -- cgit v1.2.3 From e225c20eb0fd0b6657e640408f11ee392dc82b5b Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 14 Feb 2017 17:29:36 -0700 Subject: Move stack parameters for sed_ioctl to prevent oversized stack with CONFIG_KASAN When CONFIG_KASAN is enabled, compilation fails: block/sed-opal.c: In function 'sed_ioctl': block/sed-opal.c:2447:1: error: the frame size of 2256 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] Moved all the ioctl structures off the stack and dynamically allocate using _IOC_SIZE() Fixes: 455a7b238cd6 ("block: Add Sed-opal library") Reported-by: Arnd Bergmann Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- block/sed-opal.c | 133 ++++++++++++++++------------------------------- drivers/nvme/host/core.c | 3 +- include/linux/sed-opal.h | 4 +- 3 files changed, 50 insertions(+), 90 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index bf1406e5159b..e95b8a57053d 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2344,9 +2344,10 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) } EXPORT_SYMBOL(opal_unlock_from_suspend); -int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr) +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { - void __user *arg = (void __user *)ptr; + void *p; + int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -2355,94 +2356,52 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr) return -ENOTSUPP; } - switch (cmd) { - case IOC_OPAL_SAVE: { - struct opal_lock_unlock lk_unlk; - - if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk))) - return -EFAULT; - return opal_save(dev, &lk_unlk); - } - case IOC_OPAL_LOCK_UNLOCK: { - struct opal_lock_unlock lk_unlk; - - if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk))) - return -EFAULT; - return opal_lock_unlock(dev, &lk_unlk); - } - case IOC_OPAL_TAKE_OWNERSHIP: { - struct opal_key opal_key; - - if (copy_from_user(&opal_key, arg, sizeof(opal_key))) - return -EFAULT; - return opal_take_ownership(dev, &opal_key); - } - case IOC_OPAL_ACTIVATE_LSP: { - struct opal_lr_act opal_lr_act; - - if (copy_from_user(&opal_lr_act, arg, sizeof(opal_lr_act))) - return -EFAULT; - return opal_activate_lsp(dev, &opal_lr_act); - } - case IOC_OPAL_SET_PW: { - struct opal_new_pw opal_pw; - - if (copy_from_user(&opal_pw, arg, sizeof(opal_pw))) - return -EFAULT; - return opal_set_new_pw(dev, &opal_pw); - } - case IOC_OPAL_ACTIVATE_USR: { - struct opal_session_info session; - - if (copy_from_user(&session, arg, sizeof(session))) - return -EFAULT; - return opal_activate_user(dev, &session); - } - case IOC_OPAL_REVERT_TPR: { - struct opal_key opal_key; - - if (copy_from_user(&opal_key, arg, sizeof(opal_key))) - return -EFAULT; - return opal_reverttper(dev, &opal_key); - } - case IOC_OPAL_LR_SETUP: { - struct opal_user_lr_setup lrs; + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); - if (copy_from_user(&lrs, arg, sizeof(lrs))) - return -EFAULT; - return opal_setup_locking_range(dev, &lrs); - } - case IOC_OPAL_ADD_USR_TO_LR: { - struct opal_lock_unlock lk_unlk; - - if (copy_from_user(&lk_unlk, arg, sizeof(lk_unlk))) - return -EFAULT; - return opal_add_user_to_lr(dev, &lk_unlk); - } - case IOC_OPAL_ENABLE_DISABLE_MBR: { - struct opal_mbr_data mbr; - - if (copy_from_user(&mbr, arg, sizeof(mbr))) - return -EFAULT; - return opal_enable_disable_shadow_mbr(dev, &mbr); - } - case IOC_OPAL_ERASE_LR: { - struct opal_session_info session; - - if (copy_from_user(&session, arg, sizeof(session))) - return -EFAULT; - return opal_erase_locking_range(dev, &session); - } - case IOC_OPAL_SECURE_ERASE_LR: { - struct opal_session_info session; - - if (copy_from_user(&session, arg, sizeof(session))) - return -EFAULT; - return opal_secure_erase_locking_range(dev, &session); - } + switch (cmd) { + case IOC_OPAL_SAVE: + ret = opal_save(dev, p); + break; + case IOC_OPAL_LOCK_UNLOCK: + ret = opal_lock_unlock(dev, p); + break; + case IOC_OPAL_TAKE_OWNERSHIP: + ret = opal_take_ownership(dev, p); + break; + case IOC_OPAL_ACTIVATE_LSP: + ret = opal_activate_lsp(dev, p); + break; + case IOC_OPAL_SET_PW: + ret = opal_set_new_pw(dev, p); + break; + case IOC_OPAL_ACTIVATE_USR: + ret = opal_activate_user(dev, p); + break; + case IOC_OPAL_REVERT_TPR: + ret = opal_reverttper(dev, p); + break; + case IOC_OPAL_LR_SETUP: + ret = opal_setup_locking_range(dev, p); + break; + case IOC_OPAL_ADD_USR_TO_LR: + ret = opal_add_user_to_lr(dev, p); + break; + case IOC_OPAL_ENABLE_DISABLE_MBR: + ret = opal_enable_disable_shadow_mbr(dev, p); + break; + case IOC_OPAL_ERASE_LR: + ret = opal_erase_locking_range(dev, p); + break; + case IOC_OPAL_SECURE_ERASE_LR: + ret = opal_secure_erase_locking_range(dev, p); + break; default: pr_warn("No such Opal Ioctl %u\n", cmd); } - return -ENOTTY; + + kfree(p); + return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 26ae4afd3737..b92a79281611 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -789,7 +789,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_nvm_ioctl(ns, cmd, arg); #endif if (is_sed_ioctl(cmd)) - return sed_ioctl(&ns->ctrl->opal_dev, cmd, arg); + return sed_ioctl(&ns->ctrl->opal_dev, cmd, + (void __user *) arg); return -ENOTTY; } } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index af1a85eae193..205d520ea688 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -132,7 +132,7 @@ struct opal_dev { #ifdef CONFIG_BLK_SED_OPAL bool opal_unlock_from_suspend(struct opal_dev *dev); void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); -int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr); +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); static inline bool is_sed_ioctl(unsigned int cmd) { @@ -160,7 +160,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) } static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd, - unsigned long ptr) + void __user *ioctl_ptr) { return 0; } -- cgit v1.2.3 From 0222967ba05a0a0f845f7dd54fb3da82afae024d Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 14 Feb 2017 17:29:37 -0700 Subject: Maintainers: Modify SED list from nvme to block Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f96181c927be..c0e6e9d58460 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11079,7 +11079,7 @@ SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER M: Scott Bauer M: Jonathan Derrick M: Rafael Antognolli -L: linux-nvme@lists.infradead.org +L: linux-block@vger.kernel.org S: Supported F: block/sed* F: block/opal_proto.h -- cgit v1.2.3 From 0e5ffd1cb5f7ce19f23cc829d5dc3ebb1491570f Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Wed, 15 Feb 2017 16:25:32 +0100 Subject: lightnvm: fix off-by-one error on target initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If one specifies the end lun id to be the absolute number of luns, without taking zero indexing into account, the lightnvm core will pass the off-by-one end lun id to target creation, which then panics during nvm_ioctl_dev_create. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 9bfe0352d093..6ce76c0a75e1 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -1102,9 +1102,9 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) } s = &create->conf.s; - if (s->lun_begin > s->lun_end || s->lun_end > dev->geo.nr_luns) { + if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) { pr_err("nvm: lun out of bound (%u:%u > %u)\n", - s->lun_begin, s->lun_end, dev->geo.nr_luns); + s->lun_begin, s->lun_end, dev->geo.nr_luns - 1); return -EINVAL; } -- cgit v1.2.3 From 6732c7401035c8464fd4ab5ff2e1bf86e5fcd74c Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Wed, 15 Feb 2017 16:25:33 +0100 Subject: lightnvm: set default lun range when no luns are specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The create target ioctl takes a lun begin and lun end parameter, which defines the range of luns to initialize a target with. If the user does not set the parameters, it default to only using lun 0. Instead, defaults to use all luns in the OCSSD, as it is the usual behaviour users want. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 6ce76c0a75e1..5262ba66a7a7 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -1102,6 +1102,11 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) } s = &create->conf.s; + if (s->lun_begin == -1 && s->lun_end == -1) { + s->lun_begin = 0; + s->lun_end = dev->geo.nr_luns - 1; + } + if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) { pr_err("nvm: lun out of bound (%u:%u > %u)\n", s->lun_begin, s->lun_end, dev->geo.nr_luns - 1); -- cgit v1.2.3 From f5b37b7c23915af93081a8711e0a0f0219063756 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Feb 2017 13:59:38 +0100 Subject: block/sed-opal: tone down not supported warnings Not having OPAL or a sub-feature supported is an entirely normal condition for many drives, so don't warn about it. Keep the messages, but tone them down to debug only. Signed-off-by: Christoph Hellwig Reviewed-by: Scott Bauer Signed-off-by: Jens Axboe --- block/sed-opal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a57053d..bcdd5b6d02e8 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -387,16 +387,16 @@ static int opal_discovery0_end(struct opal_dev *dev) } if (!supported) { - pr_err("This device is not Opal enabled. Not Supported!\n"); + pr_debug("This device is not Opal enabled. Not Supported!\n"); return -EOPNOTSUPP; } if (!single_user) - pr_warn("Device doesn't support single user mode\n"); + pr_debug("Device doesn't support single user mode\n"); if (!found_com_id) { - pr_warn("Could not find OPAL comid for device. Returning early\n"); + pr_debug("Could not find OPAL comid for device. Returning early\n"); return -EOPNOTSUPP;; } @@ -1951,7 +1951,7 @@ void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv) mutex_init(&opal_dev->dev_lock); opal_dev->send_recv = send_recv; if (check_opal_support(opal_dev) < 0) - pr_warn("Opal is not supported on this device\n"); + pr_debug("Opal is not supported on this device\n"); opal_dev->initialized = true; } EXPORT_SYMBOL(init_opal_dev); -- cgit v1.2.3 From 4f1244c8298606b8fae64b4d78b820ae6b896e3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Feb 2017 13:59:39 +0100 Subject: block/sed-opal: allocate struct opal_dev dynamically Insted of bloating the containing structure with it all the time this allocates struct opal_dev dynamically. Additionally this allows moving the definition of struct opal_dev into sed-opal.c. For this a new private data field is added to it that is passed to the send/receive callback. After that a lot of internals can be made private as well. Signed-off-by: Christoph Hellwig Tested-by: Scott Bauer Reviewed-by: Scott Bauer Signed-off-by: Jens Axboe --- block/opal_proto.h | 23 ++++++++++ block/sed-opal.c | 101 +++++++++++++++++++++++++++++++++++++---- drivers/nvme/host/core.c | 9 ++-- drivers/nvme/host/nvme.h | 14 ++---- drivers/nvme/host/pci.c | 8 +++- include/linux/sed-opal.h | 116 ++--------------------------------------------- 6 files changed, 131 insertions(+), 140 deletions(-) diff --git a/block/opal_proto.h b/block/opal_proto.h index af9abc56c157..f40c9acf8895 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -19,6 +19,29 @@ #ifndef _OPAL_PROTO_H #define _OPAL_PROTO_H +/* + * These constant values come from: + * SPC-4 section + * 6.30 SECURITY PROTOCOL IN command / table 265. + */ +enum { + TCG_SECP_00 = 0, + TCG_SECP_01, +}; + +/* + * Token defs derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * 3.2.2 Data Stream Encoding + */ +enum opal_response_token { + OPAL_DTA_TOKENID_BYTESTRING = 0xe0, + OPAL_DTA_TOKENID_SINT = 0xe1, + OPAL_DTA_TOKENID_UINT = 0xe2, + OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0 +}; + #define DTAERROR_NO_METHOD_STATUS 0x89 #define GENERIC_HOST_SESSION_NUM 0x41 diff --git a/block/sed-opal.c b/block/sed-opal.c index bcdd5b6d02e8..d1c52ba4d62d 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -31,6 +31,77 @@ #include "opal_proto.h" +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 + +typedef int (*opal_step)(struct opal_dev *dev); + +enum opal_atom_width { + OPAL_WIDTH_TINY, + OPAL_WIDTH_SHORT, + OPAL_WIDTH_MEDIUM, + OPAL_WIDTH_LONG, + OPAL_WIDTH_TOKEN +}; + +/* + * On the parsed response, we don't store again the toks that are already + * stored in the response buffer. Instead, for each token, we just store a + * pointer to the position in the buffer where the token starts, and the size + * of the token in bytes. + */ +struct opal_resp_tok { + const u8 *pos; + size_t len; + enum opal_response_token type; + enum opal_atom_width width; + union { + u64 u; + s64 s; + } stored; +}; + +/* + * From the response header it's not possible to know how many tokens there are + * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later + * if we start dealing with messages that have more than that, we can increase + * this number. This is done to avoid having to make two passes through the + * response, the first one counting how many tokens we have and the second one + * actually storing the positions. + */ +struct parsed_resp { + int num; + struct opal_resp_tok toks[MAX_TOKS]; +}; + +struct opal_dev { + bool supported; + + void *data; + sec_send_recv *send_recv; + + const opal_step *funcs; + void **func_data; + int state; + struct mutex dev_lock; + u16 comid; + u32 hsn; + u32 tsn; + u64 align; + u64 lowest_lba; + + size_t pos; + u8 cmd[IO_BUFFER_LENGTH]; + u8 resp[IO_BUFFER_LENGTH]; + + struct parsed_resp parsed; + size_t prev_d_len; + void *prev_data; + + struct list_head unlk_lst; +}; + + static const u8 opaluid[][OPAL_UID_LENGTH] = { /* users */ [OPAL_SMUID_UID] = @@ -243,14 +314,14 @@ static u16 get_comid_v200(const void *data) static int opal_send_cmd(struct opal_dev *dev) { - return dev->send_recv(dev, dev->comid, TCG_SECP_01, + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, dev->cmd, IO_BUFFER_LENGTH, true); } static int opal_recv_cmd(struct opal_dev *dev) { - return dev->send_recv(dev, dev->comid, TCG_SECP_01, + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, dev->resp, IO_BUFFER_LENGTH, false); } @@ -1943,16 +2014,24 @@ static int check_opal_support(struct opal_dev *dev) return ret; } -void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv) +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) { - if (opal_dev->initialized) - return; - INIT_LIST_HEAD(&opal_dev->unlk_lst); - mutex_init(&opal_dev->dev_lock); - opal_dev->send_recv = send_recv; - if (check_opal_support(opal_dev) < 0) + struct opal_dev *dev; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + + INIT_LIST_HEAD(&dev->unlk_lst); + mutex_init(&dev->dev_lock); + dev->data = data; + dev->send_recv = send_recv; + if (check_opal_support(dev) != 0) { pr_debug("Opal is not supported on this device\n"); - opal_dev->initialized = true; + kfree(dev); + return NULL; + } + return dev; } EXPORT_SYMBOL(init_opal_dev); @@ -2351,6 +2430,8 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; + if (!dev) + return -ENOTSUPP; if (!dev->supported) { pr_err("Not supported\n"); return -ENOTSUPP; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b92a79281611..f6b56a12457a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -789,7 +789,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_nvm_ioctl(ns, cmd, arg); #endif if (is_sed_ioctl(cmd)) - return sed_ioctl(&ns->ctrl->opal_dev, cmd, + return sed_ioctl(ns->ctrl->opal_dev, cmd, (void __user *) arg); return -ENOTTY; } @@ -1059,18 +1059,17 @@ static const struct pr_ops nvme_pr_ops = { }; #ifdef CONFIG_BLK_SED_OPAL -int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp, - void *buffer, size_t len, bool send) +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send) { + struct nvme_ctrl *ctrl = data; struct nvme_command cmd; - struct nvme_ctrl *ctrl = NULL; memset(&cmd, 0, sizeof(cmd)); if (send) cmd.common.opcode = nvme_admin_security_send; else cmd.common.opcode = nvme_admin_security_recv; - ctrl = container_of(dev, struct nvme_ctrl, opal_dev); cmd.common.nsid = 0; cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); cmd.common.cdw10[1] = cpu_to_le32(len); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1aa353a848e3..fd94c94ccbcb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -126,7 +126,7 @@ struct nvme_ctrl { struct list_head node; struct ida ns_ida; - struct opal_dev opal_dev; + struct opal_dev *opal_dev; char name[12]; char serial[20]; @@ -278,16 +278,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); -#ifdef CONFIG_BLK_SED_OPAL -int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp, - void *buffer, size_t len, bool send); -#else -static inline int nvme_sec_submit(struct opal_dev *dev, u16 spsp, u8 secp, - void *buffer, size_t len, bool send) -{ - return 0; -} -#endif /* CONFIG_BLK_DEV_SED_OPAL */ +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send); #define NVME_NR_AERS 1 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f08e86e73dda..50b070528c50 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1742,6 +1742,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); + kfree(dev->ctrl.opal_dev); kfree(dev); } @@ -1791,10 +1792,13 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - init_opal_dev(&dev->ctrl.opal_dev, &nvme_sec_submit); + if (!dev->ctrl.opal_dev) { + dev->ctrl.opal_dev = + init_opal_dev(&dev->ctrl, &nvme_sec_submit); + } if (was_suspend) - opal_unlock_from_suspend(&dev->ctrl.opal_dev); + opal_unlock_from_suspend(dev->ctrl.opal_dev); result = nvme_setup_io_queues(dev); if (result) diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 205d520ea688..deee23d012e7 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -21,117 +21,14 @@ #include #include -/* - * These constant values come from: - * SPC-4 section - * 6.30 SECURITY PROTOCOL IN command / table 265. - */ -enum { - TCG_SECP_00 = 0, - TCG_SECP_01, -}; struct opal_dev; -#define IO_BUFFER_LENGTH 2048 -#define MAX_TOKS 64 - -typedef int (*opal_step)(struct opal_dev *dev); -typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp, - void *buffer, size_t len, bool send); - - -enum opal_atom_width { - OPAL_WIDTH_TINY, - OPAL_WIDTH_SHORT, - OPAL_WIDTH_MEDIUM, - OPAL_WIDTH_LONG, - OPAL_WIDTH_TOKEN -}; - -/* - * Token defs derived from: - * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 - * 3.2.2 Data Stream Encoding - */ -enum opal_response_token { - OPAL_DTA_TOKENID_BYTESTRING = 0xe0, - OPAL_DTA_TOKENID_SINT = 0xe1, - OPAL_DTA_TOKENID_UINT = 0xe2, - OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ - OPAL_DTA_TOKENID_INVALID = 0X0 -}; - -/* - * On the parsed response, we don't store again the toks that are already - * stored in the response buffer. Instead, for each token, we just store a - * pointer to the position in the buffer where the token starts, and the size - * of the token in bytes. - */ -struct opal_resp_tok { - const u8 *pos; - size_t len; - enum opal_response_token type; - enum opal_atom_width width; - union { - u64 u; - s64 s; - } stored; -}; - -/* - * From the response header it's not possible to know how many tokens there are - * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later - * if we start dealing with messages that have more than that, we can increase - * this number. This is done to avoid having to make two passes through the - * response, the first one counting how many tokens we have and the second one - * actually storing the positions. - */ -struct parsed_resp { - int num; - struct opal_resp_tok toks[MAX_TOKS]; -}; - -/** - * struct opal_dev - The structure representing a OPAL enabled SED. - * @supported: Whether or not OPAL is supported on this controller. - * @send_recv: The combined sec_send/sec_recv function pointer. - * @opal_step: A series of opal methods that are necessary to complete a command. - * @func_data: An array of parameters for the opal methods above. - * @state: Describes the current opal_step we're working on. - * @dev_lock: Locks the entire opal_dev structure. - * @parsed: Parsed response from controller. - * @prev_data: Data returned from a method to the controller. - * @unlk_lst: A list of Locking ranges to unlock on this device during a resume. - */ -struct opal_dev { - bool initialized; - bool supported; - sec_send_recv *send_recv; - - const opal_step *funcs; - void **func_data; - int state; - struct mutex dev_lock; - u16 comid; - u32 hsn; - u32 tsn; - u64 align; - u64 lowest_lba; - - size_t pos; - u8 cmd[IO_BUFFER_LENGTH]; - u8 resp[IO_BUFFER_LENGTH]; - - struct parsed_resp parsed; - size_t prev_d_len; - void *prev_data; - - struct list_head unlk_lst; -}; +typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer, + size_t len, bool send); #ifdef CONFIG_BLK_SED_OPAL bool opal_unlock_from_suspend(struct opal_dev *dev); -void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); static inline bool is_sed_ioctl(unsigned int cmd) @@ -168,11 +65,6 @@ static inline bool opal_unlock_from_suspend(struct opal_dev *dev) { return false; } -static inline void init_opal_dev(struct opal_dev *opal_dev, - sec_send_recv *send_recv) -{ - opal_dev->supported = false; - opal_dev->initialized = true; -} +#define init_opal_dev(data, send_recv) NULL #endif /* CONFIG_BLK_SED_OPAL */ #endif /* LINUX_OPAL_H */ -- cgit v1.2.3 From 8a9ae523282f324989850fcf41312b42a2fb9296 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 17 Feb 2017 13:59:40 +0100 Subject: nvme: Check for Security send/recv support before issuing commands. We need to verify that the controller supports the security commands before actually trying to issue them. Signed-off-by: Scott Bauer [hch: moved the check so that we don't call into the OPAL code if not supported] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 2 +- include/linux/nvme.h | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f6b56a12457a..b4e743d1c0b4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1264,6 +1264,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fd94c94ccbcb..b0977229e219 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -140,6 +140,7 @@ struct nvme_ctrl { u32 max_hw_sectors; u16 oncs; u16 vid; + u16 oacs; atomic_t abort_limit; u8 event_limit; u8 vwc; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 50b070528c50..85896d46aebc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1792,7 +1792,7 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - if (!dev->ctrl.opal_dev) { + if ((dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) && !dev->ctrl.opal_dev) { dev->ctrl.opal_dev = init_opal_dev(&dev->ctrl, &nvme_sec_submit); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3d1c6f1b15c9..00eac863a9c7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -244,6 +244,7 @@ enum { NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, + NVME_CTRL_OACS_SEC_SUPP = 1 << 0, }; struct nvme_lbaf { -- cgit v1.2.3