summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/multipath.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/multipath.c')
-rw-r--r--drivers/nvme/host/multipath.c223
1 files changed, 161 insertions, 62 deletions
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..48e7a8906d01 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
+ [NVME_IOPOLICY_QD] = "queue-depth",
};
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
iopolicy = NVME_IOPOLICY_NUMA;
else if (!strncmp(val, "round-robin", 11))
iopolicy = NVME_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ iopolicy = NVME_IOPOLICY_QD;
else
return -EINVAL;
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
&iopolicy, 0644);
MODULE_PARM_DESC(iopolicy,
- "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+ "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
@@ -83,7 +86,7 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
- u16 status = nvme_req(req)->status & 0x7ff;
+ u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
unsigned long flags;
struct bio *bio;
@@ -118,7 +121,8 @@ void nvme_failover_req(struct request *req)
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
- blk_mq_end_request(req, 0);
+ nvme_req(req)->status = 0;
+ nvme_end_req(req);
kblockd_schedule_work(&ns->head->requeue_work);
}
@@ -127,6 +131,11 @@ void nvme_mpath_start_request(struct request *rq)
struct nvme_ns *ns = rq->q->queuedata;
struct gendisk *disk = ns->head->disk;
+ if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
+ atomic_inc(&ns->ctrl->nr_active);
+ nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
+ }
+
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
return;
@@ -140,6 +149,9 @@ void nvme_mpath_end_request(struct request *rq)
{
struct nvme_ns *ns = rq->q->queuedata;
+ if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
+ atomic_dec_if_positive(&ns->ctrl->nr_active);
+
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
return;
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
@@ -150,16 +162,17 @@ void nvme_mpath_end_request(struct request *rq)
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
if (!ns->head->disk)
continue;
kblockd_schedule_work(&ns->head->requeue_work);
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
disk_uevent(ns->head->disk, KOBJ_CHANGE);
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
static const char *nvme_ana_state_names[] = {
@@ -193,13 +206,14 @@ out:
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
nvme_mpath_clear_current_path(ns);
kblockd_schedule_work(&ns->head->requeue_work);
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -223,13 +237,14 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
static bool nvme_path_is_disabled(struct nvme_ns *ns)
{
+ enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
+
/*
* We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
* still be able to complete assuming that the controller is connected.
* Otherwise it will fail immediately and return to the requeue list.
*/
- if (ns->ctrl->state != NVME_CTRL_LIVE &&
- ns->ctrl->state != NVME_CTRL_DELETING)
+ if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
return true;
if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
!test_bit(NVME_NS_READY, &ns->flags))
@@ -246,7 +261,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
if (nvme_path_is_disabled(ns))
continue;
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+ if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+ READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
distance = node_distance(node, ns->ctrl->numa_node);
else
distance = LOCAL_DISTANCE;
@@ -286,10 +302,15 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
}
-static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
- int node, struct nvme_ns *old)
+static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns, *found = NULL;
+ int node = numa_node_id();
+ struct nvme_ns *old = srcu_dereference(head->current_path[node],
+ &head->srcu);
+
+ if (unlikely(!old))
+ return __nvme_find_path(head, node);
if (list_is_singular(&head->list)) {
if (nvme_path_is_disabled(old))
@@ -329,13 +350,49 @@ out:
return found;
}
+static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ if (nvme_path_is_disabled(ns))
+ continue;
+
+ depth = atomic_read(&ns->ctrl->nr_active);
+
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = ns;
+ }
+ break;
+ case NVME_ANA_NONOPTIMIZED:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = ns;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (min_depth_opt == 0)
+ return best_opt;
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
- return ns->ctrl->state == NVME_CTRL_LIVE &&
+ return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
ns->ana_state == NVME_ANA_OPTIMIZED;
}
-inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
+static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
{
int node = numa_node_id();
struct nvme_ns *ns;
@@ -343,22 +400,34 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (unlikely(!ns))
return __nvme_find_path(head, node);
-
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
- return nvme_round_robin_path(head, node, ns);
if (unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
}
+inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
+{
+ switch (READ_ONCE(head->subsys->iopolicy)) {
+ case NVME_IOPOLICY_QD:
+ return nvme_queue_depth_path(head);
+ case NVME_IOPOLICY_RR:
+ return nvme_round_robin_path(head);
+ default:
+ return nvme_numa_path(head);
+ }
+}
+
static bool nvme_available_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns;
+ if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
+ return NULL;
+
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
continue;
- switch (ns->ctrl->state) {
+ switch (nvme_ctrl_state(ns->ctrl)) {
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
@@ -422,6 +491,21 @@ static void nvme_ns_head_release(struct gendisk *disk)
nvme_put_ns_head(disk->private_data);
}
+static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
+ enum blk_unique_id type)
+{
+ struct nvme_ns_head *head = disk->private_data;
+ struct nvme_ns *ns;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ ns = nvme_find_path(head);
+ if (ns)
+ ret = nvme_ns_get_unique_id(ns, id, type);
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
+
#ifdef CONFIG_BLK_DEV_ZONED
static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
@@ -449,6 +533,7 @@ const struct block_device_operations nvme_ns_head_ops = {
.ioctl = nvme_ns_head_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = nvme_getgeo,
+ .get_unique_id = nvme_ns_head_get_unique_id,
.report_zones = nvme_ns_head_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -515,7 +600,7 @@ static void nvme_requeue_work(struct work_struct *work)
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
- bool vwc = false;
+ struct queue_limits lim;
mutex_init(&head->lock);
bio_list_init(&head->requeue_list);
@@ -531,36 +616,21 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
!nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0;
- head->disk = blk_alloc_disk(ctrl->numa_node);
- if (!head->disk)
- return -ENOMEM;
+ blk_set_stacking_limits(&lim);
+ lim.dma_alignment = 3;
+ lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL;
+ if (head->ids.csi == NVME_CSI_ZNS)
+ lim.features |= BLK_FEAT_ZONED;
+ else
+ lim.max_zone_append_sectors = 0;
+
+ head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
+ if (IS_ERR(head->disk))
+ return PTR_ERR(head->disk);
head->disk->fops = &nvme_ns_head_ops;
head->disk->private_data = head;
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
-
- blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
- blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
- blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
- /*
- * This assumes all controllers that refer to a namespace either
- * support poll queues or not. That is not a strict guarantee,
- * but if the assumption is wrong the effect is only suboptimal
- * performance but not correctness problem.
- */
- if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
- ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
-
- /* set to a default value of 512 until the disk is validated */
- blk_queue_logical_block_size(head->disk->queue, 512);
- blk_set_stacking_limits(&head->disk->queue->limits);
- blk_queue_dma_alignment(head->disk->queue, 3);
-
- /* we need to propagate up the VMC settings */
- if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
- vwc = true;
- blk_queue_write_cache(head->disk->queue, vwc, vwc);
return 0;
}
@@ -579,9 +649,9 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
*/
if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
rc = device_add_disk(&head->subsys->dev, head->disk,
- nvme_ns_id_attr_groups);
+ nvme_ns_attr_groups);
if (rc) {
- clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
+ clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
return;
}
nvme_add_ns_head_cdev(head);
@@ -592,7 +662,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
int node, srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
- for_each_node(node)
+ for_each_online_node(node)
__nvme_find_path(head, node);
srcu_read_unlock(&head->srcu, srcu_idx);
}
@@ -667,7 +737,7 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
* controller is ready.
*/
if (nvme_state_is_live(ns->ana_state) &&
- ns->ctrl->state == NVME_CTRL_LIVE)
+ nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
nvme_mpath_set_live(ns);
}
@@ -677,6 +747,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
+ int srcu_idx;
dev_dbg(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
@@ -688,8 +759,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (!nr_nsids)
return 0;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
unsigned nsid;
again:
nsid = le32_to_cpu(desc->nsids[n]);
@@ -702,7 +773,7 @@ again:
if (ns->head->ns_id > nsid)
goto again;
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
return 0;
}
@@ -748,7 +819,7 @@ static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
return;
nvme_read_ana_log(ctrl);
@@ -796,6 +867,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
}
+static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
+ int iopolicy)
+{
+ struct nvme_ctrl *ctrl;
+ int old_iopolicy = READ_ONCE(subsys->iopolicy);
+
+ if (old_iopolicy == iopolicy)
+ return;
+
+ WRITE_ONCE(subsys->iopolicy, iopolicy);
+
+ /* iopolicy changes clear the mpath by design */
+ mutex_lock(&nvme_subsystems_lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+ nvme_mpath_clear_ctrl_paths(ctrl);
+ mutex_unlock(&nvme_subsystems_lock);
+
+ pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
+ subsys->subnqn,
+ nvme_iopolicy_names[old_iopolicy],
+ nvme_iopolicy_names[iopolicy]);
+}
+
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -805,7 +899,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
- WRITE_ONCE(subsys->iopolicy, i);
+ nvme_subsys_iopolicy_update(subsys, i);
return count;
}
}
@@ -868,9 +962,6 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
nvme_mpath_set_live(ns);
}
- if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
- ns->head->disk->queue);
#ifdef CONFIG_BLK_DEV_ZONED
if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
ns->head->disk->nr_zones = ns->disk->nr_zones;
@@ -881,11 +972,16 @@ void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
- kblockd_schedule_work(&head->requeue_work);
- if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+ if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
nvme_cdev_del(&head->cdev, &head->cdev_device);
del_gendisk(head->disk);
}
+ /*
+ * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
+ * to allow multipath to fail all I/O.
+ */
+ synchronize_srcu(&head->srcu);
+ kblockd_schedule_work(&head->requeue_work);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -916,6 +1012,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
!(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
return 0;
+ /* initialize this in the identify path to cover controller resets */
+ atomic_set(&ctrl->nr_active, 0);
+
if (!ctrl->max_namespaces ||
ctrl->max_namespaces > le32_to_cpu(id->nn)) {
dev_err(ctrl->device,