summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r--drivers/nvme/host/core.c346
1 files changed, 206 insertions, 140 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 46a4c9c5ea96..85ab0fcf9e88 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -20,6 +20,7 @@
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/pm_qos.h>
+#include <linux/ratelimit.h>
#include <asm/unaligned.h>
#include "nvme.h"
@@ -131,7 +132,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl)
/*
* Only new queue scan work when admin and IO queues are both alive
*/
- if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work);
}
@@ -143,7 +144,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl)
*/
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{
- if (ctrl->state != NVME_CTRL_RESETTING)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
return -EBUSY;
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY;
@@ -156,7 +157,7 @@ static void nvme_failfast_work(struct work_struct *work)
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_ctrl, failfast_work);
- if (ctrl->state != NVME_CTRL_CONNECTING)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
return;
set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
@@ -200,7 +201,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
ret = nvme_reset_ctrl(ctrl);
if (!ret) {
flush_work(&ctrl->reset_work);
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
ret = -ENETRESET;
}
@@ -312,12 +313,12 @@ static void nvme_log_error(struct request *req)
struct nvme_request *nr = nvme_req(req);
if (ns) {
- pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
+ pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
ns->disk ? ns->disk->disk_name : "?",
nvme_get_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
- (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
- (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
+ nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
+ blk_rq_bytes(req) >> ns->head->lba_shift,
nvme_get_error_status_str(nr->status),
nr->status >> 8 & 7, /* Status Code Type */
nr->status & 0xff, /* Status Code */
@@ -372,9 +373,12 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
static inline void nvme_end_req_zoned(struct request *req)
{
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = nvme_lba_to_sect(req->q->queuedata,
+ req_op(req) == REQ_OP_ZONE_APPEND) {
+ struct nvme_ns *ns = req->q->queuedata;
+
+ req->__sector = nvme_lba_to_sect(ns->head,
le64_to_cpu(nvme_req(req)->result.u64));
+ }
}
static inline void nvme_end_req(struct request *req)
@@ -499,7 +503,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
spin_lock_irqsave(&ctrl->lock, flags);
- old_state = ctrl->state;
+ old_state = nvme_ctrl_state(ctrl);
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
@@ -567,7 +571,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
}
if (changed) {
- ctrl->state = new_state;
+ WRITE_ONCE(ctrl->state, new_state);
wake_up_all(&ctrl->state_wq);
}
@@ -575,11 +579,11 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
if (!changed)
return false;
- if (ctrl->state == NVME_CTRL_LIVE) {
+ if (new_state == NVME_CTRL_LIVE) {
if (old_state == NVME_CTRL_CONNECTING)
nvme_stop_failfast_work(ctrl);
nvme_kick_requeue_lists(ctrl);
- } else if (ctrl->state == NVME_CTRL_CONNECTING &&
+ } else if (new_state == NVME_CTRL_CONNECTING &&
old_state == NVME_CTRL_RESETTING) {
nvme_start_failfast_work(ctrl);
}
@@ -592,7 +596,7 @@ EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
*/
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
{
- switch (ctrl->state) {
+ switch (nvme_ctrl_state(ctrl)) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
@@ -617,7 +621,7 @@ bool nvme_wait_reset(struct nvme_ctrl *ctrl)
wait_event(ctrl->state_wq,
nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
nvme_state_terminal(ctrl));
- return ctrl->state == NVME_CTRL_RESETTING;
+ return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);
@@ -704,9 +708,11 @@ EXPORT_SYMBOL_GPL(nvme_init_request);
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
struct request *rq)
{
- if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
- ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DEAD &&
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
+ if (state != NVME_CTRL_DELETING_NOIO &&
+ state != NVME_CTRL_DELETING &&
+ state != NVME_CTRL_DEAD &&
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
@@ -736,7 +742,7 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
* command, which is require to set the queue live in the
* appropinquate states.
*/
- switch (ctrl->state) {
+ switch (nvme_ctrl_state(ctrl)) {
case NVME_CTRL_CONNECTING:
if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
(req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
@@ -791,8 +797,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
}
if (queue_max_discard_segments(req->q) == 1) {
- u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
- u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
+ u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
+ u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
range[0].cattr = cpu_to_le32(0);
range[0].nlb = cpu_to_le32(nlb);
@@ -800,8 +806,9 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
n = 1;
} else {
__rq_for_each_bio(bio, req) {
- u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
- u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+ u64 slba = nvme_sect_to_lba(ns->head,
+ bio->bi_iter.bi_sector);
+ u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
if (n < segments) {
range[n].cattr = cpu_to_le32(0);
@@ -839,7 +846,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
u64 ref48;
/* both rw and write zeroes share the same reftag format */
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
@@ -867,17 +874,18 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->write_zeroes.slba =
- cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+ cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
cmnd->write_zeroes.length =
- cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
- if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
+ if (!(req->cmd_flags & REQ_NOUNMAP) &&
+ (ns->head->features & NVME_NS_DEAC))
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
- if (nvme_ns_has_pi(ns)) {
+ if (nvme_ns_has_pi(ns->head)) {
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
nvme_set_ref_tag(ns, cmnd, req);
@@ -909,13 +917,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.cdw2 = 0;
cmnd->rw.cdw3 = 0;
cmnd->rw.metadata = 0;
- cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
- cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cmnd->rw.slba =
+ cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
+ cmnd->rw.length =
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
cmnd->rw.apptag = 0;
cmnd->rw.appmask = 0;
- if (ns->ms) {
+ if (ns->head->ms) {
/*
* If formated with metadata, the block layer always provides a
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
@@ -923,12 +933,12 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
* namespace capacity to zero to prevent any I/O.
*/
if (!blk_integrity_rq(req)) {
- if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+ if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
}
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
break;
@@ -1192,8 +1202,16 @@ static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
{
- queue_delayed_work(nvme_wq, &ctrl->ka_work,
- nvme_keep_alive_work_period(ctrl));
+ unsigned long now = jiffies;
+ unsigned long delay = nvme_keep_alive_work_period(ctrl);
+ unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
+
+ if (time_after(now, ka_next_check_tm))
+ delay = 0;
+ else
+ delay = ka_next_check_tm - now;
+
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
}
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
@@ -1442,7 +1460,7 @@ free_data:
return status;
}
-static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_id_ns **id)
{
struct nvme_command c = { };
@@ -1479,7 +1497,8 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
if (id->ncap == 0) {
/* namespace not allocated or attached */
info->is_removed = true;
- return -ENODEV;
+ ret = -ENODEV;
+ goto error;
}
info->anagrpid = id->anagrpid;
@@ -1497,8 +1516,10 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
}
+
+error:
kfree(id);
- return 0;
+ return ret;
}
static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
@@ -1658,14 +1679,14 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
- u32 max_integrity_segments)
+static void nvme_init_integrity(struct gendisk *disk,
+ struct nvme_ns_head *head, u32 max_integrity_segments)
{
struct blk_integrity integrity = { };
- switch (ns->pi_type) {
+ switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
- switch (ns->guard_type) {
+ switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1683,7 +1704,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- switch (ns->guard_type) {
+ switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type1_crc;
integrity.tag_size = sizeof(u16);
@@ -1704,27 +1725,28 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
}
- integrity.tuple_size = ns->ms;
+ integrity.tuple_size = head->ms;
blk_integrity_register(disk, &integrity);
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
}
#else
-static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
- u32 max_integrity_segments)
+static void nvme_init_integrity(struct gendisk *disk,
+ struct nvme_ns_head *head, u32 max_integrity_segments)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
+static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
+ struct nvme_ns_head *head)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
struct request_queue *queue = disk->queue;
- u32 size = queue_logical_block_size(queue);
-
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
- ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);
+ u32 max_discard_sectors;
- if (ctrl->max_discard_sectors == 0) {
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
+ max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
+ } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
+ max_discard_sectors = UINT_MAX;
+ } else {
blk_queue_max_discard_sectors(queue, 0);
return;
}
@@ -1732,14 +1754,22 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- queue->limits.discard_granularity = size;
-
- /* If discard is already enabled, don't reset queue limits */
+ /*
+ * If discard is already enabled, don't reset queue limits.
+ *
+ * This works around the fact that the block layer can't cope well with
+ * updating the hardware limits when overridden through sysfs. This is
+ * harmless because discard limits in NVMe are purely advisory.
+ */
if (queue->limits.max_discard_sectors)
return;
- blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
- blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
+ blk_queue_max_discard_sectors(queue, max_discard_sectors);
+ if (ctrl->dmrl)
+ blk_queue_max_discard_segments(queue, ctrl->dmrl);
+ else
+ blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+ queue->limits.discard_granularity = queue_logical_block_size(queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
@@ -1753,21 +1783,21 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi;
}
-static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
+ struct nvme_id_ns *id)
{
bool first = id->dps & NVME_NS_DPS_PI_FIRST;
unsigned lbaf = nvme_lbaf_index(id->flbas);
- struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_command c = { };
struct nvme_id_ns_nvm *nvm;
int ret = 0;
u32 elbaf;
- ns->pi_size = 0;
- ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ head->pi_size = 0;
+ head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- ns->pi_size = sizeof(struct t10_pi_tuple);
- ns->guard_type = NVME_NVM_NS_16B_GUARD;
+ head->pi_size = sizeof(struct t10_pi_tuple);
+ head->guard_type = NVME_NVM_NS_16B_GUARD;
goto set_pi;
}
@@ -1776,11 +1806,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(ns->head->ns_id);
+ c.identify.nsid = cpu_to_le32(head->ns_id);
c.identify.cns = NVME_ID_CNS_CS_NS;
c.identify.csi = NVME_CSI_NVM;
- ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret)
goto free_data;
@@ -1790,13 +1820,13 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
if (nvme_elbaf_sts(elbaf))
goto free_data;
- ns->guard_type = nvme_elbaf_guard_type(elbaf);
- switch (ns->guard_type) {
+ head->guard_type = nvme_elbaf_guard_type(elbaf);
+ switch (head->guard_type) {
case NVME_NVM_NS_64B_GUARD:
- ns->pi_size = sizeof(struct crc64_pi_tuple);
+ head->pi_size = sizeof(struct crc64_pi_tuple);
break;
case NVME_NVM_NS_16B_GUARD:
- ns->pi_size = sizeof(struct t10_pi_tuple);
+ head->pi_size = sizeof(struct t10_pi_tuple);
break;
default:
break;
@@ -1805,25 +1835,25 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
free_data:
kfree(nvm);
set_pi:
- if (ns->pi_size && (first || ns->ms == ns->pi_size))
- ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (head->pi_size && (first || head->ms == head->pi_size))
+ head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
- ns->pi_type = 0;
+ head->pi_type = 0;
return ret;
}
-static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head, struct nvme_id_ns *id)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
- ret = nvme_init_ms(ns, id);
+ ret = nvme_init_ms(ctrl, head, id);
if (ret)
return ret;
- ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
- if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+ head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return 0;
if (ctrl->ops->flags & NVME_F_FABRICS) {
@@ -1835,7 +1865,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
return 0;
- ns->features |= NVME_NS_EXT_LBAS;
+ head->features |= NVME_NS_EXT_LBAS;
/*
* The current fabrics transport drivers support namespace
@@ -1846,8 +1876,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* Note, this check will need to be modified if any drivers
* gain the ability to use other metadata formats.
*/
- if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
+ head->features |= NVME_NS_METADATA_SUPPORTED;
} else {
/*
* For PCIe controllers, we can't easily remap the separate
@@ -1856,9 +1886,9 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* We allow extended LBAs for the passthrough interface, though.
*/
if (id->flbas & NVME_NS_FLBAS_META_EXT)
- ns->features |= NVME_NS_EXT_LBAS;
+ head->features |= NVME_NS_EXT_LBAS;
else
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ head->features |= NVME_NS_METADATA_SUPPORTED;
}
return 0;
}
@@ -1881,18 +1911,19 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
blk_queue_write_cache(q, vwc, vwc);
}
-static void nvme_update_disk_info(struct gendisk *disk,
- struct nvme_ns *ns, struct nvme_id_ns *id)
+static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
+ struct nvme_ns_head *head, struct nvme_id_ns *id)
{
- sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
- u32 bs = 1U << ns->lba_shift;
+ sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze));
+ u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
/*
* The block layer can't support LBA sizes larger than the page size
- * yet, so catch this early and don't allow block I/O.
+ * or smaller than a sector size yet, so catch this early and don't
+ * allow block I/O.
*/
- if (ns->lba_shift > PAGE_SHIFT) {
+ if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
capacity = 0;
bs = (1 << 9);
}
@@ -1909,7 +1940,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
- atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+ atomic_bs = (1 + ctrl->subsys->awupf) * bs;
}
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
@@ -1935,20 +1966,20 @@ static void nvme_update_disk_info(struct gendisk *disk,
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
- if (ns->ms) {
+ if (head->ms) {
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (ns->features & NVME_NS_METADATA_SUPPORTED))
- nvme_init_integrity(disk, ns,
- ns->ctrl->max_integrity_segments);
- else if (!nvme_ns_has_pi(ns))
+ (head->features & NVME_NS_METADATA_SUPPORTED))
+ nvme_init_integrity(disk, head,
+ ctrl->max_integrity_segments);
+ else if (!nvme_ns_has_pi(head))
capacity = 0;
}
set_capacity_and_notify(disk, capacity);
- nvme_config_discard(disk, ns);
+ nvme_config_discard(ctrl, disk, head);
blk_queue_max_write_zeroes_sectors(disk->queue,
- ns->ctrl->max_zeroes_sectors);
+ ctrl->max_zeroes_sectors);
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@@ -1971,7 +2002,7 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
else
- iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
+ iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
if (!iob)
return;
@@ -2029,18 +2060,26 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (ret)
return ret;
+ if (id->ncap == 0) {
+ /* namespace not allocated or attached */
+ info->is_removed = true;
+ ret = -ENODEV;
+ goto error;
+ }
+
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
- ns->lba_shift = id->lbaf[lbaf].ds;
+ ns->head->lba_shift = id->lbaf[lbaf].ds;
+ ns->head->nuse = le64_to_cpu(id->nuse);
nvme_set_queue_limits(ns->ctrl, ns->queue);
- ret = nvme_configure_metadata(ns, id);
+ ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
if (ret < 0) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
nvme_set_chunk_sectors(ns, id);
- nvme_update_disk_info(ns->disk, ns, id);
+ nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
if (ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf);
@@ -2057,7 +2096,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
* do not return zeroes.
*/
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
- ns->features |= NVME_NS_DEAC;
+ ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
@@ -2070,7 +2109,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
- nvme_update_disk_info(ns->head->disk, ns, id);
+ nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
@@ -2090,6 +2129,8 @@ out:
set_bit(NVME_NS_READY, &ns->flags);
ret = 0;
}
+
+error:
kfree(id);
return ret;
}
@@ -2529,7 +2570,7 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val)
if (ctrl->ps_max_latency_us != latency) {
ctrl->ps_max_latency_us = latency;
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
nvme_configure_apst(ctrl);
}
}
@@ -2897,14 +2938,6 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
struct nvme_id_ctrl_nvm *id;
int ret;
- if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
- ctrl->max_discard_sectors = UINT_MAX;
- ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
- } else {
- ctrl->max_discard_sectors = 0;
- ctrl->max_discard_segments = 0;
- }
-
/*
* Even though NVMe spec explicitly states that MDTS is not applicable
* to the write-zeroes, we are cautious and limit the size to the
@@ -2934,8 +2967,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
if (ret)
goto free_data;
- if (id->dmrl)
- ctrl->max_discard_segments = id->dmrl;
+ ctrl->dmrl = id->dmrl;
ctrl->dmrsl = le32_to_cpu(id->dmrsl);
if (id->wzsl)
ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
@@ -3003,6 +3035,42 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
return 0;
}
+static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+ /*
+ * In fabrics we need to verify the cntlid matches the
+ * admin connect
+ */
+ if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
+ dev_err(ctrl->device,
+ "Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
+ ctrl->cntlid, le16_to_cpu(id->cntlid));
+ return -EINVAL;
+ }
+
+ if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
+ dev_err(ctrl->device,
+ "keep-alive support is mandatory for fabrics\n");
+ return -EINVAL;
+ }
+
+ if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) {
+ dev_err(ctrl->device,
+ "I/O queue command capsule supported size %d < 4\n",
+ ctrl->ioccsz);
+ return -EINVAL;
+ }
+
+ if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) {
+ dev_err(ctrl->device,
+ "I/O queue response capsule supported size %d < 1\n",
+ ctrl->iorcsz);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
@@ -3115,25 +3183,9 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
- /*
- * In fabrics we need to verify the cntlid matches the
- * admin connect
- */
- if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
- dev_err(ctrl->device,
- "Mismatching cntlid: Connect %u vs Identify "
- "%u, rejecting\n",
- ctrl->cntlid, le16_to_cpu(id->cntlid));
- ret = -EINVAL;
- goto out_free;
- }
-
- if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
- dev_err(ctrl->device,
- "keep-alive support is mandatory for fabrics\n");
- ret = -EINVAL;
+ ret = nvme_check_ctrl_fabric_info(ctrl, id);
+ if (ret)
goto out_free;
- }
} else {
ctrl->hmpre = le32_to_cpu(id->hmpre);
ctrl->hmmin = le32_to_cpu(id->hmmin);
@@ -3217,7 +3269,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
struct nvme_ctrl *ctrl =
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
- switch (ctrl->state) {
+ switch (nvme_ctrl_state(ctrl)) {
case NVME_CTRL_LIVE:
break;
default:
@@ -3392,6 +3444,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
+ ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
+ ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
if (head->ids.csi) {
@@ -3639,11 +3693,19 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
goto out_unlink_ns;
down_write(&ctrl->namespaces_rwsem);
+ /*
+ * Ensure that no namespaces are added to the ctrl list after the queues
+ * are frozen, thereby avoiding a deadlock between scan and reset.
+ */
+ if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
+ up_write(&ctrl->namespaces_rwsem);
+ goto out_unlink_ns;
+ }
nvme_ns_add_to_ctrl_list(ns);
up_write(&ctrl->namespaces_rwsem);
nvme_get_ctrl(ctrl);
- if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
+ if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
goto out_cleanup_ns_from_list;
if (!nvme_ns_head_multipath(ns->head))
@@ -3903,7 +3965,7 @@ static void nvme_scan_work(struct work_struct *work)
int ret;
/* No tagset on a live ctrl means IO queues could not created */
- if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
return;
/*
@@ -3973,7 +4035,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
* removing the namespaces' disks; fail all the queues now to avoid
* potentially having to clean up the failed sync later.
*/
- if (ctrl->state == NVME_CTRL_DEAD)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
nvme_mark_namespaces_dead(ctrl);
/* this is a no-op when called from the controller reset handler */
@@ -4055,7 +4117,7 @@ static void nvme_async_event_work(struct work_struct *work)
* flushing ctrl async_event_work after changing the controller state
* from LIVE and before freeing the admin queue.
*/
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
ctrl->ops->submit_async_event(ctrl);
}
@@ -4106,6 +4168,8 @@ static void nvme_fw_act_work(struct work_struct *work)
struct nvme_ctrl, fw_act_work);
unsigned long fw_act_timeout;
+ nvme_auth_stop(ctrl);
+
if (ctrl->mtfa)
fw_act_timeout = jiffies +
msecs_to_jiffies(ctrl->mtfa * 100);
@@ -4161,7 +4225,6 @@ static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
* firmware activation.
*/
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
- nvme_auth_stop(ctrl);
requeue = false;
queue_work(nvme_wq, &ctrl->fw_act_work);
}
@@ -4450,7 +4513,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
{
int ret;
- ctrl->state = NVME_CTRL_NEW;
+ WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
@@ -4471,6 +4534,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+ ctrl->ka_last_check_time = jiffies;
BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
PAGE_SIZE);
@@ -4559,6 +4623,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_unfreeze_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
+ clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
@@ -4592,6 +4657,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_freeze_queue_start(ns->queue);