summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r--drivers/nvme/host/core.c567
1 files changed, 416 insertions, 151 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4ee2330c603e..88cff309d8e4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
static struct class *nvme_class;
static struct class *nvme_subsys_class;
-static int nvme_revalidate_disk(struct gendisk *disk);
+static int _nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
@@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
* Revalidating a dead namespace sets capacity to 0. This will end
* buffered writers dirtying pages that can't be synced.
*/
- if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
nvme_retry_req(req);
return;
}
+ } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ req_op(req) == REQ_OP_ZONE_APPEND) {
+ req->__sector = nvme_lba_to_sect(req->q->queuedata,
+ le64_to_cpu(nvme_req(req)->result.u64));
}
nvme_trace_bio_complete(req, status);
@@ -304,7 +308,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
return true;
nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
- blk_mq_force_complete_rq(req);
+ blk_mq_complete_request(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -362,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
break;
}
break;
+ case NVME_CTRL_DELETING_NOIO:
+ switch (old_state) {
+ case NVME_CTRL_DELETING:
+ case NVME_CTRL_DEAD:
+ changed = true;
+ /* FALLTHRU */
+ default:
+ break;
+ }
+ break;
case NVME_CTRL_DEAD:
switch (old_state) {
case NVME_CTRL_DELETING:
@@ -399,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
case NVME_CTRL_CONNECTING:
return false;
case NVME_CTRL_DELETING:
+ case NVME_CTRL_DELETING_NOIO:
case NVME_CTRL_DEAD:
return true;
default:
@@ -450,10 +465,11 @@ static void nvme_free_ns(struct kref *kref)
kfree(ns);
}
-static void nvme_put_ns(struct nvme_ns *ns)
+void nvme_put_ns(struct nvme_ns *ns)
{
kref_put(&ns->kref, nvme_free_ns);
}
+EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
static inline void nvme_clear_nvme_request(struct request *req)
{
@@ -555,7 +571,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
goto out_disable_stream;
}
- ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+ ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
return 0;
@@ -589,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
}
+static void nvme_setup_passthrough(struct request *req,
+ struct nvme_command *cmd)
+{
+ memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
+ /* passthru commands should let the driver set the SGL flags */
+ cmd->common.flags &= ~NVME_CMD_SGL_ALL;
+}
+
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
@@ -673,7 +697,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
}
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
- struct request *req, struct nvme_command *cmnd)
+ struct request *req, struct nvme_command *cmnd,
+ enum nvme_opcode op)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
@@ -687,7 +712,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
- cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+ cmnd->rw.opcode = op;
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -716,6 +741,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
+ if (op == nvme_cmd_zone_append)
+ control |= NVME_RW_APPEND_PIREMAP;
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
}
@@ -751,11 +778,24 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
- memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
+ nvme_setup_passthrough(req, cmd);
break;
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
+ case REQ_OP_ZONE_RESET_ALL:
+ case REQ_OP_ZONE_RESET:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
+ break;
+ case REQ_OP_ZONE_OPEN:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
+ break;
case REQ_OP_WRITE_ZEROES:
ret = nvme_setup_write_zeroes(ns, req, cmd);
break;
@@ -763,8 +803,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
ret = nvme_setup_discard(ns, req, cmd);
break;
case REQ_OP_READ:
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
+ break;
case REQ_OP_WRITE:
- ret = nvme_setup_rw(ns, req, cmd);
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+ break;
+ case REQ_OP_ZONE_APPEND:
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
break;
default:
WARN_ON_ONCE(1);
@@ -884,6 +929,120 @@ out:
return ERR_PTR(ret);
}
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+ switch (opcode) {
+ case nvme_admin_format_nvm:
+ return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+ NVME_CMD_EFFECTS_CSE_MASK;
+ case nvme_admin_sanitize_nvm:
+ return NVME_CMD_EFFECTS_CSE_MASK;
+ default:
+ break;
+ }
+ return 0;
+}
+
+u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
+{
+ u32 effects = 0;
+
+ if (ns) {
+ if (ns->head->effects)
+ effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
+ if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
+ dev_warn(ctrl->device,
+ "IO command:%02x has unhandled effects:%08x\n",
+ opcode, effects);
+ return 0;
+ }
+
+ if (ctrl->effects)
+ effects = le32_to_cpu(ctrl->effects->acs[opcode]);
+ effects |= nvme_known_admin_effects(opcode);
+
+ return effects;
+}
+EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ u8 opcode)
+{
+ u32 effects = nvme_command_effects(ctrl, ns, opcode);
+
+ /*
+ * For simplicity, IO to all namespaces is quiesced even if the command
+ * effects say only one namespace is affected.
+ */
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ mutex_lock(&ctrl->scan_lock);
+ mutex_lock(&ctrl->subsys->lock);
+ nvme_mpath_start_freeze(ctrl->subsys);
+ nvme_mpath_wait_freeze(ctrl->subsys);
+ nvme_start_freeze(ctrl);
+ nvme_wait_freeze(ctrl);
+ }
+ return effects;
+}
+
+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ if (_nvme_revalidate_disk(ns->disk))
+ nvme_set_queue_dying(ns);
+ else if (blk_queue_is_zoned(ns->disk->queue)) {
+ /*
+ * IO commands are required to fully revalidate a zoned
+ * device. Force the command effects to trigger rescan
+ * work so report zones can run in a context with
+ * unfrozen IO queues.
+ */
+ *effects |= NVME_CMD_EFFECTS_NCC;
+ }
+ up_read(&ctrl->namespaces_rwsem);
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+ /*
+ * Revalidate LBA changes prior to unfreezing. This is necessary to
+ * prevent memory corruption if a logical block size was changed by
+ * this command.
+ */
+ if (effects & NVME_CMD_EFFECTS_LBCC)
+ nvme_update_formats(ctrl, &effects);
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ nvme_unfreeze(ctrl);
+ nvme_mpath_unfreeze(ctrl->subsys);
+ mutex_unlock(&ctrl->subsys->lock);
+ nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
+ mutex_unlock(&ctrl->scan_lock);
+ }
+ if (effects & NVME_CMD_EFFECTS_CCC)
+ nvme_init_identify(ctrl);
+ if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
+ nvme_queue_scan(ctrl);
+ flush_work(&ctrl->scan_work);
+ }
+}
+
+void nvme_execute_passthru_rq(struct request *rq)
+{
+ struct nvme_command *cmd = nvme_req(rq)->cmd;
+ struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
+ struct nvme_ns *ns = rq->q->queuedata;
+ struct gendisk *disk = ns ? ns->disk : NULL;
+ u32 effects;
+
+ effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
+ blk_execute_rq(rq->q, disk, rq, 0);
+ nvme_passthru_end(ctrl, effects);
+}
+EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
+
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
@@ -922,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
}
}
- blk_execute_rq(req->q, disk, req, 0);
+ nvme_execute_passthru_rq(req);
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
@@ -1056,8 +1215,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
return error;
}
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
+{
+ return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
+}
+
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
- struct nvme_ns_id_desc *cur)
+ struct nvme_ns_id_desc *cur, bool *csi_seen)
{
const char *warn_str = "ctrl returned bogus length:";
void *data = cur;
@@ -1087,6 +1251,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
}
uuid_copy(&ids->uuid, data + sizeof(*cur));
return NVME_NIDT_UUID_LEN;
+ case NVME_NIDT_CSI:
+ if (cur->nidl != NVME_NIDT_CSI_LEN) {
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
+ warn_str, cur->nidl);
+ return -1;
+ }
+ memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
+ *csi_seen = true;
+ return NVME_NIDT_CSI_LEN;
default:
/* Skip unknown types */
return cur->nidl;
@@ -1097,10 +1270,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids)
{
struct nvme_command c = { };
- int status;
+ bool csi_seen = false;
+ int status, pos, len;
void *data;
- int pos;
- int len;
if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
return 0;
@@ -1127,12 +1299,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (cur->nidl == 0)
break;
- len = nvme_process_ns_desc(ctrl, ids, cur);
+ len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
if (len < 0)
- goto free_data;
+ break;
len += sizeof(*cur);
}
+
+ if (nvme_multi_css(ctrl) && !csi_seen) {
+ dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
+ nsid);
+ status = -EINVAL;
+ }
+
free_data:
kfree(data);
return status;
@@ -1321,96 +1500,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
-static u32 nvme_known_admin_effects(u8 opcode)
-{
- switch (opcode) {
- case nvme_admin_format_nvm:
- return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
- NVME_CMD_EFFECTS_CSE_MASK;
- case nvme_admin_sanitize_nvm:
- return NVME_CMD_EFFECTS_CSE_MASK;
- default:
- break;
- }
- return 0;
-}
-
-static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- u8 opcode)
-{
- u32 effects = 0;
-
- if (ns) {
- if (ctrl->effects)
- effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
- if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
- dev_warn(ctrl->device,
- "IO command:%02x has unhandled effects:%08x\n",
- opcode, effects);
- return 0;
- }
-
- if (ctrl->effects)
- effects = le32_to_cpu(ctrl->effects->acs[opcode]);
- effects |= nvme_known_admin_effects(opcode);
-
- /*
- * For simplicity, IO to all namespaces is quiesced even if the command
- * effects say only one namespace is affected.
- */
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
- mutex_lock(&ctrl->scan_lock);
- mutex_lock(&ctrl->subsys->lock);
- nvme_mpath_start_freeze(ctrl->subsys);
- nvme_mpath_wait_freeze(ctrl->subsys);
- nvme_start_freeze(ctrl);
- nvme_wait_freeze(ctrl);
- }
- return effects;
-}
-
-static void nvme_update_formats(struct nvme_ctrl *ctrl)
-{
- struct nvme_ns *ns;
-
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
- if (ns->disk && nvme_revalidate_disk(ns->disk))
- nvme_set_queue_dying(ns);
- up_read(&ctrl->namespaces_rwsem);
-}
-
-static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
-{
- /*
- * Revalidate LBA changes prior to unfreezing. This is necessary to
- * prevent memory corruption if a logical block size was changed by
- * this command.
- */
- if (effects & NVME_CMD_EFFECTS_LBCC)
- nvme_update_formats(ctrl);
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
- nvme_unfreeze(ctrl);
- nvme_mpath_unfreeze(ctrl->subsys);
- mutex_unlock(&ctrl->subsys->lock);
- nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
- mutex_unlock(&ctrl->scan_lock);
- }
- if (effects & NVME_CMD_EFFECTS_CCC)
- nvme_init_identify(ctrl);
- if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
- nvme_queue_scan(ctrl);
- flush_work(&ctrl->scan_work);
- }
-}
-
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = 0;
- u32 effects;
u64 result;
int status;
@@ -1437,12 +1532,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
- effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
nvme_to_user_ptr(cmd.addr), cmd.data_len,
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &result, timeout);
- nvme_passthru_end(ctrl, effects);
if (status >= 0) {
if (put_user(result, &ucmd->result))
@@ -1458,7 +1551,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
unsigned timeout = 0;
- u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@@ -1484,12 +1576,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
- effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
nvme_to_user_ptr(cmd.addr), cmd.data_len,
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &cmd.result, timeout);
- nvme_passthru_end(ctrl, effects);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
@@ -1503,7 +1593,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
* Issue ioctl requests on the first available path. Note that unlike normal
* block layer requests we will not retry failed request on another controller.
*/
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
struct nvme_ns_head **head, int *srcu_idx)
{
#ifdef CONFIG_NVME_MULTIPATH
@@ -1523,7 +1613,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
return disk->private_data;
}
-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
{
if (head)
srcu_read_unlock(&head->srcu, idx);
@@ -1789,7 +1879,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
- if (ctrl->vs >= NVME_VS(1, 3, 0))
+ if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
return nvme_identify_ns_descs(ctrl, nsid, ids);
return 0;
}
@@ -1805,7 +1895,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
return uuid_equal(&a->uuid, &b->uuid) &&
memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
- memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+ memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
+ a->csi == b->csi;
}
static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1915,18 +2006,38 @@ static void nvme_update_disk_info(struct gendisk *disk,
static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
+ unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
+ int ret;
u32 iob;
/*
* If identify namespace failed, use default 512 byte block size so
* block layer can use before failing read/write for 0 capacity.
*/
- ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
+ ns->lba_shift = id->lbaf[lbaf].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
+ switch (ns->head->ids.csi) {
+ case NVME_CSI_NVM:
+ break;
+ case NVME_CSI_ZNS:
+ ret = nvme_update_zone_info(disk, ns, lbaf);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "failed to add zoned namespace:%u ret:%d\n",
+ ns->head->ns_id, ret);
+ return ret;
+ }
+ break;
+ default:
+ dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
+ ns->head->ids.csi, ns->head->ns_id);
+ return -ENODEV;
+ }
+
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
@@ -1934,7 +2045,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
ns->features = 0;
- ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
/* the PI implementation requires metadata equal t10 pi tuple size */
if (ns->ms == sizeof(struct t10_pi_tuple))
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
@@ -1970,14 +2081,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
#ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
- blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
+ blk_stack_limits(&ns->head->disk->queue->limits,
+ &ns->queue->limits, 0);
nvme_mpath_update_disk_size(ns->head->disk);
}
#endif
return 0;
}
-static int nvme_revalidate_disk(struct gendisk *disk)
+static int _nvme_revalidate_disk(struct gendisk *disk)
{
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
@@ -2025,6 +2137,28 @@ out:
return ret;
}
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+ int ret;
+
+ ret = _nvme_revalidate_disk(disk);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (blk_queue_is_zoned(disk->queue)) {
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+
+ ret = blk_revalidate_disk_zones(disk, NULL);
+ if (!ret)
+ blk_queue_max_zone_append_sectors(disk->queue,
+ ctrl->max_zone_append);
+ }
+#endif
+ return ret;
+}
+
static char nvme_pr_type(enum pr_type type)
{
switch (type) {
@@ -2155,6 +2289,7 @@ static const struct block_device_operations nvme_fops = {
.release = nvme_release,
.getgeo = nvme_getgeo,
.revalidate_disk= nvme_revalidate_disk,
+ .report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -2175,11 +2310,13 @@ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
const struct block_device_operations nvme_ns_head_ops = {
.owner = THIS_MODULE,
+ .submit_bio = nvme_ns_head_submit_bio,
.open = nvme_ns_head_open,
.release = nvme_ns_head_release,
.ioctl = nvme_ioctl,
.compat_ioctl = nvme_compat_ioctl,
.getgeo = nvme_getgeo,
+ .report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
#endif /* CONFIG_NVME_MULTIPATH */
@@ -2237,12 +2374,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
- /*
- * Default to a 4K page size, with the intention to update this
- * path in the future to accomodate architectures with differing
- * kernel and IO page sizes.
- */
- unsigned dev_page_min, page_shift = 12;
+ unsigned dev_page_min;
int ret;
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
@@ -2252,17 +2384,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
}
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
- if (page_shift < dev_page_min) {
+ if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
- 1 << dev_page_min, 1 << page_shift);
+ 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
return -ENODEV;
}
- ctrl->page_size = 1 << page_shift;
-
- ctrl->ctrl_config = NVME_CC_CSS_NVM;
- ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
+ if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
+ ctrl->ctrl_config = NVME_CC_CSS_CSI;
+ else
+ ctrl->ctrl_config = NVME_CC_CSS_NVM;
+ ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
ctrl->ctrl_config |= NVME_CC_ENABLE;
@@ -2312,13 +2445,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
if (ctrl->max_hw_sectors) {
u32 max_segments =
- (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+ (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
max_segments = min_not_zero(max_segments, ctrl->max_segments);
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
- blk_queue_virt_boundary(q, ctrl->page_size - 1);
+ blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
blk_queue_dma_alignment(q, 7);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
@@ -2809,7 +2942,7 @@ out_unlock:
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
@@ -2823,27 +2956,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
+ c.get_log_page.csi = csi;
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
-static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
+{
+ struct nvme_cel *cel, *ret = NULL;
+
+ spin_lock(&ctrl->lock);
+ list_for_each_entry(cel, &ctrl->cels, entry) {
+ if (cel->csi == csi) {
+ ret = cel;
+ break;
+ }
+ }
+ spin_unlock(&ctrl->lock);
+
+ return ret;
+}
+
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
+ struct nvme_effects_log **log)
{
+ struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
int ret;
- if (!ctrl->effects)
- ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+ if (cel)
+ goto out;
- if (!ctrl->effects)
- return 0;
+ cel = kzalloc(sizeof(*cel), GFP_KERNEL);
+ if (!cel)
+ return -ENOMEM;
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
- ctrl->effects, sizeof(*ctrl->effects), 0);
+ ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
+ &cel->log, sizeof(cel->log), 0);
if (ret) {
- kfree(ctrl->effects);
- ctrl->effects = NULL;
+ kfree(cel);
+ return ret;
}
- return ret;
+
+ cel->csi = csi;
+
+ spin_lock(&ctrl->lock);
+ list_add_tail(&cel->entry, &ctrl->cels);
+ spin_unlock(&ctrl->lock);
+out:
+ *log = &cel->log;
+ return 0;
}
/*
@@ -2864,7 +3025,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
return ret;
}
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
- ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+ ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
@@ -2876,7 +3037,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
- ret = nvme_get_effects_log(ctrl);
+ ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
if (ret < 0)
goto out_free;
}
@@ -2938,7 +3099,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (id->rtd3e) {
/* us -> s */
- u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
+ u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
shutdown_timeout, 60);
@@ -3344,6 +3505,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_CONNECTING] = "connecting",
[NVME_CTRL_DELETING] = "deleting",
+ [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
[NVME_CTRL_DEAD] = "dead",
};
@@ -3396,6 +3558,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev,
}
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
+static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+
+ if (ctrl->opts->max_reconnects == -1)
+ return sprintf(buf, "off\n");
+ return sprintf(buf, "%d\n",
+ opts->max_reconnects * opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ctrl_loss_tmo, err;
+
+ err = kstrtoint(buf, 10, &ctrl_loss_tmo);
+ if (err)
+ return -EINVAL;
+
+ else if (ctrl_loss_tmo < 0)
+ opts->max_reconnects = -1;
+ else
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+ opts->reconnect_delay);
+ return count;
+}
+static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
+ nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
+
+static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (ctrl->opts->reconnect_delay == -1)
+ return sprintf(buf, "off\n");
+ return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ unsigned int v;
+ int err;
+
+ err = kstrtou32(buf, 10, &v);
+ if (err)
+ return err;
+
+ ctrl->opts->reconnect_delay = v;
+ return count;
+}
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
+ nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
+
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -3413,6 +3635,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_sqsize.attr,
&dev_attr_hostnqn.attr,
&dev_attr_hostid.attr,
+ &dev_attr_ctrl_loss_tmo.attr,
+ &dev_attr_reconnect_delay.attr,
NULL
};
@@ -3509,6 +3733,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
goto out_cleanup_srcu;
}
+ if (head->ids.csi) {
+ ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
+ if (ret)
+ goto out_cleanup_srcu;
+ } else
+ head->effects = ctrl->effects;
+
ret = nvme_mpath_alloc_disk(ctrl, head);
if (ret)
goto out_cleanup_srcu;
@@ -3590,7 +3821,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
return nsa->head->ns_id - nsb->head->ns_id;
}
-static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
@@ -3608,6 +3839,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
up_read(&ctrl->namespaces_rwsem);
return ret;
}
+EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
@@ -3725,7 +3957,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_mpath_clear_current_path(ns);
synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
- if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
+ if (ns->disk->flags & GENHD_FL_UP) {
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
if (blk_get_integrity(ns->disk))
@@ -3756,7 +3988,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
- if (ns->disk && revalidate_disk(ns->disk))
+ if (revalidate_disk(ns->disk))
nvme_ns_remove(ns);
nvme_put_ns(ns);
} else
@@ -3849,8 +4081,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
* raced with us in reading the log page, which could cause us to miss
* updates.
*/
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
- log_size, 0);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
+ NVME_CSI_NVM, log, log_size, 0);
if (error)
dev_warn(ctrl->device,
"reading changed ns log failed: %d\n", error);
@@ -3911,6 +4143,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
if (ctrl->state == NVME_CTRL_DEAD)
nvme_kill_queues(ctrl);
+ /* this is a no-op when called from the controller reset handler */
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
+
down_write(&ctrl->namespaces_rwsem);
list_splice_init(&ctrl->namespaces, &ns_list);
up_write(&ctrl->namespaces_rwsem);
@@ -3994,8 +4229,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
- if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
- sizeof(*log), 0))
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
+ log, sizeof(*log), 0))
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
}
@@ -4105,8 +4340,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
- if (ctrl->kato)
- nvme_start_keep_alive(ctrl);
+ nvme_start_keep_alive(ctrl);
nvme_enable_aen(ctrl);
@@ -4132,11 +4366,16 @@ static void nvme_free_ctrl(struct device *dev)
struct nvme_ctrl *ctrl =
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
+ struct nvme_cel *cel, *next;
if (subsys && ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
- kfree(ctrl->effects);
+ list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
+ list_del(&cel->entry);
+ kfree(cel);
+ }
+
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
@@ -4167,6 +4406,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
+ INIT_LIST_HEAD(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
@@ -4345,6 +4585,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_sync_queues);
+struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path)
+{
+ struct nvme_ctrl *ctrl;
+ struct file *f;
+
+ f = filp_open(path, O_RDWR, 0);
+ if (IS_ERR(f))
+ return ERR_CAST(f);
+
+ if (f->f_op != &nvme_dev_fops) {
+ ctrl = ERR_PTR(-EINVAL);
+ goto out_close;
+ }
+
+ ctrl = f->private_data;
+ nvme_get_ctrl(ctrl);
+
+out_close:
+ filp_close(f, NULL);
+ return ctrl;
+}
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU);
+
/*
* Check we didn't inadvertently grow the command structure sizes:
*/
@@ -4363,6 +4626,8 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);