summaryrefslogtreecommitdiff
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/Kconfig29
-rw-r--r--drivers/nvme/host/Makefile2
-rw-r--r--drivers/nvme/host/apple.c21
-rw-r--r--drivers/nvme/host/auth.c73
-rw-r--r--drivers/nvme/host/constants.c10
-rw-r--r--drivers/nvme/host/core.c1025
-rw-r--r--drivers/nvme/host/fabrics.c164
-rw-r--r--drivers/nvme/host/fabrics.h26
-rw-r--r--drivers/nvme/host/fc.c124
-rw-r--r--drivers/nvme/host/hwmon.c2
-rw-r--r--drivers/nvme/host/ioctl.c251
-rw-r--r--drivers/nvme/host/multipath.c63
-rw-r--r--drivers/nvme/host/nvme.h189
-rw-r--r--drivers/nvme/host/pci.c93
-rw-r--r--drivers/nvme/host/pr.c5
-rw-r--r--drivers/nvme/host/rdma.c90
-rw-r--r--drivers/nvme/host/sysfs.c194
-rw-r--r--drivers/nvme/host/tcp.c294
-rw-r--r--drivers/nvme/host/trace.c105
-rw-r--r--drivers/nvme/host/zns.c66
20 files changed, 1789 insertions, 1037 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 2f6a7f8c94e8..b309c8be720f 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -92,18 +92,27 @@ config NVME_TCP
If unsure, say N.
-config NVME_AUTH
- bool "NVM Express over Fabrics In-Band Authentication"
+config NVME_TCP_TLS
+ bool "NVMe over Fabrics TCP TLS encryption support"
+ depends on NVME_TCP
+ select NVME_KEYRING
+ select NET_HANDSHAKE
+ select KEYS
+ help
+ Enables TLS encryption for NVMe TCP using the netlink handshake API.
+
+ The TLS handshake daemon is availble at
+ https://github.com/oracle/ktls-utils.
+
+ If unsure, say N.
+
+config NVME_HOST_AUTH
+ bool "NVMe over Fabrics In-Band Authentication in host side"
depends on NVME_CORE
- select NVME_COMMON
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_SHA256
- select CRYPTO_SHA512
- select CRYPTO_DH
- select CRYPTO_DH_RFC7919_GROUPS
+ select NVME_AUTH
help
- This provides support for NVMe over Fabrics In-Band Authentication.
+ This provides support for NVMe over Fabrics In-Band Authentication in
+ host side.
If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index c7c3cf202d12..6414ec968f99 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -17,7 +17,7 @@ nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
-nvme-core-$(CONFIG_NVME_AUTH) += auth.o
+nvme-core-$(CONFIG_NVME_HOST_AUTH) += auth.o
nvme-y += pci.o
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 596bb11eeba5..dd6ec0865141 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -797,6 +797,7 @@ static int apple_nvme_init_request(struct blk_mq_tag_set *set,
static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
{
+ enum nvme_ctrl_state state = nvme_ctrl_state(&anv->ctrl);
u32 csts = readl(anv->mmio_nvme + NVME_REG_CSTS);
bool dead = false, freeze = false;
unsigned long flags;
@@ -808,8 +809,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
if (csts & NVME_CSTS_CFS)
dead = true;
- if (anv->ctrl.state == NVME_CTRL_LIVE ||
- anv->ctrl.state == NVME_CTRL_RESETTING) {
+ if (state == NVME_CTRL_LIVE ||
+ state == NVME_CTRL_RESETTING) {
freeze = true;
nvme_start_freeze(&anv->ctrl);
}
@@ -881,7 +882,7 @@ static enum blk_eh_timer_return apple_nvme_timeout(struct request *req)
unsigned long flags;
u32 csts = readl(anv->mmio_nvme + NVME_REG_CSTS);
- if (anv->ctrl.state != NVME_CTRL_LIVE) {
+ if (nvme_ctrl_state(&anv->ctrl) != NVME_CTRL_LIVE) {
/*
* From rdma.c:
* If we are resetting, connecting or deleting we should
@@ -985,10 +986,10 @@ static void apple_nvme_reset_work(struct work_struct *work)
u32 boot_status, aqa;
struct apple_nvme *anv =
container_of(work, struct apple_nvme, ctrl.reset_work);
+ enum nvme_ctrl_state state = nvme_ctrl_state(&anv->ctrl);
- if (anv->ctrl.state != NVME_CTRL_RESETTING) {
- dev_warn(anv->dev, "ctrl state %d is not RESETTING\n",
- anv->ctrl.state);
+ if (state != NVME_CTRL_RESETTING) {
+ dev_warn(anv->dev, "ctrl state %d is not RESETTING\n", state);
ret = -ENODEV;
goto out;
}
@@ -1515,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev;
}
- anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset);
+ anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL);
if (IS_ERR(anv->ctrl.admin_q)) {
ret = -ENOMEM;
goto put_dev;
@@ -1531,7 +1532,7 @@ put_dev:
return ret;
}
-static int apple_nvme_remove(struct platform_device *pdev)
+static void apple_nvme_remove(struct platform_device *pdev)
{
struct apple_nvme *anv = platform_get_drvdata(pdev);
@@ -1546,8 +1547,6 @@ static int apple_nvme_remove(struct platform_device *pdev)
apple_rtkit_shutdown(anv->rtk);
apple_nvme_detach_genpd(anv);
-
- return 0;
}
static void apple_nvme_shutdown(struct platform_device *pdev)
@@ -1597,7 +1596,7 @@ static struct platform_driver apple_nvme_driver = {
.pm = pm_sleep_ptr(&apple_nvme_pm_ops),
},
.probe = apple_nvme_probe,
- .remove = apple_nvme_remove,
+ .remove_new = apple_nvme_remove,
.shutdown = apple_nvme_shutdown,
};
module_platform_driver(apple_nvme_driver);
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index daf5d144a8ea..371e14f0a203 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -23,11 +23,13 @@ struct nvme_dhchap_queue_context {
struct nvme_ctrl *ctrl;
struct crypto_shash *shash_tfm;
struct crypto_kpp *dh_tfm;
+ struct nvme_dhchap_key *transformed_key;
void *buf;
int qid;
int error;
u32 s1;
u32 s2;
+ bool bi_directional;
u16 transaction;
u8 status;
u8 dhgroup_id;
@@ -36,7 +38,6 @@ struct nvme_dhchap_queue_context {
u8 c1[64];
u8 c2[64];
u8 response[64];
- u8 *host_response;
u8 *ctrl_key;
u8 *host_key;
u8 *sess_key;
@@ -47,11 +48,6 @@ struct nvme_dhchap_queue_context {
static struct workqueue_struct *nvme_auth_wq;
-#define nvme_auth_flags_from_qid(qid) \
- (qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED
-#define nvme_auth_queue_from_qid(ctrl, qid) \
- (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
-
static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl)
{
return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues +
@@ -62,10 +58,15 @@ static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
void *data, size_t data_len, bool auth_send)
{
struct nvme_command cmd = {};
- blk_mq_req_flags_t flags = nvme_auth_flags_from_qid(qid);
- struct request_queue *q = nvme_auth_queue_from_qid(ctrl, qid);
+ nvme_submit_flags_t flags = NVME_SUBMIT_RETRY;
+ struct request_queue *q = ctrl->fabrics_q;
int ret;
+ if (qid != 0) {
+ flags |= NVME_SUBMIT_NOWAIT | NVME_SUBMIT_RESERVED;
+ q = ctrl->connect_q;
+ }
+
cmd.auth_common.opcode = nvme_fabrics_command;
cmd.auth_common.secp = NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER;
cmd.auth_common.spsp0 = 0x01;
@@ -79,8 +80,7 @@ static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
}
ret = __nvme_submit_sync_cmd(q, &cmd, NULL, data, data_len,
- qid == 0 ? NVME_QID_ANY : qid,
- 0, flags);
+ qid == 0 ? NVME_QID_ANY : qid, flags);
if (ret > 0)
dev_warn(ctrl->device,
"qid %d auth_send failed with status %d\n", qid, ret);
@@ -312,17 +312,17 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
data->dhvlen = cpu_to_le16(chap->host_key_len);
memcpy(data->rval, chap->response, chap->hash_len);
if (ctrl->ctrl_key) {
+ chap->bi_directional = true;
get_random_bytes(chap->c2, chap->hash_len);
data->cvalid = 1;
- chap->s2 = nvme_auth_get_seqnum();
memcpy(data->rval + chap->hash_len, chap->c2,
chap->hash_len);
dev_dbg(ctrl->device, "%s: qid %d ctrl challenge %*ph\n",
__func__, chap->qid, (int)chap->hash_len, chap->c2);
} else {
memset(chap->c2, 0, chap->hash_len);
- chap->s2 = 0;
}
+ chap->s2 = nvme_auth_get_seqnum();
data->seqnum = cpu_to_le32(chap->s2);
if (chap->host_key_len) {
dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n",
@@ -339,10 +339,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
struct nvme_dhchap_queue_context *chap)
{
struct nvmf_auth_dhchap_success1_data *data = chap->buf;
- size_t size = sizeof(*data);
-
- if (chap->ctrl_key)
- size += chap->hash_len;
+ size_t size = sizeof(*data) + chap->hash_len;
if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
@@ -428,12 +425,12 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n",
__func__, chap->qid, chap->s1, chap->transaction);
- if (!chap->host_response) {
- chap->host_response = nvme_auth_transform_key(ctrl->host_key,
+ if (!chap->transformed_key) {
+ chap->transformed_key = nvme_auth_transform_key(ctrl->host_key,
ctrl->opts->host->nqn);
- if (IS_ERR(chap->host_response)) {
- ret = PTR_ERR(chap->host_response);
- chap->host_response = NULL;
+ if (IS_ERR(chap->transformed_key)) {
+ ret = PTR_ERR(chap->transformed_key);
+ chap->transformed_key = NULL;
return ret;
}
} else {
@@ -442,7 +439,7 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
}
ret = crypto_shash_setkey(chap->shash_tfm,
- chap->host_response, ctrl->host_key->len);
+ chap->transformed_key->key, chap->transformed_key->len);
if (ret) {
dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
chap->qid, ret);
@@ -508,19 +505,19 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
struct nvme_dhchap_queue_context *chap)
{
SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
- u8 *ctrl_response;
+ struct nvme_dhchap_key *transformed_key;
u8 buf[4], *challenge = chap->c2;
int ret;
- ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+ transformed_key = nvme_auth_transform_key(ctrl->ctrl_key,
ctrl->opts->subsysnqn);
- if (IS_ERR(ctrl_response)) {
- ret = PTR_ERR(ctrl_response);
+ if (IS_ERR(transformed_key)) {
+ ret = PTR_ERR(transformed_key);
return ret;
}
ret = crypto_shash_setkey(chap->shash_tfm,
- ctrl_response, ctrl->ctrl_key->len);
+ transformed_key->key, transformed_key->len);
if (ret) {
dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
chap->qid, ret);
@@ -586,7 +583,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
out:
if (challenge != chap->c2)
kfree(challenge);
- kfree(ctrl_response);
+ nvme_auth_free_key(transformed_key);
return ret;
}
@@ -648,8 +645,8 @@ gen_sesskey:
static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
{
- kfree_sensitive(chap->host_response);
- chap->host_response = NULL;
+ nvme_auth_free_key(chap->transformed_key);
+ chap->transformed_key = NULL;
kfree_sensitive(chap->host_key);
chap->host_key = NULL;
chap->host_key_len = 0;
@@ -663,6 +660,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
chap->error = 0;
chap->s1 = 0;
chap->s2 = 0;
+ chap->bi_directional = false;
chap->transaction = 0;
memset(chap->c1, 0, sizeof(chap->c1));
memset(chap->c2, 0, sizeof(chap->c2));
@@ -732,7 +730,7 @@ static void nvme_queue_auth_work(struct work_struct *work)
NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE);
if (ret) {
chap->status = ret;
- chap->error = -ECONNREFUSED;
+ chap->error = -EKEYREJECTED;
return;
}
@@ -758,12 +756,11 @@ static void nvme_queue_auth_work(struct work_struct *work)
__func__, chap->qid);
mutex_lock(&ctrl->dhchap_auth_mutex);
ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
if (ret) {
- mutex_unlock(&ctrl->dhchap_auth_mutex);
chap->error = ret;
goto fail2;
}
- mutex_unlock(&ctrl->dhchap_auth_mutex);
/* DH-HMAC-CHAP Step 3: send reply */
dev_dbg(ctrl->device, "%s: qid %d send reply\n",
@@ -800,7 +797,7 @@ static void nvme_queue_auth_work(struct work_struct *work)
NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1);
if (ret) {
chap->status = ret;
- chap->error = -ECONNREFUSED;
+ chap->error = -EKEYREJECTED;
return;
}
@@ -821,11 +818,11 @@ static void nvme_queue_auth_work(struct work_struct *work)
ret = nvme_auth_process_dhchap_success1(ctrl, chap);
if (ret) {
/* Controller authentication failed */
- chap->error = -ECONNREFUSED;
+ chap->error = -EKEYREJECTED;
goto fail2;
}
- if (chap->ctrl_key) {
+ if (chap->bi_directional) {
/* DH-HMAC-CHAP Step 5: send success2 */
dev_dbg(ctrl->device, "%s: qid %d send success2\n",
__func__, chap->qid);
@@ -840,6 +837,8 @@ static void nvme_queue_auth_work(struct work_struct *work)
}
fail2:
+ if (chap->status == 0)
+ chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n",
__func__, chap->qid, chap->status);
tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap);
@@ -897,7 +896,7 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
* If the ctrl is no connected, bail as reconnect will handle
* authentication.
*/
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
return;
/* Authenticate admin queue first */
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 20f46c230885..6f2ebb5fcdb0 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -171,15 +171,15 @@ static const char * const nvme_statuses[] = {
[NVME_SC_HOST_ABORTED_CMD] = "Host Aborted Command",
};
-const unsigned char *nvme_get_error_status_str(u16 status)
+const char *nvme_get_error_status_str(u16 status)
{
status &= 0x7ff;
if (status < ARRAY_SIZE(nvme_statuses) && nvme_statuses[status])
- return nvme_statuses[status & 0x7ff];
+ return nvme_statuses[status];
return "Unknown";
}
-const unsigned char *nvme_get_opcode_str(u8 opcode)
+const char *nvme_get_opcode_str(u8 opcode)
{
if (opcode < ARRAY_SIZE(nvme_ops) && nvme_ops[opcode])
return nvme_ops[opcode];
@@ -187,7 +187,7 @@ const unsigned char *nvme_get_opcode_str(u8 opcode)
}
EXPORT_SYMBOL_GPL(nvme_get_opcode_str);
-const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
+const char *nvme_get_admin_opcode_str(u8 opcode)
{
if (opcode < ARRAY_SIZE(nvme_admin_ops) && nvme_admin_ops[opcode])
return nvme_admin_ops[opcode];
@@ -195,7 +195,7 @@ const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
}
EXPORT_SYMBOL_GPL(nvme_get_admin_opcode_str);
-const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode) {
+const char *nvme_get_fabrics_opcode_str(u8 opcode) {
if (opcode < ARRAY_SIZE(nvme_fabrics_ops) && nvme_fabrics_ops[opcode])
return nvme_fabrics_ops[opcode];
return "Unknown";
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f3a01b79148c..782090ce0bc1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -20,6 +20,7 @@
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/pm_qos.h>
+#include <linux/ratelimit.h>
#include <asm/unaligned.h>
#include "nvme.h"
@@ -113,12 +114,21 @@ static DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
-static struct class *nvme_class;
-static struct class *nvme_subsys_class;
+static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
+static const struct class nvme_class = {
+ .name = "nvme",
+ .dev_uevent = nvme_class_uevent,
+};
+
+static const struct class nvme_subsys_class = {
+ .name = "nvme-subsystem",
+};
static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt;
-static struct class *nvme_ns_chr_class;
+static const struct class nvme_ns_chr_class = {
+ .name = "nvme-generic",
+};
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -131,7 +141,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl)
/*
* Only new queue scan work when admin and IO queues are both alive
*/
- if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work);
}
@@ -143,7 +153,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl)
*/
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{
- if (ctrl->state != NVME_CTRL_RESETTING)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
return -EBUSY;
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY;
@@ -156,7 +166,7 @@ static void nvme_failfast_work(struct work_struct *work)
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_ctrl, failfast_work);
- if (ctrl->state != NVME_CTRL_CONNECTING)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
return;
set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
@@ -200,7 +210,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
ret = nvme_reset_ctrl(ctrl);
if (!ret) {
flush_work(&ctrl->reset_work);
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
ret = -ENETRESET;
}
@@ -312,12 +322,12 @@ static void nvme_log_error(struct request *req)
struct nvme_request *nr = nvme_req(req);
if (ns) {
- pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
+ pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
ns->disk ? ns->disk->disk_name : "?",
nvme_get_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
- (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
- (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
+ nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
+ blk_rq_bytes(req) >> ns->head->lba_shift,
nvme_get_error_status_str(nr->status),
nr->status >> 8 & 7, /* Status Code Type */
nr->status & 0xff, /* Status Code */
@@ -337,6 +347,30 @@ static void nvme_log_error(struct request *req)
nr->status & NVME_SC_DNR ? "DNR " : "");
}
+static void nvme_log_err_passthru(struct request *req)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+ struct nvme_request *nr = nvme_req(req);
+
+ pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s"
+ "cdw10=0x%x cdw11=0x%x cdw12=0x%x cdw13=0x%x cdw14=0x%x cdw15=0x%x\n",
+ ns ? ns->disk->disk_name : dev_name(nr->ctrl->device),
+ ns ? nvme_get_opcode_str(nr->cmd->common.opcode) :
+ nvme_get_admin_opcode_str(nr->cmd->common.opcode),
+ nr->cmd->common.opcode,
+ nvme_get_error_status_str(nr->status),
+ nr->status >> 8 & 7, /* Status Code Type */
+ nr->status & 0xff, /* Status Code */
+ nr->status & NVME_SC_MORE ? "MORE " : "",
+ nr->status & NVME_SC_DNR ? "DNR " : "",
+ nr->cmd->common.cdw10,
+ nr->cmd->common.cdw11,
+ nr->cmd->common.cdw12,
+ nr->cmd->common.cdw13,
+ nr->cmd->common.cdw14,
+ nr->cmd->common.cdw14);
+}
+
enum nvme_disposition {
COMPLETE,
RETRY,
@@ -349,14 +383,14 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
if (likely(nvme_req(req)->status == 0))
return COMPLETE;
- if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
- return AUTHENTICATE;
-
if (blk_noretry_request(req) ||
(nvme_req(req)->status & NVME_SC_DNR) ||
nvme_req(req)->retries >= nvme_max_retries)
return COMPLETE;
+ if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
+ return AUTHENTICATE;
+
if (req->cmd_flags & REQ_NVME_MPATH) {
if (nvme_is_path_error(nvme_req(req)->status) ||
blk_queue_dying(req->q))
@@ -372,21 +406,33 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
static inline void nvme_end_req_zoned(struct request *req)
{
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = nvme_lba_to_sect(req->q->queuedata,
+ req_op(req) == REQ_OP_ZONE_APPEND) {
+ struct nvme_ns *ns = req->q->queuedata;
+
+ req->__sector = nvme_lba_to_sect(ns->head,
le64_to_cpu(nvme_req(req)->result.u64));
+ }
}
-static inline void nvme_end_req(struct request *req)
+static inline void __nvme_end_req(struct request *req)
{
- blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
- if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET)))
- nvme_log_error(req);
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_end_request(req);
+}
+
+void nvme_end_req(struct request *req)
+{
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
+
+ if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
+ if (blk_rq_is_passthrough(req))
+ nvme_log_err_passthru(req);
+ else
+ nvme_log_error(req);
+ }
+ __nvme_end_req(req);
blk_mq_end_request(req, status);
}
@@ -420,7 +466,7 @@ void nvme_complete_rq(struct request *req)
nvme_failover_req(req);
return;
case AUTHENTICATE:
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
nvme_retry_req(req);
#else
@@ -435,7 +481,7 @@ void nvme_complete_batch_req(struct request *req)
{
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req);
- nvme_end_req_zoned(req);
+ __nvme_end_req(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
@@ -499,7 +545,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
spin_lock_irqsave(&ctrl->lock, flags);
- old_state = ctrl->state;
+ old_state = nvme_ctrl_state(ctrl);
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
@@ -567,7 +613,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
}
if (changed) {
- ctrl->state = new_state;
+ WRITE_ONCE(ctrl->state, new_state);
wake_up_all(&ctrl->state_wq);
}
@@ -575,11 +621,11 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
if (!changed)
return false;
- if (ctrl->state == NVME_CTRL_LIVE) {
+ if (new_state == NVME_CTRL_LIVE) {
if (old_state == NVME_CTRL_CONNECTING)
nvme_stop_failfast_work(ctrl);
nvme_kick_requeue_lists(ctrl);
- } else if (ctrl->state == NVME_CTRL_CONNECTING &&
+ } else if (new_state == NVME_CTRL_CONNECTING &&
old_state == NVME_CTRL_RESETTING) {
nvme_start_failfast_work(ctrl);
}
@@ -588,27 +634,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
/*
- * Returns true for sink states that can't ever transition back to live.
- */
-static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
-{
- switch (ctrl->state) {
- case NVME_CTRL_NEW:
- case NVME_CTRL_LIVE:
- case NVME_CTRL_RESETTING:
- case NVME_CTRL_CONNECTING:
- return false;
- case NVME_CTRL_DELETING:
- case NVME_CTRL_DELETING_NOIO:
- case NVME_CTRL_DEAD:
- return true;
- default:
- WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
- return true;
- }
-}
-
-/*
* Waits for the controller state to be resetting, or returns false if it is
* not possible to ever transition to that state.
*/
@@ -617,7 +642,7 @@ bool nvme_wait_reset(struct nvme_ctrl *ctrl)
wait_event(ctrl->state_wq,
nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
nvme_state_terminal(ctrl));
- return ctrl->state == NVME_CTRL_RESETTING;
+ return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);
@@ -653,7 +678,7 @@ static void nvme_free_ns(struct kref *kref)
kfree(ns);
}
-static inline bool nvme_get_ns(struct nvme_ns *ns)
+bool nvme_get_ns(struct nvme_ns *ns)
{
return kref_get_unless_zero(&ns->kref);
}
@@ -675,10 +700,21 @@ static inline void nvme_clear_nvme_request(struct request *req)
/* initialize a passthrough request */
void nvme_init_request(struct request *req, struct nvme_command *cmd)
{
- if (req->q->queuedata)
+ struct nvme_request *nr = nvme_req(req);
+ bool logging_enabled;
+
+ if (req->q->queuedata) {
+ struct nvme_ns *ns = req->q->disk->private_data;
+
+ logging_enabled = ns->head->passthru_err_log_enabled;
req->timeout = NVME_IO_TIMEOUT;
- else /* no queuedata implies admin queue */
+ } else { /* no queuedata implies admin queue */
+ logging_enabled = nr->ctrl->passthru_err_log_enabled;
req->timeout = NVME_ADMIN_TIMEOUT;
+ }
+
+ if (!logging_enabled)
+ req->rq_flags |= RQF_QUIET;
/* passthru commands should let the driver set the SGL flags */
cmd->common.flags &= ~NVME_CMD_SGL_ALL;
@@ -687,8 +723,7 @@ void nvme_init_request(struct request *req, struct nvme_command *cmd)
if (req->mq_hctx->type == HCTX_TYPE_POLL)
req->cmd_flags |= REQ_POLLED;
nvme_clear_nvme_request(req);
- req->rq_flags |= RQF_QUIET;
- memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
+ memcpy(nr->cmd, cmd, sizeof(*cmd));
}
EXPORT_SYMBOL_GPL(nvme_init_request);
@@ -704,9 +739,11 @@ EXPORT_SYMBOL_GPL(nvme_init_request);
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
struct request *rq)
{
- if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
- ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DEAD &&
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
+ if (state != NVME_CTRL_DELETING_NOIO &&
+ state != NVME_CTRL_DELETING &&
+ state != NVME_CTRL_DEAD &&
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
@@ -715,7 +752,7 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
- bool queue_live)
+ bool queue_live, enum nvme_ctrl_state state)
{
struct nvme_request *req = nvme_req(rq);
@@ -736,7 +773,7 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
* command, which is require to set the queue live in the
* appropinquate states.
*/
- switch (ctrl->state) {
+ switch (state) {
case NVME_CTRL_CONNECTING:
if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
(req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
@@ -791,8 +828,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
}
if (queue_max_discard_segments(req->q) == 1) {
- u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
- u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
+ u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
+ u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
range[0].cattr = cpu_to_le32(0);
range[0].nlb = cpu_to_le32(nlb);
@@ -800,8 +837,9 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
n = 1;
} else {
__rq_for_each_bio(bio, req) {
- u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
- u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+ u64 slba = nvme_sect_to_lba(ns->head,
+ bio->bi_iter.bi_sector);
+ u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
if (n < segments) {
range[n].cattr = cpu_to_le32(0);
@@ -839,7 +877,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
u64 ref48;
/* both rw and write zeroes share the same reftag format */
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
@@ -867,17 +905,18 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->write_zeroes.slba =
- cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+ cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
cmnd->write_zeroes.length =
- cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
- if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
+ if (!(req->cmd_flags & REQ_NOUNMAP) &&
+ (ns->head->features & NVME_NS_DEAC))
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
- if (nvme_ns_has_pi(ns)) {
+ if (nvme_ns_has_pi(ns->head)) {
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
nvme_set_ref_tag(ns, cmnd, req);
@@ -909,13 +948,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.cdw2 = 0;
cmnd->rw.cdw3 = 0;
cmnd->rw.metadata = 0;
- cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
- cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cmnd->rw.slba =
+ cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
+ cmnd->rw.length =
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
cmnd->rw.apptag = 0;
cmnd->rw.appmask = 0;
- if (ns->ms) {
+ if (ns->head->ms) {
/*
* If formated with metadata, the block layer always provides a
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
@@ -923,12 +964,12 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
* namespace capacity to zero to prevent any I/O.
*/
if (!blk_integrity_rq(req)) {
- if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+ if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
}
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
break;
@@ -957,6 +998,7 @@ void nvme_cleanup_cmd(struct request *req)
clear_bit_unlock(0, &ctrl->discard_page_busy);
else
kfree(bvec_virt(&req->special_vec));
+ req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
}
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -1041,20 +1083,27 @@ EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU);
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
- int qid, int at_head, blk_mq_req_flags_t flags)
+ int qid, nvme_submit_flags_t flags)
{
struct request *req;
int ret;
+ blk_mq_req_flags_t blk_flags = 0;
+ if (flags & NVME_SUBMIT_NOWAIT)
+ blk_flags |= BLK_MQ_REQ_NOWAIT;
+ if (flags & NVME_SUBMIT_RESERVED)
+ blk_flags |= BLK_MQ_REQ_RESERVED;
if (qid == NVME_QID_ANY)
- req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
+ req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
else
- req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
+ req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
qid - 1);
if (IS_ERR(req))
return PTR_ERR(req);
nvme_init_request(req, cmd);
+ if (flags & NVME_SUBMIT_RETRY)
+ req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
@@ -1062,7 +1111,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
goto out;
}
- ret = nvme_execute_rq(req, at_head);
+ ret = nvme_execute_rq(req, flags & NVME_SUBMIT_AT_HEAD);
if (result && ret >= 0)
*result = nvme_req(req)->result;
out:
@@ -1075,7 +1124,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@@ -1098,6 +1147,10 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
} else {
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
+
+ /* Ignore execution restrictions if any relaxation bits are set */
+ if (effects & NVME_CMD_EFFECTS_CSER_MASK)
+ effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
}
return effects;
@@ -1192,8 +1245,16 @@ static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
{
- queue_delayed_work(nvme_wq, &ctrl->ka_work,
- nvme_keep_alive_work_period(ctrl));
+ unsigned long now = jiffies;
+ unsigned long delay = nvme_keep_alive_work_period(ctrl);
+ unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
+
+ if (time_after(now, ka_next_check_tm))
+ delay = 0;
+ else
+ delay = ka_next_check_tm - now;
+
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
}
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
@@ -1331,8 +1392,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl));
- if (error)
+ if (error) {
kfree(*id);
+ *id = NULL;
+ }
return error;
}
@@ -1442,7 +1505,7 @@ free_data:
return status;
}
-static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_id_ns **id)
{
struct nvme_command c = { };
@@ -1461,6 +1524,7 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
kfree(*id);
+ *id = NULL;
}
return error;
}
@@ -1479,7 +1543,8 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
if (id->ncap == 0) {
/* namespace not allocated or attached */
info->is_removed = true;
- return -ENODEV;
+ ret = -ENODEV;
+ goto error;
}
info->anagrpid = id->anagrpid;
@@ -1497,8 +1562,10 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
}
+
+error:
kfree(id);
- return 0;
+ return ret;
}
static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
@@ -1539,7 +1606,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
- buffer, buflen, NVME_QID_ANY, 0, 0);
+ buffer, buflen, NVME_QID_ANY, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
@@ -1657,15 +1724,26 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
- u32 max_integrity_segments)
+static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
{
struct blk_integrity integrity = { };
- switch (ns->pi_type) {
+ blk_integrity_unregister(disk);
+
+ if (!head->ms)
+ return true;
+
+ /*
+ * PI can always be supported as we can ask the controller to simply
+ * insert/strip it, which is not possible for other kinds of metadata.
+ */
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
+ !(head->features & NVME_NS_METADATA_SUPPORTED))
+ return nvme_ns_has_pi(head);
+
+ switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
- switch (ns->guard_type) {
+ switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1683,7 +1761,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- switch (ns->guard_type) {
+ switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type1_crc;
integrity.tag_size = sizeof(u16);
@@ -1704,45 +1782,30 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
}
- integrity.tuple_size = ns->ms;
+ integrity.tuple_size = head->ms;
+ integrity.pi_offset = head->pi_offset;
blk_integrity_register(disk, &integrity);
- blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
-}
-#else
-static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
- u32 max_integrity_segments)
-{
+ return true;
}
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
+static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
{
struct nvme_ctrl *ctrl = ns->ctrl;
- struct request_queue *queue = disk->queue;
- u32 size = queue_logical_block_size(queue);
-
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
- ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);
-
- if (ctrl->max_discard_sectors == 0) {
- blk_queue_max_discard_sectors(queue, 0);
- return;
- }
- BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
- NVME_DSM_MAX_RANGES);
-
- queue->limits.discard_granularity = size;
-
- /* If discard is already enabled, don't reset queue limits */
- if (queue->limits.max_discard_sectors)
- return;
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
+ lim->max_hw_discard_sectors =
+ nvme_lba_to_sect(ns->head, ctrl->dmrsl);
+ else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ lim->max_hw_discard_sectors = UINT_MAX;
+ else
+ lim->max_hw_discard_sectors = 0;
- blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
- blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
+ lim->discard_granularity = lim->logical_block_size;
- if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
- blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
+ if (ctrl->dmrl)
+ lim->max_discard_segments = ctrl->dmrl;
+ else
+ lim->max_discard_segments = NVME_DSM_MAX_RANGES;
}
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
@@ -1753,76 +1816,75 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi;
}
-static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
+ struct nvme_id_ns_nvm **nvmp)
{
- bool first = id->dps & NVME_NS_DPS_PI_FIRST;
- unsigned lbaf = nvme_lbaf_index(id->flbas);
- struct nvme_ctrl *ctrl = ns->ctrl;
- struct nvme_command c = { };
+ struct nvme_command c = {
+ .identify.opcode = nvme_admin_identify,
+ .identify.nsid = cpu_to_le32(nsid),
+ .identify.cns = NVME_ID_CNS_CS_NS,
+ .identify.csi = NVME_CSI_NVM,
+ };
struct nvme_id_ns_nvm *nvm;
- int ret = 0;
- u32 elbaf;
-
- ns->pi_size = 0;
- ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
- if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- ns->pi_size = sizeof(struct t10_pi_tuple);
- ns->guard_type = NVME_NVM_NS_16B_GUARD;
- goto set_pi;
- }
+ int ret;
nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
if (!nvm)
return -ENOMEM;
- c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(ns->head->ns_id);
- c.identify.cns = NVME_ID_CNS_CS_NS;
- c.identify.csi = NVME_CSI_NVM;
-
- ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret)
- goto free_data;
+ kfree(nvm);
+ else
+ *nvmp = nvm;
+ return ret;
+}
- elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
+static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
+ struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
+{
+ u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
/* no support for storage tag formats right now */
if (nvme_elbaf_sts(elbaf))
- goto free_data;
+ return;
- ns->guard_type = nvme_elbaf_guard_type(elbaf);
- switch (ns->guard_type) {
+ head->guard_type = nvme_elbaf_guard_type(elbaf);
+ switch (head->guard_type) {
case NVME_NVM_NS_64B_GUARD:
- ns->pi_size = sizeof(struct crc64_pi_tuple);
+ head->pi_size = sizeof(struct crc64_pi_tuple);
break;
case NVME_NVM_NS_16B_GUARD:
- ns->pi_size = sizeof(struct t10_pi_tuple);
+ head->pi_size = sizeof(struct t10_pi_tuple);
break;
default:
break;
}
-
-free_data:
- kfree(nvm);
-set_pi:
- if (ns->pi_size && (first || ns->ms == ns->pi_size))
- ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
- else
- ns->pi_type = 0;
-
- return ret;
}
-static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head, struct nvme_id_ns *id,
+ struct nvme_id_ns_nvm *nvm)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
-
- if (nvme_init_ms(ns, id))
+ head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ head->pi_type = 0;
+ head->pi_size = 0;
+ head->pi_offset = 0;
+ head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
+ if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return;
- ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
- if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
- return;
+ if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
+ nvme_configure_pi_elbas(head, id, nvm);
+ } else {
+ head->pi_size = sizeof(struct t10_pi_tuple);
+ head->guard_type = NVME_NVM_NS_16B_GUARD;
+ }
+
+ if (head->pi_size && head->ms >= head->pi_size)
+ head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (!(id->dps & NVME_NS_DPS_PI_FIRST))
+ head->pi_offset = head->ms - head->pi_size;
if (ctrl->ops->flags & NVME_F_FABRICS) {
/*
@@ -1833,7 +1895,7 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
return;
- ns->features |= NVME_NS_EXT_LBAS;
+ head->features |= NVME_NS_EXT_LBAS;
/*
* The current fabrics transport drivers support namespace
@@ -1844,8 +1906,8 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* Note, this check will need to be modified if any drivers
* gain the ability to use other metadata formats.
*/
- if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
+ head->features |= NVME_NS_METADATA_SUPPORTED;
} else {
/*
* For PCIe controllers, we can't easily remap the separate
@@ -1854,48 +1916,47 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* We allow extended LBAs for the passthrough interface, though.
*/
if (id->flbas & NVME_NS_FLBAS_META_EXT)
- ns->features |= NVME_NS_EXT_LBAS;
+ head->features |= NVME_NS_EXT_LBAS;
else
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ head->features |= NVME_NS_METADATA_SUPPORTED;
}
}
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
- struct request_queue *q)
+static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
{
- bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
-
- if (ctrl->max_hw_sectors) {
- u32 max_segments =
- (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
+ return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
+}
- max_segments = min_not_zero(max_segments, ctrl->max_segments);
- blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
- }
- blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
- blk_queue_dma_alignment(q, 3);
- blk_queue_write_cache(q, vwc, vwc);
+static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
+ struct queue_limits *lim)
+{
+ lim->max_hw_sectors = ctrl->max_hw_sectors;
+ lim->max_segments = min_t(u32, USHRT_MAX,
+ min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
+ lim->max_integrity_segments = ctrl->max_integrity_segments;
+ lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
+ lim->max_segment_size = UINT_MAX;
+ lim->dma_alignment = 3;
}
-static void nvme_update_disk_info(struct gendisk *disk,
- struct nvme_ns *ns, struct nvme_id_ns *id)
+static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
- sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
- u32 bs = 1U << ns->lba_shift;
+ struct nvme_ns_head *head = ns->head;
+ u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
+ bool valid = true;
/*
* The block layer can't support LBA sizes larger than the page size
- * yet, so catch this early and don't allow block I/O.
+ * or smaller than a sector size yet, so catch this early and don't
+ * allow block I/O.
*/
- if (ns->lba_shift > PAGE_SHIFT) {
- capacity = 0;
+ if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
bs = (1 << 9);
+ valid = false;
}
- blk_integrity_unregister(disk);
-
atomic_bs = phys_bs = bs;
if (id->nabo == 0) {
/*
@@ -1916,36 +1977,20 @@ static void nvme_update_disk_info(struct gendisk *disk,
io_opt = bs * (1 + le16_to_cpu(id->nows));
}
- blk_queue_logical_block_size(disk->queue, bs);
/*
* Linux filesystems assume writing a single physical block is
* an atomic operation. Hence limit the physical block size to the
* value of the Atomic Write Unit Power Fail parameter.
*/
- blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
- blk_queue_io_min(disk->queue, phys_bs);
- blk_queue_io_opt(disk->queue, io_opt);
-
- /*
- * Register a metadata profile for PI, or the plain non-integrity NVMe
- * metadata masquerading as Type 0 if supported, otherwise reject block
- * I/O to namespaces with metadata except when the namespace supports
- * PI, as it can strip/insert in that case.
- */
- if (ns->ms) {
- if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (ns->features & NVME_NS_METADATA_SUPPORTED))
- nvme_init_integrity(disk, ns,
- ns->ctrl->max_integrity_segments);
- else if (!nvme_ns_has_pi(ns))
- capacity = 0;
- }
-
- set_capacity_and_notify(disk, capacity);
-
- nvme_config_discard(disk, ns);
- blk_queue_max_write_zeroes_sectors(disk->queue,
- ns->ctrl->max_zeroes_sectors);
+ lim->logical_block_size = bs;
+ lim->physical_block_size = min(phys_bs, atomic_bs);
+ lim->io_min = phys_bs;
+ lim->io_opt = io_opt;
+ if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ lim->max_write_zeroes_sectors = UINT_MAX;
+ else
+ lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
+ return valid;
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@@ -1959,7 +2004,8 @@ static inline bool nvme_first_scan(struct gendisk *disk)
return !disk_live(disk);
}
-static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u32 iob;
@@ -1968,7 +2014,7 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
else
- iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
+ iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
if (!iob)
return;
@@ -1987,38 +2033,37 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
return;
}
- blk_queue_chunk_sectors(ns->queue, iob);
+ lim->chunk_sectors = iob;
}
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
+ struct queue_limits lim;
+ int ret;
+
blk_mq_freeze_queue(ns->disk->queue);
- nvme_set_queue_limits(ns->ctrl, ns->queue);
+ lim = queue_limits_start_update(ns->disk->queue);
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_mq_unfreeze_queue(ns->disk->queue);
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- ns->head->disk->flags |= GENHD_FL_HIDDEN;
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
-
/* Hide the block-interface for these devices */
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
-
- return 0;
+ if (!ret)
+ ret = -ENODEV;
+ return ret;
}
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
+ bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
+ struct queue_limits lim;
+ struct nvme_id_ns_nvm *nvm = NULL;
+ struct nvme_zone_info zi = {};
struct nvme_id_ns *id;
+ sector_t capacity;
unsigned lbaf;
int ret;
@@ -2026,86 +2071,163 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (ret)
return ret;
- blk_mq_freeze_queue(ns->disk->queue);
+ if (id->ncap == 0) {
+ /* namespace not allocated or attached */
+ info->is_removed = true;
+ ret = -ENXIO;
+ goto out;
+ }
lbaf = nvme_lbaf_index(id->flbas);
- ns->lba_shift = id->lbaf[lbaf].ds;
- nvme_set_queue_limits(ns->ctrl, ns->queue);
- nvme_configure_metadata(ns, id);
- nvme_set_chunk_sectors(ns, id);
- nvme_update_disk_info(ns->disk, ns, id);
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
+ ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
+ if (ret < 0)
+ goto out;
+ }
- if (ns->head->ids.csi == NVME_CSI_ZNS) {
- ret = nvme_update_zone_info(ns, lbaf);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS) {
+ ret = nvme_query_zone_info(ns, lbaf, &zi);
+ if (ret < 0)
goto out;
- }
+ }
+
+ blk_mq_freeze_queue(ns->disk->queue);
+ ns->head->lba_shift = id->lbaf[lbaf].ds;
+ ns->head->nuse = le64_to_cpu(id->nuse);
+ capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
+
+ lim = queue_limits_start_update(ns->disk->queue);
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
+ nvme_configure_metadata(ns->ctrl, ns->head, id, nvm);
+ nvme_set_chunk_sectors(ns, id, &lim);
+ if (!nvme_update_disk_info(ns, id, &lim))
+ capacity = 0;
+ nvme_config_discard(ns, &lim);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS)
+ nvme_update_zone_info(ns, &lim, &zi);
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue);
+ goto out;
}
/*
+ * Register a metadata profile for PI, or the plain non-integrity NVMe
+ * metadata masquerading as Type 0 if supported, otherwise reject block
+ * I/O to namespaces with metadata except when the namespace supports
+ * PI, as it can strip/insert in that case.
+ */
+ if (!nvme_init_integrity(ns->disk, ns->head))
+ capacity = 0;
+
+ set_capacity_and_notify(ns->disk, capacity);
+
+ /*
* Only set the DEAC bit if the device guarantees that reads from
* deallocated data return zeroes. While the DEAC bit does not
* require that, it must be a no-op if reads from deallocated data
* do not return zeroes.
*/
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
- ns->features |= NVME_NS_DEAC;
+ ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
+ blk_queue_write_cache(ns->disk->queue, vwc, vwc);
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
- ret = nvme_revalidate_zones(ns);
+ ret = blk_revalidate_disk_zones(ns->disk);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- nvme_update_disk_info(ns->head->disk, ns, id);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- disk_update_readahead(ns->head->disk);
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
-
ret = 0;
out:
- /*
- * If probing fails due an unsupported feature, hide the block device,
- * but still allow other access.
- */
- if (ret == -ENODEV) {
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
- ret = 0;
- }
+ kfree(nvm);
kfree(id);
return ret;
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
+ bool unsupported = false;
+ int ret;
+
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
}
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
case NVME_CSI_NVM:
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
+ }
+
+ /*
+ * If probing fails due an unsupported feature, hide the block device,
+ * but still allow other access.
+ */
+ if (ret == -ENODEV) {
+ ns->disk->flags |= GENHD_FL_HIDDEN;
+ set_bit(NVME_NS_READY, &ns->flags);
+ unsupported = true;
+ ret = 0;
}
+
+ if (!ret && nvme_ns_head_multipath(ns->head)) {
+ struct queue_limits *ns_lim = &ns->disk->queue->limits;
+ struct queue_limits lim;
+
+ blk_mq_freeze_queue(ns->head->disk->queue);
+ if (unsupported)
+ ns->head->disk->flags |= GENHD_FL_HIDDEN;
+ else
+ nvme_init_integrity(ns->head->disk, ns->head);
+ set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
+ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
+ nvme_mpath_revalidate_paths(ns);
+
+ /*
+ * queue_limits mixes values that are the hardware limitations
+ * for bio splitting with what is the device configuration.
+ *
+ * For NVMe the device configuration can change after e.g. a
+ * Format command, and we really want to pick up the new format
+ * value here. But we must still stack the queue limits to the
+ * least common denominator for multipathing to split the bios
+ * properly.
+ *
+ * To work around this, we explicitly set the device
+ * configuration to those that we just queried, but only stack
+ * the splitting limits in to make sure we still obey possibly
+ * lower limitations of other controllers.
+ */
+ lim = queue_limits_start_update(ns->head->disk->queue);
+ lim.logical_block_size = ns_lim->logical_block_size;
+ lim.physical_block_size = ns_lim->physical_block_size;
+ lim.io_min = ns_lim->io_min;
+ lim.io_opt = ns_lim->io_opt;
+ queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
+ ns->head->disk->disk_name);
+ ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
+ blk_mq_unfreeze_queue(ns->head->disk->queue);
+ }
+
+ return ret;
}
#ifdef CONFIG_BLK_SED_OPAL
@@ -2124,7 +2246,7 @@ static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t l
cmd.common.cdw11 = cpu_to_le32(len);
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
- NVME_QID_ANY, 1, 0);
+ NVME_QID_ANY, NVME_SUBMIT_AT_HEAD);
}
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
@@ -2245,25 +2367,8 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
else
ctrl->ctrl_config = NVME_CC_CSS_NVM;
- if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
- u32 crto;
-
- ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
- if (ret) {
- dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
- ret);
- return ret;
- }
-
- if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
- ctrl->ctrl_config |= NVME_CC_CRIME;
- timeout = NVME_CRTO_CRIMT(crto);
- } else {
- timeout = NVME_CRTO_CRWMT(crto);
- }
- } else {
- timeout = NVME_CAP_TIMEOUT(ctrl->cap);
- }
+ if (ctrl->cap & NVME_CAP_CRMS_CRWMS && ctrl->cap & NVME_CAP_CRMS_CRIMS)
+ ctrl->ctrl_config |= NVME_CC_CRIME;
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
@@ -2277,6 +2382,39 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
if (ret)
return ret;
+ /* CAP value may change after initial CC write */
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+ if (ret)
+ return ret;
+
+ timeout = NVME_CAP_TIMEOUT(ctrl->cap);
+ if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
+ u32 crto, ready_timeout;
+
+ ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
+ if (ret) {
+ dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
+ ret);
+ return ret;
+ }
+
+ /*
+ * CRTO should always be greater or equal to CAP.TO, but some
+ * devices are known to get this wrong. Use the larger of the
+ * two values.
+ */
+ if (ctrl->ctrl_config & NVME_CC_CRIME)
+ ready_timeout = NVME_CRTO_CRIMT(crto);
+ else
+ ready_timeout = NVME_CRTO_CRWMT(crto);
+
+ if (ready_timeout < timeout)
+ dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
+ crto, ctrl->cap);
+ else
+ timeout = ready_timeout;
+ }
+
ctrl->ctrl_config |= NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
@@ -2506,7 +2644,7 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val)
if (ctrl->ps_max_latency_us != latency) {
ctrl->ps_max_latency_us = latency;
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
nvme_configure_apst(ctrl);
}
}
@@ -2764,7 +2902,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->awupf = le16_to_cpu(id->awupf);
nvme_mpath_default_iopolicy(subsys);
- subsys->dev.class = nvme_subsys_class;
+ subsys->dev.class = &nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
@@ -2874,14 +3012,6 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
struct nvme_id_ctrl_nvm *id;
int ret;
- if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
- ctrl->max_discard_sectors = UINT_MAX;
- ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
- } else {
- ctrl->max_discard_sectors = 0;
- ctrl->max_discard_segments = 0;
- }
-
/*
* Even though NVMe spec explicitly states that MDTS is not applicable
* to the write-zeroes, we are cautious and limit the size to the
@@ -2911,8 +3041,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
if (ret)
goto free_data;
- if (id->dmrl)
- ctrl->max_discard_segments = id->dmrl;
+ ctrl->dmrl = id->dmrl;
ctrl->dmrsl = le32_to_cpu(id->dmrsl);
if (id->wzsl)
ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
@@ -2980,8 +3109,50 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
return 0;
}
+static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+ /*
+ * In fabrics we need to verify the cntlid matches the
+ * admin connect
+ */
+ if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
+ dev_err(ctrl->device,
+ "Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
+ ctrl->cntlid, le16_to_cpu(id->cntlid));
+ return -EINVAL;
+ }
+
+ if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
+ dev_err(ctrl->device,
+ "keep-alive support is mandatory for fabrics\n");
+ return -EINVAL;
+ }
+
+ if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) {
+ dev_err(ctrl->device,
+ "I/O queue command capsule supported size %d < 4\n",
+ ctrl->ioccsz);
+ return -EINVAL;
+ }
+
+ if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) {
+ dev_err(ctrl->device,
+ "I/O queue response capsule supported size %d < 1\n",
+ ctrl->iorcsz);
+ return -EINVAL;
+ }
+
+ if (!ctrl->maxcmd) {
+ dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
+ struct queue_limits lim;
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
bool prev_apst_enabled;
@@ -3048,7 +3219,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
- nvme_set_queue_limits(ctrl, ctrl->admin_q);
+ lim = queue_limits_start_update(ctrl->admin_q);
+ nvme_set_ctrl_limits(ctrl, &lim);
+ ret = queue_limits_commit_update(ctrl->admin_q, &lim);
+ if (ret)
+ goto out_free;
+
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
@@ -3066,7 +3242,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ctrl->shutdown_timeout != shutdown_timeout)
dev_info(ctrl->device,
- "Shutdown timeout set to %u seconds\n",
+ "D3 entry latency set to %u seconds\n",
ctrl->shutdown_timeout);
} else
ctrl->shutdown_timeout = shutdown_timeout;
@@ -3092,25 +3268,9 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
- /*
- * In fabrics we need to verify the cntlid matches the
- * admin connect
- */
- if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
- dev_err(ctrl->device,
- "Mismatching cntlid: Connect %u vs Identify "
- "%u, rejecting\n",
- ctrl->cntlid, le16_to_cpu(id->cntlid));
- ret = -EINVAL;
- goto out_free;
- }
-
- if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
- dev_err(ctrl->device,
- "keep-alive support is mandatory for fabrics\n");
- ret = -EINVAL;
+ ret = nvme_check_ctrl_fabric_info(ctrl, id);
+ if (ret)
goto out_free;
- }
} else {
ctrl->hmpre = le32_to_cpu(id->hmpre);
ctrl->hmmin = le32_to_cpu(id->hmmin);
@@ -3183,6 +3343,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
ctrl->identified = true;
+ nvme_start_keep_alive(ctrl);
+
return 0;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
@@ -3192,7 +3354,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
struct nvme_ctrl *ctrl =
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
- switch (ctrl->state) {
+ switch (nvme_ctrl_state(ctrl)) {
case NVME_CTRL_LIVE:
break;
default:
@@ -3294,7 +3456,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
if (minor < 0)
return minor;
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
- cdev_device->class = nvme_ns_chr_class;
+ cdev_device->class = &nvme_ns_chr_class;
cdev_device->release = nvme_cdev_rel;
device_initialize(cdev_device);
cdev_init(cdev, fops);
@@ -3367,6 +3529,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
+ ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
+ ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
if (head->ids.csi) {
@@ -3502,7 +3666,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
"Found shared namespace %d, but multipathing not supported.\n",
info->nsid);
dev_warn_once(ctrl->device,
- "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
+ "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
}
}
@@ -3521,9 +3685,10 @@ out_unlock:
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
if (ns->head->ns_id == nsid) {
if (!nvme_get_ns(ns))
continue;
@@ -3533,7 +3698,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (ns->head->ns_id > nsid)
break;
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
@@ -3547,7 +3712,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
if (tmp->head->ns_id < ns->head->ns_id) {
- list_add(&ns->list, &tmp->list);
+ list_add_rcu(&ns->list, &tmp->list);
return;
}
}
@@ -3564,7 +3729,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (!ns)
return;
- disk = blk_mq_alloc_disk(ctrl->tagset, ns);
+ disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
@@ -3613,12 +3778,21 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
- down_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
+ /*
+ * Ensure that no namespaces are added to the ctrl list after the queues
+ * are frozen, thereby avoiding a deadlock between scan and reset.
+ */
+ if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
+ mutex_unlock(&ctrl->namespaces_lock);
+ goto out_unlink_ns;
+ }
nvme_ns_add_to_ctrl_list(ns);
- up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_lock);
+ synchronize_srcu(&ctrl->srcu);
nvme_get_ctrl(ctrl);
- if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
+ if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
goto out_cleanup_ns_from_list;
if (!nvme_ns_head_multipath(ns->head))
@@ -3627,13 +3801,21 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
nvme_mpath_add_disk(ns, info->anagrpid);
nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
+ /*
+ * Set ns->disk->device->driver_data to ns so we can access
+ * ns->head->passthru_err_log_enabled in
+ * nvme_io_passthru_err_log_enabled_[store | show]().
+ */
+ dev_set_drvdata(disk_to_dev(ns->disk), ns);
+
return;
out_cleanup_ns_from_list:
nvme_put_ctrl(ctrl);
- down_write(&ctrl->namespaces_rwsem);
- list_del_init(&ns->list);
- up_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
+ list_del_rcu(&ns->list);
+ mutex_unlock(&ctrl->namespaces_lock);
+ synchronize_srcu(&ctrl->srcu);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
@@ -3683,9 +3865,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
del_gendisk(ns->disk);
- down_write(&ns->ctrl->namespaces_rwsem);
- list_del_init(&ns->list);
- up_write(&ns->ctrl->namespaces_rwsem);
+ mutex_lock(&ns->ctrl->namespaces_lock);
+ list_del_rcu(&ns->list);
+ mutex_unlock(&ns->ctrl->namespaces_lock);
+ synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
nvme_mpath_shutdown_disk(ns->head);
@@ -3775,16 +3958,18 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
struct nvme_ns *ns, *next;
LIST_HEAD(rm_list);
- down_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
- if (ns->head->ns_id > nsid)
- list_move_tail(&ns->list, &rm_list);
+ if (ns->head->ns_id > nsid) {
+ list_del_rcu(&ns->list);
+ synchronize_srcu(&ctrl->srcu);
+ list_add_tail_rcu(&ns->list, &rm_list);
+ }
}
- up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_lock);
list_for_each_entry_safe(ns, next, &rm_list, list)
nvme_ns_remove(ns);
-
}
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -3878,7 +4063,7 @@ static void nvme_scan_work(struct work_struct *work)
int ret;
/* No tagset on a live ctrl means IO queues could not created */
- if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
return;
/*
@@ -3948,15 +4133,16 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
* removing the namespaces' disks; fail all the queues now to avoid
* potentially having to clean up the failed sync later.
*/
- if (ctrl->state == NVME_CTRL_DEAD)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
nvme_mark_namespaces_dead(ctrl);
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
- down_write(&ctrl->namespaces_rwsem);
- list_splice_init(&ctrl->namespaces, &ns_list);
- up_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
+ list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
+ mutex_unlock(&ctrl->namespaces_lock);
+ synchronize_srcu(&ctrl->srcu);
list_for_each_entry_safe(ns, next, &ns_list, list)
nvme_ns_remove(ns);
@@ -4030,7 +4216,7 @@ static void nvme_async_event_work(struct work_struct *work)
* flushing ctrl async_event_work after changing the controller state
* from LIVE and before freeing the admin queue.
*/
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
ctrl->ops->submit_async_event(ctrl);
}
@@ -4051,14 +4237,30 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
{
struct nvme_fw_slot_info_log *log;
+ u8 next_fw_slot, cur_fw_slot;
log = kmalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return;
if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
- log, sizeof(*log), 0))
+ log, sizeof(*log), 0)) {
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
+ goto out_free_log;
+ }
+
+ cur_fw_slot = log->afi & 0x7;
+ next_fw_slot = (log->afi & 0x70) >> 4;
+ if (!cur_fw_slot || (next_fw_slot && (cur_fw_slot != next_fw_slot))) {
+ dev_info(ctrl->device,
+ "Firmware is activated after next Controller Level Reset\n");
+ goto out_free_log;
+ }
+
+ memcpy(ctrl->subsys->firmware_rev, &log->frs[cur_fw_slot - 1],
+ sizeof(ctrl->subsys->firmware_rev));
+
+out_free_log:
kfree(log);
}
@@ -4068,6 +4270,8 @@ static void nvme_fw_act_work(struct work_struct *work)
struct nvme_ctrl, fw_act_work);
unsigned long fw_act_timeout;
+ nvme_auth_stop(ctrl);
+
if (ctrl->mtfa)
fw_act_timeout = jiffies +
msecs_to_jiffies(ctrl->mtfa * 100);
@@ -4123,7 +4327,6 @@ static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
* firmware activation.
*/
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
- nvme_auth_stop(ctrl);
requeue = false;
queue_work(nvme_wq, &ctrl->fw_act_work);
}
@@ -4193,13 +4396,15 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int cmd_size)
{
+ struct queue_limits lim = {};
int ret;
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
if (ctrl->ops->flags & NVME_F_FABRICS)
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ /* Reserved for fabric connect and keep alive */
+ set->reserved_tags = 2;
set->numa_node = ctrl->numa_node;
set->flags = BLK_MQ_F_NO_SCHED;
if (ctrl->ops->flags & NVME_F_BLOCKING)
@@ -4212,14 +4417,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ret)
return ret;
- ctrl->admin_q = blk_mq_init_queue(set);
+ ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->fabrics_q = blk_mq_init_queue(set);
+ ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q;
@@ -4268,7 +4473,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
set->reserved_tags = NVME_AQ_DEPTH;
else if (ctrl->ops->flags & NVME_F_FABRICS)
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ /* Reserved for fabric connect */
+ set->reserved_tags = 1;
set->numa_node = ctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
if (ctrl->ops->flags & NVME_F_BLOCKING)
@@ -4283,7 +4489,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->connect_q = blk_mq_init_queue(set);
+ ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
@@ -4327,8 +4533,6 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
- nvme_start_keep_alive(ctrl);
-
nvme_enable_aen(ctrl);
/*
@@ -4383,9 +4587,10 @@ static void nvme_free_ctrl(struct device *dev)
if (!subsys || ctrl->instance != subsys->instance)
ida_free(&nvme_instance_ida, ctrl->instance);
-
+ key_put(ctrl->tls_key);
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
+ cleanup_srcu_struct(&ctrl->srcu);
nvme_auth_stop(ctrl);
nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
@@ -4414,13 +4619,19 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
{
int ret;
- ctrl->state = NVME_CTRL_NEW;
+ WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
+ ctrl->passthru_err_log_enabled = false;
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
+ mutex_init(&ctrl->namespaces_lock);
+
+ ret = init_srcu_struct(&ctrl->srcu);
+ if (ret)
+ return ret;
+
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
xa_init(&ctrl->cels);
- init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
@@ -4435,6 +4646,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+ ctrl->ka_last_check_time = jiffies;
BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
PAGE_SIZE);
@@ -4453,7 +4665,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
ctrl->instance);
- ctrl->device->class = nvme_class;
+ ctrl->device->class = &nvme_class;
ctrl->device->parent = ctrl->dev;
if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups;
@@ -4499,6 +4711,7 @@ out_release_instance:
out:
if (ctrl->discard_page)
__free_page(ctrl->discard_page);
+ cleanup_srcu_struct(&ctrl->srcu);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -4507,36 +4720,40 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mark_disk_dead(ns->disk);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mq_unfreeze_queue(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
+ clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
if (timeout <= 0)
break;
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4544,22 +4761,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mq_freeze_queue_wait(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_freeze_queue_start(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
@@ -4602,11 +4822,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_sync_queue(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
@@ -4684,29 +4905,22 @@ static int __init nvme_core_init(void)
if (result < 0)
goto destroy_delete_wq;
- nvme_class = class_create("nvme");
- if (IS_ERR(nvme_class)) {
- result = PTR_ERR(nvme_class);
+ result = class_register(&nvme_class);
+ if (result)
goto unregister_chrdev;
- }
- nvme_class->dev_uevent = nvme_class_uevent;
- nvme_subsys_class = class_create("nvme-subsystem");
- if (IS_ERR(nvme_subsys_class)) {
- result = PTR_ERR(nvme_subsys_class);
+ result = class_register(&nvme_subsys_class);
+ if (result)
goto destroy_class;
- }
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
"nvme-generic");
if (result < 0)
goto destroy_subsys_class;
- nvme_ns_chr_class = class_create("nvme-generic");
- if (IS_ERR(nvme_ns_chr_class)) {
- result = PTR_ERR(nvme_ns_chr_class);
+ result = class_register(&nvme_ns_chr_class);
+ if (result)
goto unregister_generic_ns;
- }
result = nvme_init_auth();
if (result)
@@ -4714,13 +4928,13 @@ static int __init nvme_core_init(void)
return 0;
destroy_ns_chr:
- class_destroy(nvme_ns_chr_class);
+ class_unregister(&nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
- class_destroy(nvme_subsys_class);
+ class_unregister(&nvme_subsys_class);
destroy_class:
- class_destroy(nvme_class);
+ class_unregister(&nvme_class);
unregister_chrdev:
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_delete_wq:
@@ -4736,9 +4950,9 @@ out:
static void __exit nvme_core_exit(void)
{
nvme_exit_auth();
- class_destroy(nvme_ns_chr_class);
- class_destroy(nvme_subsys_class);
- class_destroy(nvme_class);
+ class_unregister(&nvme_ns_chr_class);
+ class_unregister(&nvme_subsys_class);
+ class_unregister(&nvme_class);
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_delete_wq);
@@ -4750,5 +4964,6 @@ static void __exit nvme_core_exit(void)
MODULE_LICENSE("GPL");
MODULE_VERSION("1.0");
+MODULE_DESCRIPTION("NVMe host core framework");
module_init(nvme_core_init);
module_exit(nvme_core_exit);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 8175d49f2909..ceb9c0ed3120 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include "nvme.h"
#include "fabrics.h"
+#include <linux/nvme-keyring.h>
static LIST_HEAD(nvmf_transports);
static DECLARE_RWSEM(nvmf_transports_rwsem);
@@ -179,7 +180,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, NVME_SUBMIT_RESERVED);
if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -225,7 +226,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, NVME_SUBMIT_RESERVED);
if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -270,7 +271,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.value = cpu_to_le64(val);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, NVME_SUBMIT_RESERVED);
if (unlikely(ret))
dev_err(ctrl->device,
"Property Set error: %d, offset %#x\n",
@@ -386,8 +387,8 @@ static struct nvmf_connect_data *nvmf_connect_data_prep(struct nvme_ctrl *ctrl,
uuid_copy(&data->hostid, &ctrl->opts->host->id);
data->cntlid = cpu_to_le16(cntlid);
- strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
- strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
+ strscpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
+ strscpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
return data;
}
@@ -427,12 +428,6 @@ static void nvmf_connect_cmd_prep(struct nvme_ctrl *ctrl, u16 qid,
* fabrics-protocol connection of the NVMe Admin queue between the
* host system device and the allocated NVMe controller on the
* target system via a NVMe Fabrics "Connect" command.
- *
- * Return:
- * 0: success
- * > 0: NVMe error status code
- * < 0: Linux errno error code
- *
*/
int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
{
@@ -449,8 +444,10 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
return -ENOMEM;
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
- data, sizeof(*data), NVME_QID_ANY, 1,
- BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+ data, sizeof(*data), NVME_QID_ANY,
+ NVME_SUBMIT_AT_HEAD |
+ NVME_SUBMIT_NOWAIT |
+ NVME_SUBMIT_RESERVED);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
@@ -464,7 +461,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
if (result & NVME_CONNECT_AUTHREQ_ASCR) {
dev_warn(ctrl->device,
"qid 0: secure concatenation is not supported\n");
- ret = NVME_SC_AUTH_REQUIRED;
+ ret = -EOPNOTSUPP;
goto out_free_data;
}
/* Authentication required */
@@ -472,14 +469,14 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
if (ret) {
dev_warn(ctrl->device,
"qid 0: authentication setup failed\n");
- ret = NVME_SC_AUTH_REQUIRED;
goto out_free_data;
}
ret = nvme_auth_wait(ctrl, 0);
- if (ret)
+ if (ret) {
dev_warn(ctrl->device,
- "qid 0: authentication failed\n");
- else
+ "qid 0: authentication failed, error %d\n",
+ ret);
+ } else
dev_info(ctrl->device,
"qid 0: authenticated\n");
}
@@ -524,11 +521,14 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
return -ENOMEM;
ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
- data, sizeof(*data), qid, 1,
- BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+ data, sizeof(*data), qid,
+ NVME_SUBMIT_AT_HEAD |
+ NVME_SUBMIT_RESERVED |
+ NVME_SUBMIT_NOWAIT);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
+ goto out_free_data;
}
result = le32_to_cpu(res.u32);
if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) {
@@ -536,7 +536,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
if (result & NVME_CONNECT_AUTHREQ_ASCR) {
dev_warn(ctrl->device,
"qid 0: secure concatenation is not supported\n");
- ret = NVME_SC_AUTH_REQUIRED;
+ ret = -EOPNOTSUPP;
goto out_free_data;
}
/* Authentication required */
@@ -544,12 +544,13 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
if (ret) {
dev_warn(ctrl->device,
"qid %d: authentication setup failed\n", qid);
- ret = NVME_SC_AUTH_REQUIRED;
- } else {
- ret = nvme_auth_wait(ctrl, qid);
- if (ret)
- dev_warn(ctrl->device,
- "qid %u: authentication failed\n", qid);
+ goto out_free_data;
+ }
+ ret = nvme_auth_wait(ctrl, qid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "qid %u: authentication failed, error %d\n",
+ qid, ret);
}
}
out_free_data:
@@ -558,8 +559,26 @@ out_free_data:
}
EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
-bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
+/*
+ * Evaluate the status information returned by the transport in order to decided
+ * if a reconnect attempt should be scheduled.
+ *
+ * Do not retry when:
+ *
+ * - the DNR bit is set and the specification states no further connect
+ * attempts with the same set of paramenters should be attempted.
+ *
+ * - when the authentication attempt fails, because the key was invalid.
+ * This error code is set on the host side.
+ */
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl, int status)
{
+ if (status > 0 && (status & NVME_SC_DNR))
+ return false;
+
+ if (status == -EKEYREJECTED)
+ return false;
+
if (ctrl->opts->max_reconnects == -1 ||
ctrl->nr_reconnects < ctrl->opts->max_reconnects)
return true;
@@ -622,6 +641,23 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
return NULL;
}
+static struct key *nvmf_parse_key(int key_id)
+{
+ struct key *key;
+
+ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) {
+ pr_err("TLS is not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ key = key_lookup(key_id);
+ if (IS_ERR(key))
+ pr_err("key id %08x not found\n", key_id);
+ else
+ pr_debug("Using key id %08x\n", key_id);
+ return key;
+}
+
static const match_table_t opt_tokens = {
{ NVMF_OPT_TRANSPORT, "transport=%s" },
{ NVMF_OPT_TRADDR, "traddr=%s" },
@@ -643,10 +679,19 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
{ NVMF_OPT_TOS, "tos=%d" },
+#ifdef CONFIG_NVME_TCP_TLS
+ { NVMF_OPT_KEYRING, "keyring=%d" },
+ { NVMF_OPT_TLS_KEY, "tls_key=%d" },
+#endif
{ NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" },
{ NVMF_OPT_DISCOVERY, "discovery" },
+#ifdef CONFIG_NVME_HOST_AUTH
{ NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" },
{ NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" },
+#endif
+#ifdef CONFIG_NVME_TCP_TLS
+ { NVMF_OPT_TLS, "tls" },
+#endif
{ NVMF_OPT_ERR, NULL }
};
@@ -657,9 +702,10 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
char *options, *o, *p;
int token, ret = 0;
size_t nqnlen = 0;
- int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
+ int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO, key_id;
uuid_t hostid;
char hostnqn[NVMF_NQN_SIZE];
+ struct key *key;
/* Set defaults */
opts->queue_size = NVMF_DEF_QUEUE_SIZE;
@@ -671,6 +717,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->hdr_digest = false;
opts->data_digest = false;
opts->tos = -1; /* < 0 == use transport default */
+ opts->tls = false;
+ opts->tls_key = NULL;
+ opts->keyring = NULL;
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -924,6 +973,32 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->tos = token;
break;
+ case NVMF_OPT_KEYRING:
+ if (match_int(args, &key_id) || key_id <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ key = nvmf_parse_key(key_id);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto out;
+ }
+ key_put(opts->keyring);
+ opts->keyring = key;
+ break;
+ case NVMF_OPT_TLS_KEY:
+ if (match_int(args, &key_id) || key_id <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ key = nvmf_parse_key(key_id);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto out;
+ }
+ key_put(opts->tls_key);
+ opts->tls_key = key;
+ break;
case NVMF_OPT_DISCOVERY:
opts->discovery_nqn = true;
break;
@@ -955,6 +1030,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
kfree(opts->dhchap_ctrl_secret);
opts->dhchap_ctrl_secret = p;
break;
+ case NVMF_OPT_TLS:
+ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) {
+ pr_err("TLS is not supported\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->tls = true;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -1156,6 +1239,8 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
void nvmf_free_options(struct nvmf_ctrl_options *opts)
{
nvmf_host_put(opts->host);
+ key_put(opts->keyring);
+ key_put(opts->tls_key);
kfree(opts->transport);
kfree(opts->traddr);
kfree(opts->trsvcid);
@@ -1247,7 +1332,10 @@ out_free_opts:
return ERR_PTR(ret);
}
-static struct class *nvmf_class;
+static const struct class nvmf_class = {
+ .name = "nvme-fabrics",
+};
+
static struct device *nvmf_device;
static DEFINE_MUTEX(nvmf_dev_mutex);
@@ -1367,15 +1455,14 @@ static int __init nvmf_init(void)
if (!nvmf_default_host)
return -ENOMEM;
- nvmf_class = class_create("nvme-fabrics");
- if (IS_ERR(nvmf_class)) {
+ ret = class_register(&nvmf_class);
+ if (ret) {
pr_err("couldn't register class nvme-fabrics\n");
- ret = PTR_ERR(nvmf_class);
goto out_free_host;
}
nvmf_device =
- device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
+ device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
if (IS_ERR(nvmf_device)) {
pr_err("couldn't create nvme-fabrics device!\n");
ret = PTR_ERR(nvmf_device);
@@ -1391,9 +1478,9 @@ static int __init nvmf_init(void)
return 0;
out_destroy_device:
- device_destroy(nvmf_class, MKDEV(0, 0));
+ device_destroy(&nvmf_class, MKDEV(0, 0));
out_destroy_class:
- class_destroy(nvmf_class);
+ class_unregister(&nvmf_class);
out_free_host:
nvmf_host_put(nvmf_default_host);
return ret;
@@ -1402,8 +1489,8 @@ out_free_host:
static void __exit nvmf_exit(void)
{
misc_deregister(&nvmf_misc);
- device_destroy(nvmf_class, MKDEV(0, 0));
- class_destroy(nvmf_class);
+ device_destroy(&nvmf_class, MKDEV(0, 0));
+ class_unregister(&nvmf_class);
nvmf_host_put(nvmf_default_host);
BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);
@@ -1421,6 +1508,7 @@ static void __exit nvmf_exit(void)
}
MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("NVMe host fabrics library");
module_init(nvmf_init);
module_exit(nvmf_exit);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 82e7a27ffbde..602135910ae9 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -19,13 +19,6 @@
#define NVMF_DEF_FAIL_FAST_TMO -1
/*
- * Reserved one command for internal usage. This command is used for sending
- * the connect command, as well as for the keep alive command on the admin
- * queue once live.
- */
-#define NVMF_RESERVED_TAGS 1
-
-/*
* Define a host as seen by the target. We allocate one at boot, but also
* allow the override it when creating controllers. This is both to provide
* persistence of the Host NQN over multiple boots, and to allow using
@@ -70,6 +63,9 @@ enum {
NVMF_OPT_DISCOVERY = 1 << 22,
NVMF_OPT_DHCHAP_SECRET = 1 << 23,
NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24,
+ NVMF_OPT_TLS = 1 << 25,
+ NVMF_OPT_KEYRING = 1 << 26,
+ NVMF_OPT_TLS_KEY = 1 << 27,
};
/**
@@ -102,6 +98,9 @@ enum {
* @dhchap_secret: DH-HMAC-CHAP secret
* @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional
* authentication
+ * @keyring: Keyring to use for key lookups
+ * @tls_key: TLS key for encrypted connections (TCP)
+ * @tls: Start TLS encrypted connections (TCP)
* @disable_sqflow: disable controller sq flow control
* @hdr_digest: generate/verify header digest (TCP)
* @data_digest: generate/verify data digest (TCP)
@@ -128,6 +127,9 @@ struct nvmf_ctrl_options {
struct nvmf_host *host;
char *dhchap_secret;
char *dhchap_ctrl_secret;
+ struct key *keyring;
+ struct key *tls_key;
+ bool tls;
bool disable_sqflow;
bool hdr_digest;
bool data_digest;
@@ -176,9 +178,11 @@ static inline bool
nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
struct nvmf_ctrl_options *opts)
{
- if (ctrl->state == NVME_CTRL_DELETING ||
- ctrl->state == NVME_CTRL_DELETING_NOIO ||
- ctrl->state == NVME_CTRL_DEAD ||
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
+ if (state == NVME_CTRL_DELETING ||
+ state == NVME_CTRL_DELETING_NOIO ||
+ state == NVME_CTRL_DEAD ||
strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
!uuid_equal(&opts->host->id, &ctrl->opts->host->id))
@@ -219,7 +223,7 @@ int nvmf_register_transport(struct nvmf_transport_ops *ops);
void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
void nvmf_free_options(struct nvmf_ctrl_options *opts);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
-bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl, int status);
bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
struct nvmf_ctrl_options *opts);
void nvmf_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues,
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 1cd2bf82319a..f0b081332749 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -221,11 +221,6 @@ static LIST_HEAD(nvme_fc_lport_list);
static DEFINE_IDA(nvme_fc_local_port_cnt);
static DEFINE_IDA(nvme_fc_ctrl_cnt);
-static struct workqueue_struct *nvme_fc_wq;
-
-static bool nvme_fc_waiting_to_unload;
-static DECLARE_COMPLETION(nvme_fc_unload_proceed);
-
/*
* These items are short-term. They will eventually be moved into
* a generic FC class. See comments in module init.
@@ -255,8 +250,6 @@ nvme_fc_free_lport(struct kref *ref)
/* remove from transport list */
spin_lock_irqsave(&nvme_fc_lock, flags);
list_del(&lport->port_list);
- if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list))
- complete(&nvme_fc_unload_proceed);
spin_unlock_irqrestore(&nvme_fc_lock, flags);
ida_free(&nvme_fc_local_port_cnt, lport->localport.port_num);
@@ -557,7 +550,7 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport)
static void
nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
{
- switch (ctrl->ctrl.state) {
+ switch (nvme_ctrl_state(&ctrl->ctrl)) {
case NVME_CTRL_NEW:
case NVME_CTRL_CONNECTING:
/*
@@ -793,7 +786,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
"NVME-FC{%d}: controller connectivity lost. Awaiting "
"Reconnect", ctrl->cnum);
- switch (ctrl->ctrl.state) {
+ switch (nvme_ctrl_state(&ctrl->ctrl)) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
/*
@@ -1218,10 +1211,10 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
/* Linux supports only Dynamic controllers */
assoc_rqst->assoc_cmd.cntlid = cpu_to_be16(0xffff);
uuid_copy(&assoc_rqst->assoc_cmd.hostid, &ctrl->ctrl.opts->host->id);
- strncpy(assoc_rqst->assoc_cmd.hostnqn, ctrl->ctrl.opts->host->nqn,
- min(FCNVME_ASSOC_HOSTNQN_LEN, NVMF_NQN_SIZE));
- strncpy(assoc_rqst->assoc_cmd.subnqn, ctrl->ctrl.opts->subsysnqn,
- min(FCNVME_ASSOC_SUBNQN_LEN, NVMF_NQN_SIZE));
+ strscpy(assoc_rqst->assoc_cmd.hostnqn, ctrl->ctrl.opts->host->nqn,
+ sizeof(assoc_rqst->assoc_cmd.hostnqn));
+ strscpy(assoc_rqst->assoc_cmd.subnqn, ctrl->ctrl.opts->subsysnqn,
+ sizeof(assoc_rqst->assoc_cmd.subnqn));
lsop->queue = queue;
lsreq->rqstaddr = assoc_rqst;
@@ -1924,7 +1917,7 @@ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req)
struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req);
struct request *rq = op->rq;
- if (!IS_ENABLED(CONFIG_BLK_CGROUP_FC_APPID) || !rq->bio)
+ if (!IS_ENABLED(CONFIG_BLK_CGROUP_FC_APPID) || !rq || !rq->bio)
return NULL;
return blkcg_get_fc_appid(rq->bio);
}
@@ -2435,7 +2428,7 @@ nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
* controller. Called after last nvme_put_ctrl() call
*/
static void
-nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
+nvme_fc_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
@@ -2548,24 +2541,17 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
* the controller. Abort any ios on the association and let the
* create_association error path resolve things.
*/
- enum nvme_ctrl_state state;
- unsigned long flags;
-
- spin_lock_irqsave(&ctrl->lock, flags);
- state = ctrl->ctrl.state;
- if (state == NVME_CTRL_CONNECTING) {
- set_bit(ASSOC_FAILED, &ctrl->flags);
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
__nvme_fc_abort_outstanding_ios(ctrl, true);
+ set_bit(ASSOC_FAILED, &ctrl->flags);
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: transport error during (re)connect\n",
ctrl->cnum);
return;
}
- spin_unlock_irqrestore(&ctrl->lock, flags);
/* Otherwise, only proceed if in LIVE state - e.g. on first error */
- if (state != NVME_CTRL_LIVE)
+ if (ctrl->ctrl.state != NVME_CTRL_LIVE)
return;
dev_warn(ctrl->ctrl.device,
@@ -2581,6 +2567,7 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
{
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
struct nvme_fc_ctrl *ctrl = op->ctrl;
+ u16 qnum = op->queue->qnum;
struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
struct nvme_command *sqe = &cmdiu->sqe;
@@ -2589,10 +2576,11 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
* will detect the aborted io and will fail the connection.
*/
dev_info(ctrl->ctrl.device,
- "NVME-FC{%d.%d}: io timeout: opcode %d fctype %d w10/11: "
+ "NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
"x%08x/x%08x\n",
- ctrl->cnum, op->queue->qnum, sqe->common.opcode,
- sqe->connect.fctype, sqe->common.cdw10, sqe->common.cdw11);
+ ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
+ nvme_fabrics_opcode_str(qnum, sqe),
+ sqe->common.cdw10, sqe->common.cdw11);
if (__nvme_fc_abort_op(ctrl, op))
nvme_fc_error_recovery(ctrl, "io timeout abort failed");
@@ -3132,11 +3120,12 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
nvme_unquiesce_admin_queue(&ctrl->ctrl);
ret = nvme_init_ctrl_finish(&ctrl->ctrl, false);
- if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags))
- ret = -EIO;
if (ret)
goto out_disconnect_admin_queue;
-
+ if (test_bit(ASSOC_FAILED, &ctrl->flags)) {
+ ret = -EIO;
+ goto out_stop_keep_alive;
+ }
/* sanity checks */
/* FC-NVME does not have other data in the capsule */
@@ -3144,7 +3133,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
ctrl->ctrl.icdoff);
ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
- goto out_disconnect_admin_queue;
+ goto out_stop_keep_alive;
}
/* FC-NVME supports normal SGL Data Block Descriptors */
@@ -3152,7 +3141,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
dev_err(ctrl->ctrl.device,
"Mandatory sgls are not supported!\n");
ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
- goto out_disconnect_admin_queue;
+ goto out_stop_keep_alive;
}
if (opts->queue_size > ctrl->ctrl.maxcmd) {
@@ -3179,16 +3168,12 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
else
ret = nvme_fc_recreate_io_queues(ctrl);
}
-
- spin_lock_irqsave(&ctrl->lock, flags);
if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags))
ret = -EIO;
- if (ret) {
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ if (ret)
goto out_term_aen_ops;
- }
+
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- spin_unlock_irqrestore(&ctrl->lock, flags);
ctrl->ctrl.nr_reconnects = 0;
@@ -3199,6 +3184,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
out_term_aen_ops:
nvme_fc_term_aen_ops(ctrl);
+out_stop_keep_alive:
+ nvme_stop_keep_alive(&ctrl->ctrl);
out_disconnect_admin_queue:
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: create_assoc failed, assoc_id %llx ret %d\n",
@@ -3316,19 +3303,17 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
bool recon = true;
- if (ctrl->ctrl.state != NVME_CTRL_CONNECTING)
+ if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_CONNECTING)
return;
if (portptr->port_state == FC_OBJSTATE_ONLINE) {
dev_info(ctrl->ctrl.device,
"NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
ctrl->cnum, status);
- if (status > 0 && (status & NVME_SC_DNR))
- recon = false;
} else if (time_after_eq(jiffies, rport->dev_loss_end))
recon = false;
- if (recon && nvmf_should_reconnect(&ctrl->ctrl)) {
+ if (recon && nvmf_should_reconnect(&ctrl->ctrl, status)) {
if (portptr->port_state == FC_OBJSTATE_ONLINE)
dev_info(ctrl->ctrl.device,
"NVME-FC{%d}: Reconnect attempt in %ld "
@@ -3397,7 +3382,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
- .free_ctrl = nvme_fc_nvme_ctrl_freed,
+ .free_ctrl = nvme_fc_free_ctrl,
.submit_async_event = nvme_fc_submit_async_event,
.delete_ctrl = nvme_fc_delete_ctrl,
.get_address = nvmf_get_address,
@@ -3506,10 +3491,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
- if (lport->dev)
- ctrl->ctrl.numa_node = dev_to_node(lport->dev);
- else
- ctrl->ctrl.numa_node = NUMA_NO_NODE;
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
@@ -3554,6 +3535,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
if (ret)
goto out_free_queues;
+ if (lport->dev)
+ ctrl->ctrl.numa_node = dev_to_node(lport->dev);
/* at this point, teardown path changes to ref counting on nvme ctrl */
@@ -3585,8 +3568,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
flush_delayed_work(&ctrl->connect_work);
dev_info(ctrl->ctrl.device,
- "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
- ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl));
+ "NVME-FC{%d}: new ctrl: NQN \"%s\", hostnqn: %s\n",
+ ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl), opts->host->nqn);
return &ctrl->ctrl;
@@ -3904,10 +3887,6 @@ static int __init nvme_fc_init_module(void)
{
int ret;
- nvme_fc_wq = alloc_workqueue("nvme_fc_wq", WQ_MEM_RECLAIM, 0);
- if (!nvme_fc_wq)
- return -ENOMEM;
-
/*
* NOTE:
* It is expected that in the future the kernel will combine
@@ -3925,7 +3904,7 @@ static int __init nvme_fc_init_module(void)
ret = class_register(&fc_class);
if (ret) {
pr_err("couldn't register class fc\n");
- goto out_destroy_wq;
+ return ret;
}
/*
@@ -3949,8 +3928,6 @@ out_destroy_device:
device_destroy(&fc_class, MKDEV(0, 0));
out_destroy_class:
class_unregister(&fc_class);
-out_destroy_wq:
- destroy_workqueue(nvme_fc_wq);
return ret;
}
@@ -3970,48 +3947,27 @@ nvme_fc_delete_controllers(struct nvme_fc_rport *rport)
spin_unlock(&rport->lock);
}
-static void
-nvme_fc_cleanup_for_unload(void)
+static void __exit nvme_fc_exit_module(void)
{
struct nvme_fc_lport *lport;
struct nvme_fc_rport *rport;
-
- list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
- list_for_each_entry(rport, &lport->endp_list, endp_list) {
- nvme_fc_delete_controllers(rport);
- }
- }
-}
-
-static void __exit nvme_fc_exit_module(void)
-{
unsigned long flags;
- bool need_cleanup = false;
spin_lock_irqsave(&nvme_fc_lock, flags);
- nvme_fc_waiting_to_unload = true;
- if (!list_empty(&nvme_fc_lport_list)) {
- need_cleanup = true;
- nvme_fc_cleanup_for_unload();
- }
+ list_for_each_entry(lport, &nvme_fc_lport_list, port_list)
+ list_for_each_entry(rport, &lport->endp_list, endp_list)
+ nvme_fc_delete_controllers(rport);
spin_unlock_irqrestore(&nvme_fc_lock, flags);
- if (need_cleanup) {
- pr_info("%s: waiting for ctlr deletes\n", __func__);
- wait_for_completion(&nvme_fc_unload_proceed);
- pr_info("%s: ctrl deletes complete\n", __func__);
- }
+ flush_workqueue(nvme_delete_wq);
nvmf_unregister_transport(&nvme_fc_transport);
- ida_destroy(&nvme_fc_local_port_cnt);
- ida_destroy(&nvme_fc_ctrl_cnt);
-
device_destroy(&fc_class, MKDEV(0, 0));
class_unregister(&fc_class);
- destroy_workqueue(nvme_fc_wq);
}
module_init(nvme_fc_init_module);
module_exit(nvme_fc_exit_module);
+MODULE_DESCRIPTION("NVMe host FC transport driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
index 316f3e4ca7cc..8df73a0b3980 100644
--- a/drivers/nvme/host/hwmon.c
+++ b/drivers/nvme/host/hwmon.c
@@ -187,7 +187,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data,
return 0;
}
-static const struct hwmon_channel_info *nvme_hwmon_info[] = {
+static const struct hwmon_channel_info *const nvme_hwmon_info[] = {
HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ),
HWMON_CHANNEL_INFO(temp,
HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN |
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d39f3219358b..8b69427a4476 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -5,7 +5,7 @@
*/
#include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include "nvme.h"
enum {
@@ -18,15 +18,12 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
{
u32 effects;
- if (capable(CAP_SYS_ADMIN))
- return true;
-
/*
* Do not allow unprivileged passthrough on partitions, as that allows an
* escape from the containment of the partition.
*/
if (flags & NVME_IOCTL_PARTITION)
- return false;
+ goto admin;
/*
* Do not allow unprivileged processes to send vendor specific or fabrics
@@ -34,7 +31,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
*/
if (c->common.opcode >= nvme_cmd_vendor_start ||
c->common.opcode == nvme_fabrics_command)
- return false;
+ goto admin;
/*
* Do not allow unprivileged passthrough of admin commands except
@@ -53,7 +50,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
return true;
}
}
- return false;
+ goto admin;
}
/*
@@ -63,7 +60,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
*/
effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode);
if (!(effects & NVME_CMD_EFFECTS_CSUPP))
- return false;
+ goto admin;
/*
* Don't allow passthrough for command that have intrusive (or unknown)
@@ -72,16 +69,20 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
NVME_CMD_EFFECTS_UUID_SEL |
NVME_CMD_EFFECTS_SCOPE_MASK))
- return false;
+ goto admin;
/*
* Only allow I/O commands that transfer data to the controller or that
* change the logical block contents if the file descriptor is open for
* writing.
*/
- if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC))
- return open_for_write;
+ if ((nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) &&
+ !open_for_write)
+ goto admin;
+
return true;
+admin:
+ return capable(CAP_SYS_ADMIN);
}
/*
@@ -96,55 +97,6 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
return (void __user *)ptrval;
}
-static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
- unsigned len, u32 seed)
-{
- struct bio_integrity_payload *bip;
- int ret = -ENOMEM;
- void *buf;
- struct bio *bio = req->bio;
-
- buf = kmalloc(len, GFP_KERNEL);
- if (!buf)
- goto out;
-
- ret = -EFAULT;
- if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len))
- goto out_free_meta;
-
- bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
- if (IS_ERR(bip)) {
- ret = PTR_ERR(bip);
- goto out_free_meta;
- }
-
- bip->bip_iter.bi_size = len;
- bip->bip_iter.bi_sector = seed;
- ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
- offset_in_page(buf));
- if (ret != len) {
- ret = -ENOMEM;
- goto out_free_meta;
- }
-
- req->cmd_flags |= REQ_INTEGRITY;
- return buf;
-out_free_meta:
- kfree(buf);
-out:
- return ERR_PTR(ret);
-}
-
-static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
- void *meta, unsigned len, int ret)
-{
- if (!ret && req_op(req) == REQ_OP_DRV_IN &&
- copy_to_user(ubuf, meta, len))
- ret = -EFAULT;
- kfree(meta);
- return ret;
-}
-
static struct request *nvme_alloc_user_request(struct request_queue *q,
struct nvme_command *cmd, blk_opf_t rq_flags,
blk_mq_req_flags_t blk_flags)
@@ -159,16 +111,21 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
return req;
}
+static void nvme_unmap_bio(struct bio *bio)
+{
+ if (bio_integrity(bio))
+ bio_integrity_unmap_free_user(bio);
+ blk_rq_unmap_user(bio);
+}
+
static int nvme_map_user_request(struct request *req, u64 ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
- u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
- unsigned int flags)
+ u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags)
{
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
struct bio *bio = NULL;
- void *meta = NULL;
int ret;
if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
@@ -190,25 +147,24 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
if (ret)
goto out;
+
bio = req->bio;
- if (bdev)
+ if (bdev) {
bio_set_dev(bio, bdev);
-
- if (bdev && meta_buffer && meta_len) {
- meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
- meta_seed);
- if (IS_ERR(meta)) {
- ret = PTR_ERR(meta);
- goto out_unmap;
+ if (meta_buffer && meta_len) {
+ ret = bio_integrity_map_user(bio, meta_buffer, meta_len,
+ meta_seed);
+ if (ret)
+ goto out_unmap;
+ req->cmd_flags |= REQ_INTEGRITY;
}
- *metap = meta;
}
return ret;
out_unmap:
if (bio)
- blk_rq_unmap_user(bio);
+ nvme_unmap_bio(bio);
out:
blk_mq_free_request(req);
return ret;
@@ -222,7 +178,6 @@ static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_ns *ns = q->queuedata;
struct nvme_ctrl *ctrl;
struct request *req;
- void *meta = NULL;
struct bio *bio;
u32 effects;
int ret;
@@ -234,7 +189,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
req->timeout = timeout;
if (ubuffer && bufflen) {
ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
- meta_len, meta_seed, &meta, NULL, flags);
+ meta_len, meta_seed, NULL, flags);
if (ret)
return ret;
}
@@ -246,11 +201,8 @@ static int nvme_submit_user_cmd(struct request_queue *q,
ret = nvme_execute_rq(req, false);
if (result)
*result = le64_to_cpu(nvme_req(req)->result.u64);
- if (meta)
- ret = nvme_finish_user_metadata(req, meta_buffer, meta,
- meta_len, ret);
if (bio)
- blk_rq_unmap_user(bio);
+ nvme_unmap_bio(bio);
blk_mq_free_request(req);
if (effects)
@@ -280,10 +232,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
return -EINVAL;
}
- length = (io.nblocks + 1) << ns->lba_shift;
+ length = (io.nblocks + 1) << ns->head->lba_shift;
if ((io.control & NVME_RW_PRINFO_PRACT) &&
- ns->ms == sizeof(struct t10_pi_tuple)) {
+ (ns->head->ms == ns->head->pi_size)) {
/*
* Protection information is stripped/inserted by the
* controller.
@@ -293,11 +245,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
meta_len = 0;
metadata = NULL;
} else {
- meta_len = (io.nblocks + 1) * ns->ms;
+ meta_len = (io.nblocks + 1) * ns->head->ms;
metadata = nvme_to_user_ptr(io.metadata);
}
- if (ns->features & NVME_NS_EXT_LBAS) {
+ if (ns->head->features & NVME_NS_EXT_LBAS) {
length += meta_len;
meta_len = 0;
} else if (meta_len) {
@@ -443,19 +395,10 @@ struct nvme_uring_data {
* Expect build errors if this grows larger than that.
*/
struct nvme_uring_cmd_pdu {
- union {
- struct bio *bio;
- struct request *req;
- };
- u32 meta_len;
- u32 nvme_status;
- union {
- struct {
- void *meta; /* kernel-resident buffer */
- void __user *meta_buffer;
- };
- u64 result;
- } u;
+ struct request *req;
+ struct bio *bio;
+ u64 result;
+ int status;
};
static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
@@ -464,40 +407,14 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
}
-static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd,
- unsigned issue_flags)
-{
- struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
- struct request *req = pdu->req;
- int status;
- u64 result;
-
- if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
- status = -EINTR;
- else
- status = nvme_req(req)->status;
-
- result = le64_to_cpu(nvme_req(req)->result.u64);
-
- if (pdu->meta_len)
- status = nvme_finish_user_metadata(req, pdu->u.meta_buffer,
- pdu->u.meta, pdu->meta_len, status);
- if (req->bio)
- blk_rq_unmap_user(req->bio);
- blk_mq_free_request(req);
-
- io_uring_cmd_done(ioucmd, status, result, issue_flags);
-}
-
static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
unsigned issue_flags)
{
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
if (pdu->bio)
- blk_rq_unmap_user(pdu->bio);
-
- io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags);
+ nvme_unmap_bio(pdu->bio);
+ io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags);
}
static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
@@ -506,20 +423,24 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
struct io_uring_cmd *ioucmd = req->end_io_data;
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
- req->bio = pdu->bio;
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
- pdu->nvme_status = -EINTR;
+ pdu->status = -EINTR;
else
- pdu->nvme_status = nvme_req(req)->status;
- pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64);
+ pdu->status = nvme_req(req)->status;
+ pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
/*
- * For iopoll, complete it directly.
+ * For iopoll, complete it directly. Note that using the uring_cmd
+ * helper for this is safe only because we check blk_rq_is_poll().
+ * As that returns false if we're NOT on a polled queue, then it's
+ * safe to use the polled completion helper.
+ *
* Otherwise, move the completion to task work.
*/
if (blk_rq_is_poll(req)) {
- WRITE_ONCE(ioucmd->cookie, NULL);
- nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED);
+ if (pdu->bio)
+ nvme_unmap_bio(pdu->bio);
+ io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
} else {
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
}
@@ -527,29 +448,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
return RQ_END_IO_FREE;
}
-static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
- blk_status_t err)
-{
- struct io_uring_cmd *ioucmd = req->end_io_data;
- struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-
- req->bio = pdu->bio;
- pdu->req = req;
-
- /*
- * For iopoll, complete it directly.
- * Otherwise, move the completion to task work.
- */
- if (blk_rq_is_poll(req)) {
- WRITE_ONCE(ioucmd->cookie, NULL);
- nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED);
- } else {
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb);
- }
-
- return RQ_END_IO_NONE;
-}
-
static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
{
@@ -561,7 +459,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct request *req;
blk_opf_t rq_flags = REQ_ALLOC_CACHE;
blk_mq_req_flags_t blk_flags = 0;
- void *meta = NULL;
int ret;
c.common.opcode = READ_ONCE(cmd->opcode);
@@ -609,27 +506,16 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (d.addr && d.data_len) {
ret = nvme_map_user_request(req, d.addr,
d.data_len, nvme_to_user_ptr(d.metadata),
- d.metadata_len, 0, &meta, ioucmd, vec);
+ d.metadata_len, 0, ioucmd, vec);
if (ret)
return ret;
}
- if (blk_rq_is_poll(req)) {
- ioucmd->flags |= IORING_URING_CMD_POLLED;
- WRITE_ONCE(ioucmd->cookie, req);
- }
-
/* to free bio on completion, as req->bio will be null at that time */
pdu->bio = req->bio;
- pdu->meta_len = d.metadata_len;
+ pdu->req = req;
req->end_io_data = ioucmd;
- if (pdu->meta_len) {
- pdu->u.meta = meta;
- pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata);
- req->end_io = nvme_uring_cmd_end_io_meta;
- } else {
- req->end_io = nvme_uring_cmd_end_io;
- }
+ req->end_io = nvme_uring_cmd_end_io;
blk_execute_rq_nowait(req, false);
return -EIOCBQUEUED;
}
@@ -780,16 +666,12 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
struct io_comp_batch *iob,
unsigned int poll_flags)
{
- struct request *req;
- int ret = 0;
-
- if (!(ioucmd->flags & IORING_URING_CMD_POLLED))
- return 0;
+ struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+ struct request *req = pdu->req;
- req = READ_ONCE(ioucmd->cookie);
if (req && blk_rq_is_poll(req))
- ret = blk_rq_poll(req, iob, poll_flags);
- return ret;
+ return blk_rq_poll(req, iob, poll_flags);
+ return 0;
}
#ifdef CONFIG_NVME_MULTIPATH
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
@@ -914,15 +796,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
bool open_for_write)
{
struct nvme_ns *ns;
- int ret;
+ int ret, srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
if (list_empty(&ctrl->namespaces)) {
ret = -ENOTTY;
goto out_unlock;
}
- ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
+ ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
dev_warn(ctrl->device,
"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
@@ -932,15 +814,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
dev_warn(ctrl->device,
"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
- kref_get(&ns->kref);
- up_read(&ctrl->namespaces_rwsem);
+ if (!nvme_get_ns(ns)) {
+ ret = -ENXIO;
+ goto out_unlock;
+ }
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
nvme_put_ns(ns);
return ret;
out_unlock:
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..d8b6b4648eaf 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req)
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
- blk_mq_end_request(req, 0);
+ nvme_req(req)->status = 0;
+ nvme_end_req(req);
kblockd_schedule_work(&ns->head->requeue_work);
}
@@ -150,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
if (!ns->head->disk)
continue;
kblockd_schedule_work(&ns->head->requeue_work);
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
disk_uevent(ns->head->disk, KOBJ_CHANGE);
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
static const char *nvme_ana_state_names[] = {
@@ -193,13 +195,14 @@ out:
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
nvme_mpath_clear_current_path(ns);
kblockd_schedule_work(&ns->head->requeue_work);
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -223,13 +226,14 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
static bool nvme_path_is_disabled(struct nvme_ns *ns)
{
+ enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
+
/*
* We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
* still be able to complete assuming that the controller is connected.
* Otherwise it will fail immediately and return to the requeue list.
*/
- if (ns->ctrl->state != NVME_CTRL_LIVE &&
- ns->ctrl->state != NVME_CTRL_DELETING)
+ if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
return true;
if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
!test_bit(NVME_NS_READY, &ns->flags))
@@ -246,7 +250,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
if (nvme_path_is_disabled(ns))
continue;
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+ if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+ READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
distance = node_distance(node, ns->ctrl->numa_node);
else
distance = LOCAL_DISTANCE;
@@ -331,7 +336,7 @@ out:
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
- return ns->ctrl->state == NVME_CTRL_LIVE &&
+ return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
ns->ana_state == NVME_ANA_OPTIMIZED;
}
@@ -358,7 +363,7 @@ static bool nvme_available_path(struct nvme_ns_head *head)
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
continue;
- switch (ns->ctrl->state) {
+ switch (nvme_ctrl_state(ns->ctrl)) {
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
@@ -515,6 +520,7 @@ static void nvme_requeue_work(struct work_struct *work)
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
+ struct queue_limits lim;
bool vwc = false;
mutex_init(&head->lock);
@@ -531,9 +537,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
!nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0;
- head->disk = blk_alloc_disk(ctrl->numa_node);
- if (!head->disk)
- return -ENOMEM;
+ blk_set_stacking_limits(&lim);
+ lim.dma_alignment = 3;
+ if (head->ids.csi != NVME_CSI_ZNS)
+ lim.max_zone_append_sectors = 0;
+
+ head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
+ if (IS_ERR(head->disk))
+ return PTR_ERR(head->disk);
head->disk->fops = &nvme_ns_head_ops;
head->disk->private_data = head;
sprintf(head->disk->disk_name, "nvme%dn%d",
@@ -552,11 +563,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
- /* set to a default value of 512 until the disk is validated */
- blk_queue_logical_block_size(head->disk->queue, 512);
- blk_set_stacking_limits(&head->disk->queue->limits);
- blk_queue_dma_alignment(head->disk->queue, 3);
-
/* we need to propagate up the VMC settings */
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
@@ -579,7 +585,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
*/
if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
rc = device_add_disk(&head->subsys->dev, head->disk,
- nvme_ns_id_attr_groups);
+ nvme_ns_attr_groups);
if (rc) {
clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
return;
@@ -592,7 +598,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
int node, srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
- for_each_node(node)
+ for_each_online_node(node)
__nvme_find_path(head, node);
srcu_read_unlock(&head->srcu, srcu_idx);
}
@@ -667,7 +673,7 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
* controller is ready.
*/
if (nvme_state_is_live(ns->ana_state) &&
- ns->ctrl->state == NVME_CTRL_LIVE)
+ nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
nvme_mpath_set_live(ns);
}
@@ -677,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
+ int srcu_idx;
dev_dbg(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
@@ -688,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (!nr_nsids)
return 0;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
unsigned nsid;
again:
nsid = le32_to_cpu(desc->nsids[n]);
@@ -702,7 +709,7 @@ again:
if (ns->head->ns_id > nsid)
goto again;
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
return 0;
}
@@ -748,7 +755,7 @@ static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
return;
nvme_read_ana_log(ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f35647c470af..f3a41133ac3f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@
#include <linux/rcupdate.h>
#include <linux/wait.h>
#include <linux/t10-pi.h>
+#include <linux/ratelimit_types.h>
#include <trace/events/block.h>
@@ -156,6 +157,16 @@ enum nvme_quirks {
* No temperature thresholds for channels other than 0 (Composite).
*/
NVME_QUIRK_NO_SECONDARY_TEMP_THRESH = (1 << 19),
+
+ /*
+ * Disables simple suspend/resume path.
+ */
+ NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND = (1 << 20),
+
+ /*
+ * MSI (but not MSI-X) interrupts are broken and never fire.
+ */
+ NVME_QUIRK_BROKEN_MSI = (1 << 21),
};
/*
@@ -251,11 +262,13 @@ enum nvme_ctrl_flags {
NVME_CTRL_STOPPED = 3,
NVME_CTRL_SKIP_ID_CNS_CS = 4,
NVME_CTRL_DIRTY_CAPABILITY = 5,
+ NVME_CTRL_FROZEN = 6,
};
struct nvme_ctrl {
bool comp_seen;
bool identified;
+ bool passthru_err_log_enabled;
enum nvme_ctrl_state state;
spinlock_t lock;
struct mutex scan_lock;
@@ -269,7 +282,8 @@ struct nvme_ctrl {
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
- struct rw_semaphore namespaces_rwsem;
+ struct mutex namespaces_lock;
+ struct srcu_struct srcu;
struct device ctrl_device;
struct device *device; /* char device */
#ifdef CONFIG_NVME_HWMON
@@ -296,14 +310,13 @@ struct nvme_ctrl {
u32 max_hw_sectors;
u32 max_segments;
u32 max_integrity_segments;
- u32 max_discard_sectors;
- u32 max_discard_segments;
u32 max_zeroes_sectors;
#ifdef CONFIG_BLK_DEV_ZONED
u32 max_zone_append;
#endif
u16 crdt[3];
u16 oncs;
+ u8 dmrl;
u32 dmrsl;
u16 oacs;
u16 sqsize;
@@ -349,7 +362,7 @@ struct nvme_ctrl {
struct work_struct ana_work;
#endif
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
struct work_struct dhchap_auth_work;
struct mutex dhchap_auth_mutex;
struct nvme_dhchap_queue_context *dhchap_ctxs;
@@ -357,6 +370,7 @@ struct nvme_ctrl {
struct nvme_dhchap_key *ctrl_key;
u16 transaction;
#endif
+ struct key *tls_key;
/* Power saving configuration */
u64 ps_max_latency_us;
@@ -386,6 +400,11 @@ struct nvme_ctrl {
enum nvme_dctype dctype;
};
+static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
+{
+ return READ_ONCE(ctrl->state);
+}
+
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
@@ -438,13 +457,27 @@ struct nvme_ns_head {
struct list_head list;
struct srcu_struct srcu;
struct nvme_subsystem *subsys;
- unsigned ns_id;
struct nvme_ns_ids ids;
struct list_head entry;
struct kref ref;
bool shared;
+ bool passthru_err_log_enabled;
int instance;
struct nvme_effects_log *effects;
+ u64 nuse;
+ unsigned ns_id;
+ int lba_shift;
+ u16 ms;
+ u16 pi_size;
+ u8 pi_type;
+ u8 pi_offset;
+ u8 guard_type;
+#ifdef CONFIG_BLK_DEV_ZONED
+ u64 zsze;
+#endif
+ unsigned long features;
+
+ struct ratelimit_state rs_nuse;
struct cdev cdev;
struct device cdev_device;
@@ -486,17 +519,6 @@ struct nvme_ns {
struct kref kref;
struct nvme_ns_head *head;
- int lba_shift;
- u16 ms;
- u16 pi_size;
- u16 sgs;
- u32 sws;
- u8 pi_type;
- u8 guard_type;
-#ifdef CONFIG_BLK_DEV_ZONED
- u64 zsze;
-#endif
- unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
#define NVME_NS_ANA_PENDING 2
@@ -507,13 +529,12 @@ struct nvme_ns {
struct device cdev_device;
struct nvme_fault_inject fault_inject;
-
};
/* NVMe ns supports metadata actions by the controller (generate/strip) */
-static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+static inline bool nvme_ns_has_pi(struct nvme_ns_head *head)
{
- return ns->pi_type && ns->ms == ns->pi_size;
+ return head->pi_type && head->ms == head->pi_size;
}
struct nvme_ctrl_ops {
@@ -645,17 +666,17 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
/*
* Convert a 512B sector number to a device logical block number.
*/
-static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
+static inline u64 nvme_sect_to_lba(struct nvme_ns_head *head, sector_t sector)
{
- return sector >> (ns->lba_shift - SECTOR_SHIFT);
+ return sector >> (head->lba_shift - SECTOR_SHIFT);
}
/*
* Convert a device logical block number to a 512B sector number.
*/
-static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
+static inline sector_t nvme_lba_to_sect(struct nvme_ns_head *head, u64 lba)
{
- return lba << (ns->lba_shift - SECTOR_SHIFT);
+ return lba << (head->lba_shift - SECTOR_SHIFT);
}
/*
@@ -724,6 +745,28 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH;
}
+/*
+ * Returns true for sink states that can't ever transition back to live.
+ */
+static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
+{
+ switch (nvme_ctrl_state(ctrl)) {
+ case NVME_CTRL_NEW:
+ case NVME_CTRL_LIVE:
+ case NVME_CTRL_RESETTING:
+ case NVME_CTRL_CONNECTING:
+ return false;
+ case NVME_CTRL_DELETING:
+ case NVME_CTRL_DELETING_NOIO:
+ case NVME_CTRL_DEAD:
+ return true;
+ default:
+ WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
+ return true;
+ }
+}
+
+void nvme_end_req(struct request *req);
void nvme_complete_rq(struct request *req);
void nvme_complete_batch_req(struct request *req);
@@ -790,17 +833,18 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req);
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
struct request *req);
bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
- bool queue_live);
+ bool queue_live, enum nvme_ctrl_state state);
static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live)
{
- if (likely(ctrl->state == NVME_CTRL_LIVE))
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
+ if (likely(state == NVME_CTRL_LIVE))
return true;
- if (ctrl->ops->flags & NVME_F_FABRICS &&
- ctrl->state == NVME_CTRL_DELETING)
+ if (ctrl->ops->flags & NVME_F_FABRICS && state == NVME_CTRL_DELETING)
return queue_live;
- return __nvme_check_ready(ctrl, rq, queue_live);
+ return __nvme_check_ready(ctrl, rq, queue_live, state);
}
/*
@@ -821,12 +865,27 @@ static inline bool nvme_is_unique_nsid(struct nvme_ctrl *ctrl,
(ctrl->ctratt & NVME_CTRL_CTRATT_NVM_SETS);
}
+/*
+ * Flags for __nvme_submit_sync_cmd()
+ */
+typedef __u32 __bitwise nvme_submit_flags_t;
+
+enum {
+ /* Insert request at the head of the queue */
+ NVME_SUBMIT_AT_HEAD = (__force nvme_submit_flags_t)(1 << 0),
+ /* Set BLK_MQ_REQ_NOWAIT when allocating request */
+ NVME_SUBMIT_NOWAIT = (__force nvme_submit_flags_t)(1 << 1),
+ /* Set BLK_MQ_REQ_RESERVED when allocating request */
+ NVME_SUBMIT_RESERVED = (__force nvme_submit_flags_t)(1 << 2),
+ /* Retry command when NVME_SC_DNR is not set in the result */
+ NVME_SUBMIT_RETRY = (__force nvme_submit_flags_t)(1 << 3),
+};
+
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buf, unsigned bufflen);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
- int qid, int at_head,
- blk_mq_req_flags_t flags);
+ int qid, nvme_submit_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
u32 *result);
@@ -861,10 +920,12 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
+int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+ struct nvme_id_ns **id);
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
-extern const struct attribute_group *nvme_ns_id_attr_groups[];
+extern const struct attribute_group *nvme_ns_attr_groups[];
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
@@ -914,6 +975,10 @@ extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
+static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+{
+ return disk->fops == &nvme_ns_head_ops;
+}
#else
#define multipath false
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -991,13 +1056,25 @@ static inline void nvme_mpath_start_request(struct request *rq)
static inline void nvme_mpath_end_request(struct request *rq)
{
}
+static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+{
+ return false;
+}
#endif /* CONFIG_NVME_MULTIPATH */
-int nvme_revalidate_zones(struct nvme_ns *ns);
+struct nvme_zone_info {
+ u64 zone_size;
+ unsigned int max_open_zones;
+ unsigned int max_active_zones;
+};
+
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
+int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf,
+ struct nvme_zone_info *zi);
+void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
+ struct nvme_zone_info *zi);
#ifdef CONFIG_BLK_DEV_ZONED
-int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action);
@@ -1008,18 +1085,14 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
{
return BLK_STS_NOTSUPP;
}
-
-static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
-{
- dev_warn(ns->ctrl->device,
- "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
- return -EPROTONOSUPPORT;
-}
#endif
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
{
- return dev_to_disk(dev)->private_data;
+ struct gendisk *disk = dev_to_disk(dev);
+
+ WARN_ON(nvme_disk_is_ns_head(disk));
+ return disk->private_data;
}
#ifdef CONFIG_NVME_HWMON
@@ -1048,7 +1121,7 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
int __init nvme_init_auth(void);
void __exit nvme_exit_auth(void);
int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
@@ -1075,7 +1148,7 @@ static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
}
static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
{
- return NVME_SC_AUTH_REQUIRED;
+ return -EPROTONOSUPPORT;
}
static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
#endif
@@ -1088,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
struct nvme_command *cmd, int status);
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
+bool nvme_get_ns(struct nvme_ns *ns);
void nvme_put_ns(struct nvme_ns *ns);
static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
@@ -1096,35 +1170,42 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
}
#ifdef CONFIG_NVME_VERBOSE_ERRORS
-const unsigned char *nvme_get_error_status_str(u16 status);
-const unsigned char *nvme_get_opcode_str(u8 opcode);
-const unsigned char *nvme_get_admin_opcode_str(u8 opcode);
-const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode);
+const char *nvme_get_error_status_str(u16 status);
+const char *nvme_get_opcode_str(u8 opcode);
+const char *nvme_get_admin_opcode_str(u8 opcode);
+const char *nvme_get_fabrics_opcode_str(u8 opcode);
#else /* CONFIG_NVME_VERBOSE_ERRORS */
-static inline const unsigned char *nvme_get_error_status_str(u16 status)
+static inline const char *nvme_get_error_status_str(u16 status)
{
return "I/O Error";
}
-static inline const unsigned char *nvme_get_opcode_str(u8 opcode)
+static inline const char *nvme_get_opcode_str(u8 opcode)
{
return "I/O Cmd";
}
-static inline const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
+static inline const char *nvme_get_admin_opcode_str(u8 opcode)
{
return "Admin Cmd";
}
-static inline const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode)
+static inline const char *nvme_get_fabrics_opcode_str(u8 opcode)
{
return "Fabrics Cmd";
}
#endif /* CONFIG_NVME_VERBOSE_ERRORS */
-static inline const unsigned char *nvme_opcode_str(int qid, u8 opcode, u8 fctype)
+static inline const char *nvme_opcode_str(int qid, u8 opcode)
{
- if (opcode == nvme_fabrics_command)
- return nvme_get_fabrics_opcode_str(fctype);
return qid ? nvme_get_opcode_str(opcode) :
nvme_get_admin_opcode_str(opcode);
}
+
+static inline const char *nvme_fabrics_opcode_str(
+ int qid, const struct nvme_command *cmd)
+{
+ if (nvme_is_fabrics(cmd))
+ return nvme_get_fabrics_opcode_str(cmd->fabrics.fctype);
+
+ return nvme_opcode_str(qid, cmd->common.opcode);
+}
#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2f57da12d983..102a9fb0c65f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
+ if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
+ bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
@@ -924,7 +925,6 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
return false;
- req->mq_hctx->tags->rqs[req->tag] = req;
return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
}
@@ -1234,7 +1234,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
/* If there is a reset/reinit ongoing, we shouldn't reset again. */
- switch (dev->ctrl.state) {
+ switch (nvme_ctrl_state(&dev->ctrl)) {
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
return false;
@@ -1285,6 +1285,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
struct request *abort_req;
struct nvme_command cmd = { };
u32 csts = readl(dev->bar + NVME_REG_CSTS);
+ u8 opcode;
+
+ if (nvme_state_terminal(&dev->ctrl))
+ goto disable;
/* If PCI error recovery process is happening, we cannot reset or
* the recovery mechanism will surely fail.
@@ -1311,8 +1315,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
dev_warn(dev->ctrl.device,
- "I/O %d QID %d timeout, completion polled\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) QID %d timeout, completion polled\n",
+ req->tag, nvme_cid(req), nvmeq->qid);
return BLK_EH_DONE;
}
@@ -1322,14 +1326,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* cancellation error. All outstanding requests are completed on
* shutdown, so we return BLK_EH_DONE.
*/
- switch (dev->ctrl.state) {
+ switch (nvme_ctrl_state(&dev->ctrl)) {
case NVME_CTRL_CONNECTING:
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
fallthrough;
case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device,
- "I/O %d QID %d timeout, disable controller\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) QID %d timeout, disable controller\n",
+ req->tag, nvme_cid(req), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, true);
return BLK_EH_DONE;
@@ -1344,10 +1348,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* command was already aborted once before and still hasn't been
* returned to the driver, or if this is the admin queue.
*/
+ opcode = nvme_req(req)->cmd->common.opcode;
if (!nvmeq->qid || iod->aborted) {
dev_warn(dev->ctrl.device,
- "I/O %d QID %d timeout, reset controller\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
+ req->tag, nvme_cid(req), opcode,
+ nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
goto disable;
}
@@ -1363,10 +1369,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
dev_warn(nvmeq->dev->ctrl.device,
- "I/O %d (%s) QID %d timeout, aborting\n",
- req->tag,
- nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
- nvmeq->qid);
+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n",
+ req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
+ nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
+ blk_rq_bytes(req));
abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
BLK_MQ_REQ_NOWAIT);
@@ -1388,8 +1394,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
return BLK_EH_RESET_TIMER;
disable:
- if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+ if (nvme_state_terminal(&dev->ctrl))
+ nvme_dev_disable(dev, true);
return BLK_EH_DONE;
+ }
nvme_dev_disable(dev, false);
if (nvme_try_sched_reset(&dev->ctrl))
@@ -1594,7 +1603,7 @@ static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
/*
* Controller is in wrong state, fail early.
*/
- if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
+ if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) {
mutex_unlock(&dev->shutdown_lock);
return -ENODEV;
}
@@ -2216,6 +2225,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
.priv = dev,
};
unsigned int irq_queues, poll_queues;
+ unsigned int flags = PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY;
/*
* Poll queues don't need interrupts, but we need at least one I/O queue
@@ -2239,8 +2249,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
irq_queues = 1;
if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
irq_queues += (nr_io_queues - poll_queues);
- return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+ if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
+ flags &= ~PCI_IRQ_MSI;
+ return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, flags,
+ &affd);
}
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
@@ -2469,6 +2481,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
{
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ unsigned int flags = PCI_IRQ_ALL_TYPES;
if (pci_enable_device_mem(pdev))
return result;
@@ -2485,7 +2498,9 @@ static int nvme_pci_enable(struct nvme_dev *dev)
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
- result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+ if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
+ flags &= ~PCI_IRQ_MSI;
+ result = pci_alloc_irq_vectors(pdev, 1, 1, flags);
if (result < 0)
goto disable;
@@ -2574,13 +2589,13 @@ static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
+ enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl);
struct pci_dev *pdev = to_pci_dev(dev->dev);
bool dead;
mutex_lock(&dev->shutdown_lock);
dead = nvme_pci_ctrl_is_dead(dev);
- if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING) {
+ if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) {
if (pci_is_enabled(pdev))
nvme_start_freeze(&dev->ctrl);
/*
@@ -2691,7 +2706,7 @@ static void nvme_reset_work(struct work_struct *work)
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
- if (dev->ctrl.state != NVME_CTRL_RESETTING) {
+ if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
dev->ctrl.state);
result = -ENODEV;
@@ -2744,10 +2759,10 @@ static void nvme_reset_work(struct work_struct *work)
* controller around but remove all namespaces.
*/
if (dev->online_queues > 1) {
+ nvme_dbbuf_set(dev);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
nvme_pci_update_nr_queues(dev);
- nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
} else {
dev_warn(dev->ctrl.device, "IO queues lost\n");
@@ -2903,6 +2918,18 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
return NVME_QUIRK_SIMPLE_SUSPEND;
+ } else if (pdev->vendor == 0x2646 && (pdev->device == 0x2263 ||
+ pdev->device == 0x500f)) {
+ /*
+ * Exclude some Kingston NV1 and A2000 devices from
+ * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
+ * lot fo energy with s2idle sleep on some TUXEDO platforms.
+ */
+ if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
+ dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
+ dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") ||
+ dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1"))
+ return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
}
return 0;
@@ -2916,9 +2943,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
struct nvme_dev *dev;
int ret = -ENOMEM;
- if (node == NUMA_NO_NODE)
- set_dev_node(&pdev->dev, first_memory_node);
-
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -2936,7 +2960,9 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
dev->dev = get_device(&pdev->dev);
quirks |= check_vendor_combination_bug(pdev);
- if (!noacpi && acpi_storage_d3(&pdev->dev)) {
+ if (!noacpi &&
+ !(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) &&
+ acpi_storage_d3(&pdev->dev)) {
/*
* Some systems use a bios work around to ask for D3 on
* platforms that support kernel managed suspend.
@@ -3196,7 +3222,7 @@ static int nvme_suspend(struct device *dev)
nvme_wait_freeze(ctrl);
nvme_sync_queues(ctrl);
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
goto unfreeze;
/*
@@ -3332,7 +3358,8 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES |
- NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ NVME_QUIRK_IGNORE_DEV_SUBNQN |
+ NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
@@ -3349,6 +3376,9 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+ NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */
.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
NVME_QUIRK_BOGUS_NID, },
@@ -3367,6 +3397,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
NVME_QUIRK_DISABLE_WRITE_ZEROES|
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */
+ .driver_data = NVME_QUIRK_BROKEN_MSI },
{ PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */
@@ -3397,6 +3429,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x1c5c, 0x174a), /* SK Hynix P31 SSD */
.driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1c5c, 0x1D59), /* SK Hynix BC901 */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x1d97, 0x2263), /* SPCC */
@@ -3527,5 +3561,6 @@ static void __exit nvme_exit(void)
MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
MODULE_VERSION("1.0");
+MODULE_DESCRIPTION("NVMe host PCIe transport driver");
module_init(nvme_init);
module_exit(nvme_exit);
diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index 391b1465ebfd..8fa1ffcdaed4 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -77,7 +77,7 @@ static int nvme_sc_to_pr_err(int nvme_sc)
if (nvme_is_path_error(nvme_sc))
return PR_STS_PATH_FAILED;
- switch (nvme_sc) {
+ switch (nvme_sc & 0x7ff) {
case NVME_SC_SUCCESS:
return PR_STS_SUCCESS;
case NVME_SC_RESERVATION_CONFLICT:
@@ -97,8 +97,7 @@ static int nvme_sc_to_pr_err(int nvme_sc)
static int nvme_send_pr_command(struct block_device *bdev,
struct nvme_command *c, void *data, unsigned int data_len)
{
- if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
- bdev->bd_disk->fops == &nvme_ns_head_ops)
+ if (nvme_disk_is_ns_head(bdev->bd_disk))
return nvme_send_ns_head_pr_command(bdev, c, data, data_len);
return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 337a624a537c..51a62b0c645a 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -638,6 +638,9 @@ static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
{
+ if (!test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
+ return;
+
mutex_lock(&queue->queue_lock);
if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
__nvme_rdma_stop_queue(queue);
@@ -979,16 +982,18 @@ free_ctrl:
kfree(ctrl);
}
-static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
+static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl,
+ int status)
{
+ enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
+
/* If we are resetting/deleting then do nothing */
- if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
- WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
- ctrl->ctrl.state == NVME_CTRL_LIVE);
+ if (state != NVME_CTRL_CONNECTING) {
+ WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
return;
}
- if (nvmf_should_reconnect(&ctrl->ctrl)) {
+ if (nvmf_should_reconnect(&ctrl->ctrl, status)) {
dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
ctrl->ctrl.opts->reconnect_delay);
queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
@@ -1002,6 +1007,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{
int ret;
bool changed;
+ u16 max_queue_size;
ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret)
@@ -1026,11 +1032,16 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
}
- if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
+ if (ctrl->ctrl.max_integrity_segments)
+ max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE;
+ else
+ max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE;
+
+ if (ctrl->ctrl.sqsize + 1 > max_queue_size) {
dev_warn(ctrl->ctrl.device,
- "ctrl sqsize %u > max queue size %u, clamping down\n",
- ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
- ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
+ "ctrl sqsize %u > max queue size %u, clamping down\n",
+ ctrl->ctrl.sqsize + 1, max_queue_size);
+ ctrl->ctrl.sqsize = max_queue_size - 1;
}
if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
@@ -1056,8 +1067,10 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
- ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
+ enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
+
+ WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
@@ -1077,6 +1090,7 @@ destroy_io:
nvme_rdma_free_io_queues(ctrl);
}
destroy_admin:
+ nvme_stop_keep_alive(&ctrl->ctrl);
nvme_quiesce_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
@@ -1091,10 +1105,12 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_rdma_ctrl, reconnect_work);
+ int ret;
++ctrl->ctrl.nr_reconnects;
- if (nvme_rdma_setup_ctrl(ctrl, false))
+ ret = nvme_rdma_setup_ctrl(ctrl, false);
+ if (ret)
goto requeue;
dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
@@ -1105,9 +1121,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
return;
requeue:
- dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
- ctrl->ctrl.nr_reconnects);
- nvme_rdma_reconnect_or_remove(ctrl);
+ dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d/%d\n",
+ ctrl->ctrl.nr_reconnects, ctrl->ctrl.opts->max_reconnects);
+ nvme_rdma_reconnect_or_remove(ctrl, ret);
}
static void nvme_rdma_error_recovery_work(struct work_struct *work)
@@ -1125,12 +1141,14 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
- ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
+ enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
+
+ WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ state != NVME_CTRL_DELETING_NOIO);
return;
}
- nvme_rdma_reconnect_or_remove(ctrl);
+ nvme_rdma_reconnect_or_remove(ctrl, 0);
}
static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
@@ -1158,7 +1176,7 @@ static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
struct nvme_rdma_queue *queue = wc->qp->qp_context;
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
- if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE)
dev_info(ctrl->ctrl.device,
"%s for CQE 0x%p failed with status %s (%d)\n",
op, wc->wr_cqe,
@@ -1401,6 +1419,8 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
struct nvme_ns *ns = rq->q->queuedata;
struct bio *bio = rq->bio;
struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ u32 xfer_len;
int nr;
req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
@@ -1413,8 +1433,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
if (unlikely(nr))
goto mr_put;
- nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
- req->mr->sig_attrs, ns->pi_type);
+ nvme_rdma_set_sig_attrs(bi, c, req->mr->sig_attrs, ns->head->pi_type);
nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
@@ -1432,7 +1451,11 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
IB_ACCESS_REMOTE_WRITE;
sg->addr = cpu_to_le64(req->mr->iova);
- put_unaligned_le24(req->mr->length, sg->length);
+ xfer_len = req->mr->length;
+ /* Check if PI is added by the HW */
+ if (!pi_count)
+ xfer_len += (xfer_len >> bi->interval_exp) * ns->head->pi_size;
+ put_unaligned_le24(xfer_len, sg->length);
put_unaligned_le32(req->mr->rkey, sg->key);
sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
@@ -1937,11 +1960,15 @@ static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq)
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_queue *queue = req->queue;
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
+ struct nvme_command *cmd = req->req.cmd;
+ int qid = nvme_rdma_queue_idx(queue);
- dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
- rq->tag, nvme_rdma_queue_idx(queue));
+ dev_warn(ctrl->ctrl.device,
+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout\n",
+ rq->tag, nvme_cid(rq), cmd->common.opcode,
+ nvme_fabrics_opcode_str(qid, cmd), qid);
- if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_LIVE) {
/*
* If we are resetting, connecting or deleting we should
* complete immediately because we may block controller
@@ -2008,7 +2035,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
queue->pi_support &&
(c->common.opcode == nvme_cmd_write ||
c->common.opcode == nvme_cmd_read) &&
- nvme_ns_has_pi(ns))
+ nvme_ns_has_pi(ns->head))
req->use_sig_mr = true;
else
req->use_sig_mr = false;
@@ -2145,6 +2172,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl =
container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
+ int ret;
nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -2155,14 +2183,15 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
return;
}
- if (nvme_rdma_setup_ctrl(ctrl, false))
+ ret = nvme_rdma_setup_ctrl(ctrl, false);
+ if (ret)
goto out_fail;
return;
out_fail:
++ctrl->ctrl.nr_reconnects;
- nvme_rdma_reconnect_or_remove(ctrl);
+ nvme_rdma_reconnect_or_remove(ctrl, ret);
}
static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
@@ -2282,8 +2311,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
if (ret)
goto out_uninit_ctrl;
- dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
- nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
+ dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs, hostnqn: %s\n",
+ nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn);
mutex_lock(&nvme_rdma_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
@@ -2386,4 +2415,5 @@ static void __exit nvme_rdma_cleanup_module(void)
module_init(nvme_rdma_init_module);
module_exit(nvme_rdma_cleanup_module);
+MODULE_DESCRIPTION("NVMe host RDMA transport driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 212e1b05d298..3c55f7edd181 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -35,16 +35,71 @@ static ssize_t nvme_sysfs_rescan(struct device *dev,
}
static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
+static ssize_t nvme_adm_passthru_err_log_enabled_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf,
+ ctrl->passthru_err_log_enabled ? "on\n" : "off\n");
+}
+
+static ssize_t nvme_adm_passthru_err_log_enabled_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ bool passthru_err_log_enabled;
+ int err;
+
+ err = kstrtobool(buf, &passthru_err_log_enabled);
+ if (err)
+ return -EINVAL;
+
+ ctrl->passthru_err_log_enabled = passthru_err_log_enabled;
+
+ return count;
+}
+
static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
{
struct gendisk *disk = dev_to_disk(dev);
- if (disk->fops == &nvme_bdev_ops)
- return nvme_get_ns_from_dev(dev)->head;
- else
+ if (nvme_disk_is_ns_head(disk))
return disk->private_data;
+ return nvme_get_ns_from_dev(dev)->head;
}
+static ssize_t nvme_io_passthru_err_log_enabled_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+ return sysfs_emit(buf, head->passthru_err_log_enabled ? "on\n" : "off\n");
+}
+
+static ssize_t nvme_io_passthru_err_log_enabled_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+ bool passthru_err_log_enabled;
+ int err;
+
+ err = kstrtobool(buf, &passthru_err_log_enabled);
+ if (err)
+ return -EINVAL;
+ head->passthru_err_log_enabled = passthru_err_log_enabled;
+
+ return count;
+}
+
+static struct device_attribute dev_attr_adm_passthru_err_log_enabled = \
+ __ATTR(passthru_err_log_enabled, S_IRUGO | S_IWUSR, \
+ nvme_adm_passthru_err_log_enabled_show, nvme_adm_passthru_err_log_enabled_store);
+
+static struct device_attribute dev_attr_io_passthru_err_log_enabled = \
+ __ATTR(passthru_err_log_enabled, S_IRUGO | S_IWUSR, \
+ nvme_io_passthru_err_log_enabled_show, nvme_io_passthru_err_log_enabled_store);
+
static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -114,20 +169,102 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(nsid);
-static struct attribute *nvme_ns_id_attrs[] = {
+static ssize_t csi_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, "%u\n", dev_to_ns_head(dev)->ids.csi);
+}
+static DEVICE_ATTR_RO(csi);
+
+static ssize_t metadata_bytes_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", dev_to_ns_head(dev)->ms);
+}
+static DEVICE_ATTR_RO(metadata_bytes);
+
+static int ns_head_update_nuse(struct nvme_ns_head *head)
+{
+ struct nvme_id_ns *id;
+ struct nvme_ns *ns;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ /* Avoid issuing commands too often by rate limiting the update */
+ if (!__ratelimit(&head->rs_nuse))
+ return 0;
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ ns = nvme_find_path(head);
+ if (!ns)
+ goto out_unlock;
+
+ ret = nvme_identify_ns(ns->ctrl, head->ns_id, &id);
+ if (ret)
+ goto out_unlock;
+
+ head->nuse = le64_to_cpu(id->nuse);
+ kfree(id);
+
+out_unlock:
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
+
+static int ns_update_nuse(struct nvme_ns *ns)
+{
+ struct nvme_id_ns *id;
+ int ret;
+
+ /* Avoid issuing commands too often by rate limiting the update. */
+ if (!__ratelimit(&ns->head->rs_nuse))
+ return 0;
+
+ ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id);
+ if (ret)
+ return ret;
+
+ ns->head->nuse = le64_to_cpu(id->nuse);
+ kfree(id);
+ return 0;
+}
+
+static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+ struct gendisk *disk = dev_to_disk(dev);
+ struct block_device *bdev = disk->part0;
+ int ret;
+
+ if (nvme_disk_is_ns_head(bdev->bd_disk))
+ ret = ns_head_update_nuse(head);
+ else
+ ret = ns_update_nuse(bdev->bd_disk->private_data);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%llu\n", head->nuse);
+}
+static DEVICE_ATTR_RO(nuse);
+
+static struct attribute *nvme_ns_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
+ &dev_attr_csi.attr,
&dev_attr_nsid.attr,
+ &dev_attr_metadata_bytes.attr,
+ &dev_attr_nuse.attr,
#ifdef CONFIG_NVME_MULTIPATH
&dev_attr_ana_grpid.attr,
&dev_attr_ana_state.attr,
#endif
+ &dev_attr_io_passthru_err_log_enabled.attr,
NULL,
};
-static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
+static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
@@ -148,7 +285,8 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
}
#ifdef CONFIG_NVME_MULTIPATH
if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
- if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */
+ /* per-path attr */
+ if (nvme_disk_is_ns_head(dev_to_disk(dev)))
return 0;
if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
return 0;
@@ -157,13 +295,13 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
return a->mode;
}
-static const struct attribute_group nvme_ns_id_attr_group = {
- .attrs = nvme_ns_id_attrs,
- .is_visible = nvme_ns_id_attrs_are_visible,
+static const struct attribute_group nvme_ns_attr_group = {
+ .attrs = nvme_ns_attrs,
+ .is_visible = nvme_ns_attrs_are_visible,
};
-const struct attribute_group *nvme_ns_id_attr_groups[] = {
- &nvme_ns_id_attr_group,
+const struct attribute_group *nvme_ns_attr_groups[] = {
+ &nvme_ns_attr_group,
NULL,
};
@@ -226,6 +364,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ unsigned state = (unsigned)nvme_ctrl_state(ctrl);
static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live",
@@ -236,9 +375,8 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
[NVME_CTRL_DEAD] = "dead",
};
- if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
- state_name[ctrl->state])
- return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
+ if (state < ARRAY_SIZE(state_name) && state_name[state])
+ return sysfs_emit(buf, "%s\n", state_name[state]);
return sysfs_emit(buf, "unknown state\n");
}
@@ -409,7 +547,7 @@ static ssize_t dctype_show(struct device *dev,
}
static DEVICE_ATTR_RO(dctype);
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -527,6 +665,19 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
#endif
+#ifdef CONFIG_NVME_TCP_TLS
+static ssize_t tls_key_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (!ctrl->tls_key)
+ return 0;
+ return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key));
+}
+static DEVICE_ATTR_RO(tls_key);
+#endif
+
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -550,10 +701,14 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_kato.attr,
&dev_attr_cntrltype.attr,
&dev_attr_dctype.attr,
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
&dev_attr_dhchap_secret.attr,
&dev_attr_dhchap_ctrl_secret.attr,
#endif
+#ifdef CONFIG_NVME_TCP_TLS
+ &dev_attr_tls_key.attr,
+#endif
+ &dev_attr_adm_passthru_err_log_enabled.attr,
NULL
};
@@ -577,12 +732,17 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
return 0;
-#ifdef CONFIG_NVME_AUTH
+#ifdef CONFIG_NVME_HOST_AUTH
if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
return 0;
if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
return 0;
#endif
+#ifdef CONFIG_NVME_TCP_TLS
+ if (a == &dev_attr_tls_key.attr &&
+ (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")))
+ return 0;
+#endif
return a->mode;
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 5b332d9f87fc..8b5e4327fe83 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -8,9 +8,14 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
+#include <linux/key.h>
#include <linux/nvme-tcp.h>
+#include <linux/nvme-keyring.h>
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/tls.h>
+#include <net/tls_prot.h>
+#include <net/handshake.h>
#include <linux/blk-mq.h>
#include <crypto/hash.h>
#include <net/busy_poll.h>
@@ -31,6 +36,24 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
+/*
+ * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity
+ * from sysfs.
+ */
+static bool wq_unbound;
+module_param(wq_unbound, bool, 0644);
+MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
+
+/*
+ * TLS handshake timeout
+ */
+static int tls_handshake_timeout = 10;
+#ifdef CONFIG_NVME_TCP_TLS
+module_param(tls_handshake_timeout, int, 0644);
+MODULE_PARM_DESC(tls_handshake_timeout,
+ "nvme TLS handshake timeout in seconds (default 10)");
+#endif
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -146,7 +169,8 @@ struct nvme_tcp_queue {
struct ahash_request *snd_hash;
__le32 exp_ddgst;
__le32 recv_ddgst;
-
+ struct completion tls_complete;
+ int tls_err;
struct page_frag_cache pf_cache;
void (*state_change)(struct sock *);
@@ -189,6 +213,14 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
return queue - queue->ctrl->queues;
}
+static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl)
+{
+ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
+ return 0;
+
+ return ctrl->opts->tls;
+}
+
static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
{
u32 queue_idx = nvme_tcp_queue_id(queue);
@@ -328,12 +360,18 @@ static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
} while (ret > 0);
}
-static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
+static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
{
return !list_empty(&queue->send_list) ||
!llist_empty(&queue->req_list);
}
+static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
+{
+ return !nvme_tcp_tls(&queue->ctrl->ctrl) &&
+ nvme_tcp_queue_has_pending(queue);
+}
+
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
bool sync, bool last)
{
@@ -354,7 +392,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
mutex_unlock(&queue->send_mutex);
}
- if (last && nvme_tcp_queue_more(queue))
+ if (last && nvme_tcp_queue_has_pending(queue))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
@@ -1320,7 +1358,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
{
- struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
unsigned int noreclaim_flag;
@@ -1331,14 +1368,12 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
- if (queue->pf_cache.va) {
- page = virt_to_head_page(queue->pf_cache.va);
- __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
- queue->pf_cache.va = NULL;
- }
+ page_frag_cache_drain(&queue->pf_cache);
noreclaim_flag = memalloc_noreclaim_save();
- sock_release(queue->sock);
+ /* ->sock will be released by fput() */
+ fput(queue->sock->file);
+ queue->sock = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
kfree(queue->pdu);
@@ -1350,6 +1385,8 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_icreq_pdu *icreq;
struct nvme_tcp_icresp_pdu *icresp;
+ char cbuf[CMSG_LEN(sizeof(char))] = {};
+ u8 ctype;
struct msghdr msg = {};
struct kvec iov;
bool ctrl_hdgst, ctrl_ddgst;
@@ -1381,17 +1418,36 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
iov.iov_base = icreq;
iov.iov_len = sizeof(*icreq);
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
- if (ret < 0)
+ if (ret < 0) {
+ pr_warn("queue %d: failed to send icreq, error %d\n",
+ nvme_tcp_queue_id(queue), ret);
goto free_icresp;
+ }
memset(&msg, 0, sizeof(msg));
iov.iov_base = icresp;
iov.iov_len = sizeof(*icresp);
+ if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
+ msg.msg_control = cbuf;
+ msg.msg_controllen = sizeof(cbuf);
+ }
ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
- if (ret < 0)
+ if (ret < 0) {
+ pr_warn("queue %d: failed to receive icresp, error %d\n",
+ nvme_tcp_queue_id(queue), ret);
goto free_icresp;
-
+ }
+ ret = -ENOTCONN;
+ if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
+ ctype = tls_get_record_type(queue->sock->sk,
+ (struct cmsghdr *)cbuf);
+ if (ctype != TLS_RECORD_TYPE_DATA) {
+ pr_err("queue %d: unhandled TLS record %d\n",
+ nvme_tcp_queue_id(queue), ctype);
+ goto free_icresp;
+ }
+ }
ret = -EINVAL;
if (icresp->hdr.type != nvme_tcp_icresp) {
pr_err("queue %d: bad type returned %d\n",
@@ -1504,14 +1560,96 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ if (wq_unbound)
+ queue->io_cpu = WORK_CPU_UNBOUND;
+ else
+ queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+}
+
+static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
+{
+ struct nvme_tcp_queue *queue = data;
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+ struct key *tls_key;
+
+ dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n",
+ qid, pskid, status);
+
+ if (status) {
+ queue->tls_err = -status;
+ goto out_complete;
+ }
+
+ tls_key = key_lookup(pskid);
+ if (IS_ERR(tls_key)) {
+ dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n",
+ qid, pskid);
+ queue->tls_err = -ENOKEY;
+ } else {
+ ctrl->ctrl.tls_key = tls_key;
+ queue->tls_err = 0;
+ }
+
+out_complete:
+ complete(&queue->tls_complete);
}
-static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
+static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
+ struct nvme_tcp_queue *queue,
+ key_serial_t pskid)
+{
+ int qid = nvme_tcp_queue_id(queue);
+ int ret;
+ struct tls_handshake_args args;
+ unsigned long tmo = tls_handshake_timeout * HZ;
+ key_serial_t keyring = nvme_keyring_id();
+
+ dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n",
+ qid, pskid);
+ memset(&args, 0, sizeof(args));
+ args.ta_sock = queue->sock;
+ args.ta_done = nvme_tcp_tls_done;
+ args.ta_data = queue;
+ args.ta_my_peerids[0] = pskid;
+ args.ta_num_peerids = 1;
+ if (nctrl->opts->keyring)
+ keyring = key_serial(nctrl->opts->keyring);
+ args.ta_keyring = keyring;
+ args.ta_timeout_ms = tls_handshake_timeout * 1000;
+ queue->tls_err = -EOPNOTSUPP;
+ init_completion(&queue->tls_complete);
+ ret = tls_client_hello_psk(&args, GFP_KERNEL);
+ if (ret) {
+ dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n",
+ qid, ret);
+ return ret;
+ }
+ ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+
+ dev_err(nctrl->device,
+ "queue %d: TLS handshake failed, error %d\n",
+ qid, ret);
+ tls_handshake_cancel(queue->sock->sk);
+ } else {
+ dev_dbg(nctrl->device,
+ "queue %d: TLS handshake complete, error %d\n",
+ qid, queue->tls_err);
+ ret = queue->tls_err;
+ }
+ return ret;
+}
+
+static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
+ key_serial_t pskid)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
int ret, rcv_pdu_size;
+ struct file *sock_file;
mutex_init(&queue->queue_lock);
queue->ctrl = ctrl;
@@ -1534,6 +1672,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
goto err_destroy_mutex;
}
+ sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL);
+ if (IS_ERR(sock_file)) {
+ ret = PTR_ERR(sock_file);
+ goto err_destroy_mutex;
+ }
nvme_tcp_reclassify_socket(queue->sock);
/* Single syn retry */
@@ -1624,6 +1767,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
goto err_rcv_pdu;
}
+ /* If PSKs are configured try to start TLS */
+ if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) {
+ ret = nvme_tcp_start_tls(nctrl, queue, pskid);
+ if (ret)
+ goto err_init_connect;
+ }
+
ret = nvme_tcp_init_connection(queue);
if (ret)
goto err_init_connect;
@@ -1640,7 +1790,8 @@ err_crypto:
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
err_sock:
- sock_release(queue->sock);
+ /* ->sock will be released by fput() */
+ fput(queue->sock->file);
queue->sock = NULL;
err_destroy_mutex:
mutex_destroy(&queue->send_mutex);
@@ -1772,8 +1923,22 @@ out_stop_queues:
static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
{
int ret;
+ key_serial_t pskid = 0;
- ret = nvme_tcp_alloc_queue(ctrl, 0);
+ if (nvme_tcp_tls(ctrl)) {
+ if (ctrl->opts->tls_key)
+ pskid = key_serial(ctrl->opts->tls_key);
+ else
+ pskid = nvme_tls_psk_default(ctrl->opts->keyring,
+ ctrl->opts->host->nqn,
+ ctrl->opts->subsysnqn);
+ if (!pskid) {
+ dev_err(ctrl->device, "no valid PSK found\n");
+ return -ENOKEY;
+ }
+ }
+
+ ret = nvme_tcp_alloc_queue(ctrl, 0, pskid);
if (ret)
return ret;
@@ -1792,8 +1957,13 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
+ if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) {
+ dev_err(ctrl->device, "no PSK negotiated\n");
+ return -ENOKEY;
+ }
for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvme_tcp_alloc_queue(ctrl, i);
+ ret = nvme_tcp_alloc_queue(ctrl, i,
+ key_serial(ctrl->tls_key));
if (ret)
goto out_free_queues;
}
@@ -1991,22 +2161,25 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
nvme_tcp_destroy_io_queues(ctrl, remove);
}
-static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
+ int status)
{
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
/* If we are resetting/deleting then do nothing */
- if (ctrl->state != NVME_CTRL_CONNECTING) {
- WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
- ctrl->state == NVME_CTRL_LIVE);
+ if (state != NVME_CTRL_CONNECTING) {
+ WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
return;
}
- if (nvmf_should_reconnect(ctrl)) {
+ if (nvmf_should_reconnect(ctrl, status)) {
dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
ctrl->opts->reconnect_delay);
queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
ctrl->opts->reconnect_delay * HZ);
} else {
- dev_info(ctrl->device, "Removing controller...\n");
+ dev_info(ctrl->device, "Removing controller (%d)...\n",
+ status);
nvme_delete_ctrl(ctrl);
}
}
@@ -2056,8 +2229,10 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DELETING_NOIO);
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
+ WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
@@ -2075,11 +2250,8 @@ destroy_io:
nvme_tcp_destroy_io_queues(ctrl, new);
}
destroy_admin:
- nvme_quiesce_admin_queue(ctrl);
- blk_sync_queue(ctrl->admin_q);
- nvme_tcp_stop_queue(ctrl, 0);
- nvme_cancel_admin_tagset(ctrl);
- nvme_tcp_destroy_admin_queue(ctrl, new);
+ nvme_stop_keep_alive(ctrl);
+ nvme_tcp_teardown_admin_queue(ctrl, false);
return ret;
}
@@ -2088,23 +2260,25 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
struct nvme_tcp_ctrl, connect_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+ int ret;
++ctrl->nr_reconnects;
- if (nvme_tcp_setup_ctrl(ctrl, false))
+ ret = nvme_tcp_setup_ctrl(ctrl, false);
+ if (ret)
goto requeue;
- dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
- ctrl->nr_reconnects);
+ dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n",
+ ctrl->nr_reconnects, ctrl->opts->max_reconnects);
ctrl->nr_reconnects = 0;
return;
requeue:
- dev_info(ctrl->device, "Failed reconnect attempt %d\n",
- ctrl->nr_reconnects);
- nvme_tcp_reconnect_or_remove(ctrl);
+ dev_info(ctrl->device, "Failed reconnect attempt %d/%d\n",
+ ctrl->nr_reconnects, ctrl->opts->max_reconnects);
+ nvme_tcp_reconnect_or_remove(ctrl, ret);
}
static void nvme_tcp_error_recovery_work(struct work_struct *work)
@@ -2124,12 +2298,14 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DELETING_NOIO);
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
+ WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ state != NVME_CTRL_DELETING_NOIO);
return;
}
- nvme_tcp_reconnect_or_remove(ctrl);
+ nvme_tcp_reconnect_or_remove(ctrl, 0);
}
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
@@ -2149,25 +2325,29 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, reset_work);
+ int ret;
nvme_stop_ctrl(ctrl);
nvme_tcp_teardown_ctrl(ctrl, false);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DELETING_NOIO);
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+
+ WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ state != NVME_CTRL_DELETING_NOIO);
return;
}
- if (nvme_tcp_setup_ctrl(ctrl, false))
+ ret = nvme_tcp_setup_ctrl(ctrl, false);
+ if (ret)
goto out_fail;
return;
out_fail:
++ctrl->nr_reconnects;
- nvme_tcp_reconnect_or_remove(ctrl);
+ nvme_tcp_reconnect_or_remove(ctrl, ret);
}
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
@@ -2266,15 +2446,15 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
- u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
+ struct nvme_command *cmd = &pdu->cmd;
int qid = nvme_tcp_queue_id(req->queue);
dev_warn(ctrl->device,
- "queue %d: timeout cid %#x type %d opcode %#x (%s)\n",
- nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
- opc, nvme_opcode_str(qid, opc, fctype));
+ "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
+ rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
+ nvme_fabrics_opcode_str(qid, cmd), qid);
- if (ctrl->state != NVME_CTRL_LIVE) {
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
/*
* If we are resetting, connecting or deleting we should
* complete immediately because we may block controller
@@ -2591,8 +2771,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (ret)
goto out_uninit_ctrl;
- dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
- nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
+ dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp, hostnqn: %s\n",
+ nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn);
mutex_lock(&nvme_tcp_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
@@ -2621,12 +2801,15 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
- NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
+ NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
+ NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY,
.create_ctrl = nvme_tcp_create_ctrl,
};
static int __init nvme_tcp_init_module(void)
{
+ unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
+
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
@@ -2636,8 +2819,10 @@ static int __init nvme_tcp_init_module(void)
BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
- nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
- WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ if (wq_unbound)
+ wq_flags |= WQ_UNBOUND;
+
+ nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
if (!nvme_tcp_wq)
return -ENOMEM;
@@ -2663,4 +2848,5 @@ static void __exit nvme_tcp_cleanup_module(void)
module_init(nvme_tcp_init_module);
module_exit(nvme_tcp_cleanup_module);
+MODULE_DESCRIPTION("NVMe host TCP transport driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 1c36fcedea20..0288315f0050 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -119,7 +119,10 @@ static const char *nvme_trace_get_lba_status(struct trace_seq *p,
static const char *nvme_trace_admin_format_nvm(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
- u8 lbaf = cdw10[0] & 0xF;
+ /*
+ * lbafu(bit 13:12) is already in the upper 4 bits, lbafl: bit 03:00.
+ */
+ u8 lbaf = (cdw10[1] & 0x30) | (cdw10[0] & 0xF);
u8 mset = (cdw10[0] >> 4) & 0x1;
u8 pi = (cdw10[0] >> 5) & 0x7;
u8 pil = cdw10[1] & 0x1;
@@ -164,12 +167,27 @@ static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const zsa_strs[] = {
+ [0x01] = "close zone",
+ [0x02] = "finish zone",
+ [0x03] = "open zone",
+ [0x04] = "reset zone",
+ [0x05] = "offline zone",
+ [0x10] = "set zone descriptor extension"
+ };
const char *ret = trace_seq_buffer_ptr(p);
u64 slba = get_unaligned_le64(cdw10);
+ const char *zsa_str;
u8 zsa = cdw10[12];
u8 all = cdw10[13];
- trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all);
+ if (zsa < ARRAY_SIZE(zsa_strs) && zsa_strs[zsa])
+ zsa_str = zsa_strs[zsa];
+ else
+ zsa_str = "reserved";
+
+ trace_seq_printf(p, "slba=%llu, zsa=%u:%s, all=%u",
+ slba, zsa, zsa_str, all);
trace_seq_putc(p, 0);
return ret;
@@ -177,15 +195,86 @@ static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const zrasf_strs[] = {
+ [0x00] = "list all zones",
+ [0x01] = "list the zones in the ZSE: Empty state",
+ [0x02] = "list the zones in the ZSIO: Implicitly Opened state",
+ [0x03] = "list the zones in the ZSEO: Explicitly Opened state",
+ [0x04] = "list the zones in the ZSC: Closed state",
+ [0x05] = "list the zones in the ZSF: Full state",
+ [0x06] = "list the zones in the ZSRO: Read Only state",
+ [0x07] = "list the zones in the ZSO: Offline state",
+ [0x09] = "list the zones that have the zone attribute"
+ };
const char *ret = trace_seq_buffer_ptr(p);
u64 slba = get_unaligned_le64(cdw10);
u32 numd = get_unaligned_le32(cdw10 + 8);
u8 zra = cdw10[12];
u8 zrasf = cdw10[13];
+ const char *zrasf_str;
u8 pr = cdw10[14];
- trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u",
- slba, numd, zra, zrasf, pr);
+ if (zrasf < ARRAY_SIZE(zrasf_strs) && zrasf_strs[zrasf])
+ zrasf_str = zrasf_strs[zrasf];
+ else
+ zrasf_str = "reserved";
+
+ trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u:%s, pr=%u",
+ slba, numd, zra, zrasf, zrasf_str, pr);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 rrega = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 ptpl = (cdw10[3] >> 6) & 0x3;
+
+ trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u",
+ rrega, iekey, ptpl);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 racqa = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 rtype = cdw10[1];
+
+ trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u",
+ racqa, iekey, rtype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 rrela = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 rtype = cdw10[1];
+
+ trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u",
+ rrela, iekey, rtype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_report(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u32 numd = get_unaligned_le32(cdw10);
+ u8 eds = cdw10[4] & 0x1;
+
+ trace_seq_printf(p, "numd=%u, eds=%u", numd, eds);
trace_seq_putc(p, 0);
return ret;
@@ -243,6 +332,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
return nvme_trace_zone_mgmt_send(p, cdw10);
case nvme_cmd_zone_mgmt_recv:
return nvme_trace_zone_mgmt_recv(p, cdw10);
+ case nvme_cmd_resv_register:
+ return nvme_trace_resv_reg(p, cdw10);
+ case nvme_cmd_resv_acquire:
+ return nvme_trace_resv_acq(p, cdw10);
+ case nvme_cmd_resv_release:
+ return nvme_trace_resv_rel(p, cdw10);
+ case nvme_cmd_resv_report:
+ return nvme_trace_resv_report(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index ec8557810c21..77aa0f440a6d 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -7,16 +7,6 @@
#include <linux/vmalloc.h>
#include "nvme.h"
-int nvme_revalidate_zones(struct nvme_ns *ns)
-{
- struct request_queue *q = ns->queue;
-
- blk_queue_chunk_sectors(q, ns->zsze);
- blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
-
- return blk_revalidate_disk_zones(ns->disk, NULL);
-}
-
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
{
struct nvme_command c = { };
@@ -45,10 +35,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
return 0;
}
-int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
+int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf,
+ struct nvme_zone_info *zi)
{
struct nvme_effects_log *log = ns->head->effects;
- struct request_queue *q = ns->queue;
struct nvme_command c = { };
struct nvme_id_ns_zns *id;
int status;
@@ -99,24 +89,34 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data;
}
- ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
- if (!is_power_of_2(ns->zsze)) {
+ zi->zone_size = le64_to_cpu(id->lbafe[lbaf].zsze);
+ if (!is_power_of_2(zi->zone_size)) {
dev_warn(ns->ctrl->device,
- "invalid zone size:%llu for namespace:%u\n",
- ns->zsze, ns->head->ns_id);
+ "invalid zone size: %llu for namespace: %u\n",
+ zi->zone_size, ns->head->ns_id);
status = -ENODEV;
goto free_data;
}
+ zi->max_open_zones = le32_to_cpu(id->mor) + 1;
+ zi->max_active_zones = le32_to_cpu(id->mar) + 1;
- disk_set_zoned(ns->disk, BLK_ZONED_HM);
- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
- disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
free_data:
kfree(id);
return status;
}
+void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
+ struct nvme_zone_info *zi)
+{
+ lim->zoned = 1;
+ lim->max_open_zones = zi->max_open_zones;
+ lim->max_active_zones = zi->max_active_zones;
+ lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
+ lim->chunk_sectors = ns->head->zsze =
+ nvme_lba_to_sect(ns->head, zi->zone_size);
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
+}
+
static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
unsigned int nr_zones, size_t *buflen)
{
@@ -128,7 +128,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
sizeof(struct nvme_zone_descriptor);
nr_zones = min_t(unsigned int, nr_zones,
- get_capacity(ns->disk) >> ilog2(ns->zsze));
+ get_capacity(ns->disk) >> ilog2(ns->head->zsze));
bufsize = sizeof(struct nvme_zone_report) +
nr_zones * sizeof(struct nvme_zone_descriptor);
@@ -147,7 +147,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
return NULL;
}
-static int nvme_zone_parse_entry(struct nvme_ns *ns,
+static int nvme_zone_parse_entry(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head,
struct nvme_zone_descriptor *entry,
unsigned int idx, report_zones_cb cb,
void *data)
@@ -155,20 +156,20 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
struct blk_zone zone = { };
if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
- dev_err(ns->ctrl->device, "invalid zone type %#x\n",
+ dev_err(ctrl->device, "invalid zone type %#x\n",
entry->zt);
return -EINVAL;
}
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone.cond = entry->zs >> 4;
- zone.len = ns->zsze;
- zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
- zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
+ zone.len = head->zsze;
+ zone.capacity = nvme_lba_to_sect(head, le64_to_cpu(entry->zcap));
+ zone.start = nvme_lba_to_sect(head, le64_to_cpu(entry->zslba));
if (zone.cond == BLK_ZONE_COND_FULL)
zone.wp = zone.start + zone.len;
else
- zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
+ zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp));
return cb(&zone, idx, data);
}
@@ -196,11 +197,11 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
- sector &= ~(ns->zsze - 1);
+ sector &= ~(ns->head->zsze - 1);
while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
memset(report, 0, buflen);
- c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
+ c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns->head, sector));
ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
if (ret) {
if (ret > 0)
@@ -213,14 +214,15 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
break;
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
- ret = nvme_zone_parse_entry(ns, &report->entries[i],
+ ret = nvme_zone_parse_entry(ns->ctrl, ns->head,
+ &report->entries[i],
zone_idx, cb, data);
if (ret)
goto out_free;
zone_idx++;
}
- sector += ns->zsze * nz;
+ sector += ns->head->zsze * nz;
}
if (zone_idx > 0)
@@ -239,7 +241,7 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
c->zms.opcode = nvme_cmd_zone_mgmt_send;
c->zms.nsid = cpu_to_le32(ns->head->ns_id);
- c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+ c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
c->zms.zsa = action;
if (req_op(req) == REQ_OP_ZONE_RESET_ALL)