summaryrefslogtreecommitdiff
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/Kconfig4
-rw-r--r--drivers/nvme/host/core.c322
-rw-r--r--drivers/nvme/host/fc.c582
-rw-r--r--drivers/nvme/host/fc.h227
-rw-r--r--drivers/nvme/host/lightnvm.c7
-rw-r--r--drivers/nvme/host/multipath.c16
-rw-r--r--drivers/nvme/host/nvme.h31
-rw-r--r--drivers/nvme/host/pci.c134
-rw-r--r--drivers/nvme/host/rdma.c321
-rw-r--r--drivers/nvme/host/tcp.c125
10 files changed, 1357 insertions, 412 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 9c17ed32be64..3ed9786b88d8 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -7,7 +7,7 @@ config BLK_DEV_NVME
tristate "NVM Express block device"
depends on PCI && BLOCK
select NVME_CORE
- ---help---
+ help
The NVM Express driver is for solid state drives directly
connected to the PCI or PCI Express bus. If you know you
don't have one of these, it is safe to answer N.
@@ -18,7 +18,7 @@ config BLK_DEV_NVME
config NVME_MULTIPATH
bool "NVMe multipath support"
depends on NVME_CORE
- ---help---
+ help
This option enables support for multipath access to NVMe
subsystems. If this option is enabled only a single
/dev/nvmeXnY device will show up for each NVMe namespaces,
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f3c037f5a9ba..c2c5bc4fb702 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -19,7 +19,6 @@
#include <linux/pr.h>
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
-#include <linux/t10-pi.h>
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
@@ -204,11 +203,6 @@ static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
nvme_put_ctrl(ctrl);
}
-static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
-{
- return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
-}
-
static blk_status_t nvme_error_status(u16 status)
{
switch (status & 0x7ff) {
@@ -310,7 +304,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
return true;
nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
- blk_mq_complete_request(req);
+ blk_mq_force_complete_rq(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -433,7 +427,6 @@ static void nvme_free_ns_head(struct kref *ref)
nvme_mpath_remove_disk(head);
ida_simple_remove(&head->subsys->ns_ida, head->instance);
- list_del_init(&head->entry);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
kfree(head);
@@ -530,7 +523,7 @@ static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
c.directive.opcode = nvme_admin_directive_recv;
c.directive.nsid = cpu_to_le32(nsid);
- c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
+ c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
c.directive.dtype = NVME_DIR_STREAMS;
@@ -553,19 +546,22 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
if (ret)
- return ret;
+ goto out_disable_stream;
ctrl->nssa = le16_to_cpu(s.nssa);
if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
dev_info(ctrl->device, "too few streams (%u) available\n",
ctrl->nssa);
- nvme_disable_streams(ctrl);
- return 0;
+ goto out_disable_stream;
}
ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
return 0;
+
+out_disable_stream:
+ nvme_disable_streams(ctrl);
+ return ret;
}
/*
@@ -1027,6 +1023,19 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
+/*
+ * In NVMe 1.0 the CNS field was just a binary controller or namespace
+ * flag, thus sending any new CNS opcodes has a big chance of not working.
+ * Qemu unfortunately had that bug after reporting a 1.1 version compliance
+ * (but not for any later version).
+ */
+static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
+ return ctrl->vs < NVME_VS(1, 2, 0);
+ return ctrl->vs < NVME_VS(1, 1, 0);
+}
+
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
struct nvme_command c = { };
@@ -1290,7 +1299,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
meta_len = (io.nblocks + 1) * ns->ms;
metadata = nvme_to_user_ptr(io.metadata);
- if (ns->ext) {
+ if (ns->features & NVME_NS_EXT_LBAS) {
length += meta_len;
meta_len = 0;
} else if (meta_len) {
@@ -1392,8 +1401,10 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
}
if (effects & NVME_CMD_EFFECTS_CCC)
nvme_init_identify(ctrl);
- if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
+ if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
nvme_queue_scan(ctrl);
+ flush_work(&ctrl->scan_work);
+ }
}
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1682,7 +1693,8 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
+ u32 max_integrity_segments)
{
struct blk_integrity integrity;
@@ -1705,20 +1717,15 @@ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
}
integrity.tuple_size = ms;
blk_integrity_register(disk, &integrity);
- blk_queue_max_integrity_segments(disk->queue, 1);
+ blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
}
#else
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
+ u32 max_integrity_segments)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_set_chunk_size(struct nvme_ns *ns)
-{
- u32 chunk_size = nvme_lba_to_sect(ns, ns->noiob);
- blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
-}
-
static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
{
struct nvme_ctrl *ctrl = ns->ctrl;
@@ -1804,12 +1811,37 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
}
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ u32 *phys_bs, u32 *io_opt)
+{
+ struct streams_directive_params s;
+ int ret;
+
+ if (!ctrl->nr_streams)
+ return 0;
+
+ ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
+ if (ret)
+ return ret;
+
+ ns->sws = le32_to_cpu(s.sws);
+ ns->sgs = le16_to_cpu(s.sgs);
+
+ if (ns->sws) {
+ *phys_bs = ns->sws * (1 << ns->lba_shift);
+ if (ns->sgs)
+ *io_opt = *phys_bs * ns->sgs;
+ }
+
+ return 0;
+}
+
static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
unsigned short bs = 1 << ns->lba_shift;
- u32 atomic_bs, phys_bs, io_opt;
+ u32 atomic_bs, phys_bs, io_opt = 0;
if (ns->lba_shift > PAGE_SHIFT) {
/* unsupported block size, set capacity to 0 later */
@@ -1818,26 +1850,25 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_mq_freeze_queue(disk->queue);
blk_integrity_unregister(disk);
+ atomic_bs = phys_bs = bs;
+ nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
if (id->nabo == 0) {
/*
* Bit 1 indicates whether NAWUPF is defined for this namespace
* and whether it should be used instead of AWUPF. If NAWUPF ==
* 0 then AWUPF must be used instead.
*/
- if (id->nsfeat & (1 << 1) && id->nawupf)
+ if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
- } else {
- atomic_bs = bs;
}
- phys_bs = bs;
- io_opt = bs;
- if (id->nsfeat & (1 << 4)) {
+
+ if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
/* NPWG = Namespace Preferred Write Granularity */
- phys_bs *= 1 + le16_to_cpu(id->npwg);
+ phys_bs = bs * (1 + le16_to_cpu(id->npwg));
/* NOWS = Namespace Optimal Write Size */
- io_opt *= 1 + le16_to_cpu(id->nows);
+ io_opt = bs * (1 + le16_to_cpu(id->nows));
}
blk_queue_logical_block_size(disk->queue, bs);
@@ -1850,19 +1881,34 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_queue_io_min(disk->queue, phys_bs);
blk_queue_io_opt(disk->queue, io_opt);
- if (ns->ms && !ns->ext &&
- (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
- nvme_init_integrity(disk, ns->ms, ns->pi_type);
- if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) ||
- ns->lba_shift > PAGE_SHIFT)
+ /*
+ * The block layer can't support LBA sizes larger than the page size
+ * yet, so catch this early and don't allow block I/O.
+ */
+ if (ns->lba_shift > PAGE_SHIFT)
capacity = 0;
+ /*
+ * Register a metadata profile for PI, or the plain non-integrity NVMe
+ * metadata masquerading as Type 0 if supported, otherwise reject block
+ * I/O to namespaces with metadata except when the namespace supports
+ * PI, as it can strip/insert in that case.
+ */
+ if (ns->ms) {
+ if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
+ (ns->features & NVME_NS_METADATA_SUPPORTED))
+ nvme_init_integrity(disk, ns->ms, ns->pi_type,
+ ns->ctrl->max_integrity_segments);
+ else if (!nvme_ns_has_pi(ns))
+ capacity = 0;
+ }
+
set_capacity_revalidate_and_notify(disk, capacity, false);
nvme_config_discard(disk, ns);
nvme_config_write_zeroes(disk, ns);
- if (id->nsattr & (1 << 0))
+ if (id->nsattr & NVME_NS_ATTR_RO)
set_disk_ro(disk, true);
else
set_disk_ro(disk, false);
@@ -1870,9 +1916,11 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_mq_unfreeze_queue(disk->queue);
}
-static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
+static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ u32 iob;
/*
* If identify namespace failed, use default 512 byte block size so
@@ -1881,32 +1929,55 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
- ns->noiob = le16_to_cpu(id->noiob);
+
+ if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+ is_power_of_2(ctrl->max_hw_sectors))
+ iob = ctrl->max_hw_sectors;
+ else
+ iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
+
+ ns->features = 0;
ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
- ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
/* the PI implementation requires metadata equal t10 pi tuple size */
if (ns->ms == sizeof(struct t10_pi_tuple))
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
ns->pi_type = 0;
- if (ns->noiob)
- nvme_set_chunk_size(ns);
+ if (ns->ms) {
+ /*
+ * For PCIe only the separate metadata pointer is supported,
+ * as the block layer supplies metadata in a separate bio_vec
+ * chain. For Fabrics, only metadata as part of extended data
+ * LBA is supported on the wire per the Fabrics specification,
+ * but the HBA/HCA will do the remapping from the separate
+ * metadata buffers for us.
+ */
+ if (id->flbas & NVME_NS_FLBAS_META_EXT) {
+ ns->features |= NVME_NS_EXT_LBAS;
+ if ((ctrl->ops->flags & NVME_F_FABRICS) &&
+ (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) &&
+ ctrl->max_integrity_segments)
+ ns->features |= NVME_NS_METADATA_SUPPORTED;
+ } else {
+ if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS))
+ return -EINVAL;
+ if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
+ ns->features |= NVME_NS_METADATA_SUPPORTED;
+ }
+ }
+
+ if (iob)
+ blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob));
nvme_update_disk_info(disk, ns, id);
#ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
- if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
- struct backing_dev_info *info =
- ns->head->disk->queue->backing_dev_info;
-
- info->capabilities |= BDI_CAP_STABLE_WRITES;
- }
-
revalidate_disk(ns->head->disk);
}
#endif
+ return 0;
}
static int nvme_revalidate_disk(struct gendisk *disk)
@@ -1931,7 +2002,6 @@ static int nvme_revalidate_disk(struct gendisk *disk)
goto free_id;
}
- __nvme_revalidate_disk(disk, id);
ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
if (ret)
goto free_id;
@@ -1940,8 +2010,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
dev_err(ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
ret = -ENODEV;
+ goto free_id;
}
+ ret = __nvme_revalidate_disk(disk, id);
free_id:
kfree(id);
out:
@@ -2249,10 +2321,8 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
- if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
- is_power_of_2(ctrl->max_hw_sectors))
- blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
blk_queue_virt_boundary(q, ctrl->page_size - 1);
+ blk_queue_dma_alignment(q, 7);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
blk_queue_write_cache(q, vwc, vwc);
@@ -2655,7 +2725,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
return false;
}
- if ((id->cmic & (1 << 1)) ||
+ if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
(ctrl->opts && ctrl->opts->discovery_nqn))
continue;
@@ -2746,7 +2816,7 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
- unsigned long dwlen = size / 4 - 1;
+ u32 dwlen = nvme_bytes_to_numd(size);
c.get_log_page.opcode = nvme_admin_get_log_page;
c.get_log_page.nsid = cpu_to_le32(nsid);
@@ -3401,7 +3471,6 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys,
list_for_each_entry(h, &subsys->nsheads, entry) {
if (nvme_ns_ids_valid(&new->ids) &&
- !list_empty(&h->list) &&
nvme_ns_ids_equal(&new->ids, &h->ids))
return -EINVAL;
}
@@ -3410,8 +3479,7 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys,
}
static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
- unsigned nsid, struct nvme_id_ns *id,
- struct nvme_ns_ids *ids)
+ unsigned nsid, struct nvme_ns_ids *ids)
{
struct nvme_ns_head *head;
size_t size = sizeof(*head);
@@ -3469,42 +3537,51 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
struct nvme_id_ns *id)
{
struct nvme_ctrl *ctrl = ns->ctrl;
- bool is_shared = id->nmic & (1 << 0);
+ bool is_shared = id->nmic & NVME_NS_NMIC_SHARED;
struct nvme_ns_head *head = NULL;
struct nvme_ns_ids ids;
int ret = 0;
ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
- if (ret)
- goto out;
+ if (ret) {
+ if (ret < 0)
+ return ret;
+ return blk_status_to_errno(nvme_error_status(ret));
+ }
mutex_lock(&ctrl->subsys->lock);
- if (is_shared)
- head = nvme_find_ns_head(ctrl->subsys, nsid);
+ head = nvme_find_ns_head(ctrl->subsys, nsid);
if (!head) {
- head = nvme_alloc_ns_head(ctrl, nsid, id, &ids);
+ head = nvme_alloc_ns_head(ctrl, nsid, &ids);
if (IS_ERR(head)) {
ret = PTR_ERR(head);
goto out_unlock;
}
+ head->shared = is_shared;
} else {
+ ret = -EINVAL;
+ if (!is_shared || !head->shared) {
+ dev_err(ctrl->device,
+ "Duplicate unshared namespace %d\n", nsid);
+ goto out_put_ns_head;
+ }
if (!nvme_ns_ids_equal(&head->ids, &ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
nsid);
- ret = -EINVAL;
- goto out_unlock;
+ goto out_put_ns_head;
}
}
list_add_tail(&ns->siblings, &head->list);
ns->head = head;
+ mutex_unlock(&ctrl->subsys->lock);
+ return 0;
+out_put_ns_head:
+ nvme_put_ns_head(head);
out_unlock:
mutex_unlock(&ctrl->subsys->lock);
-out:
- if (ret > 0)
- ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -3535,32 +3612,6 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
return ret;
}
-static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
-{
- struct streams_directive_params s;
- int ret;
-
- if (!ctrl->nr_streams)
- return 0;
-
- ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
- if (ret)
- return ret;
-
- ns->sws = le32_to_cpu(s.sws);
- ns->sgs = le16_to_cpu(s.sgs);
-
- if (ns->sws) {
- unsigned int bs = 1 << ns->lba_shift;
-
- blk_queue_io_min(ns->queue, bs * ns->sws);
- if (ns->sgs)
- blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
- }
-
- return 0;
-}
-
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
@@ -3604,7 +3655,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ret = nvme_init_ns_head(ns, nsid, id);
if (ret)
goto out_free_id;
- nvme_setup_streams_ns(ctrl, ns);
nvme_set_disk_name(disk_name, ns, ctrl, &flags);
disk = alloc_disk_node(0, node);
@@ -3618,7 +3668,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
ns->disk = disk;
- __nvme_revalidate_disk(disk, id);
+ if (__nvme_revalidate_disk(disk, id))
+ goto out_put_disk;
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
ret = nvme_nvm_register(ns, disk_name, node);
@@ -3648,6 +3699,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
+ if (list_empty(&ns->head->list))
+ list_del_init(&ns->head->entry);
mutex_unlock(&ctrl->subsys->lock);
nvme_put_ns_head(ns->head);
out_free_id:
@@ -3667,7 +3720,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
+ if (list_empty(&ns->head->list))
+ list_del_init(&ns->head->entry);
mutex_unlock(&ns->ctrl->subsys->lock);
+
synchronize_rcu(); /* guarantee not available in head->list */
nvme_mpath_clear_current_path(ns);
synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
@@ -3687,6 +3743,16 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_put_ns(ns);
}
+static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
+{
+ struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
+
+ if (ns) {
+ nvme_ns_remove(ns);
+ nvme_put_ns(ns);
+ }
+}
+
static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
@@ -3718,39 +3784,34 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
}
-static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
{
- struct nvme_ns *ns;
+ const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
__le32 *ns_list;
- unsigned i, j, nsid, prev = 0;
- unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024);
- int ret = 0;
+ u32 prev = 0;
+ int ret = 0, i;
+
+ if (nvme_ctrl_limited_cns(ctrl))
+ return -EOPNOTSUPP;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
- for (i = 0; i < num_lists; i++) {
+ for (;;) {
ret = nvme_identify_ns_list(ctrl, prev, ns_list);
if (ret)
goto free;
- for (j = 0; j < min(nn, 1024U); j++) {
- nsid = le32_to_cpu(ns_list[j]);
- if (!nsid)
- goto out;
+ for (i = 0; i < nr_entries; i++) {
+ u32 nsid = le32_to_cpu(ns_list[i]);
+ if (!nsid) /* end of the list? */
+ goto out;
nvme_validate_ns(ctrl, nsid);
-
- while (++prev < nsid) {
- ns = nvme_find_get_ns(ctrl, prev);
- if (ns) {
- nvme_ns_remove(ns);
- nvme_put_ns(ns);
- }
- }
+ while (++prev < nsid)
+ nvme_ns_remove_by_nsid(ctrl, prev);
}
- nn -= j;
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
@@ -3759,9 +3820,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
return ret;
}
-static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
+static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
{
- unsigned i;
+ struct nvme_id_ctrl *id;
+ u32 nn, i;
+
+ if (nvme_identify_ctrl(ctrl, &id))
+ return;
+ nn = le32_to_cpu(id->nn);
+ kfree(id);
for (i = 1; i <= nn; i++)
nvme_validate_ns(ctrl, i);
@@ -3798,8 +3865,6 @@ static void nvme_scan_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, scan_work);
- struct nvme_id_ctrl *id;
- unsigned nn;
/* No tagset on a live ctrl means IO queues could not created */
if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
@@ -3810,20 +3875,11 @@ static void nvme_scan_work(struct work_struct *work)
nvme_clear_changed_ns_log(ctrl);
}
- if (nvme_identify_ctrl(ctrl, &id))
- return;
-
mutex_lock(&ctrl->scan_lock);
- nn = le32_to_cpu(id->nn);
- if (ctrl->vs >= NVME_VS(1, 1, 0) &&
- !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
- if (!nvme_scan_ns_list(ctrl, nn))
- goto out_free_id;
- }
- nvme_scan_ns_sequential(ctrl, nn);
-out_free_id:
+ if (nvme_scan_ns_list(ctrl) != 0)
+ nvme_scan_ns_sequential(ctrl);
mutex_unlock(&ctrl->scan_lock);
- kfree(id);
+
down_write(&ctrl->namespaces_rwsem);
list_sort(NULL, &ctrl->namespaces, ns_cmp);
up_write(&ctrl->namespaces_rwsem);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 7dfc4a2ecf1e..e999a8c4b7e8 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -14,6 +14,7 @@
#include "fabrics.h"
#include <linux/nvme-fc-driver.h>
#include <linux/nvme-fc.h>
+#include "fc.h"
#include <scsi/scsi_transport_fc.h>
/* *************************** Data Structures/Defines ****************** */
@@ -61,6 +62,17 @@ struct nvmefc_ls_req_op {
bool req_queued;
};
+struct nvmefc_ls_rcv_op {
+ struct nvme_fc_rport *rport;
+ struct nvmefc_ls_rsp *lsrsp;
+ union nvmefc_ls_requests *rqstbuf;
+ union nvmefc_ls_responses *rspbuf;
+ u16 rqstdatalen;
+ bool handled;
+ dma_addr_t rspdma;
+ struct list_head lsrcv_list; /* rport->ls_rcv_list */
+} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
+
enum nvme_fcpop_state {
FCPOP_STATE_UNINIT = 0,
FCPOP_STATE_IDLE = 1,
@@ -96,7 +108,7 @@ struct nvme_fc_fcp_op {
struct nvme_fcp_op_w_sgl {
struct nvme_fc_fcp_op op;
struct scatterlist sgl[NVME_INLINE_SG_CNT];
- uint8_t priv[0];
+ uint8_t priv[];
};
struct nvme_fc_lport {
@@ -117,6 +129,7 @@ struct nvme_fc_rport {
struct list_head endp_list; /* for lport->endp_list */
struct list_head ctrl_list;
struct list_head ls_req_list;
+ struct list_head ls_rcv_list;
struct list_head disc_list;
struct device *dev; /* physical device for dma */
struct nvme_fc_lport *lport;
@@ -124,11 +137,12 @@ struct nvme_fc_rport {
struct kref ref;
atomic_t act_ctrl_cnt;
unsigned long dev_loss_end;
+ struct work_struct lsrcv_work;
} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
-enum nvme_fcctrl_flags {
- FCCTRL_TERMIO = (1 << 0),
-};
+/* fc_ctrl flags values - specified as bit positions */
+#define ASSOC_ACTIVE 0
+#define FCCTRL_TERMIO 1
struct nvme_fc_ctrl {
spinlock_t lock;
@@ -139,9 +153,9 @@ struct nvme_fc_ctrl {
u32 cnum;
bool ioq_live;
- bool assoc_active;
atomic_t err_work_active;
u64 association_id;
+ struct nvmefc_ls_rcv_op *rcv_disconn;
struct list_head ctrl_list; /* rport->ctrl_list */
@@ -152,7 +166,7 @@ struct nvme_fc_ctrl {
struct work_struct err_work;
struct kref ref;
- u32 flags;
+ unsigned long flags;
u32 iocnt;
wait_queue_head_t ioabort_wait;
@@ -219,6 +233,9 @@ static struct device *fc_udev_device;
static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
struct nvme_fc_queue *, unsigned int);
+static void nvme_fc_handle_ls_rqst_work(struct work_struct *work);
+
+
static void
nvme_fc_free_lport(struct kref *ref)
{
@@ -394,7 +411,10 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
newrec->ops = template;
newrec->dev = dev;
ida_init(&newrec->endp_cnt);
- newrec->localport.private = &newrec[1];
+ if (template->local_priv_sz)
+ newrec->localport.private = &newrec[1];
+ else
+ newrec->localport.private = NULL;
newrec->localport.node_name = pinfo->node_name;
newrec->localport.port_name = pinfo->port_name;
newrec->localport.port_role = pinfo->port_role;
@@ -701,9 +721,13 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
atomic_set(&newrec->act_ctrl_cnt, 0);
spin_lock_init(&newrec->lock);
newrec->remoteport.localport = &lport->localport;
+ INIT_LIST_HEAD(&newrec->ls_rcv_list);
newrec->dev = lport->dev;
newrec->lport = lport;
- newrec->remoteport.private = &newrec[1];
+ if (lport->ops->remote_priv_sz)
+ newrec->remoteport.private = &newrec[1];
+ else
+ newrec->remoteport.private = NULL;
newrec->remoteport.port_role = pinfo->port_role;
newrec->remoteport.node_name = pinfo->node_name;
newrec->remoteport.port_name = pinfo->port_name;
@@ -711,6 +735,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
newrec->remoteport.port_num = idx;
__nvme_fc_set_dev_loss_tmo(newrec, pinfo);
+ INIT_WORK(&newrec->lsrcv_work, nvme_fc_handle_ls_rqst_work);
spin_lock_irqsave(&nvme_fc_lock, flags);
list_add_tail(&newrec->endp_list, &lport->endp_list);
@@ -1000,6 +1025,7 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
+static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
static void
__nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
@@ -1140,41 +1166,6 @@ nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport,
return __nvme_fc_send_ls_req(rport, lsop, done);
}
-/* Validation Error indexes into the string table below */
-enum {
- VERR_NO_ERROR = 0,
- VERR_LSACC = 1,
- VERR_LSDESC_RQST = 2,
- VERR_LSDESC_RQST_LEN = 3,
- VERR_ASSOC_ID = 4,
- VERR_ASSOC_ID_LEN = 5,
- VERR_CONN_ID = 6,
- VERR_CONN_ID_LEN = 7,
- VERR_CR_ASSOC = 8,
- VERR_CR_ASSOC_ACC_LEN = 9,
- VERR_CR_CONN = 10,
- VERR_CR_CONN_ACC_LEN = 11,
- VERR_DISCONN = 12,
- VERR_DISCONN_ACC_LEN = 13,
-};
-
-static char *validation_errors[] = {
- "OK",
- "Not LS_ACC",
- "Not LSDESC_RQST",
- "Bad LSDESC_RQST Length",
- "Not Association ID",
- "Bad Association ID Length",
- "Not Connection ID",
- "Bad Connection ID Length",
- "Not CR_ASSOC Rqst",
- "Bad CR_ASSOC ACC Length",
- "Not CR_CONN Rqst",
- "Bad CR_CONN ACC Length",
- "Not Disconnect Rqst",
- "Bad Disconnect ACC Length",
-};
-
static int
nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
struct nvme_fc_queue *queue, u16 qsize, u16 ersp_ratio)
@@ -1183,21 +1174,27 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
struct nvmefc_ls_req *lsreq;
struct fcnvme_ls_cr_assoc_rqst *assoc_rqst;
struct fcnvme_ls_cr_assoc_acc *assoc_acc;
+ unsigned long flags;
int ret, fcret = 0;
lsop = kzalloc((sizeof(*lsop) +
- ctrl->lport->ops->lsrqst_priv_sz +
- sizeof(*assoc_rqst) + sizeof(*assoc_acc)), GFP_KERNEL);
+ sizeof(*assoc_rqst) + sizeof(*assoc_acc) +
+ ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL);
if (!lsop) {
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: send Create Association failed: ENOMEM\n",
+ ctrl->cnum);
ret = -ENOMEM;
goto out_no_memory;
}
- lsreq = &lsop->ls_req;
- lsreq->private = (void *)&lsop[1];
- assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *)
- (lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+ assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *)&lsop[1];
assoc_acc = (struct fcnvme_ls_cr_assoc_acc *)&assoc_rqst[1];
+ lsreq = &lsop->ls_req;
+ if (ctrl->lport->ops->lsrqst_priv_sz)
+ lsreq->private = &assoc_acc[1];
+ else
+ lsreq->private = NULL;
assoc_rqst->w0.ls_cmd = FCNVME_LS_CREATE_ASSOCIATION;
assoc_rqst->desc_list_len =
@@ -1267,11 +1264,13 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
"q %d Create Association LS failed: %s\n",
queue->qnum, validation_errors[fcret]);
} else {
+ spin_lock_irqsave(&ctrl->lock, flags);
ctrl->association_id =
be64_to_cpu(assoc_acc->associd.association_id);
queue->connection_id =
be64_to_cpu(assoc_acc->connectid.connection_id);
set_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+ spin_unlock_irqrestore(&ctrl->lock, flags);
}
out_free_buffer:
@@ -1295,18 +1294,23 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
int ret, fcret = 0;
lsop = kzalloc((sizeof(*lsop) +
- ctrl->lport->ops->lsrqst_priv_sz +
- sizeof(*conn_rqst) + sizeof(*conn_acc)), GFP_KERNEL);
+ sizeof(*conn_rqst) + sizeof(*conn_acc) +
+ ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL);
if (!lsop) {
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: send Create Connection failed: ENOMEM\n",
+ ctrl->cnum);
ret = -ENOMEM;
goto out_no_memory;
}
- lsreq = &lsop->ls_req;
- lsreq->private = (void *)&lsop[1];
- conn_rqst = (struct fcnvme_ls_cr_conn_rqst *)
- (lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+ conn_rqst = (struct fcnvme_ls_cr_conn_rqst *)&lsop[1];
conn_acc = (struct fcnvme_ls_cr_conn_acc *)&conn_rqst[1];
+ lsreq = &lsop->ls_req;
+ if (ctrl->lport->ops->lsrqst_priv_sz)
+ lsreq->private = (void *)&conn_acc[1];
+ else
+ lsreq->private = NULL;
conn_rqst->w0.ls_cmd = FCNVME_LS_CREATE_CONNECTION;
conn_rqst->desc_list_len = cpu_to_be32(
@@ -1420,54 +1424,385 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
int ret;
lsop = kzalloc((sizeof(*lsop) +
- ctrl->lport->ops->lsrqst_priv_sz +
- sizeof(*discon_rqst) + sizeof(*discon_acc)),
- GFP_KERNEL);
- if (!lsop)
- /* couldn't sent it... too bad */
+ sizeof(*discon_rqst) + sizeof(*discon_acc) +
+ ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL);
+ if (!lsop) {
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: send Disconnect Association "
+ "failed: ENOMEM\n",
+ ctrl->cnum);
return;
+ }
+ discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *)&lsop[1];
+ discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1];
lsreq = &lsop->ls_req;
+ if (ctrl->lport->ops->lsrqst_priv_sz)
+ lsreq->private = (void *)&discon_acc[1];
+ else
+ lsreq->private = NULL;
- lsreq->private = (void *)&lsop[1];
- discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *)
- (lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
- discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1];
+ nvmefc_fmt_lsreq_discon_assoc(lsreq, discon_rqst, discon_acc,
+ ctrl->association_id);
- discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT_ASSOC;
- discon_rqst->desc_list_len = cpu_to_be32(
- sizeof(struct fcnvme_lsdesc_assoc_id) +
- sizeof(struct fcnvme_lsdesc_disconn_cmd));
+ ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop,
+ nvme_fc_disconnect_assoc_done);
+ if (ret)
+ kfree(lsop);
+}
- discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
- discon_rqst->associd.desc_len =
- fcnvme_lsdesc_len(
- sizeof(struct fcnvme_lsdesc_assoc_id));
+static void
+nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
+{
+ struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
+ struct nvme_fc_rport *rport = lsop->rport;
+ struct nvme_fc_lport *lport = rport->lport;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rport->lock, flags);
+ list_del(&lsop->lsrcv_list);
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ fc_dma_sync_single_for_cpu(lport->dev, lsop->rspdma,
+ sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
+ fc_dma_unmap_single(lport->dev, lsop->rspdma,
+ sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
+
+ kfree(lsop);
+
+ nvme_fc_rport_put(rport);
+}
+
+static void
+nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
+{
+ struct nvme_fc_rport *rport = lsop->rport;
+ struct nvme_fc_lport *lport = rport->lport;
+ struct fcnvme_ls_rqst_w0 *w0 = &lsop->rqstbuf->w0;
+ int ret;
+
+ fc_dma_sync_single_for_device(lport->dev, lsop->rspdma,
+ sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
+
+ ret = lport->ops->xmt_ls_rsp(&lport->localport, &rport->remoteport,
+ lsop->lsrsp);
+ if (ret) {
+ dev_warn(lport->dev,
+ "LLDD rejected LS RSP xmt: LS %d status %d\n",
+ w0->ls_cmd, ret);
+ nvme_fc_xmt_ls_rsp_done(lsop->lsrsp);
+ return;
+ }
+}
+
+static struct nvme_fc_ctrl *
+nvme_fc_match_disconn_ls(struct nvme_fc_rport *rport,
+ struct nvmefc_ls_rcv_op *lsop)
+{
+ struct fcnvme_ls_disconnect_assoc_rqst *rqst =
+ &lsop->rqstbuf->rq_dis_assoc;
+ struct nvme_fc_ctrl *ctrl, *ret = NULL;
+ struct nvmefc_ls_rcv_op *oldls = NULL;
+ u64 association_id = be64_to_cpu(rqst->associd.association_id);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rport->lock, flags);
+
+ list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+ if (!nvme_fc_ctrl_get(ctrl))
+ continue;
+ spin_lock(&ctrl->lock);
+ if (association_id == ctrl->association_id) {
+ oldls = ctrl->rcv_disconn;
+ ctrl->rcv_disconn = lsop;
+ ret = ctrl;
+ }
+ spin_unlock(&ctrl->lock);
+ if (ret)
+ /* leave the ctrl get reference */
+ break;
+ nvme_fc_ctrl_put(ctrl);
+ }
+
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ /* transmit a response for anything that was pending */
+ if (oldls) {
+ dev_info(rport->lport->dev,
+ "NVME-FC{%d}: Multiple Disconnect Association "
+ "LS's received\n", ctrl->cnum);
+ /* overwrite good response with bogus failure */
+ oldls->lsrsp->rsplen = nvme_fc_format_rjt(oldls->rspbuf,
+ sizeof(*oldls->rspbuf),
+ rqst->w0.ls_cmd,
+ FCNVME_RJT_RC_UNAB,
+ FCNVME_RJT_EXP_NONE, 0);
+ nvme_fc_xmt_ls_rsp(oldls);
+ }
+
+ return ret;
+}
+
+/*
+ * returns true to mean LS handled and ls_rsp can be sent
+ * returns false to defer ls_rsp xmt (will be done as part of
+ * association termination)
+ */
+static bool
+nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
+{
+ struct nvme_fc_rport *rport = lsop->rport;
+ struct fcnvme_ls_disconnect_assoc_rqst *rqst =
+ &lsop->rqstbuf->rq_dis_assoc;
+ struct fcnvme_ls_disconnect_assoc_acc *acc =
+ &lsop->rspbuf->rsp_dis_assoc;
+ struct nvme_fc_ctrl *ctrl = NULL;
+ int ret = 0;
+
+ memset(acc, 0, sizeof(*acc));
+
+ ret = nvmefc_vldt_lsreq_discon_assoc(lsop->rqstdatalen, rqst);
+ if (!ret) {
+ /* match an active association */
+ ctrl = nvme_fc_match_disconn_ls(rport, lsop);
+ if (!ctrl)
+ ret = VERR_NO_ASSOC;
+ }
+
+ if (ret) {
+ dev_info(rport->lport->dev,
+ "Disconnect LS failed: %s\n",
+ validation_errors[ret]);
+ lsop->lsrsp->rsplen = nvme_fc_format_rjt(acc,
+ sizeof(*acc), rqst->w0.ls_cmd,
+ (ret == VERR_NO_ASSOC) ?
+ FCNVME_RJT_RC_INV_ASSOC :
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
+ return true;
+ }
+
+ /* format an ACCept response */
- discon_rqst->associd.association_id = cpu_to_be64(ctrl->association_id);
+ lsop->lsrsp->rsplen = sizeof(*acc);
- discon_rqst->discon_cmd.desc_tag = cpu_to_be32(
- FCNVME_LSDESC_DISCONN_CMD);
- discon_rqst->discon_cmd.desc_len =
+ nvme_fc_format_rsp_hdr(acc, FCNVME_LS_ACC,
fcnvme_lsdesc_len(
- sizeof(struct fcnvme_lsdesc_disconn_cmd));
+ sizeof(struct fcnvme_ls_disconnect_assoc_acc)),
+ FCNVME_LS_DISCONNECT_ASSOC);
- lsreq->rqstaddr = discon_rqst;
- lsreq->rqstlen = sizeof(*discon_rqst);
- lsreq->rspaddr = discon_acc;
- lsreq->rsplen = sizeof(*discon_acc);
- lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC;
+ /*
+ * the transmit of the response will occur after the exchanges
+ * for the association have been ABTS'd by
+ * nvme_fc_delete_association().
+ */
- ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop,
- nvme_fc_disconnect_assoc_done);
- if (ret)
- kfree(lsop);
+ /* fail the association */
+ nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
+
+ /* release the reference taken by nvme_fc_match_disconn_ls() */
+ nvme_fc_ctrl_put(ctrl);
+
+ return false;
}
+/*
+ * Actual Processing routine for received FC-NVME LS Requests from the LLD
+ * returns true if a response should be sent afterward, false if rsp will
+ * be sent asynchronously.
+ */
+static bool
+nvme_fc_handle_ls_rqst(struct nvmefc_ls_rcv_op *lsop)
+{
+ struct fcnvme_ls_rqst_w0 *w0 = &lsop->rqstbuf->w0;
+ bool ret = true;
+
+ lsop->lsrsp->nvme_fc_private = lsop;
+ lsop->lsrsp->rspbuf = lsop->rspbuf;
+ lsop->lsrsp->rspdma = lsop->rspdma;
+ lsop->lsrsp->done = nvme_fc_xmt_ls_rsp_done;
+ /* Be preventative. handlers will later set to valid length */
+ lsop->lsrsp->rsplen = 0;
-/* *********************** NVME Ctrl Routines **************************** */
+ /*
+ * handlers:
+ * parse request input, execute the request, and format the
+ * LS response
+ */
+ switch (w0->ls_cmd) {
+ case FCNVME_LS_DISCONNECT_ASSOC:
+ ret = nvme_fc_ls_disconnect_assoc(lsop);
+ break;
+ case FCNVME_LS_DISCONNECT_CONN:
+ lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf,
+ sizeof(*lsop->rspbuf), w0->ls_cmd,
+ FCNVME_RJT_RC_UNSUP, FCNVME_RJT_EXP_NONE, 0);
+ break;
+ case FCNVME_LS_CREATE_ASSOCIATION:
+ case FCNVME_LS_CREATE_CONNECTION:
+ lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf,
+ sizeof(*lsop->rspbuf), w0->ls_cmd,
+ FCNVME_RJT_RC_LOGIC, FCNVME_RJT_EXP_NONE, 0);
+ break;
+ default:
+ lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf,
+ sizeof(*lsop->rspbuf), w0->ls_cmd,
+ FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
+ break;
+ }
-static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
+ return(ret);
+}
+
+static void
+nvme_fc_handle_ls_rqst_work(struct work_struct *work)
+{
+ struct nvme_fc_rport *rport =
+ container_of(work, struct nvme_fc_rport, lsrcv_work);
+ struct fcnvme_ls_rqst_w0 *w0;
+ struct nvmefc_ls_rcv_op *lsop;
+ unsigned long flags;
+ bool sendrsp;
+
+restart:
+ sendrsp = true;
+ spin_lock_irqsave(&rport->lock, flags);
+ list_for_each_entry(lsop, &rport->ls_rcv_list, lsrcv_list) {
+ if (lsop->handled)
+ continue;
+
+ lsop->handled = true;
+ if (rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
+ spin_unlock_irqrestore(&rport->lock, flags);
+ sendrsp = nvme_fc_handle_ls_rqst(lsop);
+ } else {
+ spin_unlock_irqrestore(&rport->lock, flags);
+ w0 = &lsop->rqstbuf->w0;
+ lsop->lsrsp->rsplen = nvme_fc_format_rjt(
+ lsop->rspbuf,
+ sizeof(*lsop->rspbuf),
+ w0->ls_cmd,
+ FCNVME_RJT_RC_UNAB,
+ FCNVME_RJT_EXP_NONE, 0);
+ }
+ if (sendrsp)
+ nvme_fc_xmt_ls_rsp(lsop);
+ goto restart;
+ }
+ spin_unlock_irqrestore(&rport->lock, flags);
+}
+
+/**
+ * nvme_fc_rcv_ls_req - transport entry point called by an LLDD
+ * upon the reception of a NVME LS request.
+ *
+ * The nvme-fc layer will copy payload to an internal structure for
+ * processing. As such, upon completion of the routine, the LLDD may
+ * immediately free/reuse the LS request buffer passed in the call.
+ *
+ * If this routine returns error, the LLDD should abort the exchange.
+ *
+ * @remoteport: pointer to the (registered) remote port that the LS
+ * was received from. The remoteport is associated with
+ * a specific localport.
+ * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be
+ * used to reference the exchange corresponding to the LS
+ * when issuing an ls response.
+ * @lsreqbuf: pointer to the buffer containing the LS Request
+ * @lsreqbuf_len: length, in bytes, of the received LS request
+ */
+int
+nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
+ struct nvmefc_ls_rsp *lsrsp,
+ void *lsreqbuf, u32 lsreqbuf_len)
+{
+ struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+ struct nvme_fc_lport *lport = rport->lport;
+ struct fcnvme_ls_rqst_w0 *w0 = (struct fcnvme_ls_rqst_w0 *)lsreqbuf;
+ struct nvmefc_ls_rcv_op *lsop;
+ unsigned long flags;
+ int ret;
+
+ nvme_fc_rport_get(rport);
+
+ /* validate there's a routine to transmit a response */
+ if (!lport->ops->xmt_ls_rsp) {
+ dev_info(lport->dev,
+ "RCV %s LS failed: no LLDD xmt_ls_rsp\n",
+ (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
+ nvmefc_ls_names[w0->ls_cmd] : "");
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ if (lsreqbuf_len > sizeof(union nvmefc_ls_requests)) {
+ dev_info(lport->dev,
+ "RCV %s LS failed: payload too large\n",
+ (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
+ nvmefc_ls_names[w0->ls_cmd] : "");
+ ret = -E2BIG;
+ goto out_put;
+ }
+
+ lsop = kzalloc(sizeof(*lsop) +
+ sizeof(union nvmefc_ls_requests) +
+ sizeof(union nvmefc_ls_responses),
+ GFP_KERNEL);
+ if (!lsop) {
+ dev_info(lport->dev,
+ "RCV %s LS failed: No memory\n",
+ (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
+ nvmefc_ls_names[w0->ls_cmd] : "");
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1];
+ lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1];
+
+ lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf,
+ sizeof(*lsop->rspbuf),
+ DMA_TO_DEVICE);
+ if (fc_dma_mapping_error(lport->dev, lsop->rspdma)) {
+ dev_info(lport->dev,
+ "RCV %s LS failed: DMA mapping failure\n",
+ (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
+ nvmefc_ls_names[w0->ls_cmd] : "");
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ lsop->rport = rport;
+ lsop->lsrsp = lsrsp;
+
+ memcpy(lsop->rqstbuf, lsreqbuf, lsreqbuf_len);
+ lsop->rqstdatalen = lsreqbuf_len;
+
+ spin_lock_irqsave(&rport->lock, flags);
+ if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE) {
+ spin_unlock_irqrestore(&rport->lock, flags);
+ ret = -ENOTCONN;
+ goto out_unmap;
+ }
+ list_add_tail(&lsop->lsrcv_list, &rport->ls_rcv_list);
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ schedule_work(&rport->lsrcv_work);
+
+ return 0;
+
+out_unmap:
+ fc_dma_unmap_single(lport->dev, lsop->rspdma,
+ sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
+out_free:
+ kfree(lsop);
+out_put:
+ nvme_fc_rport_put(rport);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_rcv_ls_req);
+
+
+/* *********************** NVME Ctrl Routines **************************** */
static void
__nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl,
@@ -1500,7 +1835,7 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
if (opstate != FCPOP_STATE_ACTIVE)
atomic_set(&op->state, opstate);
- else if (ctrl->flags & FCCTRL_TERMIO)
+ else if (test_bit(FCCTRL_TERMIO, &ctrl->flags))
ctrl->iocnt++;
spin_unlock_irqrestore(&ctrl->lock, flags);
@@ -1537,7 +1872,7 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
if (opstate == FCPOP_STATE_ABORTED) {
spin_lock_irqsave(&ctrl->lock, flags);
- if (ctrl->flags & FCCTRL_TERMIO) {
+ if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
if (!--ctrl->iocnt)
wake_up(&ctrl->ioabort_wait);
}
@@ -1771,7 +2106,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++);
if (res)
return res;
- op->op.fcp_req.first_sgl = &op->sgl[0];
+ op->op.fcp_req.first_sgl = op->sgl;
op->op.fcp_req.private = &op->priv[0];
nvme_req(rq)->ctrl = &ctrl->ctrl;
return res;
@@ -1783,15 +2118,17 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_fcp_op *aen_op;
struct nvme_fc_cmd_iu *cmdiu;
struct nvme_command *sqe;
- void *private;
+ void *private = NULL;
int i, ret;
aen_op = ctrl->aen_ops;
for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
- private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
+ if (ctrl->lport->ops->fcprqst_priv_sz) {
+ private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
GFP_KERNEL);
- if (!private)
- return -ENOMEM;
+ if (!private)
+ return -ENOMEM;
+ }
cmdiu = &aen_op->cmd_iu;
sqe = &cmdiu->sqe;
@@ -1822,9 +2159,6 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
aen_op = ctrl->aen_ops;
for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
- if (!aen_op->fcp_req.private)
- continue;
-
__nvme_fc_exit_request(ctrl, aen_op);
kfree(aen_op->fcp_req.private);
@@ -2300,10 +2634,11 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
- if (!(op->flags & FCOP_FLAGS_AEN))
+ if (!(op->flags & FCOP_FLAGS_AEN)) {
nvme_fc_unmap_data(ctrl, op->rq, op);
+ nvme_cleanup_cmd(op->rq);
+ }
- nvme_cleanup_cmd(op->rq);
nvme_fc_ctrl_put(ctrl);
if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
@@ -2366,16 +2701,9 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
struct nvme_fc_fcp_op *aen_op;
- unsigned long flags;
- bool terminating = false;
blk_status_t ret;
- spin_lock_irqsave(&ctrl->lock, flags);
- if (ctrl->flags & FCCTRL_TERMIO)
- terminating = true;
- spin_unlock_irqrestore(&ctrl->lock, flags);
-
- if (terminating)
+ if (test_bit(FCCTRL_TERMIO, &ctrl->flags))
return;
aen_op = &ctrl->aen_ops[0];
@@ -2584,10 +2912,9 @@ nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_rport *rport = ctrl->rport;
u32 cnt;
- if (ctrl->assoc_active)
+ if (test_and_set_bit(ASSOC_ACTIVE, &ctrl->flags))
return 1;
- ctrl->assoc_active = true;
cnt = atomic_inc_return(&rport->act_ctrl_cnt);
if (cnt == 1)
nvme_fc_rport_active_on_lport(rport);
@@ -2602,7 +2929,7 @@ nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_lport *lport = rport->lport;
u32 cnt;
- /* ctrl->assoc_active=false will be set independently */
+ /* clearing of ctrl->flags ASSOC_ACTIVE bit is in association delete */
cnt = atomic_dec_return(&rport->act_ctrl_cnt);
if (cnt == 0) {
@@ -2622,6 +2949,8 @@ static int
nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ struct nvmefc_ls_rcv_op *disls = NULL;
+ unsigned long flags;
int ret;
bool changed;
@@ -2739,12 +3068,18 @@ out_term_aen_ops:
out_disconnect_admin_queue:
/* send a Disconnect(association) LS to fc-nvme target */
nvme_fc_xmt_disconnect_assoc(ctrl);
+ spin_lock_irqsave(&ctrl->lock, flags);
ctrl->association_id = 0;
+ disls = ctrl->rcv_disconn;
+ ctrl->rcv_disconn = NULL;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ if (disls)
+ nvme_fc_xmt_ls_rsp(disls);
out_delete_hw_queue:
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
out_free_queue:
nvme_fc_free_queue(&ctrl->queues[0]);
- ctrl->assoc_active = false;
+ clear_bit(ASSOC_ACTIVE, &ctrl->flags);
nvme_fc_ctlr_inactive_on_rport(ctrl);
return ret;
@@ -2759,14 +3094,14 @@ out_free_queue:
static void
nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
{
+ struct nvmefc_ls_rcv_op *disls = NULL;
unsigned long flags;
- if (!ctrl->assoc_active)
+ if (!test_and_clear_bit(ASSOC_ACTIVE, &ctrl->flags))
return;
- ctrl->assoc_active = false;
spin_lock_irqsave(&ctrl->lock, flags);
- ctrl->flags |= FCCTRL_TERMIO;
+ set_bit(FCCTRL_TERMIO, &ctrl->flags);
ctrl->iocnt = 0;
spin_unlock_irqrestore(&ctrl->lock, flags);
@@ -2817,7 +3152,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
/* wait for all io that had to be aborted */
spin_lock_irq(&ctrl->lock);
wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
- ctrl->flags &= ~FCCTRL_TERMIO;
+ clear_bit(FCCTRL_TERMIO, &ctrl->flags);
spin_unlock_irq(&ctrl->lock);
nvme_fc_term_aen_ops(ctrl);
@@ -2831,7 +3166,17 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
if (ctrl->association_id)
nvme_fc_xmt_disconnect_assoc(ctrl);
+ spin_lock_irqsave(&ctrl->lock, flags);
ctrl->association_id = 0;
+ disls = ctrl->rcv_disconn;
+ ctrl->rcv_disconn = NULL;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ if (disls)
+ /*
+ * if a Disconnect Request was waiting for a response, send
+ * now that all ABTS's have been issued (and are complete).
+ */
+ nvme_fc_xmt_ls_rsp(disls);
if (ctrl->ctrl.tagset) {
nvme_fc_delete_hw_io_queues(ctrl);
@@ -2902,7 +3247,9 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: dev_loss_tmo (%d) expired "
"while waiting for remoteport connectivity.\n",
- ctrl->cnum, portptr->dev_loss_tmo);
+ ctrl->cnum, min_t(int, portptr->dev_loss_tmo,
+ (ctrl->ctrl.opts->max_reconnects *
+ ctrl->ctrl.opts->reconnect_delay)));
WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
}
}
@@ -3089,7 +3436,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->dev = lport->dev;
ctrl->cnum = idx;
ctrl->ioq_live = false;
- ctrl->assoc_active = false;
atomic_set(&ctrl->err_work_active, 0);
init_waitqueue_head(&ctrl->ioabort_wait);
diff --git a/drivers/nvme/host/fc.h b/drivers/nvme/host/fc.h
new file mode 100644
index 000000000000..05ce566f2caf
--- /dev/null
+++ b/drivers/nvme/host/fc.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2016, Avago Technologies
+ */
+
+#ifndef _NVME_FC_TRANSPORT_H
+#define _NVME_FC_TRANSPORT_H 1
+
+
+/*
+ * Common definitions between the nvme_fc (host) transport and
+ * nvmet_fc (target) transport implementation.
+ */
+
+/*
+ * ****************** FC-NVME LS HANDLING ******************
+ */
+
+union nvmefc_ls_requests {
+ struct fcnvme_ls_rqst_w0 w0;
+ struct fcnvme_ls_cr_assoc_rqst rq_cr_assoc;
+ struct fcnvme_ls_cr_conn_rqst rq_cr_conn;
+ struct fcnvme_ls_disconnect_assoc_rqst rq_dis_assoc;
+ struct fcnvme_ls_disconnect_conn_rqst rq_dis_conn;
+} __aligned(128); /* alignment for other things alloc'd with */
+
+union nvmefc_ls_responses {
+ struct fcnvme_ls_rjt rsp_rjt;
+ struct fcnvme_ls_cr_assoc_acc rsp_cr_assoc;
+ struct fcnvme_ls_cr_conn_acc rsp_cr_conn;
+ struct fcnvme_ls_disconnect_assoc_acc rsp_dis_assoc;
+ struct fcnvme_ls_disconnect_conn_acc rsp_dis_conn;
+} __aligned(128); /* alignment for other things alloc'd with */
+
+static inline void
+nvme_fc_format_rsp_hdr(void *buf, u8 ls_cmd, __be32 desc_len, u8 rqst_ls_cmd)
+{
+ struct fcnvme_ls_acc_hdr *acc = buf;
+
+ acc->w0.ls_cmd = ls_cmd;
+ acc->desc_list_len = desc_len;
+ acc->rqst.desc_tag = cpu_to_be32(FCNVME_LSDESC_RQST);
+ acc->rqst.desc_len =
+ fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst));
+ acc->rqst.w0.ls_cmd = rqst_ls_cmd;
+}
+
+static inline int
+nvme_fc_format_rjt(void *buf, u16 buflen, u8 ls_cmd,
+ u8 reason, u8 explanation, u8 vendor)
+{
+ struct fcnvme_ls_rjt *rjt = buf;
+
+ nvme_fc_format_rsp_hdr(buf, FCNVME_LSDESC_RQST,
+ fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_rjt)),
+ ls_cmd);
+ rjt->rjt.desc_tag = cpu_to_be32(FCNVME_LSDESC_RJT);
+ rjt->rjt.desc_len = fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rjt));
+ rjt->rjt.reason_code = reason;
+ rjt->rjt.reason_explanation = explanation;
+ rjt->rjt.vendor = vendor;
+
+ return sizeof(struct fcnvme_ls_rjt);
+}
+
+/* Validation Error indexes into the string table below */
+enum {
+ VERR_NO_ERROR = 0,
+ VERR_CR_ASSOC_LEN = 1,
+ VERR_CR_ASSOC_RQST_LEN = 2,
+ VERR_CR_ASSOC_CMD = 3,
+ VERR_CR_ASSOC_CMD_LEN = 4,
+ VERR_ERSP_RATIO = 5,
+ VERR_ASSOC_ALLOC_FAIL = 6,
+ VERR_QUEUE_ALLOC_FAIL = 7,
+ VERR_CR_CONN_LEN = 8,
+ VERR_CR_CONN_RQST_LEN = 9,
+ VERR_ASSOC_ID = 10,
+ VERR_ASSOC_ID_LEN = 11,
+ VERR_NO_ASSOC = 12,
+ VERR_CONN_ID = 13,
+ VERR_CONN_ID_LEN = 14,
+ VERR_INVAL_CONN = 15,
+ VERR_CR_CONN_CMD = 16,
+ VERR_CR_CONN_CMD_LEN = 17,
+ VERR_DISCONN_LEN = 18,
+ VERR_DISCONN_RQST_LEN = 19,
+ VERR_DISCONN_CMD = 20,
+ VERR_DISCONN_CMD_LEN = 21,
+ VERR_DISCONN_SCOPE = 22,
+ VERR_RS_LEN = 23,
+ VERR_RS_RQST_LEN = 24,
+ VERR_RS_CMD = 25,
+ VERR_RS_CMD_LEN = 26,
+ VERR_RS_RCTL = 27,
+ VERR_RS_RO = 28,
+ VERR_LSACC = 29,
+ VERR_LSDESC_RQST = 30,
+ VERR_LSDESC_RQST_LEN = 31,
+ VERR_CR_ASSOC = 32,
+ VERR_CR_ASSOC_ACC_LEN = 33,
+ VERR_CR_CONN = 34,
+ VERR_CR_CONN_ACC_LEN = 35,
+ VERR_DISCONN = 36,
+ VERR_DISCONN_ACC_LEN = 37,
+};
+
+static char *validation_errors[] = {
+ "OK",
+ "Bad CR_ASSOC Length",
+ "Bad CR_ASSOC Rqst Length",
+ "Not CR_ASSOC Cmd",
+ "Bad CR_ASSOC Cmd Length",
+ "Bad Ersp Ratio",
+ "Association Allocation Failed",
+ "Queue Allocation Failed",
+ "Bad CR_CONN Length",
+ "Bad CR_CONN Rqst Length",
+ "Not Association ID",
+ "Bad Association ID Length",
+ "No Association",
+ "Not Connection ID",
+ "Bad Connection ID Length",
+ "Invalid Connection ID",
+ "Not CR_CONN Cmd",
+ "Bad CR_CONN Cmd Length",
+ "Bad DISCONN Length",
+ "Bad DISCONN Rqst Length",
+ "Not DISCONN Cmd",
+ "Bad DISCONN Cmd Length",
+ "Bad Disconnect Scope",
+ "Bad RS Length",
+ "Bad RS Rqst Length",
+ "Not RS Cmd",
+ "Bad RS Cmd Length",
+ "Bad RS R_CTL",
+ "Bad RS Relative Offset",
+ "Not LS_ACC",
+ "Not LSDESC_RQST",
+ "Bad LSDESC_RQST Length",
+ "Not CR_ASSOC Rqst",
+ "Bad CR_ASSOC ACC Length",
+ "Not CR_CONN Rqst",
+ "Bad CR_CONN ACC Length",
+ "Not Disconnect Rqst",
+ "Bad Disconnect ACC Length",
+};
+
+#define NVME_FC_LAST_LS_CMD_VALUE FCNVME_LS_DISCONNECT_CONN
+
+static char *nvmefc_ls_names[] = {
+ "Reserved (0)",
+ "RJT (1)",
+ "ACC (2)",
+ "Create Association",
+ "Create Connection",
+ "Disconnect Association",
+ "Disconnect Connection",
+};
+
+static inline void
+nvmefc_fmt_lsreq_discon_assoc(struct nvmefc_ls_req *lsreq,
+ struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst,
+ struct fcnvme_ls_disconnect_assoc_acc *discon_acc,
+ u64 association_id)
+{
+ lsreq->rqstaddr = discon_rqst;
+ lsreq->rqstlen = sizeof(*discon_rqst);
+ lsreq->rspaddr = discon_acc;
+ lsreq->rsplen = sizeof(*discon_acc);
+ lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC;
+
+ discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT_ASSOC;
+ discon_rqst->desc_list_len = cpu_to_be32(
+ sizeof(struct fcnvme_lsdesc_assoc_id) +
+ sizeof(struct fcnvme_lsdesc_disconn_cmd));
+
+ discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+ discon_rqst->associd.desc_len =
+ fcnvme_lsdesc_len(
+ sizeof(struct fcnvme_lsdesc_assoc_id));
+
+ discon_rqst->associd.association_id = cpu_to_be64(association_id);
+
+ discon_rqst->discon_cmd.desc_tag = cpu_to_be32(
+ FCNVME_LSDESC_DISCONN_CMD);
+ discon_rqst->discon_cmd.desc_len =
+ fcnvme_lsdesc_len(
+ sizeof(struct fcnvme_lsdesc_disconn_cmd));
+}
+
+static inline int
+nvmefc_vldt_lsreq_discon_assoc(u32 rqstlen,
+ struct fcnvme_ls_disconnect_assoc_rqst *rqst)
+{
+ int ret = 0;
+
+ if (rqstlen < sizeof(struct fcnvme_ls_disconnect_assoc_rqst))
+ ret = VERR_DISCONN_LEN;
+ else if (rqst->desc_list_len !=
+ fcnvme_lsdesc_len(
+ sizeof(struct fcnvme_ls_disconnect_assoc_rqst)))
+ ret = VERR_DISCONN_RQST_LEN;
+ else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+ ret = VERR_ASSOC_ID;
+ else if (rqst->associd.desc_len !=
+ fcnvme_lsdesc_len(
+ sizeof(struct fcnvme_lsdesc_assoc_id)))
+ ret = VERR_ASSOC_ID_LEN;
+ else if (rqst->discon_cmd.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD))
+ ret = VERR_DISCONN_CMD;
+ else if (rqst->discon_cmd.desc_len !=
+ fcnvme_lsdesc_len(
+ sizeof(struct fcnvme_lsdesc_disconn_cmd)))
+ ret = VERR_DISCONN_CMD_LEN;
+ /*
+ * As the standard changed on the LS, check if old format and scope
+ * something other than Association (e.g. 0).
+ */
+ else if (rqst->discon_cmd.rsvd8[0])
+ ret = VERR_DISCONN_SCOPE;
+
+ return ret;
+}
+
+#endif /* _NVME_FC_TRANSPORT_H */
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ec46693f6b64..69608755d415 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -171,7 +171,7 @@ struct nvme_nvm_bb_tbl {
__le32 tdresv;
__le32 thresv;
__le32 rsvd2[8];
- __u8 blk[0];
+ __u8 blk[];
};
struct nvme_nvm_id20_addrf {
@@ -961,7 +961,10 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
geo = &dev->geo;
geo->csecs = 1 << ns->lba_shift;
geo->sos = ns->ms;
- geo->ext = ns->ext;
+ if (ns->features & NVME_NS_EXT_LBAS)
+ geo->ext = true;
+ else
+ geo->ext = false;
geo->mdts = ns->ctrl->max_hw_sectors;
dev->q = q;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 54603bd3e02d..da78e499947a 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -3,6 +3,7 @@
* Copyright (c) 2017-2018 Christoph Hellwig.
*/
+#include <linux/backing-dev.h>
#include <linux/moduleparam.h>
#include <trace/events/block.h>
#include "nvme.h"
@@ -293,7 +294,7 @@ static bool nvme_available_path(struct nvme_ns_head *head)
static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
struct bio *bio)
{
- struct nvme_ns_head *head = q->queuedata;
+ struct nvme_ns_head *head = bio->bi_disk->private_data;
struct device *dev = disk_to_dev(head->disk);
struct nvme_ns *ns;
blk_qc_t ret = BLK_QC_T_NONE;
@@ -371,13 +372,12 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
* We also do this for private namespaces as the namespace sharing data could
* change after a rescan.
*/
- if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
+ if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
return 0;
q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node);
if (!q)
goto out;
- q->queuedata = head;
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(q, 512);
@@ -666,6 +666,13 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
}
+
+ if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
+ struct backing_dev_info *info =
+ ns->head->disk->queue->backing_dev_info;
+
+ info->capabilities |= BDI_CAP_STABLE_WRITES;
+ }
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -687,7 +694,8 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
int error;
/* check if multipath is enabled and we have the capability */
- if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3)))
+ if (!multipath || !ctrl->subsys ||
+ !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
return 0;
ctrl->anacap = id->anacap;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2e04a36296d9..c0f4226d3299 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
#include <linux/wait.h>
+#include <linux/t10-pi.h>
#include <trace/events/block.h>
@@ -30,8 +31,10 @@ extern unsigned int admin_timeout;
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define NVME_INLINE_SG_CNT 0
+#define NVME_INLINE_METADATA_SG_CNT 0
#else
#define NVME_INLINE_SG_CNT 2
+#define NVME_INLINE_METADATA_SG_CNT 1
#endif
extern struct workqueue_struct *nvme_wq;
@@ -228,6 +231,7 @@ struct nvme_ctrl {
u32 page_size;
u32 max_hw_sectors;
u32 max_segments;
+ u32 max_integrity_segments;
u16 crdt[3];
u16 oncs;
u16 oacs;
@@ -352,6 +356,7 @@ struct nvme_ns_head {
struct nvme_ns_ids ids;
struct list_head entry;
struct kref ref;
+ bool shared;
int instance;
#ifdef CONFIG_NVME_MULTIPATH
struct gendisk *disk;
@@ -363,6 +368,11 @@ struct nvme_ns_head {
#endif
};
+enum nvme_ns_features {
+ NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
+ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
+};
+
struct nvme_ns {
struct list_head list;
@@ -382,18 +392,23 @@ struct nvme_ns {
u16 ms;
u16 sgs;
u32 sws;
- bool ext;
u8 pi_type;
+ unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
#define NVME_NS_DEAD 1
#define NVME_NS_ANA_PENDING 2
- u16 noiob;
struct nvme_fault_inject fault_inject;
};
+/* NVMe ns supports metadata actions by the controller (generate/strip) */
+static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+{
+ return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
+}
+
struct nvme_ctrl_ops {
const char *name;
struct module *module;
@@ -449,6 +464,14 @@ static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
return lba << (ns->lba_shift - SECTOR_SHIFT);
}
+/*
+ * Convert byte length to nvme's 0-based num dwords
+ */
+static inline u32 nvme_bytes_to_numd(size_t len)
+{
+ return (len >> 2) - 1;
+}
+
static inline void nvme_end_request(struct request *req, __le16 status,
union nvme_result result)
{
@@ -489,7 +512,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
-void nvme_put_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_identify(struct nvme_ctrl *ctrl);
void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
@@ -577,8 +599,7 @@ static inline void nvme_trace_bio_complete(struct request *req,
struct nvme_ns *ns = req->q->queuedata;
if (req->cmd_flags & REQ_NVME_MPATH)
- trace_block_bio_complete(ns->head->disk->queue,
- req->bio, status);
+ trace_block_bio_complete(ns->head->disk->queue, req->bio);
}
extern struct device_attribute dev_attr_ana_grpid;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3726dc780d15..e2bacd369a88 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -68,14 +68,30 @@ static int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
+static int io_queue_count_set(const char *val, const struct kernel_param *kp)
+{
+ unsigned int n;
+ int ret;
+
+ ret = kstrtouint(val, 10, &n);
+ if (ret != 0 || n > num_possible_cpus())
+ return -EINVAL;
+ return param_set_uint(val, kp);
+}
+
+static const struct kernel_param_ops io_queue_count_ops = {
+ .set = io_queue_count_set,
+ .get = param_get_uint,
+};
+
static unsigned int write_queues;
-module_param(write_queues, uint, 0644);
+module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
MODULE_PARM_DESC(write_queues,
"Number of queues to use for writes. If not set, reads and writes "
"will share a queue set.");
static unsigned int poll_queues;
-module_param(poll_queues, uint, 0644);
+module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
struct nvme_dev;
@@ -128,6 +144,9 @@ struct nvme_dev {
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
+ unsigned int nr_allocated_queues;
+ unsigned int nr_write_queues;
+ unsigned int nr_poll_queues;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -166,14 +185,13 @@ struct nvme_queue {
void *sq_cmds;
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
- volatile struct nvme_completion *cqes;
+ struct nvme_completion *cqes;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
u32 __iomem *q_db;
u16 q_depth;
u16 cq_vector;
u16 sq_tail;
- u16 last_sq_tail;
u16 cq_head;
u16 qid;
u8 cq_phase;
@@ -209,25 +227,14 @@ struct nvme_iod {
struct scatterlist *sg;
};
-static unsigned int max_io_queues(void)
+static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
{
- return num_possible_cpus() + write_queues + poll_queues;
-}
-
-static unsigned int max_queue_count(void)
-{
- /* IO queues + admin queue */
- return 1 + max_io_queues();
-}
-
-static inline unsigned int nvme_dbbuf_size(u32 stride)
-{
- return (max_queue_count() * 8 * stride);
+ return dev->nr_allocated_queues * 8 * dev->db_stride;
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
- unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+ unsigned int mem_size = nvme_dbbuf_size(dev);
if (dev->dbbuf_dbs)
return 0;
@@ -252,7 +259,7 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
- unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+ unsigned int mem_size = nvme_dbbuf_size(dev);
if (dev->dbbuf_dbs) {
dma_free_coherent(dev->dev, mem_size,
@@ -446,24 +453,11 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
return 0;
}
-/*
- * Write sq tail if we are asked to, or if the next command would wrap.
- */
-static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
+static inline void nvme_write_sq_db(struct nvme_queue *nvmeq)
{
- if (!write_sq) {
- u16 next_tail = nvmeq->sq_tail + 1;
-
- if (next_tail == nvmeq->q_depth)
- next_tail = 0;
- if (next_tail != nvmeq->last_sq_tail)
- return;
- }
-
if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
writel(nvmeq->sq_tail, nvmeq->q_db);
- nvmeq->last_sq_tail = nvmeq->sq_tail;
}
/**
@@ -480,7 +474,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
- nvme_write_sq_db(nvmeq, write_sq);
+ if (write_sq)
+ nvme_write_sq_db(nvmeq);
spin_unlock(&nvmeq->sq_lock);
}
@@ -489,8 +484,7 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
struct nvme_queue *nvmeq = hctx->driver_data;
spin_lock(&nvmeq->sq_lock);
- if (nvmeq->sq_tail != nvmeq->last_sq_tail)
- nvme_write_sq_db(nvmeq, true);
+ nvme_write_sq_db(nvmeq);
spin_unlock(&nvmeq->sq_lock);
}
@@ -922,8 +916,9 @@ static void nvme_pci_complete_rq(struct request *req)
/* We read the CQE phase first to check if the rest of the entry is valid */
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
{
- return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
- nvmeq->cq_phase;
+ struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];
+
+ return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
}
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
@@ -944,7 +939,7 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{
- volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
+ struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
@@ -1382,16 +1377,19 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
/*
* Called only on a device that has been disabled and after all other threads
- * that can check this device's completion queues have synced. This is the
- * last chance for the driver to see a natural completion before
- * nvme_cancel_request() terminates all incomplete requests.
+ * that can check this device's completion queues have synced, except
+ * nvme_poll(). This is the last chance for the driver to see a natural
+ * completion before nvme_cancel_request() terminates all incomplete requests.
*/
static void nvme_reap_pending_cqes(struct nvme_dev *dev)
{
int i;
- for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+ for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
+ spin_lock(&dev->queues[i].cq_poll_lock);
nvme_process_cq(&dev->queues[i]);
+ spin_unlock(&dev->queues[i].cq_poll_lock);
+ }
}
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1498,7 +1496,6 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
struct nvme_dev *dev = nvmeq->dev;
nvmeq->sq_tail = 0;
- nvmeq->last_sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -2000,7 +1997,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
{
struct nvme_dev *dev = affd->priv;
- unsigned int nr_read_queues;
+ unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
/*
* If there is no interupt available for queues, ensure that
@@ -2016,12 +2013,12 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
if (!nrirqs) {
nrirqs = 1;
nr_read_queues = 0;
- } else if (nrirqs == 1 || !write_queues) {
+ } else if (nrirqs == 1 || !nr_write_queues) {
nr_read_queues = 0;
- } else if (write_queues >= nrirqs) {
+ } else if (nr_write_queues >= nrirqs) {
nr_read_queues = 1;
} else {
- nr_read_queues = nrirqs - write_queues;
+ nr_read_queues = nrirqs - nr_write_queues;
}
dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
@@ -2045,7 +2042,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
* Poll queues don't need interrupts, but we need at least one IO
* queue left over for non-polled IO.
*/
- this_p_queues = poll_queues;
+ this_p_queues = dev->nr_poll_queues;
if (this_p_queues >= nr_io_queues) {
this_p_queues = nr_io_queues - 1;
irq_queues = 1;
@@ -2075,14 +2072,25 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}
+static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
+{
+ return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+}
+
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
- int result, nr_io_queues;
+ unsigned int nr_io_queues;
unsigned long size;
+ int result;
- nr_io_queues = max_io_queues();
+ /*
+ * Sample the module parameters once at reset time so that we have
+ * stable values to work with.
+ */
+ dev->nr_write_queues = write_queues;
+ dev->nr_poll_queues = poll_queues;
/*
* If tags are shared with admin queue (Apple bug), then
@@ -2090,6 +2098,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
nr_io_queues = 1;
+ else
+ nr_io_queues = min(nvme_max_io_queues(dev),
+ dev->nr_allocated_queues - 1);
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
@@ -2562,6 +2573,12 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
+ /*
+ * We do not support an SGL for metadata (yet), so we are limited to a
+ * single integrity segment for the separate metadata pointer.
+ */
+ dev->ctrl.max_integrity_segments = 1;
+
result = nvme_init_identify(&dev->ctrl);
if (result)
goto out;
@@ -2764,8 +2781,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (!dev)
return -ENOMEM;
- dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
- GFP_KERNEL, node);
+ dev->nr_write_queues = write_queues;
+ dev->nr_poll_queues = poll_queues;
+ dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
+ dev->queues = kcalloc_node(dev->nr_allocated_queues,
+ sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
goto free;
@@ -2930,9 +2950,15 @@ static int nvme_suspend(struct device *dev)
* the PCI bus layer to put it into D3 in order to take the PCIe link
* down, so as to allow the platform to achieve its minimum low-power
* state (which may not be possible if the link is up).
+ *
+ * If a host memory buffer is enabled, shut down the device as the NVMe
+ * specification allows the device to access the host memory buffer in
+ * host DRAM from all power states, but hosts will fail access to DRAM
+ * during S3.
*/
if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) ||
+ ndev->nr_host_mem_descs ||
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
return nvme_disable_prepare_reset(ndev, true);
@@ -3128,8 +3154,6 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
- write_queues = min(write_queues, num_possible_cpus());
- poll_queues = min(poll_queues, num_possible_cpus());
return pci_register_driver(&nvme_driver);
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index cac8a930396a..f8f856dc0c67 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -34,6 +34,11 @@
#define NVME_RDMA_MAX_INLINE_SEGMENTS 4
+#define NVME_RDMA_DATA_SGL_SIZE \
+ (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT)
+#define NVME_RDMA_METADATA_SGL_SIZE \
+ (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT)
+
struct nvme_rdma_device {
struct ib_device *dev;
struct ib_pd *pd;
@@ -48,6 +53,11 @@ struct nvme_rdma_qe {
u64 dma;
};
+struct nvme_rdma_sgl {
+ int nents;
+ struct sg_table sg_table;
+};
+
struct nvme_rdma_queue;
struct nvme_rdma_request {
struct nvme_request req;
@@ -58,12 +68,12 @@ struct nvme_rdma_request {
refcount_t ref;
struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
u32 num_sge;
- int nents;
struct ib_reg_wr reg_wr;
struct ib_cqe reg_cqe;
struct nvme_rdma_queue *queue;
- struct sg_table sg_table;
- struct scatterlist first_sgl[];
+ struct nvme_rdma_sgl data_sgl;
+ struct nvme_rdma_sgl *metadata_sgl;
+ bool use_sig_mr;
};
enum nvme_rdma_queue_flags {
@@ -85,6 +95,7 @@ struct nvme_rdma_queue {
struct rdma_cm_id *cm_id;
int cm_error;
struct completion cm_done;
+ bool pi_support;
};
struct nvme_rdma_ctrl {
@@ -261,6 +272,8 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
init_attr.qp_type = IB_QPT_RC;
init_attr.send_cq = queue->ib_cq;
init_attr.recv_cq = queue->ib_cq;
+ if (queue->pi_support)
+ init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
@@ -290,6 +303,12 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
if (!req->sqe.data)
return -ENOMEM;
+ /* metadata nvme_rdma_sgl struct is located after command's data SGL */
+ if (queue->pi_support)
+ req->metadata_sgl = (void *)nvme_req(rq) +
+ sizeof(struct nvme_rdma_request) +
+ NVME_RDMA_DATA_SGL_SIZE;
+
req->queue = queue;
return 0;
@@ -400,6 +419,8 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
dev = queue->device;
ibdev = dev->dev;
+ if (queue->pi_support)
+ ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
/*
@@ -416,10 +437,16 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
nvme_rdma_dev_put(dev);
}
-static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
+static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
{
- return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
- ibdev->attrs.max_fast_reg_page_list_len - 1);
+ u32 max_page_list_len;
+
+ if (pi_support)
+ max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len;
+ else
+ max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len;
+
+ return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
}
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
@@ -476,7 +503,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
* misaligned we'll end up using two entries for a single data page,
* so one additional entry is required.
*/
- pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
+ pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1;
ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
queue->queue_size,
IB_MR_TYPE_MEM_REG,
@@ -488,10 +515,24 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
goto out_destroy_ring;
}
+ if (queue->pi_support) {
+ ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
+ queue->queue_size, IB_MR_TYPE_INTEGRITY,
+ pages_per_mr, pages_per_mr);
+ if (ret) {
+ dev_err(queue->ctrl->ctrl.device,
+ "failed to initialize PI MR pool sized %d for QID %d\n",
+ queue->queue_size, idx);
+ goto out_destroy_mr_pool;
+ }
+ }
+
set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
return 0;
+out_destroy_mr_pool:
+ ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
out_destroy_ring:
nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
sizeof(struct nvme_completion), DMA_FROM_DEVICE);
@@ -513,6 +554,10 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
queue = &ctrl->queues[idx];
queue->ctrl = ctrl;
+ if (idx && ctrl->ctrl.max_integrity_segments)
+ queue->pi_support = true;
+ else
+ queue->pi_support = false;
init_completion(&queue->cm_done);
if (idx > 0)
@@ -723,7 +768,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->reserved_tags = 2; /* connect + keep-alive */
set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
- NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
+ NVME_RDMA_DATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = ADMIN_TIMEOUT;
@@ -737,7 +782,10 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
- NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
+ NVME_RDMA_DATA_SGL_SIZE;
+ if (nctrl->max_integrity_segments)
+ set->cmd_size += sizeof(struct nvme_rdma_sgl) +
+ NVME_RDMA_METADATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
@@ -770,6 +818,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
bool new)
{
+ bool pi_capable = false;
int error;
error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
@@ -779,7 +828,13 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
ctrl->device = ctrl->queues[0].device;
ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
- ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
+ /* T10-PI support */
+ if (ctrl->device->dev->attrs.device_cap_flags &
+ IB_DEVICE_INTEGRITY_HANDOVER)
+ pi_capable = true;
+
+ ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
+ pi_capable);
/*
* Bind the async event SQE DMA mapping to the admin queue lifetime.
@@ -821,6 +876,10 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
ctrl->ctrl.max_segments = ctrl->max_fr_pages;
ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
+ if (pi_capable)
+ ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
+ else
+ ctrl->ctrl.max_integrity_segments = 0;
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
@@ -1149,17 +1208,29 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_device *dev = queue->device;
struct ib_device *ibdev = dev->dev;
+ struct list_head *pool = &queue->qp->rdma_mrs;
if (!blk_rq_nr_phys_segments(rq))
return;
+ if (blk_integrity_rq(rq)) {
+ ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
+ req->metadata_sgl->nents, rq_dma_dir(rq));
+ sg_free_table_chained(&req->metadata_sgl->sg_table,
+ NVME_INLINE_METADATA_SG_CNT);
+ }
+
+ if (req->use_sig_mr)
+ pool = &queue->qp->sig_mrs;
+
if (req->mr) {
- ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
+ ib_mr_pool_put(queue->qp, pool, req->mr);
req->mr = NULL;
}
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
- sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT);
+ ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
+ rq_dma_dir(rq));
+ sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
}
static int nvme_rdma_set_sg_null(struct nvme_command *c)
@@ -1178,7 +1249,7 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
int count)
{
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
- struct scatterlist *sgl = req->sg_table.sgl;
+ struct scatterlist *sgl = req->data_sgl.sg_table.sgl;
struct ib_sge *sge = &req->sge[1];
u32 len = 0;
int i;
@@ -1203,8 +1274,8 @@ static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
{
struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
- sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
- put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
+ sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
+ put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length);
put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
return 0;
@@ -1225,7 +1296,8 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
* Align the MR to a 4K page size to match the ctrl page size and
* the block virtual boundary.
*/
- nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
+ nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL,
+ SZ_4K);
if (unlikely(nr < count)) {
ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
req->mr = NULL;
@@ -1256,12 +1328,125 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
return 0;
}
+static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
+ struct nvme_command *cmd, struct ib_sig_domain *domain,
+ u16 control, u8 pi_type)
+{
+ domain->sig_type = IB_SIG_TYPE_T10_DIF;
+ domain->sig.dif.bg_type = IB_T10DIF_CRC;
+ domain->sig.dif.pi_interval = 1 << bi->interval_exp;
+ domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
+ if (control & NVME_RW_PRINFO_PRCHK_REF)
+ domain->sig.dif.ref_remap = true;
+
+ domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
+ domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
+ domain->sig.dif.app_escape = true;
+ if (pi_type == NVME_NS_DPS_PI_TYPE3)
+ domain->sig.dif.ref_escape = true;
+}
+
+static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
+ struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
+ u8 pi_type)
+{
+ u16 control = le16_to_cpu(cmd->rw.control);
+
+ memset(sig_attrs, 0, sizeof(*sig_attrs));
+ if (control & NVME_RW_PRINFO_PRACT) {
+ /* for WRITE_INSERT/READ_STRIP no memory domain */
+ sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
+ pi_type);
+ /* Clear the PRACT bit since HCA will generate/verify the PI */
+ control &= ~NVME_RW_PRINFO_PRACT;
+ cmd->rw.control = cpu_to_le16(control);
+ } else {
+ /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
+ pi_type);
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
+ pi_type);
+ }
+}
+
+static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask)
+{
+ *mask = 0;
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
+ *mask |= IB_SIG_CHECK_REFTAG;
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
+ *mask |= IB_SIG_CHECK_GUARD;
+}
+
+static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ if (unlikely(wc->status != IB_WC_SUCCESS))
+ nvme_rdma_wr_error(cq, wc, "SIG");
+}
+
+static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
+ struct nvme_rdma_request *req, struct nvme_command *c,
+ int count, int pi_count)
+{
+ struct nvme_rdma_sgl *sgl = &req->data_sgl;
+ struct ib_reg_wr *wr = &req->reg_wr;
+ struct request *rq = blk_mq_rq_from_pdu(req);
+ struct nvme_ns *ns = rq->q->queuedata;
+ struct bio *bio = rq->bio;
+ struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
+ int nr;
+
+ req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
+ if (WARN_ON_ONCE(!req->mr))
+ return -EAGAIN;
+
+ nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL,
+ req->metadata_sgl->sg_table.sgl, pi_count, NULL,
+ SZ_4K);
+ if (unlikely(nr))
+ goto mr_put;
+
+ nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c,
+ req->mr->sig_attrs, ns->pi_type);
+ nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
+
+ ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
+
+ req->reg_cqe.done = nvme_rdma_sig_done;
+ memset(wr, 0, sizeof(*wr));
+ wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
+ wr->wr.wr_cqe = &req->reg_cqe;
+ wr->wr.num_sge = 0;
+ wr->wr.send_flags = 0;
+ wr->mr = req->mr;
+ wr->key = req->mr->rkey;
+ wr->access = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+
+ sg->addr = cpu_to_le64(req->mr->iova);
+ put_unaligned_le24(req->mr->length, sg->length);
+ put_unaligned_le32(req->mr->rkey, sg->key);
+ sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
+
+ return 0;
+
+mr_put:
+ ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
+ req->mr = NULL;
+ if (nr < 0)
+ return nr;
+ return -EINVAL;
+}
+
static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
struct request *rq, struct nvme_command *c)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_device *dev = queue->device;
struct ib_device *ibdev = dev->dev;
+ int pi_count = 0;
int count, ret;
req->num_sge = 1;
@@ -1272,22 +1457,52 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
if (!blk_rq_nr_phys_segments(rq))
return nvme_rdma_set_sg_null(c);
- req->sg_table.sgl = req->first_sgl;
- ret = sg_alloc_table_chained(&req->sg_table,
- blk_rq_nr_phys_segments(rq), req->sg_table.sgl,
+ req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1);
+ ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
+ blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl,
NVME_INLINE_SG_CNT);
if (ret)
return -ENOMEM;
- req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
+ req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
+ req->data_sgl.sg_table.sgl);
- count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
- rq_dma_dir(rq));
+ count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
+ req->data_sgl.nents, rq_dma_dir(rq));
if (unlikely(count <= 0)) {
ret = -EIO;
goto out_free_table;
}
+ if (blk_integrity_rq(rq)) {
+ req->metadata_sgl->sg_table.sgl =
+ (struct scatterlist *)(req->metadata_sgl + 1);
+ ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table,
+ blk_rq_count_integrity_sg(rq->q, rq->bio),
+ req->metadata_sgl->sg_table.sgl,
+ NVME_INLINE_METADATA_SG_CNT);
+ if (unlikely(ret)) {
+ ret = -ENOMEM;
+ goto out_unmap_sg;
+ }
+
+ req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q,
+ rq->bio, req->metadata_sgl->sg_table.sgl);
+ pi_count = ib_dma_map_sg(ibdev,
+ req->metadata_sgl->sg_table.sgl,
+ req->metadata_sgl->nents,
+ rq_dma_dir(rq));
+ if (unlikely(pi_count <= 0)) {
+ ret = -EIO;
+ goto out_free_pi_table;
+ }
+ }
+
+ if (req->use_sig_mr) {
+ ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count);
+ goto out;
+ }
+
if (count <= dev->num_inline_segments) {
if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
queue->ctrl->use_inline_data &&
@@ -1306,14 +1521,23 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
ret = nvme_rdma_map_sg_fr(queue, req, c, count);
out:
if (unlikely(ret))
- goto out_unmap_sg;
+ goto out_unmap_pi_sg;
return 0;
+out_unmap_pi_sg:
+ if (blk_integrity_rq(rq))
+ ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
+ req->metadata_sgl->nents, rq_dma_dir(rq));
+out_free_pi_table:
+ if (blk_integrity_rq(rq))
+ sg_free_table_chained(&req->metadata_sgl->sg_table,
+ NVME_INLINE_METADATA_SG_CNT);
out_unmap_sg:
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
+ ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
+ rq_dma_dir(rq));
out_free_table:
- sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT);
+ sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
return ret;
}
@@ -1761,6 +1985,15 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(rq);
+ if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
+ queue->pi_support &&
+ (c->common.opcode == nvme_cmd_write ||
+ c->common.opcode == nvme_cmd_read) &&
+ nvme_ns_has_pi(ns))
+ req->use_sig_mr = true;
+ else
+ req->use_sig_mr = false;
+
err = nvme_rdma_map_data(queue, rq, c);
if (unlikely(err < 0)) {
dev_err(queue->ctrl->ctrl.device,
@@ -1801,12 +2034,46 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
return ib_process_cq_direct(queue->ib_cq, -1);
}
+static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req)
+{
+ struct request *rq = blk_mq_rq_from_pdu(req);
+ struct ib_mr_status mr_status;
+ int ret;
+
+ ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
+ if (ret) {
+ pr_err("ib_check_mr_status failed, ret %d\n", ret);
+ nvme_req(rq)->status = NVME_SC_INVALID_PI;
+ return;
+ }
+
+ if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+ switch (mr_status.sig_err.err_type) {
+ case IB_SIG_BAD_GUARD:
+ nvme_req(rq)->status = NVME_SC_GUARD_CHECK;
+ break;
+ case IB_SIG_BAD_REFTAG:
+ nvme_req(rq)->status = NVME_SC_REFTAG_CHECK;
+ break;
+ case IB_SIG_BAD_APPTAG:
+ nvme_req(rq)->status = NVME_SC_APPTAG_CHECK;
+ break;
+ }
+ pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
+ mr_status.sig_err.err_type, mr_status.sig_err.expected,
+ mr_status.sig_err.actual);
+ }
+}
+
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_queue *queue = req->queue;
struct ib_device *ibdev = queue->device->dev;
+ if (req->use_sig_mr)
+ nvme_rdma_check_pi_status(req);
+
nvme_rdma_unmap_data(queue, rq);
ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
DMA_TO_DEVICE);
@@ -1926,7 +2193,7 @@ out_fail:
static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.name = "rdma",
.module = THIS_MODULE,
- .flags = NVME_F_FABRICS,
+ .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED,
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c15a92163c1f..3345ec7efaff 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -60,6 +60,7 @@ struct nvme_tcp_request {
enum nvme_tcp_queue_flags {
NVME_TCP_Q_ALLOCATED = 0,
NVME_TCP_Q_LIVE = 1,
+ NVME_TCP_Q_POLLING = 2,
};
enum nvme_tcp_recv_state {
@@ -75,6 +76,7 @@ struct nvme_tcp_queue {
int io_cpu;
spinlock_t lock;
+ struct mutex send_mutex;
struct list_head send_list;
/* recv state */
@@ -129,8 +131,9 @@ struct nvme_tcp_ctrl {
static LIST_HEAD(nvme_tcp_ctrl_list);
static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
static struct workqueue_struct *nvme_tcp_wq;
-static struct blk_mq_ops nvme_tcp_mq_ops;
-static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+static const struct blk_mq_ops nvme_tcp_mq_ops;
+static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
{
@@ -257,15 +260,29 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
}
}
-static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
+ bool sync)
{
struct nvme_tcp_queue *queue = req->queue;
+ bool empty;
spin_lock(&queue->lock);
+ empty = list_empty(&queue->send_list) && !queue->request;
list_add_tail(&req->entry, &queue->send_list);
spin_unlock(&queue->lock);
- queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ /*
+ * if we're the first on the send_list and we can try to send
+ * directly, otherwise queue io_work. Also, only do that if we
+ * are on the same cpu, so we don't introduce contention.
+ */
+ if (queue->io_cpu == smp_processor_id() &&
+ sync && empty && mutex_trylock(&queue->send_mutex)) {
+ nvme_tcp_try_send(queue);
+ mutex_unlock(&queue->send_mutex);
+ } else {
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ }
}
static inline struct nvme_tcp_request *
@@ -578,7 +595,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, false);
return 0;
}
@@ -794,11 +811,12 @@ static void nvme_tcp_data_ready(struct sock *sk)
{
struct nvme_tcp_queue *queue;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
- if (likely(queue && queue->rd_enabled))
+ if (likely(queue && queue->rd_enabled) &&
+ !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void nvme_tcp_write_space(struct sock *sk)
@@ -867,7 +885,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
if (last && !queue->data_digest)
flags |= MSG_EOR;
else
- flags |= MSG_MORE;
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
/* can't zcopy slab pages */
if (unlikely(PageSlab(page))) {
@@ -906,11 +924,16 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
bool inline_data = nvme_tcp_has_inline_data(req);
- int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
+ int flags = MSG_DONTWAIT;
int ret;
+ if (inline_data)
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ else
+ flags |= MSG_EOR;
+
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
@@ -949,7 +972,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len,
- MSG_DONTWAIT | MSG_MORE);
+ MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
if (unlikely(ret <= 0))
return ret;
@@ -1063,11 +1086,14 @@ static void nvme_tcp_io_work(struct work_struct *w)
bool pending = false;
int result;
- result = nvme_tcp_try_send(queue);
- if (result > 0)
- pending = true;
- else if (unlikely(result < 0))
- break;
+ if (mutex_trylock(&queue->send_mutex)) {
+ result = nvme_tcp_try_send(queue);
+ mutex_unlock(&queue->send_mutex);
+ if (result > 0)
+ pending = true;
+ else if (unlikely(result < 0))
+ break;
+ }
result = nvme_tcp_try_recv(queue);
if (result > 0)
@@ -1313,12 +1339,12 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
- struct linger sol = { .l_onoff = 1, .l_linger = 0 };
- int ret, opt, rcv_pdu_size;
+ int ret, rcv_pdu_size;
queue->ctrl = ctrl;
INIT_LIST_HEAD(&queue->send_list);
spin_lock_init(&queue->lock);
+ mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
queue->queue_size = queue_size;
@@ -1337,60 +1363,24 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
}
/* Single syn retry */
- opt = 1;
- ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
- (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set TCP_SYNCNT sock opt %d\n", ret);
- goto err_sock;
- }
+ tcp_sock_set_syncnt(queue->sock->sk, 1);
/* Set TCP no delay */
- opt = 1;
- ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
- TCP_NODELAY, (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set TCP_NODELAY sock opt %d\n", ret);
- goto err_sock;
- }
+ tcp_sock_set_nodelay(queue->sock->sk);
/*
* Cleanup whatever is sitting in the TCP transmit queue on socket
* close. This is done to prevent stale data from being sent should
* the network connection be restored before TCP times out.
*/
- ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
- (char *)&sol, sizeof(sol));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set SO_LINGER sock opt %d\n", ret);
- goto err_sock;
- }
+ sock_no_linger(queue->sock->sk);
- if (so_priority > 0) {
- ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY,
- (char *)&so_priority, sizeof(so_priority));
- if (ret) {
- dev_err(ctrl->ctrl.device,
- "failed to set SO_PRIORITY sock opt, ret %d\n",
- ret);
- goto err_sock;
- }
- }
+ if (so_priority > 0)
+ sock_set_priority(queue->sock->sk, so_priority);
/* Set socket type of service */
- if (nctrl->opts->tos >= 0) {
- opt = nctrl->opts->tos;
- ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
- (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set IP_TOS sock opt %d\n", ret);
- goto err_sock;
- }
- }
+ if (nctrl->opts->tos >= 0)
+ ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
queue->sock->sk->sk_allocation = GFP_ATOMIC;
nvme_tcp_set_queue_io_cpu(queue);
@@ -1543,6 +1533,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
set->numa_node = NUMA_NO_NODE;
+ set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = 1;
@@ -1554,7 +1545,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
set->numa_node = NUMA_NO_NODE;
- set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
@@ -2113,7 +2104,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req);
+ nvme_tcp_queue_request(&ctrl->async_req, true);
}
static enum blk_eh_timer_return
@@ -2244,7 +2235,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(rq);
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, true);
return BLK_STS_OK;
}
@@ -2302,13 +2293,15 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
+ set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
+ clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return queue->nr_cqe;
}
-static struct blk_mq_ops nvme_tcp_mq_ops = {
+static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,
@@ -2319,7 +2312,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
.poll = nvme_tcp_poll,
};
-static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,