summaryrefslogtreecommitdiff
path: root/drivers/vfio/pci/mlx5/cmd.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-15 23:21:13 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-15 23:21:13 +0300
commit4138f02288333cb596885e9af03dd3ea2de845cb (patch)
tree96567ff0801b022508baa3d42127f721126421e9 /drivers/vfio/pci/mlx5/cmd.c
parent4f712ee0cbbd5c777d270427092bb301fc31044f (diff)
parent7447d911af699a15f8d050dfcb7c680a86f87012 (diff)
downloadlinux-4138f02288333cb596885e9af03dd3ea2de845cb.tar.xz
Merge tag 'vfio-v6.9-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Add warning in unlikely case that device is not captured with driver_override (Kunwu Chan) - Error handling improvements in mlx5-vfio-pci to detect firmware tracking object error states, logging of firmware error syndrom, and releasing of firmware resources in aborted migration sequence (Yishai Hadas) - Correct an un-alphabetized VFIO MAINTAINERS entry (Alex Williamson) - Make the mdev_bus_type const and also make the class struct const for a couple of the vfio-mdev sample drivers (Ricardo B. Marliere) - Addition of a new vfio-pci variant driver for the GPU of NVIDIA's Grace-Hopper superchip. During initialization of the chip-to-chip interconnect in this hardware module, the PCI BARs of the device become unused in favor of a faster, coherent mechanism for exposing device memory. This driver primarily changes the VFIO representation of the device to masquerade this coherent aperture to replace the physical PCI BARs for userspace drivers. This also incorporates use of a new vma flag allowing KVM to use write combining attributes for uncached device memory (Ankit Agrawal) - Reset fixes and cleanups for the pds-vfio-pci driver. Save and restore files were previously leaked if the device didn't pass through an error state, this is resolved and later re-fixed to prevent access to the now freed files. Reset handling is also refactored to remove the complicated deferred reset mechanism (Brett Creeley) - Remove some references to pl330 in the vfio-platform amba driver (Geert Uytterhoeven) - Remove twice redundant and ugly code to unpin incidental pins of the zero-page (Alex Williamson) - Deferred reset logic is also removed from the hisi-acc-vfio-pci driver as a simplification (Shameer Kolothum) - Enforce that mlx5-vfio-pci devices must support PRE_COPY and remove resulting unnecessary code. There is no device firmware that has been available publicly without this support (Yishai Hadas) - Switch over to using the .remove_new callback for vfio-platform in support of the broader transition for a void remove function (Uwe Kleine-König) - Resolve multiple issues in interrupt code for VFIO bus drivers that allow calling eventfd_signal() on a NULL context. This also remove a potential race in INTx setup on certain hardware for vfio-pci, races with various mechanisms to mask INTx, and leaked virqfds in vfio-platform (Alex Williamson) * tag 'vfio-v6.9-rc1' of https://github.com/awilliam/linux-vfio: (29 commits) vfio/fsl-mc: Block calling interrupt handler without trigger vfio/platform: Create persistent IRQ handlers vfio/platform: Disable virqfds on cleanup vfio/pci: Create persistent INTx handler vfio: Introduce interface to flush virqfd inject workqueue vfio/pci: Lock external INTx masking ops vfio/pci: Disable auto-enable of exclusive INTx IRQ vfio/pds: Refactor/simplify reset logic vfio/pds: Make sure migration file isn't accessed after reset vfio/platform: Convert to platform remove callback returning void vfio/mlx5: Enforce PRE_COPY support vfio/mbochs: make mbochs_class constant vfio/mdpy: make mdpy_class constant hisi_acc_vfio_pci: Remove the deferred_reset logic Revert "vfio/type1: Unpin zero pages" vfio/nvgrace-gpu: Convey kvm to map device memory region as noncached vfio: amba: Rename pl330_ids[] to vfio_amba_ids[] vfio/pds: Always clear the save/restore FDs on reset vfio/nvgrace-gpu: Add vfio pci variant module for grace hopper vfio/pci: rename and export range_intersect_range ...
Diffstat (limited to 'drivers/vfio/pci/mlx5/cmd.c')
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c157
1 files changed, 132 insertions, 25 deletions
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index efd1d252cdc9..41a4b0cf4297 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -108,8 +108,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
if (ret)
return ret;
- if (mvdev->saving_migf->state ==
- MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /* Upon cleanup, ignore previous pre_copy error state */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
+ !(query_flags & MLX5VF_QUERY_CLEANUP)) {
/*
* In case we had a PRE_COPY error, only query full
* image for final image
@@ -121,6 +122,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
}
query_flags &= ~MLX5VF_QUERY_INC;
}
+ /* Block incremental query which is state-dependent */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
+ complete(&mvdev->saving_migf->save_comp);
+ return -ENODEV;
+ }
}
MLX5_SET(query_vhca_migration_state_in, in, opcode,
@@ -149,6 +155,12 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
return 0;
}
+static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
+{
+ mvdev->tracker.object_changed = true;
+ complete(&mvdev->tracker_comp);
+}
+
static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
{
/* Mark the tracker under an error and wake it up if it's running */
@@ -189,7 +201,7 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
/* Must be done outside the lock to let it progress */
set_tracker_error(mvdev);
mutex_lock(&mvdev->state_mutex);
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
_mlx5vf_free_page_tracker_resources(mvdev);
mlx5vf_state_mutex_unlock(mvdev);
}
@@ -221,6 +233,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
if (!MLX5_CAP_GEN(mvdev->mdev, migration))
goto end;
+ if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+ MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
+ goto end;
+
mvdev->vf_id = pci_iov_vf_id(pdev);
if (mvdev->vf_id < 0)
goto end;
@@ -250,17 +266,14 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
mvdev->migrate_cap = 1;
mvdev->core_device.vdev.migration_flags =
VFIO_MIGRATION_STOP_COPY |
- VFIO_MIGRATION_P2P;
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+
mvdev->core_device.vdev.mig_ops = mig_ops;
init_completion(&mvdev->tracker_comp);
if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
mvdev->core_device.vdev.log_ops = log_ops;
- if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
- MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
- mvdev->core_device.vdev.migration_flags |=
- VFIO_MIGRATION_PRE_COPY;
-
if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
mvdev->chunk_mode = 1;
@@ -402,6 +415,50 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
kfree(buf);
}
+static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(
+ &buf->table, page_list, filled, 0,
+ filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
+ GFP_KERNEL_ACCOUNT);
+
+ if (ret)
+ goto err;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err:
+ kvfree(page_list);
+ return ret;
+}
+
struct mlx5_vhca_data_buffer *
mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
size_t length,
@@ -608,8 +665,13 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
err:
/* The error flow can't run from an interrupt context */
- if (status == -EREMOTEIO)
+ if (status == -EREMOTEIO) {
status = MLX5_GET(save_vhca_state_out, async_data->out, status);
+ /* Failed in FW, print cmd out failure details */
+ mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
+ async_data->out);
+ }
+
async_data->status = status;
queue_work(migf->mvdev->cb_wq, &async_data->work);
}
@@ -623,6 +685,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5vf_async_data *async_data;
+ bool pre_copy_cleanup = false;
int err;
lockdep_assert_held(&mvdev->state_mutex);
@@ -633,6 +696,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (err)
return err;
+ if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
+ pre_copy_cleanup = true;
+
if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
/*
* In case we had a PRE_COPY error, SAVE is triggered only for
@@ -651,29 +718,27 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
async_data = &migf->async_data;
async_data->buf = buf;
- async_data->stop_copy_chunk = !track;
+ async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
goto err_out;
}
- if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- if (async_data->stop_copy_chunk) {
- u8 header_idx = buf->stop_copy_chunk_num ?
- buf->stop_copy_chunk_num - 1 : 0;
+ if (async_data->stop_copy_chunk) {
+ u8 header_idx = buf->stop_copy_chunk_num ?
+ buf->stop_copy_chunk_num - 1 : 0;
- header_buf = migf->buf_header[header_idx];
- migf->buf_header[header_idx] = NULL;
- }
+ header_buf = migf->buf_header[header_idx];
+ migf->buf_header[header_idx] = NULL;
+ }
- if (!header_buf) {
- header_buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- if (IS_ERR(header_buf)) {
- err = PTR_ERR(header_buf);
- goto err_free;
- }
+ if (!header_buf) {
+ header_buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(header_buf)) {
+ err = PTR_ERR(header_buf);
+ goto err_free;
}
}
@@ -900,6 +965,29 @@ static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
}
+static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_page_tracker *tracker)
+{
+ u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+ void *obj_context;
+ void *cmd_hdr;
+ int err;
+
+ cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
+
+ err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
+ tracker->status = MLX5_GET(page_track, obj_context, state);
+ return 0;
+}
+
static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
struct mlx5_vhca_cq_buf *buf, int nent,
int cqe_size)
@@ -957,9 +1045,11 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
struct mlx5vf_pci_core_device *mvdev = container_of(
tracker, struct mlx5vf_pci_core_device, tracker);
+ struct mlx5_eqe_obj_change *object;
struct mlx5_eqe *eqe = data;
u8 event_type = (u8)type;
u8 queue_type;
+ u32 obj_id;
int qp_num;
switch (event_type) {
@@ -975,6 +1065,12 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
break;
set_tracker_error(mvdev);
break;
+ case MLX5_EVENT_TYPE_OBJECT_CHANGE:
+ object = &eqe->data.obj_change;
+ obj_id = be32_to_cpu(object->obj_id);
+ if (obj_id == tracker->id)
+ set_tracker_change_event(mvdev);
+ break;
default:
break;
}
@@ -1634,6 +1730,11 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
goto end;
}
+ if (tracker->is_err) {
+ err = -EIO;
+ goto end;
+ }
+
mdev = mvdev->mdev;
err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
MLX5_PAGE_TRACK_STATE_REPORTING);
@@ -1652,6 +1753,12 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
dirty, &tracker->status);
if (poll_err == CQ_EMPTY) {
wait_for_completion(&mvdev->tracker_comp);
+ if (tracker->object_changed) {
+ tracker->object_changed = false;
+ err = mlx5vf_cmd_query_tracker(mdev, tracker);
+ if (err)
+ goto end;
+ }
continue;
}
}