From a39a1466dae5e3ed0dd7c03334a60894e4d1f334 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 13 Oct 2022 14:46:36 -0700 Subject: MAINTAINERS: git://github -> https://github.com for awilliam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Github deprecated the git:// links about a year ago, so let's move to the https:// URLs instead. Reported-by: Conor Dooley Link: https://github.blog/2021-09-01-improving-git-protocol-security-github/ Signed-off-by: Palmer Dabbelt Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20221013214636.30721-1-palmer@rivosinc.com Signed-off-by: Alex Williamson --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 046ff06ff97f..daa6a7a755ec 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21539,7 +21539,7 @@ M: Alex Williamson R: Cornelia Huck L: kvm@vger.kernel.org S: Maintained -T: git git://github.com/awilliam/linux-vfio.git +T: git https://github.com/awilliam/linux-vfio.git F: Documentation/ABI/testing/sysfs-devices-vfio-dev F: Documentation/driver-api/vfio.rst F: drivers/vfio/ -- cgit v1.2.3 From cd48ebc5c4f2e94830b238f035ebf04f1c3a4433 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 22 Sep 2022 20:35:07 +0800 Subject: vfio/mlx5: Switch to use module_pci_driver() macro Since pci provides the helper macro module_pci_driver(), we may replace the module_init/exit with it. Signed-off-by: Shang XiaoJing Reviewed-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220922123507.11222-1-shangxiaojing@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a2..457138b92f13 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -676,18 +676,7 @@ static struct pci_driver mlx5vf_pci_driver = { .driver_managed_dma = true, }; -static void __exit mlx5vf_pci_cleanup(void) -{ - pci_unregister_driver(&mlx5vf_pci_driver); -} - -static int __init mlx5vf_pci_init(void) -{ - return pci_register_driver(&mlx5vf_pci_driver); -} - -module_init(mlx5vf_pci_init); -module_exit(mlx5vf_pci_cleanup); +module_pci_driver(mlx5vf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy "); -- cgit v1.2.3 From e67e070632a665c932d534b8b800477bb3111449 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Tue, 18 Oct 2022 12:28:25 -0300 Subject: vfio: platform: Do not pass return buffer to ACPI _RST method The ACPI _RST method has no return value, there's no need to pass a return buffer to acpi_evaluate_object(). Fixes: d30daa33ec1d ("vfio: platform: call _RST method when using ACPI") Signed-off-by: Rafael Mendonca Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20221018152825.891032-1-rafaelmendsr@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 55dc4f43c31e..1a0a238ffa35 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -72,12 +72,11 @@ static int vfio_platform_acpi_call_reset(struct vfio_platform_device *vdev, const char **extra_dbg) { #ifdef CONFIG_ACPI - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; struct device *dev = vdev->device; acpi_handle handle = ACPI_HANDLE(dev); acpi_status acpi_ret; - acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, &buffer); + acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, NULL); if (ACPI_FAILURE(acpi_ret)) { if (extra_dbg) *extra_dbg = acpi_format_exception(acpi_ret); -- cgit v1.2.3 From ea00d4ededcd8639f9e814513426cfeccdd3aaf0 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 25 Oct 2022 20:31:13 +0100 Subject: vfio/iova_bitmap: Explicitly include linux/slab.h kzalloc/kzfree are used so include `slab.h`. While it happens to work without it, due to commit 8b9f3ac5b01d ("fs: introduce alloc_inode_sb() to allocate filesystems specific inode") which indirectly includes via: . ./include/linux/mm.h .. ./include/linux/huge_mm.h ... ./include/linux/fs.h .... ./include/linux/slab.h Make it explicit should any of its indirect dependencies be dropped/changed for entirely different reasons as it was the cause prior to commit above recently (i.e. <= v5.18). Signed-off-by: Joao Martins Link: https://lore.kernel.org/r/20221025193114.58695-2-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 6631e8befe1b..56816ba1be9b 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) -- cgit v1.2.3 From f38044e5ef58ad0346fdabd7027ea5c1e1a3b624 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 25 Oct 2022 20:31:14 +0100 Subject: vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps iova_bitmap_set() doesn't consider the end of the page boundary when the first bitmap page offset isn't zero, and wrongly changes the consecutive page right after. Consequently this leads to missing dirty pages from reported by the device as seen from the VMM. The current logic iterates over a given number of base pages and clamps it to the remaining indexes to iterate in the last page. Instead of having to consider extra pages to pin (e.g. first and extra pages), just handle the first page as its own range and let the rest of the bitmap be handled as if it was base page aligned. This is done by changing iova_bitmap_mapped_remaining() to return PAGE_SIZE - pgoff (on the first bitmap page), and leads to pgoff being set to 0 on following iterations. Fixes: 58ccf0190d19 ("vfio: Add an IOVA bitmap support") Reported-by: Avihai Horon Tested-by: Avihai Horon Signed-off-by: Joao Martins Link: https://lore.kernel.org/r/20221025193114.58695-3-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 56816ba1be9b..de6d6ea5c496 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -296,11 +296,15 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) */ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) { - unsigned long remaining; + unsigned long remaining, bytes; + + /* Cap to one page in the first iteration, if PAGE_SIZE unaligned. */ + bytes = !bitmap->mapped.pgoff ? bitmap->mapped.npages << PAGE_SHIFT : + PAGE_SIZE - bitmap->mapped.pgoff; remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, - (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap)); + bytes / sizeof(*bitmap->bitmap)); return remaining; } -- cgit v1.2.3 From 9e6f07cd1eaa72c41719050c5ca9d22a8c0b7c02 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:01 +0100 Subject: vfio/ccw: create a parent struct Move the stuff associated with the mdev parent (and thus the subchannel struct) into its own struct, and leave the rest in the existing private structure. The subchannel will point to the parent, and the parent will point to the private, for the areas where one or both are needed. Further separation of these structs will follow. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-2-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 98 ++++++++++++++++++++++++++++++------- drivers/s390/cio/vfio_ccw_ops.c | 8 ++- drivers/s390/cio/vfio_ccw_private.h | 20 ++++++-- 3 files changed, 101 insertions(+), 25 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 7f5402fe857a..444b32047397 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -36,10 +36,19 @@ debug_info_t *vfio_ccw_debug_trace_id; */ int vfio_ccw_sch_quiesce(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); DECLARE_COMPLETION_ONSTACK(completion); int iretry, ret = 0; + /* + * Probably an impossible situation, after being called through + * FSM callbacks. But in the event it did, register a warning + * and return as if things were fine. + */ + if (WARN_ON(!private)) + return 0; + iretry = 255; do { @@ -121,7 +130,23 @@ static void vfio_ccw_crw_todo(struct work_struct *work) */ static void vfio_ccw_sch_irq(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); + + /* + * The subchannel should still be disabled at this point, + * so an interrupt would be quite surprising. As with an + * interrupt while the FSM is closed, let's attempt to + * disable the subchannel again. + */ + if (!private) { + VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: unexpected interrupt\n", + sch->schid.cssid, sch->schid.ssid, + sch->schid.sch_no); + + cio_disable_subchannel(sch); + return; + } inc_irq_stat(IRQIO_CIO); vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); @@ -201,10 +226,19 @@ static void vfio_ccw_free_private(struct vfio_ccw_private *private) mutex_destroy(&private->io_mutex); kfree(private); } + +static void vfio_ccw_free_parent(struct device *dev) +{ + struct vfio_ccw_parent *parent = container_of(dev, struct vfio_ccw_parent, dev); + + kfree(parent); +} + static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; struct vfio_ccw_private *private; + struct vfio_ccw_parent *parent; int ret = -ENOMEM; if (pmcw->qf) { @@ -213,38 +247,58 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) return -ENODEV; } + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (!parent) + return -ENOMEM; + + dev_set_name(&parent->dev, "parent"); + parent->dev.parent = &sch->dev; + parent->dev.release = &vfio_ccw_free_parent; + ret = device_register(&parent->dev); + if (ret) + goto out_free; + private = vfio_ccw_alloc_private(sch); - if (IS_ERR(private)) + if (IS_ERR(private)) { + device_unregister(&parent->dev); return PTR_ERR(private); + } - dev_set_drvdata(&sch->dev, private); + dev_set_drvdata(&sch->dev, parent); + dev_set_drvdata(&parent->dev, private); - private->mdev_type.sysfs_name = "io"; - private->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; - private->mdev_types[0] = &private->mdev_type; - ret = mdev_register_parent(&private->parent, &sch->dev, + parent->mdev_type.sysfs_name = "io"; + parent->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; + parent->mdev_types[0] = &parent->mdev_type; + ret = mdev_register_parent(&parent->parent, &sch->dev, &vfio_ccw_mdev_driver, - private->mdev_types, 1); + parent->mdev_types, 1); if (ret) - goto out_free; + goto out_unreg; VFIO_CCW_MSG_EVENT(4, "bound to subchannel %x.%x.%04x\n", sch->schid.cssid, sch->schid.ssid, sch->schid.sch_no); return 0; +out_unreg: + device_unregister(&parent->dev); out_free: + dev_set_drvdata(&parent->dev, NULL); dev_set_drvdata(&sch->dev, NULL); - vfio_ccw_free_private(private); + if (private) + vfio_ccw_free_private(private); return ret; } static void vfio_ccw_sch_remove(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); - mdev_unregister_parent(&private->parent); + mdev_unregister_parent(&parent->parent); + device_unregister(&parent->dev); dev_set_drvdata(&sch->dev, NULL); vfio_ccw_free_private(private); @@ -256,7 +310,11 @@ static void vfio_ccw_sch_remove(struct subchannel *sch) static void vfio_ccw_sch_shutdown(struct subchannel *sch) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); + + if (WARN_ON(!private)) + return; vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_CLOSE); vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); @@ -274,7 +332,8 @@ static void vfio_ccw_sch_shutdown(struct subchannel *sch) */ static int vfio_ccw_sch_event(struct subchannel *sch, int process) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); unsigned long flags; int rc = -EAGAIN; @@ -287,8 +346,10 @@ static int vfio_ccw_sch_event(struct subchannel *sch, int process) rc = 0; - if (cio_update_schib(sch)) - vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); + if (cio_update_schib(sch)) { + if (private) + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); + } out_unlock: spin_unlock_irqrestore(sch->lock, flags); @@ -326,7 +387,8 @@ static void vfio_ccw_queue_crw(struct vfio_ccw_private *private, static int vfio_ccw_chp_event(struct subchannel *sch, struct chp_link *link, int event) { - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); int mask = chp_ssd_get_mask(&sch->ssd_info, link); int retry = 255; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 6ae4d012d800..dc084883d872 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -55,7 +55,9 @@ static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) static int vfio_ccw_mdev_probe(struct mdev_device *mdev) { - struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); + struct subchannel *sch = to_subchannel(mdev->dev.parent); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); int ret; if (private->state == VFIO_CCW_STATE_NOT_OPER) @@ -100,7 +102,9 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) static void vfio_ccw_mdev_remove(struct mdev_device *mdev) { - struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); + struct subchannel *sch = to_subchannel(mdev->dev.parent); + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n", private->sch->schid.cssid, diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index bd5fb81456af..1f598d58d969 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -67,6 +67,21 @@ struct vfio_ccw_crw { struct crw crw; }; +/** + * struct vfio_ccw_parent + * + * @dev: embedded device struct + * @parent: parent data structures for mdevs created + * @mdev_type(s): identifying information for mdevs created + */ +struct vfio_ccw_parent { + struct device dev; + + struct mdev_parent parent; + struct mdev_type mdev_type; + struct mdev_type *mdev_types[1]; +}; + /** * struct vfio_ccw_private * @vdev: Embedded VFIO device @@ -89,7 +104,6 @@ struct vfio_ccw_crw { * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling * @release_comp: synchronization helper for vfio device release - * @parent: parent data structures for mdevs created */ struct vfio_ccw_private { struct vfio_device vdev; @@ -116,10 +130,6 @@ struct vfio_ccw_private { struct work_struct crw_work; struct completion release_comp; - - struct mdev_parent parent; - struct mdev_type mdev_type; - struct mdev_type *mdev_types[1]; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); -- cgit v1.2.3 From 008a011d68036ebfa4dede07cb9b93dedaa958b1 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:02 +0100 Subject: vfio/ccw: remove private->sch These places all rely on the ability to jump from a private struct back to the subchannel struct. Rather than keeping a copy in our back pocket, let's use the relationship provided by the vfio_device embedded within the private. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-3-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_chp.c | 5 +++-- drivers/s390/cio/vfio_ccw_drv.c | 3 +-- drivers/s390/cio/vfio_ccw_fsm.c | 27 ++++++++++++--------------- drivers/s390/cio/vfio_ccw_ops.c | 12 ++++++------ drivers/s390/cio/vfio_ccw_private.h | 7 ++++--- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index 13b26a1c7988..d3f3a611f95b 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -16,6 +16,7 @@ static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private, char __user *buf, size_t count, loff_t *ppos) { + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; struct ccw_schib_region *region; @@ -27,12 +28,12 @@ static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private, mutex_lock(&private->io_mutex); region = private->region[i].data; - if (cio_update_schib(private->sch)) { + if (cio_update_schib(sch)) { ret = -ENODEV; goto out; } - memcpy(region, &private->sch->schib, sizeof(*region)); + memcpy(region, &sch->schib, sizeof(*region)); if (copy_to_user(buf, (void *)region + pos, count)) { ret = -EFAULT; diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 444b32047397..2c680a556383 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -160,7 +160,6 @@ static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) if (!private) return ERR_PTR(-ENOMEM); - private->sch = sch; mutex_init(&private->io_mutex); private->state = VFIO_CCW_STATE_STANDBY; INIT_LIST_HEAD(&private->crw); @@ -395,7 +394,7 @@ static int vfio_ccw_chp_event(struct subchannel *sch, if (!private || !mask) return 0; - trace_vfio_ccw_chp_event(private->sch->schid, mask, event); + trace_vfio_ccw_chp_event(sch->schid, mask, event); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: mask=0x%x event=%d\n", sch->schid.cssid, sch->schid.ssid, sch->schid.sch_no, diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index a59c758869f8..e67fad897af3 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -18,15 +18,13 @@ static int fsm_io_helper(struct vfio_ccw_private *private) { - struct subchannel *sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); union orb *orb; int ccode; __u8 lpm; unsigned long flags; int ret; - sch = private->sch; - spin_lock_irqsave(sch->lock, flags); orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm); @@ -80,13 +78,11 @@ out: static int fsm_do_halt(struct vfio_ccw_private *private) { - struct subchannel *sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); unsigned long flags; int ccode; int ret; - sch = private->sch; - spin_lock_irqsave(sch->lock, flags); VFIO_CCW_TRACE_EVENT(2, "haltIO"); @@ -121,13 +117,11 @@ static int fsm_do_halt(struct vfio_ccw_private *private) static int fsm_do_clear(struct vfio_ccw_private *private) { - struct subchannel *sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); unsigned long flags; int ccode; int ret; - sch = private->sch; - spin_lock_irqsave(sch->lock, flags); VFIO_CCW_TRACE_EVENT(2, "clearIO"); @@ -160,7 +154,7 @@ static int fsm_do_clear(struct vfio_ccw_private *private) static void fsm_notoper(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: notoper event %x state %x\n", sch->schid.cssid, @@ -228,7 +222,7 @@ static void fsm_async_retry(struct vfio_ccw_private *private, static void fsm_disabled_irq(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); /* * An interrupt in a disabled state means a previous disable was not @@ -238,7 +232,9 @@ static void fsm_disabled_irq(struct vfio_ccw_private *private, } inline struct subchannel_id get_schid(struct vfio_ccw_private *p) { - return p->sch->schid; + struct subchannel *sch = to_subchannel(p->vdev.dev->parent); + + return sch->schid; } /* @@ -360,10 +356,11 @@ static void fsm_async_request(struct vfio_ccw_private *private, static void fsm_irq(struct vfio_ccw_private *private, enum vfio_ccw_event event) { + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); struct irb *irb = this_cpu_ptr(&cio_irb); VFIO_CCW_TRACE_EVENT(6, "IRQ"); - VFIO_CCW_TRACE_EVENT(6, dev_name(&private->sch->dev)); + VFIO_CCW_TRACE_EVENT(6, dev_name(&sch->dev)); memcpy(&private->irb, irb, sizeof(*irb)); @@ -376,7 +373,7 @@ static void fsm_irq(struct vfio_ccw_private *private, static void fsm_open(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); int ret; spin_lock_irq(sch->lock); @@ -397,7 +394,7 @@ err_unlock: static void fsm_close(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - struct subchannel *sch = private->sch; + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); int ret; spin_lock_irq(sch->lock); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index dc084883d872..79c50cb7dcb8 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -68,9 +68,9 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) return ret; VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n", - private->sch->schid.cssid, - private->sch->schid.ssid, - private->sch->schid.sch_no); + sch->schid.cssid, + sch->schid.ssid, + sch->schid.sch_no); ret = vfio_register_emulated_iommu_dev(&private->vdev); if (ret) @@ -107,9 +107,9 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n", - private->sch->schid.cssid, - private->sch->schid.ssid, - private->sch->schid.sch_no); + sch->schid.cssid, + sch->schid.ssid, + sch->schid.sch_no); vfio_unregister_group_dev(&private->vdev); diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 1f598d58d969..b28af2f63963 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -85,7 +85,6 @@ struct vfio_ccw_parent { /** * struct vfio_ccw_private * @vdev: Embedded VFIO device - * @sch: pointer to the subchannel * @state: internal state of the device * @completion: synchronization helper of the I/O completion * @io_region: MMIO region to input/output I/O arguments/results @@ -107,7 +106,6 @@ struct vfio_ccw_parent { */ struct vfio_ccw_private { struct vfio_device vdev; - struct subchannel *sch; int state; struct completion *completion; struct ccw_io_region *io_region; @@ -172,7 +170,10 @@ extern fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS]; static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private, enum vfio_ccw_event event) { - trace_vfio_ccw_fsm_event(private->sch->schid, private->state, event); + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); + + if (sch) + trace_vfio_ccw_fsm_event(sch->schid, private->state, event); vfio_ccw_jumptable[private->state][event](private, event); } -- cgit v1.2.3 From 06caaa27df8f1a8a6be78212c5ef5cac04500176 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:03 +0100 Subject: vfio/ccw: move private initialization to callback There's already a device initialization callback that is used to initialize the release completion workaround that was introduced by commit ebb72b765fb49 ("vfio/ccw: Use the new device life cycle helpers"). Move the other elements of the vfio_ccw_private struct that require distinct initialization over to that routine. With that done, the vfio_ccw_alloc_private routine only does a kzalloc, so fold it inline. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-4-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 74 +++++-------------------------------- drivers/s390/cio/vfio_ccw_ops.c | 43 +++++++++++++++++++++ drivers/s390/cio/vfio_ccw_private.h | 7 +++- 3 files changed, 58 insertions(+), 66 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 2c680a556383..fbc26338ceab 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -23,10 +23,10 @@ #include "vfio_ccw_private.h" struct workqueue_struct *vfio_ccw_work_q; -static struct kmem_cache *vfio_ccw_io_region; -static struct kmem_cache *vfio_ccw_cmd_region; -static struct kmem_cache *vfio_ccw_schib_region; -static struct kmem_cache *vfio_ccw_crw_region; +struct kmem_cache *vfio_ccw_io_region; +struct kmem_cache *vfio_ccw_cmd_region; +struct kmem_cache *vfio_ccw_schib_region; +struct kmem_cache *vfio_ccw_crw_region; debug_info_t *vfio_ccw_debug_msg_id; debug_info_t *vfio_ccw_debug_trace_id; @@ -79,7 +79,7 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch) return ret; } -static void vfio_ccw_sch_io_todo(struct work_struct *work) +void vfio_ccw_sch_io_todo(struct work_struct *work) { struct vfio_ccw_private *private; struct irb *irb; @@ -115,7 +115,7 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work) eventfd_signal(private->io_trigger, 1); } -static void vfio_ccw_crw_todo(struct work_struct *work) +void vfio_ccw_crw_todo(struct work_struct *work) { struct vfio_ccw_private *private; @@ -152,62 +152,6 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) -{ - struct vfio_ccw_private *private; - - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) - return ERR_PTR(-ENOMEM); - - mutex_init(&private->io_mutex); - private->state = VFIO_CCW_STATE_STANDBY; - INIT_LIST_HEAD(&private->crw); - INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); - INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); - - private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), - GFP_KERNEL); - if (!private->cp.guest_cp) - goto out_free_private; - - private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, - GFP_KERNEL | GFP_DMA); - if (!private->io_region) - goto out_free_cp; - - private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, - GFP_KERNEL | GFP_DMA); - if (!private->cmd_region) - goto out_free_io; - - private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, - GFP_KERNEL | GFP_DMA); - - if (!private->schib_region) - goto out_free_cmd; - - private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, - GFP_KERNEL | GFP_DMA); - - if (!private->crw_region) - goto out_free_schib; - return private; - -out_free_schib: - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); -out_free_cmd: - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); -out_free_io: - kmem_cache_free(vfio_ccw_io_region, private->io_region); -out_free_cp: - kfree(private->cp.guest_cp); -out_free_private: - mutex_destroy(&private->io_mutex); - kfree(private); - return ERR_PTR(-ENOMEM); -} - static void vfio_ccw_free_private(struct vfio_ccw_private *private) { struct vfio_ccw_crw *crw, *temp; @@ -257,10 +201,10 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (ret) goto out_free; - private = vfio_ccw_alloc_private(sch); - if (IS_ERR(private)) { + private = kzalloc(sizeof(*private), GFP_KERNEL); + if (!private) { device_unregister(&parent->dev); - return PTR_ERR(private); + return -ENOMEM; } dev_set_drvdata(&sch->dev, parent); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 79c50cb7dcb8..eb0b8cc210bb 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -49,8 +49,51 @@ static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); + mutex_init(&private->io_mutex); + private->state = VFIO_CCW_STATE_STANDBY; + INIT_LIST_HEAD(&private->crw); + INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); + INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); init_completion(&private->release_comp); + + private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), + GFP_KERNEL); + if (!private->cp.guest_cp) + goto out_free_private; + + private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, + GFP_KERNEL | GFP_DMA); + if (!private->io_region) + goto out_free_cp; + + private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, + GFP_KERNEL | GFP_DMA); + if (!private->cmd_region) + goto out_free_io; + + private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, + GFP_KERNEL | GFP_DMA); + if (!private->schib_region) + goto out_free_cmd; + + private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, + GFP_KERNEL | GFP_DMA); + if (!private->crw_region) + goto out_free_schib; + return 0; + +out_free_schib: + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); +out_free_cmd: + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); +out_free_io: + kmem_cache_free(vfio_ccw_io_region, private->io_region); +out_free_cp: + kfree(private->cp.guest_cp); +out_free_private: + mutex_destroy(&private->io_mutex); + return -ENOMEM; } static int vfio_ccw_mdev_probe(struct mdev_device *mdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index b28af2f63963..55d636225cff 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -131,6 +131,8 @@ struct vfio_ccw_private { } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); +void vfio_ccw_sch_io_todo(struct work_struct *work); +void vfio_ccw_crw_todo(struct work_struct *work); extern struct mdev_driver vfio_ccw_mdev_driver; @@ -178,7 +180,10 @@ static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private, } extern struct workqueue_struct *vfio_ccw_work_q; - +extern struct kmem_cache *vfio_ccw_io_region; +extern struct kmem_cache *vfio_ccw_cmd_region; +extern struct kmem_cache *vfio_ccw_schib_region; +extern struct kmem_cache *vfio_ccw_crw_region; /* s390 debug feature, similar to base cio */ extern debug_info_t *vfio_ccw_debug_msg_id; -- cgit v1.2.3 From 3d62fe18b6a3a93c5360c2d590b9b40b6842133e Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:04 +0100 Subject: vfio/ccw: move private to mdev lifecycle Now that the mdev parent data is split out into its own struct, it is safe to move the remaining private data to follow the mdev probe/remove lifecycle. The mdev parent data will remain where it is, and follow the subchannel and the css driver interfaces. Signed-off-by: Eric Farman Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-5-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 16 +--------------- drivers/s390/cio/vfio_ccw_ops.c | 26 +++++++++++++------------- drivers/s390/cio/vfio_ccw_private.h | 2 ++ 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index fbc26338ceab..9fbd1b27a1ac 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -152,7 +152,7 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -static void vfio_ccw_free_private(struct vfio_ccw_private *private) +void vfio_ccw_free_private(struct vfio_ccw_private *private) { struct vfio_ccw_crw *crw, *temp; @@ -180,7 +180,6 @@ static void vfio_ccw_free_parent(struct device *dev) static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; - struct vfio_ccw_private *private; struct vfio_ccw_parent *parent; int ret = -ENOMEM; @@ -201,14 +200,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (ret) goto out_free; - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) { - device_unregister(&parent->dev); - return -ENOMEM; - } - dev_set_drvdata(&sch->dev, parent); - dev_set_drvdata(&parent->dev, private); parent->mdev_type.sysfs_name = "io"; parent->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; @@ -227,25 +219,19 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) out_unreg: device_unregister(&parent->dev); out_free: - dev_set_drvdata(&parent->dev, NULL); dev_set_drvdata(&sch->dev, NULL); - if (private) - vfio_ccw_free_private(private); return ret; } static void vfio_ccw_sch_remove(struct subchannel *sch) { struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); - struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); mdev_unregister_parent(&parent->parent); device_unregister(&parent->dev); dev_set_drvdata(&sch->dev, NULL); - vfio_ccw_free_private(private); - VFIO_CCW_MSG_EVENT(4, "unbound from subchannel %x.%x.%04x\n", sch->schid.cssid, sch->schid.ssid, sch->schid.sch_no); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index eb0b8cc210bb..e45d4acb109b 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -100,15 +100,20 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) { struct subchannel *sch = to_subchannel(mdev->dev.parent); struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); - struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); + struct vfio_ccw_private *private; int ret; - if (private->state == VFIO_CCW_STATE_NOT_OPER) - return -ENODEV; + private = kzalloc(sizeof(*private), GFP_KERNEL); + if (!private) + return -ENOMEM; ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); - if (ret) + if (ret) { + kfree(private); return ret; + } + + dev_set_drvdata(&parent->dev, private); VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n", sch->schid.cssid, @@ -122,6 +127,7 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) return 0; err_put_vdev: + dev_set_drvdata(&parent->dev, NULL); vfio_put_device(&private->vdev); return ret; } @@ -131,15 +137,6 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); - /* - * We cannot free vfio_ccw_private here because it includes - * parent info which must be free'ed by css driver. - * - * Use a workaround by memset'ing the core device part and - * then notifying the remove path that all active references - * to this device have been released. - */ - memset(vdev, 0, sizeof(*vdev)); complete(&private->release_comp); } @@ -156,6 +153,7 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) vfio_unregister_group_dev(&private->vdev); + dev_set_drvdata(&parent->dev, NULL); vfio_put_device(&private->vdev); /* * Wait for all active references on mdev are released so it @@ -166,6 +164,8 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) * cycle. */ wait_for_completion(&private->release_comp); + + vfio_ccw_free_private(private); } static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 55d636225cff..747aba5f5272 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -134,6 +134,8 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch); void vfio_ccw_sch_io_todo(struct work_struct *work); void vfio_ccw_crw_todo(struct work_struct *work); +void vfio_ccw_free_private(struct vfio_ccw_private *private); + extern struct mdev_driver vfio_ccw_mdev_driver; /* -- cgit v1.2.3 From f4da83f7e3f096ece936512d86ef3726e470fbfd Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:05 +0100 Subject: vfio/ccw: remove release completion There's enough separation between the parent and private structs now, that it is fine to remove the release completion hack. Signed-off-by: Eric Farman Reviewed-by: Kevin Tian Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-6-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 14 +------------- drivers/s390/cio/vfio_ccw_private.h | 3 --- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index e45d4acb109b..8a929a9cf3c6 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -54,7 +54,6 @@ static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) INIT_LIST_HEAD(&private->crw); INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); - init_completion(&private->release_comp); private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), GFP_KERNEL); @@ -137,7 +136,7 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); - complete(&private->release_comp); + vfio_ccw_free_private(private); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) @@ -155,17 +154,6 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) dev_set_drvdata(&parent->dev, NULL); vfio_put_device(&private->vdev); - /* - * Wait for all active references on mdev are released so it - * is safe to defer kfree() to a later point. - * - * TODO: the clean fix is to split parent/mdev info from ccw - * private structure so each can be managed in its own life - * cycle. - */ - wait_for_completion(&private->release_comp); - - vfio_ccw_free_private(private); } static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 747aba5f5272..2278fd38d34e 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -102,7 +102,6 @@ struct vfio_ccw_parent { * @req_trigger: eventfd ctx for signaling userspace to return device * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling - * @release_comp: synchronization helper for vfio device release */ struct vfio_ccw_private { struct vfio_device vdev; @@ -126,8 +125,6 @@ struct vfio_ccw_private { struct eventfd_ctx *req_trigger; struct work_struct io_work; struct work_struct crw_work; - - struct completion release_comp; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); -- cgit v1.2.3 From d1104f9327df9b26901b97cd026949f80ccab0d3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:06 +0100 Subject: vfio/ccw: replace vfio_init_device with _alloc_ Now that we have a reasonable separation of structs that follow the subchannel and mdev lifecycles, there's no reason we can't call the official vfio_alloc_device routine for our private data, and behave like everyone else. Signed-off-by: Eric Farman Reviewed-by: Kevin Tian Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-7-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 18 ------------------ drivers/s390/cio/vfio_ccw_ops.c | 28 ++++++++++++++++++---------- drivers/s390/cio/vfio_ccw_private.h | 2 -- drivers/vfio/vfio_main.c | 10 +++++----- include/linux/vfio.h | 2 -- 5 files changed, 23 insertions(+), 37 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 9fbd1b27a1ac..c2a65808605a 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -152,24 +152,6 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -void vfio_ccw_free_private(struct vfio_ccw_private *private) -{ - struct vfio_ccw_crw *crw, *temp; - - list_for_each_entry_safe(crw, temp, &private->crw, next) { - list_del(&crw->next); - kfree(crw); - } - - kmem_cache_free(vfio_ccw_crw_region, private->crw_region); - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); - kmem_cache_free(vfio_ccw_io_region, private->io_region); - kfree(private->cp.guest_cp); - mutex_destroy(&private->io_mutex); - kfree(private); -} - static void vfio_ccw_free_parent(struct device *dev) { struct vfio_ccw_parent *parent = container_of(dev, struct vfio_ccw_parent, dev); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 8a929a9cf3c6..1155f8bcedd9 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -102,15 +102,10 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) struct vfio_ccw_private *private; int ret; - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) - return -ENOMEM; - - ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); - if (ret) { - kfree(private); - return ret; - } + private = vfio_alloc_device(vfio_ccw_private, vdev, &mdev->dev, + &vfio_ccw_dev_ops); + if (IS_ERR(private)) + return PTR_ERR(private); dev_set_drvdata(&parent->dev, private); @@ -135,8 +130,21 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) { struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); + struct vfio_ccw_crw *crw, *temp; + + list_for_each_entry_safe(crw, temp, &private->crw, next) { + list_del(&crw->next); + kfree(crw); + } + + kmem_cache_free(vfio_ccw_crw_region, private->crw_region); + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); + kmem_cache_free(vfio_ccw_io_region, private->io_region); + kfree(private->cp.guest_cp); + mutex_destroy(&private->io_mutex); - vfio_ccw_free_private(private); + vfio_free_device(vdev); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 2278fd38d34e..b441ae6700fd 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -131,8 +131,6 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch); void vfio_ccw_sch_io_todo(struct work_struct *work); void vfio_ccw_crw_todo(struct work_struct *work); -void vfio_ccw_free_private(struct vfio_ccw_private *private); - extern struct mdev_driver vfio_ccw_mdev_driver; /* diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2d168793d4e1..2901b8ad5be9 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -348,6 +348,9 @@ static void vfio_device_release(struct device *dev) device->ops->release(device); } +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); + /* * Allocate and initialize vfio_device so it can be registered to vfio * core. @@ -386,11 +389,9 @@ EXPORT_SYMBOL_GPL(_vfio_alloc_device); /* * Initialize a vfio_device so it can be registered to vfio core. - * - * Only vfio-ccw driver should call this interface. */ -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops) +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops) { int ret; @@ -422,7 +423,6 @@ out_uninit: ida_free(&vfio.device_ida, device->index); return ret; } -EXPORT_SYMBOL_GPL(vfio_init_device); /* * The helper called by driver @release callback to free the device diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd..ba809268a48e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,8 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops); void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { -- cgit v1.2.3 From 913447d06f032a9e9c84870bec0b1adb8c588f29 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:07 +0100 Subject: vfio: Remove vfio_free_device With the "mess" sorted out, we should be able to inline the vfio_free_device call introduced by commit cb9ff3f3b84c ("vfio: Add helpers for unifying vfio_device life cycle") and remove them from driver release callbacks. Signed-off-by: Eric Farman Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Reviewed-by: Tony Krowiak # vfio-ap part Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-8-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 1 - drivers/s390/cio/vfio_ccw_ops.c | 2 -- drivers/s390/crypto/vfio_ap_ops.c | 6 ------ drivers/vfio/fsl-mc/vfio_fsl_mc.c | 1 - drivers/vfio/pci/vfio_pci_core.c | 1 - drivers/vfio/platform/vfio_amba.c | 1 - drivers/vfio/platform/vfio_platform.c | 1 - drivers/vfio/vfio_main.c | 22 ++++------------------ include/linux/vfio.h | 1 - samples/vfio-mdev/mbochs.c | 1 - samples/vfio-mdev/mdpy.c | 1 - samples/vfio-mdev/mtty.c | 1 - 12 files changed, 4 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 7a45e5360caf..eee6805e67de 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1461,7 +1461,6 @@ static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); intel_gvt_destroy_vgpu(vgpu); - vfio_free_device(vfio_dev); } static const struct vfio_device_ops intel_vgpu_dev_ops = { diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 1155f8bcedd9..598a3814d428 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -143,8 +143,6 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) kmem_cache_free(vfio_ccw_io_region, private->io_region); kfree(private->cp.guest_cp); mutex_destroy(&private->io_mutex); - - vfio_free_device(vdev); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 0b4cc8c597ae..f108c0f14712 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -765,11 +765,6 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev) } } -static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) -{ - vfio_free_device(vdev); -} - static void vfio_ap_mdev_remove(struct mdev_device *mdev) { struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); @@ -1784,7 +1779,6 @@ static const struct attribute_group vfio_queue_attr_group = { static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { .init = vfio_ap_mdev_init_dev, - .release = vfio_ap_mdev_release_dev, .open_device = vfio_ap_mdev_open_device, .close_device = vfio_ap_mdev_close_device, .ioctl = vfio_ap_mdev_ioctl, diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index b16874e913e4..7b8889f55007 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -568,7 +568,6 @@ static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev) vfio_fsl_uninit_device(vdev); mutex_destroy(&vdev->igate); - vfio_free_device(core_vdev); } static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index badc9d828cac..9be2d5be5d95 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2109,7 +2109,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); - vfio_free_device(core_vdev); } EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index eaea63e5294c..18faf2678b99 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -95,7 +95,6 @@ static void vfio_amba_release_dev(struct vfio_device *core_vdev) vfio_platform_release_common(vdev); kfree(vdev->name); - vfio_free_device(core_vdev); } static void vfio_amba_remove(struct amba_device *adev) diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 82cedcebfd90..9910451dc341 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -83,7 +83,6 @@ static void vfio_platform_release_dev(struct vfio_device *core_vdev) container_of(core_vdev, struct vfio_platform_device, vdev); vfio_platform_release_common(vdev); - vfio_free_device(core_vdev); } static int vfio_platform_remove(struct platform_device *pdev) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2901b8ad5be9..9835757e2bee 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -339,13 +339,10 @@ static void vfio_device_release(struct device *dev) vfio_release_device_set(device); ida_free(&vfio.device_ida, device->index); - /* - * kvfree() cannot be done here due to a life cycle mess in - * vfio-ccw. Before the ccw part is fixed all drivers are - * required to support @release and call vfio_free_device() - * from there. - */ - device->ops->release(device); + if (device->ops->release) + device->ops->release(device); + + kvfree(device); } static int vfio_init_device(struct vfio_device *device, struct device *dev, @@ -424,17 +421,6 @@ out_uninit: return ret; } -/* - * The helper called by driver @release callback to free the device - * structure. Drivers which don't have private data to clean can - * simply use this helper as its @release. - */ -void vfio_free_device(struct vfio_device *device) -{ - kvfree(device); -} -EXPORT_SYMBOL_GPL(vfio_free_device); - static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, enum vfio_group_type type) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ba809268a48e..e7480154825e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,7 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { put_device(&device->device); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 117a8d799f71..8b5a3a778a25 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -594,7 +594,6 @@ static void mbochs_release_dev(struct vfio_device *vdev) atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); kfree(mdev_state->pages); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mbochs_remove(struct mdev_device *mdev) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 946e8cfde6fd..721fb06c6413 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -283,7 +283,6 @@ static void mdpy_release_dev(struct vfio_device *vdev) vfree(mdev_state->memblk); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mdpy_remove(struct mdev_device *mdev) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index e72085fc1376..3c2a421b9b69 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -784,7 +784,6 @@ static void mtty_release_dev(struct vfio_device *vdev) atomic_add(mdev_state->nr_ports, &mdev_avail_ports); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mtty_remove(struct mdev_device *mdev) -- cgit v1.2.3 From 4e016f969529f2aec0545e90119e7eb3cb124c46 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:18 +0200 Subject: vfio: Add an option to get migration data size Add an option to get migration data size by introducing a new migration feature named VFIO_DEVICE_FEATURE_MIG_DATA_SIZE. Upon VFIO_DEVICE_FEATURE_GET the estimated data length that will be required to complete STOP_COPY is returned. This option may better enable user space to consider before moving to STOP_COPY whether it can meet the downtime SLA based on the returned data. The patch also includes the implementation for mlx5 and hisi for this new option to make it feature complete for the existing drivers in this area. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Longfang Liu Link: https://lore.kernel.org/r/20221106174630.25909-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 9 ++++++++ drivers/vfio/pci/mlx5/main.c | 18 +++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 3 ++- drivers/vfio/vfio_main.c | 32 ++++++++++++++++++++++++++ include/linux/vfio.h | 5 ++++ include/uapi/linux/vfio.h | 13 +++++++++++ 6 files changed, 79 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 39eeca18a0f7..0c0c0c7f0521 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -957,6 +957,14 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, return res; } +static int +hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + *stop_copy_length = sizeof(struct acc_vf_data); + return 0; +} + static int hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) @@ -1213,6 +1221,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { .migration_set_state = hisi_acc_vfio_pci_set_device_state, .migration_get_state = hisi_acc_vfio_pci_get_device_state, + .migration_get_data_size = hisi_acc_vfio_pci_get_data_size, }; static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 457138b92f13..6e9cf2aacc52 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -512,6 +512,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev, return res; } +static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + size_t state_size; + int ret; + + mutex_lock(&mvdev->state_mutex); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &state_size); + if (!ret) + *stop_copy_length = state_size; + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -577,6 +594,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_set_state = mlx5vf_pci_set_device_state, .migration_get_state = mlx5vf_pci_get_device_state, + .migration_get_data_size = mlx5vf_pci_get_data_size, }; static const struct vfio_log_ops mlx5vf_pci_log_ops = { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9be2d5be5d95..189d4930c276 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2127,7 +2127,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (vdev->vdev.mig_ops) { if (!(vdev->vdev.mig_ops->migration_get_state && - vdev->vdev.mig_ops->migration_set_state) || + vdev->vdev.mig_ops->migration_set_state && + vdev->vdev.mig_ops->migration_get_data_size) || !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY)) return -EINVAL; } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9835757e2bee..662e267a3e13 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1242,6 +1242,34 @@ out_copy: return 0; } +static int +vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + struct vfio_device_feature_mig_data_size data_size = {}; + unsigned long stop_copy_length; + int ret; + + if (!device->mig_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(data_size)); + if (ret != 1) + return ret; + + ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); + if (ret) + return ret; + + data_size.stop_copy_length = stop_copy_length; + if (copy_to_user(arg, &data_size, sizeof(data_size))) + return -EFAULT; + + return 0; +} + static int vfio_ioctl_device_feature_migration(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) @@ -1469,6 +1497,10 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, return vfio_ioctl_device_feature_logging_report( device, feature.flags, arg->data, feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: + return vfio_ioctl_device_feature_migration_data_size( + device, feature.flags, arg->data, + feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) return -EINVAL; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7480154825e..43b67e46a2cb 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -107,6 +107,9 @@ struct vfio_device_ops { * @migration_get_state: Optional callback to get the migration state for * devices that support migration. It's mandatory for * VFIO_DEVICE_FEATURE_MIGRATION migration support. + * @migration_get_data_size: Optional callback to get the estimated data + * length that will be required to complete stop copy. It's mandatory for + * VFIO_DEVICE_FEATURE_MIGRATION migration support. */ struct vfio_migration_ops { struct file *(*migration_set_state)( @@ -114,6 +117,8 @@ struct vfio_migration_ops { enum vfio_device_mig_state new_state); int (*migration_get_state)(struct vfio_device *device, enum vfio_device_mig_state *curr_state); + int (*migration_get_data_size)(struct vfio_device *device, + unsigned long *stop_copy_length); }; /** diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index d7d8e0922376..3e45dbaf190e 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1128,6 +1128,19 @@ struct vfio_device_feature_dma_logging_report { #define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 +/* + * Upon VFIO_DEVICE_FEATURE_GET read back the estimated data length that will + * be required to complete stop copy. + * + * Note: Can be called on each device state. + */ + +struct vfio_device_feature_mig_data_size { + __aligned_u64 stop_copy_length; +}; + +#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- cgit v1.2.3 From 2f5d8cef45c30edcf3972d345f606df563d3a48e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:19 +0200 Subject: vfio/mlx5: Fix a typo in mlx5vf_cmd_load_vhca_state() Fix a typo in mlx5vf_cmd_load_vhca_state() to use the 'load' memory layout. As in/out sizes are equal for save and load commands there wasn't any functional issue. Fixes: f1d98f346ee3 ("vfio/mlx5: Expose migration commands over mlx5 device") Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221106174630.25909-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5..0848bc905d3e 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -378,8 +378,8 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { struct mlx5_core_dev *mdev; - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; u32 pdn, mkey; int err; -- cgit v1.2.3 From b058ea3ab5afea873ab8d976277539ca9e43869a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 29 Nov 2022 13:12:35 +0000 Subject: vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries Commit f38044e5ef58 ("vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps") had fixed the unaligned bitmaps by capping the remaining iterable set at the start of the bitmap. Although, that mistakenly worked around iova_bitmap_set() incorrectly setting bits across page boundary. Fix this by reworking the loop inside iova_bitmap_set() to iterate over a range of bits to set (cur_bit .. last_bit) which may span different pinned pages, thus updating @page_idx and @offset as it sets the bits. The previous cap to the first page is now adjusted to be always accounted rather than when there's only a non-zero pgoff. While at it, make @page_idx , @offset and @nbits to be unsigned int given that it won't be more than 512 and 4096 respectively (even a bigger PAGE_SIZE or a smaller struct page size won't make this bigger than the above 32-bit max). Also, delete the stale kdoc on Return type. Cc: Avihai Horon Fixes: f38044e5ef58 ("vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps") Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Tested-by: Avihai Horon Link: https://lore.kernel.org/r/20221129131235.38880-1-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index de6d6ea5c496..0848f920efb7 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -298,9 +298,7 @@ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) { unsigned long remaining, bytes; - /* Cap to one page in the first iteration, if PAGE_SIZE unaligned. */ - bytes = !bitmap->mapped.pgoff ? bitmap->mapped.npages << PAGE_SHIFT : - PAGE_SIZE - bitmap->mapped.pgoff; + bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff; remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, @@ -399,29 +397,27 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, * Set the bits corresponding to the range [iova .. iova+length-1] in * the user bitmap. * - * Return: The number of bits set. */ void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length) { struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long offset = (iova - mapped->iova) >> mapped->pgshift; - unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift); - unsigned long page_idx = offset / BITS_PER_PAGE; - unsigned long page_offset = mapped->pgoff; - void *kaddr; - - offset = offset % BITS_PER_PAGE; + unsigned long cur_bit = ((iova - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; do { - unsigned long size = min(BITS_PER_PAGE - offset, nbits); + unsigned int page_idx = cur_bit / BITS_PER_PAGE; + unsigned int offset = cur_bit % BITS_PER_PAGE; + unsigned int nbits = min(BITS_PER_PAGE - offset, + last_bit - cur_bit + 1); + void *kaddr; kaddr = kmap_local_page(mapped->pages[page_idx]); - bitmap_set(kaddr + page_offset, offset, size); + bitmap_set(kaddr, offset, nbits); kunmap_local(kaddr); - page_offset = offset = 0; - nbits -= size; - page_idx++; - } while (nbits > 0); + cur_bit += nbits; + } while (cur_bit <= last_bit); } EXPORT_SYMBOL_GPL(iova_bitmap_set); -- cgit v1.2.3 From 8f8bcc8c720c360885639de66fe69756febed824 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:16 -0400 Subject: vfio/pci: Move all the SPAPR PCI specific logic to vfio_pci_core.ko The vfio_spapr_pci_eeh_open/release() functions are one line wrappers around an arch function. Just call them directly. This eliminates some weird exported symbols that don't need to exist. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 11 +++++++++-- drivers/vfio/vfio_spapr_eeh.c | 13 ------------- include/linux/vfio.h | 11 ----------- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 189d4930c276..56501e7ef564 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#include +#endif #include "vfio_pci_priv.h" @@ -686,7 +689,9 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } - vfio_spapr_pci_eeh_release(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_release(vdev->pdev); +#endif vfio_pci_core_disable(vdev); mutex_lock(&vdev->igate); @@ -705,7 +710,9 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); - vfio_spapr_pci_eeh_open(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_open(vdev->pdev); +#endif if (vdev->sriov_pf_core_dev) { mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index 67f55ac1d459..c9d102aafbcd 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -15,19 +15,6 @@ #define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" #define DRIVER_DESC "VFIO IOMMU SPAPR EEH" -/* We might build address mapping here for "fast" path later */ -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ - eeh_dev_open(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open); - -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ - eeh_dev_release(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release); - long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 43b67e46a2cb..9378ca79d548 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,21 +233,10 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -struct pci_dev; #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg); #else -static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ -} - -static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ -} - static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) -- cgit v1.2.3 From e5c38a203eb4343993e889eb69f5386f085f25ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:17 -0400 Subject: vfio/spapr: Move VFIO_CHECK_EXTENSION into tce_iommu_ioctl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PPC64 kconfig is a bit of a rats nest, but it turns out that if CONFIG_SPAPR_TCE_IOMMU is on then EEH must be too: config SPAPR_TCE_IOMMU bool "sPAPR TCE IOMMU Support" depends on PPC_POWERNV || PPC_PSERIES select IOMMU_API help Enables bits of IOMMU API required by VFIO. The iommu_ops is not implemented as it is not necessary for VFIO. config PPC_POWERNV select FORCE_PCI config PPC_PSERIES select FORCE_PCI config EEH bool depends on (PPC_POWERNV || PPC_PSERIES) && PCI default y So, just open code the call to eeh_enabled() into tce_iommu_ioctl(). Reviewed-by: Christoph Hellwig Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_spapr_tce.c | 10 ++++------ drivers/vfio/vfio_spapr_eeh.c | 6 ------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 169f07ac162d..73cec2beae70 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -785,14 +785,12 @@ static long tce_iommu_ioctl(void *iommu_data, switch (arg) { case VFIO_SPAPR_TCE_IOMMU: case VFIO_SPAPR_TCE_v2_IOMMU: - ret = 1; - break; + return 1; + case VFIO_EEH: + return eeh_enabled(); default: - ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); - break; + return 0; } - - return (ret < 0) ? 0 : ret; } /* diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index c9d102aafbcd..221b1b637e18 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -24,12 +24,6 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, long ret = -EINVAL; switch (cmd) { - case VFIO_CHECK_EXTENSION: - if (arg == VFIO_EEH) - ret = eeh_enabled() ? 1 : 0; - else - ret = 0; - break; case VFIO_EEH_PE_OP: pe = eeh_iommu_group_to_pe(group); if (!pe) -- cgit v1.2.3 From e276e25819b8a173a21947720bb0a548c0b724b7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:18 -0400 Subject: vfio: Move vfio_spapr_iommu_eeh_ioctl into vfio_iommu_spapr_tce.c As with the previous patch EEH is always enabled if SPAPR_TCE_IOMMU, so move this last bit of code into the main module. Now that this function only processes VFIO_EEH_PE_OP remove a level of indenting as well, it is only called by a case statement that already checked VFIO_EEH_PE_OP. This eliminates an unnecessary module and SPAPR code in a global header. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 1 - drivers/vfio/vfio_iommu_spapr_tce.c | 55 ++++++++++++++++++++++- drivers/vfio/vfio_spapr_eeh.c | 88 ------------------------------------- include/linux/vfio.h | 12 ----- 4 files changed, 53 insertions(+), 103 deletions(-) delete mode 100644 drivers/vfio/vfio_spapr_eeh.c diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index b693a1169286..50b8e8e3fb10 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -10,7 +10,6 @@ vfio-y += vfio_main.o \ obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o -obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 73cec2beae70..60a50ce8701e 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -4,6 +4,7 @@ * * Copyright (C) 2013 IBM Corp. All rights reserved. * Author: Alexey Kardashevskiy + * Copyright Gavin Shan, IBM Corporation 2014. * * Derived from original vfio_iommu_type1.c: * Copyright (C) 2012 Red Hat, Inc. All rights reserved. @@ -773,6 +774,57 @@ static long tce_iommu_create_default_window(struct tce_container *container) return ret; } +static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group, + unsigned long arg) +{ + struct eeh_pe *pe; + struct vfio_eeh_pe_op op; + unsigned long minsz; + + pe = eeh_iommu_group_to_pe(group); + if (!pe) + return -ENODEV; + + minsz = offsetofend(struct vfio_eeh_pe_op, op); + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + if (op.argsz < minsz || op.flags) + return -EINVAL; + + switch (op.op) { + case VFIO_EEH_PE_DISABLE: + return eeh_pe_set_option(pe, EEH_OPT_DISABLE); + case VFIO_EEH_PE_ENABLE: + return eeh_pe_set_option(pe, EEH_OPT_ENABLE); + case VFIO_EEH_PE_UNFREEZE_IO: + return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); + case VFIO_EEH_PE_UNFREEZE_DMA: + return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); + case VFIO_EEH_PE_GET_STATE: + return eeh_pe_get_state(pe); + break; + case VFIO_EEH_PE_RESET_DEACTIVATE: + return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); + case VFIO_EEH_PE_RESET_HOT: + return eeh_pe_reset(pe, EEH_RESET_HOT, true); + case VFIO_EEH_PE_RESET_FUNDAMENTAL: + return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); + case VFIO_EEH_PE_CONFIGURE: + return eeh_pe_configure(pe); + case VFIO_EEH_PE_INJECT_ERR: + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); + if (op.argsz < minsz) + return -EINVAL; + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + + return eeh_pe_inject_err(pe, op.err.type, op.err.func, + op.err.addr, op.err.mask); + default: + return -EINVAL; + } +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -1044,8 +1096,7 @@ static long tce_iommu_ioctl(void *iommu_data, ret = 0; list_for_each_entry(tcegrp, &container->group_list, next) { - ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, - cmd, arg); + ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg); if (ret) return ret; } diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c deleted file mode 100644 index 221b1b637e18..000000000000 --- a/drivers/vfio/vfio_spapr_eeh.c +++ /dev/null @@ -1,88 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * EEH functionality support for VFIO devices. The feature is only - * available on sPAPR compatible platforms. - * - * Copyright Gavin Shan, IBM Corporation 2014. - */ - -#include -#include -#include -#include - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" -#define DRIVER_DESC "VFIO IOMMU SPAPR EEH" - -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, unsigned long arg) -{ - struct eeh_pe *pe; - struct vfio_eeh_pe_op op; - unsigned long minsz; - long ret = -EINVAL; - - switch (cmd) { - case VFIO_EEH_PE_OP: - pe = eeh_iommu_group_to_pe(group); - if (!pe) - return -ENODEV; - - minsz = offsetofend(struct vfio_eeh_pe_op, op); - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - if (op.argsz < minsz || op.flags) - return -EINVAL; - - switch (op.op) { - case VFIO_EEH_PE_DISABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE); - break; - case VFIO_EEH_PE_ENABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE); - break; - case VFIO_EEH_PE_UNFREEZE_IO: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); - break; - case VFIO_EEH_PE_UNFREEZE_DMA: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); - break; - case VFIO_EEH_PE_GET_STATE: - ret = eeh_pe_get_state(pe); - break; - case VFIO_EEH_PE_RESET_DEACTIVATE: - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); - break; - case VFIO_EEH_PE_RESET_HOT: - ret = eeh_pe_reset(pe, EEH_RESET_HOT, true); - break; - case VFIO_EEH_PE_RESET_FUNDAMENTAL: - ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); - break; - case VFIO_EEH_PE_CONFIGURE: - ret = eeh_pe_configure(pe); - break; - case VFIO_EEH_PE_INJECT_ERR: - minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); - if (op.argsz < minsz) - return -EINVAL; - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - - ret = eeh_pe_inject_err(pe, op.err.type, op.err.func, - op.err.addr, op.err.mask); - break; - default: - ret = -EINVAL; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 9378ca79d548..b4d5d4ca3d7d 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,18 +233,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, - unsigned long arg); -#else -static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} -#endif /* CONFIG_VFIO_SPAPR_EEH */ - /* * IRQfd - generic */ -- cgit v1.2.3 From 20601c45a0fa20bbb5545f4dd69f4f18448f4973 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:19 -0400 Subject: vfio: Remove CONFIG_VFIO_SPAPR_EEH We don't need a kconfig symbol for this, just directly test CONFIG_EEH in the few places that need it. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 5 ----- drivers/vfio/pci/vfio_pci_core.c | 6 +++--- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 86c381ceb9a1..d25b91adfd64 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -20,11 +20,6 @@ config VFIO_IOMMU_SPAPR_TCE depends on SPAPR_TCE_IOMMU default VFIO -config VFIO_SPAPR_EEH - tristate - depends on EEH && VFIO_IOMMU_SPAPR_TCE - default VFIO - config VFIO_VIRQFD tristate select EVENTFD diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 56501e7ef564..f9365a5bc961 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,7 +27,7 @@ #include #include #include -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) #include #endif @@ -689,7 +689,7 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_release(vdev->pdev); #endif vfio_pci_core_disable(vdev); @@ -710,7 +710,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_open(vdev->pdev); #endif -- cgit v1.2.3 From e2d55709398e62cf53e5c7df3758ae52cc62d63a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:20 -0400 Subject: vfio: Fold vfio_virqfd.ko into vfio.ko This is only 1.8k, putting it in its own module is not really necessary. The kconfig infrastructure is still there to completely remove it for systems that are trying for small footprint. Put it in the main vfio.ko module now that kbuild can support multiple .c files. Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/5-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 2 +- drivers/vfio/Makefile | 4 +--- drivers/vfio/vfio.h | 13 +++++++++++++ drivers/vfio/vfio_main.c | 7 +++++++ drivers/vfio/virqfd.c | 17 +++-------------- 5 files changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index d25b91adfd64..0b8d53f63c7e 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -21,7 +21,7 @@ config VFIO_IOMMU_SPAPR_TCE default VFIO config VFIO_VIRQFD - tristate + bool select EVENTFD default n diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 50b8e8e3fb10..0721ed4831c9 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,13 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -vfio_virqfd-y := virqfd.o - obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ iova_bitmap.o \ container.o +vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o -obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index bcad54bbab08..a7113b4baaa2 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -124,6 +124,19 @@ long vfio_container_ioctl_check_extension(struct vfio_container *container, int __init vfio_container_init(void); void vfio_container_cleanup(void); +#if IS_ENABLED(CONFIG_VFIO_VIRQFD) +int __init vfio_virqfd_init(void); +void vfio_virqfd_exit(void); +#else +static inline int __init vfio_virqfd_init(void) +{ + return 0; +} +static inline void vfio_virqfd_exit(void) +{ +} +#endif + #ifdef CONFIG_VFIO_NOIOMMU extern bool vfio_noiommu __read_mostly; #else diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 662e267a3e13..7f88569c3eba 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1832,6 +1832,10 @@ static int __init vfio_init(void) if (ret) return ret; + ret = vfio_virqfd_init(); + if (ret) + goto err_virqfd; + /* /dev/vfio/$GROUP */ vfio.class = class_create(THIS_MODULE, "vfio"); if (IS_ERR(vfio.class)) { @@ -1862,6 +1866,8 @@ err_dev_class: class_destroy(vfio.class); vfio.class = NULL; err_group_class: + vfio_virqfd_exit(); +err_virqfd: vfio_container_cleanup(); return ret; } @@ -1876,6 +1882,7 @@ static void __exit vfio_cleanup(void) class_destroy(vfio.device_class); vfio.device_class = NULL; class_destroy(vfio.class); + vfio_virqfd_exit(); vfio_container_cleanup(); vfio.class = NULL; xa_destroy(&vfio_device_set_xa); diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 414e98d82b02..497a17b37865 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -12,15 +12,12 @@ #include #include #include - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Alex Williamson " -#define DRIVER_DESC "IRQFD support for VFIO bus drivers" +#include "vfio.h" static struct workqueue_struct *vfio_irqfd_cleanup_wq; static DEFINE_SPINLOCK(virqfd_lock); -static int __init vfio_virqfd_init(void) +int __init vfio_virqfd_init(void) { vfio_irqfd_cleanup_wq = create_singlethread_workqueue("vfio-irqfd-cleanup"); @@ -30,7 +27,7 @@ static int __init vfio_virqfd_init(void) return 0; } -static void __exit vfio_virqfd_exit(void) +void vfio_virqfd_exit(void) { destroy_workqueue(vfio_irqfd_cleanup_wq); } @@ -216,11 +213,3 @@ void vfio_virqfd_disable(struct virqfd **pvirqfd) flush_workqueue(vfio_irqfd_cleanup_wq); } EXPORT_SYMBOL_GPL(vfio_virqfd_disable); - -module_init(vfio_virqfd_init); -module_exit(vfio_virqfd_exit); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); -- cgit v1.2.3 From ce3895735cc26957dc6b2a8f5af07ddab09483ae Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 2 Dec 2022 09:46:15 -0700 Subject: vfio/ap/ccw/samples: Fix device_register() unwind path We always need to call put_device() if device_register() fails. All vfio drivers calling device_register() include a similar unwind stack via gotos, therefore split device_unregister() into its device_del() and put_device() components in the unwind path, and add a goto target to handle only the put_device() requirement. Reported-by: Ruan Jinjie Link: https://lore.kernel.org/all/20221118032827.3725190-1-ruanjinjie@huawei.com Fixes: d61fc96f47fd ("sample: vfio mdev display - host device") Fixes: 9d1a546c53b4 ("docs: Sample driver to demonstrate how to use Mediated device framework.") Fixes: a5e6e6505f38 ("sample: vfio bochs vbe display (host device for bochs-drm)") Fixes: 9e6f07cd1eaa ("vfio/ccw: create a parent struct") Fixes: 36360658eb5a ("s390: vfio_ap: link the vfio_ap devices to the vfio_ap bus subsystem") Cc: Tony Krowiak Cc: Halil Pasic Cc: Jason Herne Cc: Kirti Wankhede Reviewed-by: Kevin Tian Reviewed-by: Eric Farman Reviewed-by: Tony Krowiak Reviewed-by: Jason J. Herne Link: https://lore.kernel.org/r/166999942139.645727.12439756512449846442.stgit@omen Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 3 ++- drivers/s390/crypto/vfio_ap_drv.c | 2 +- samples/vfio-mdev/mbochs.c | 7 ++++--- samples/vfio-mdev/mdpy.c | 7 ++++--- samples/vfio-mdev/mtty.c | 7 ++++--- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index c2a65808605a..54aba7cceb33 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -199,8 +199,9 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) return 0; out_unreg: - device_unregister(&parent->dev); + device_del(&parent->dev); out_free: + put_device(&parent->dev); dev_set_drvdata(&sch->dev, NULL); return ret; } diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index f43cfeabd2cc..997b524bdd2b 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -122,7 +122,7 @@ static int vfio_ap_matrix_dev_create(void) return 0; matrix_drv_err: - device_unregister(&matrix_dev->device); + device_del(&matrix_dev->device); matrix_reg_err: put_device(&matrix_dev->device); matrix_alloc_err: diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 8b5a3a778a25..e54eb752e1ba 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1430,7 +1430,7 @@ static int __init mbochs_dev_init(void) ret = device_register(&mbochs_dev); if (ret) - goto err_class; + goto err_put; ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver, mbochs_mdev_types, @@ -1441,8 +1441,9 @@ static int __init mbochs_dev_init(void) return 0; err_device: - device_unregister(&mbochs_dev); -err_class: + device_del(&mbochs_dev); +err_put: + put_device(&mbochs_dev); class_destroy(mbochs_class); err_driver: mdev_unregister_driver(&mbochs_driver); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 721fb06c6413..e8400fdab71d 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -717,7 +717,7 @@ static int __init mdpy_dev_init(void) ret = device_register(&mdpy_dev); if (ret) - goto err_class; + goto err_put; ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver, mdpy_mdev_types, @@ -728,8 +728,9 @@ static int __init mdpy_dev_init(void) return 0; err_device: - device_unregister(&mdpy_dev); -err_class: + device_del(&mdpy_dev); +err_put: + put_device(&mdpy_dev); class_destroy(mdpy_class); err_driver: mdev_unregister_driver(&mdpy_driver); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 3c2a421b9b69..e887de672c52 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1330,7 +1330,7 @@ static int __init mtty_dev_init(void) ret = device_register(&mtty_dev.dev); if (ret) - goto err_class; + goto err_put; ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, &mtty_driver, mtty_mdev_types, @@ -1340,8 +1340,9 @@ static int __init mtty_dev_init(void) return 0; err_device: - device_unregister(&mtty_dev.dev); -err_class: + device_del(&mtty_dev.dev); +err_put: + put_device(&mtty_dev.dev); class_destroy(mtty_dev.vd_class); err_driver: mdev_unregister_driver(&mtty_driver); -- cgit v1.2.3 From c943a9374d12bebca2d0de7e273b1c723b58c122 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:25 +0200 Subject: net/mlx5: Introduce ifc bits for pre_copy Introduce ifc related stuff to enable PRE_COPY of VF during migration. Signed-off-by: Shay Drory Acked-by: Leon Romanovsky Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..230a96626a5f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1882,7 +1882,12 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 max_reformat_remove_size[0x8]; u8 max_reformat_remove_offset[0x8]; - u8 reserved_at_c0[0xe0]; + u8 reserved_at_c0[0x8]; + u8 migration_multi_load[0x1]; + u8 migration_tracking_state[0x1]; + u8 reserved_at_ca[0x16]; + + u8 reserved_at_e0[0xc0]; u8 reserved_at_1a0[0xb]; u8 log_min_mkey_entity_size[0x5]; @@ -11918,7 +11923,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x10]; + u8 incremental[0x1]; + u8 reserved_at_41[0xf]; u8 vhca_id[0x10]; u8 reserved_at_60[0x20]; @@ -11944,7 +11950,9 @@ struct mlx5_ifc_save_vhca_state_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x10]; + u8 incremental[0x1]; + u8 set_track[0x1]; + u8 reserved_at_42[0xe]; u8 vhca_id[0x10]; u8 reserved_at_60[0x20]; -- cgit v1.2.3 From 4db52602a6074e9cc523500b8304600ff63e7b85 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Dec 2022 10:34:26 +0200 Subject: vfio: Extend the device migration protocol with PRE_COPY The optional PRE_COPY states open the saving data transfer FD before reaching STOP_COPY and allows the device to dirty track internal state changes with the general idea to reduce the volume of data transferred in the STOP_COPY stage. While in PRE_COPY the device remains RUNNING, but the saving FD is open. Only if the device also supports RUNNING_P2P can it support PRE_COPY_P2P, which halts P2P transfers while continuing the saving FD. PRE_COPY, with P2P support, requires the driver to implement 7 new arcs and exists as an optional FSM branch between RUNNING and STOP_COPY: RUNNING -> PRE_COPY -> PRE_COPY_P2P -> STOP_COPY A new ioctl VFIO_MIG_GET_PRECOPY_INFO is provided to allow userspace to query the progress of the precopy operation in the driver with the idea it will judge to move to STOP_COPY at least once the initial data set is transferred, and possibly after the dirty size has shrunk appropriately. This ioctl is valid only in PRE_COPY states and kernel driver should return -EINVAL from any other migration state. Compared to the v1 clarification, STOP_COPY -> PRE_COPY is blocked and to be defined in future. We also split the pending_bytes report into the initial and sustaining values, e.g.: initial_bytes and dirty_bytes. initial_bytes: Amount of initial precopy data. dirty_bytes: Device state changes relative to data previously retrieved. These fields are not required to have any bearing to STOP_COPY phase. It is recommended to leave PRE_COPY for STOP_COPY only after the initial_bytes field reaches zero. Leaving PRE_COPY earlier might make things slower. Signed-off-by: Jason Gunthorpe Signed-off-by: Shay Drory Reviewed-by: Kevin Tian Reviewed-by: Shameer Kolothum Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 74 +++++++++++++++++++++++++++- include/uapi/linux/vfio.h | 123 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 191 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 7f88569c3eba..03dbcd3d96f0 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1042,7 +1042,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, enum vfio_device_mig_state new_fsm, enum vfio_device_mig_state *next_fsm) { - enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 }; + enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; /* * The coding in this table requires the driver to implement the * following FSM arcs: @@ -1057,30 +1057,65 @@ int vfio_mig_get_next_state(struct vfio_device *device, * RUNNING_P2P -> RUNNING * RUNNING_P2P -> STOP * STOP -> RUNNING_P2P - * Without P2P the driver must implement: + * + * If precopy is supported then the driver must support these additional + * FSM arcs: + * RUNNING -> PRE_COPY + * PRE_COPY -> RUNNING + * PRE_COPY -> STOP_COPY + * However, if precopy and P2P are supported together then the driver + * must support these additional arcs beyond the P2P arcs above: + * PRE_COPY -> RUNNING + * PRE_COPY -> PRE_COPY_P2P + * PRE_COPY_P2P -> PRE_COPY + * PRE_COPY_P2P -> RUNNING_P2P + * PRE_COPY_P2P -> STOP_COPY + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P + * + * Without P2P and precopy the driver must implement: * RUNNING -> STOP * STOP -> RUNNING * * The coding will step through multiple states for some combination * transitions; if all optional features are supported, this means the * following ones: + * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY + * PRE_COPY -> RUNNING -> RUNNING_P2P + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING + * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING + * PRE_COPY_P2P -> RUNNING_P2P -> STOP + * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING * RESUMING -> STOP -> RUNNING_P2P + * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P * RESUMING -> STOP -> RUNNING_P2P -> RUNNING + * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * RESUMING -> STOP -> STOP_COPY + * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P * RUNNING -> RUNNING_P2P -> STOP * RUNNING -> RUNNING_P2P -> STOP -> RESUMING * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY + * RUNNING_P2P -> RUNNING -> PRE_COPY * RUNNING_P2P -> STOP -> RESUMING * RUNNING_P2P -> STOP -> STOP_COPY + * STOP -> RUNNING_P2P -> PRE_COPY_P2P * STOP -> RUNNING_P2P -> RUNNING + * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * STOP_COPY -> STOP -> RESUMING * STOP_COPY -> STOP -> RUNNING_P2P * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING + * + * The following transitions are blocked: + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P */ static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1089,14 +1124,38 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, }, + [VFIO_DEVICE_STATE_PRE_COPY] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, [VFIO_DEVICE_STATE_STOP_COPY] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1105,6 +1164,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RESUMING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1113,6 +1174,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING_P2P] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1121,6 +1184,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_ERROR] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, @@ -1131,6 +1196,11 @@ int vfio_mig_get_next_state(struct vfio_device *device, static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, + [VFIO_DEVICE_STATE_PRE_COPY] = + VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING_P2P] = diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 3e45dbaf190e..23105eb036fa 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -819,12 +819,20 @@ struct vfio_device_feature { * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P * is supported in addition to the STOP_COPY states. * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY means that + * PRE_COPY is supported in addition to the STOP_COPY states. + * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY + * means that RUNNING_P2P, PRE_COPY and PRE_COPY_P2P are supported + * in addition to the STOP_COPY states. + * * Other combinations of flags have behavior to be defined in the future. */ struct vfio_device_feature_migration { __aligned_u64 flags; #define VFIO_MIGRATION_STOP_COPY (1 << 0) #define VFIO_MIGRATION_P2P (1 << 1) +#define VFIO_MIGRATION_PRE_COPY (1 << 2) }; #define VFIO_DEVICE_FEATURE_MIGRATION 1 @@ -875,8 +883,13 @@ struct vfio_device_feature_mig_state { * RESUMING - The device is stopped and is loading a new internal state * ERROR - The device has failed and must be reset * - * And 1 optional state to support VFIO_MIGRATION_P2P: + * And optional states to support VFIO_MIGRATION_P2P: * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA + * And VFIO_MIGRATION_PRE_COPY: + * PRE_COPY - The device is running normally but tracking internal state + * changes + * And VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY: + * PRE_COPY_P2P - PRE_COPY, except the device cannot do peer to peer DMA * * The FSM takes actions on the arcs between FSM states. The driver implements * the following behavior for the FSM arcs: @@ -908,20 +921,48 @@ struct vfio_device_feature_mig_state { * * To abort a RESUMING session the device must be reset. * + * PRE_COPY -> RUNNING * RUNNING_P2P -> RUNNING * While in RUNNING the device is fully operational, the device may generate * interrupts, DMA, respond to MMIO, all vfio device regions are functional, * and the device may advance its internal state. * + * The PRE_COPY arc will terminate a data transfer session. + * + * PRE_COPY_P2P -> RUNNING_P2P * RUNNING -> RUNNING_P2P * STOP -> RUNNING_P2P * While in RUNNING_P2P the device is partially running in the P2P quiescent * state defined below. * + * The PRE_COPY_P2P arc will terminate a data transfer session. + * + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P * STOP -> STOP_COPY - * This arc begin the process of saving the device state and will return a - * new data_fd. + * PRE_COPY, PRE_COPY_P2P and STOP_COPY form the "saving group" of states + * which share a data transfer session. Moving between these states alters + * what is streamed in session, but does not terminate or otherwise affect + * the associated fd. + * + * These arcs begin the process of saving the device state and will return a + * new data_fd. The migration driver may perform actions such as enabling + * dirty logging of device state when entering PRE_COPY or PER_COPY_P2P. * + * Each arc does not change the device operation, the device remains + * RUNNING, P2P quiesced or in STOP. The STOP_COPY state is described below + * in PRE_COPY_P2P -> STOP_COPY. + * + * PRE_COPY -> PRE_COPY_P2P + * Entering PRE_COPY_P2P continues all the behaviors of PRE_COPY above. + * However, while in the PRE_COPY_P2P state, the device is partially running + * in the P2P quiescent state defined below, like RUNNING_P2P. + * + * PRE_COPY_P2P -> PRE_COPY + * This arc allows returning the device to a full RUNNING behavior while + * continuing all the behaviors of PRE_COPY. + * + * PRE_COPY_P2P -> STOP_COPY * While in the STOP_COPY state the device has the same behavior as STOP * with the addition that the data transfers session continues to stream the * migration state. End of stream on the FD indicates the entire device @@ -939,6 +980,13 @@ struct vfio_device_feature_mig_state { * device state for this arc if required to prepare the device to receive the * migration data. * + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P + * These arcs are not permitted and return error if requested. Future + * revisions of this API may define behaviors for these arcs, in this case + * support will be discoverable by a new flag in + * VFIO_DEVICE_FEATURE_MIGRATION. + * * any -> ERROR * ERROR cannot be specified as a device state, however any transition request * can be failed with an errno return and may then move the device_state into @@ -950,7 +998,7 @@ struct vfio_device_feature_mig_state { * The optional peer to peer (P2P) quiescent state is intended to be a quiescent * state for the device for the purposes of managing multiple devices within a * user context where peer-to-peer DMA between devices may be active. The - * RUNNING_P2P states must prevent the device from initiating + * RUNNING_P2P and PRE_COPY_P2P states must prevent the device from initiating * any new P2P DMA transactions. If the device can identify P2P transactions * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration * driver must complete any such outstanding operations prior to completing the @@ -963,6 +1011,8 @@ struct vfio_device_feature_mig_state { * above FSM arcs. As there are multiple paths through the FSM arcs the path * should be selected based on the following rules: * - Select the shortest path. + * - The path cannot have saving group states as interior arcs, only + * starting/end states. * Refer to vfio_mig_get_next_state() for the result of the algorithm. * * The automatic transit through the FSM arcs that make up the combination @@ -976,6 +1026,9 @@ struct vfio_device_feature_mig_state { * support them. The user can discover if these states are supported by using * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can * avoid knowing about these optional states if the kernel driver supports them. + * + * Arcs touching PRE_COPY and PRE_COPY_P2P are removed if support for PRE_COPY + * is not present. */ enum vfio_device_mig_state { VFIO_DEVICE_STATE_ERROR = 0, @@ -984,8 +1037,70 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_STOP_COPY = 3, VFIO_DEVICE_STATE_RESUMING = 4, VFIO_DEVICE_STATE_RUNNING_P2P = 5, + VFIO_DEVICE_STATE_PRE_COPY = 6, + VFIO_DEVICE_STATE_PRE_COPY_P2P = 7, +}; + +/** + * VFIO_MIG_GET_PRECOPY_INFO - _IO(VFIO_TYPE, VFIO_BASE + 21) + * + * This ioctl is used on the migration data FD in the precopy phase of the + * migration data transfer. It returns an estimate of the current data sizes + * remaining to be transferred. It allows the user to judge when it is + * appropriate to leave PRE_COPY for STOP_COPY. + * + * This ioctl is valid only in PRE_COPY states and kernel driver should + * return -EINVAL from any other migration state. + * + * The vfio_precopy_info data structure returned by this ioctl provides + * estimates of data available from the device during the PRE_COPY states. + * This estimate is split into two categories, initial_bytes and + * dirty_bytes. + * + * The initial_bytes field indicates the amount of initial precopy + * data available from the device. This field should have a non-zero initial + * value and decrease as migration data is read from the device. + * It is recommended to leave PRE_COPY for STOP_COPY only after this field + * reaches zero. Leaving PRE_COPY earlier might make things slower. + * + * The dirty_bytes field tracks device state changes relative to data + * previously retrieved. This field starts at zero and may increase as + * the internal device state is modified or decrease as that modified + * state is read from the device. + * + * Userspace may use the combination of these fields to estimate the + * potential data size available during the PRE_COPY phases, as well as + * trends relative to the rate the device is dirtying its internal + * state, but these fields are not required to have any bearing relative + * to the data size available during the STOP_COPY phase. + * + * Drivers have a lot of flexibility in when and what they transfer during the + * PRE_COPY phase, and how they report this from VFIO_MIG_GET_PRECOPY_INFO. + * + * During pre-copy the migration data FD has a temporary "end of stream" that is + * reached when both initial_bytes and dirty_byte are zero. For instance, this + * may indicate that the device is idle and not currently dirtying any internal + * state. When read() is done on this temporary end of stream the kernel driver + * should return ENOMSG from read(). Userspace can wait for more data (which may + * never come) by using poll. + * + * Once in STOP_COPY the migration data FD has a permanent end of stream + * signaled in the usual way by read() always returning 0 and poll always + * returning readable. ENOMSG may not be returned in STOP_COPY. + * Support for this ioctl is mandatory if a driver claims to support + * VFIO_MIGRATION_PRE_COPY. + * + * Return: 0 on success, -1 and errno set on failure. + */ +struct vfio_precopy_info { + __u32 argsz; + __u32 flags; + __aligned_u64 initial_bytes; + __aligned_u64 dirty_bytes; }; +#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21) + /* * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power * state with the platform-based power management. Device use of lower power -- cgit v1.2.3 From 0e7caa65d707b93fbb4322c6313f739fa9103dfa Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:27 +0200 Subject: vfio/mlx5: Enforce a single SAVE command at a time Enforce a single SAVE command at a time. As the SAVE command is an asynchronous one, we must enforce running only a single command at a time. This will preserve ordering between multiple calls and protect from races on the migration file data structure. This is a must for the next patches from the series where as part of PRE_COPY we may have multiple images to be saved and multiple SAVE commands may be issued from different flows. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 6 ++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0848bc905d3e..55ee8036f59c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -281,6 +281,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); + complete(&migf->save_comp); fput(migf->filp); } @@ -321,6 +322,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return -ENOTCONN; mdev = mvdev->mdev; + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + err = mlx5_core_alloc_pd(mdev, &pdn); if (err) return err; @@ -371,6 +376,7 @@ err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); + complete(&migf->save_comp); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 921d5720a1e5..8ffa7699872c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -37,6 +37,7 @@ struct mlx5_vf_migration_file { unsigned long last_offset; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; + struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 6e9cf2aacc52..0d71ebb2a972 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -245,6 +245,13 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); + init_completion(&migf->save_comp); + /* + * save_comp is being used as a binary semaphore built from + * a completion. A normal mutex cannot be used because the lock is + * passed between kernel threads and lockdep can't model this. + */ + complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, -- cgit v1.2.3 From 9945a67ea4b30657dd998c7fbbea1b3950747168 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:28 +0200 Subject: vfio/mlx5: Refactor PD usage This patch refactors PD usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED and a single PD can be simply reused. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 53 +++++++++++++++++++++++++++++--------------- drivers/vfio/pci/mlx5/cmd.h | 5 ++++- drivers/vfio/pci/mlx5/main.c | 44 ++++++++++++++++++++++++++---------- 3 files changed, 71 insertions(+), 31 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 55ee8036f59c..a97eac49e3d6 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -279,7 +279,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5_core_destroy_mkey(mdev, async_data->mkey); dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -314,7 +313,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; struct mlx5_core_dev *mdev; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -326,16 +325,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); if (err) goto err_dma_map; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_create_mkey; @@ -357,7 +352,6 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, migf->total_length = 0; get_file(migf->filp); async_data->mkey = mkey; - async_data->pdn = pdn; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -375,7 +369,6 @@ err_out: err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: - mlx5_core_dealloc_pd(mdev, pdn); complete(&migf->save_comp); return err; } @@ -386,7 +379,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -400,15 +393,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, } mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - goto end; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); if (err) - goto err_reg; + goto end; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_mkey; @@ -424,13 +413,41 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, mlx5_core_destroy_mkey(mdev, mkey); err_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: - mlx5_core_dealloc_pd(mdev, pdn); end: mutex_unlock(&migf->lock); return err; } +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ + int err; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return -ENOTCONN; + + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); + return err; +} + +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return; + + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + + WARN_ON(migf->mvdev->mdev_detach); + + mlx5vf_cmd_dealloc_pd(migf); +} + static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, u32 req_nodes) { diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8ffa7699872c..ba760f956d53 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -16,7 +16,6 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 pdn; u32 mkey; void *out; }; @@ -27,6 +26,7 @@ struct mlx5_vf_migration_file { u8 disabled:1; u8 is_err:1; + u32 pdn; struct sg_append_table table; size_t total_length; size_t allocated_length; @@ -127,6 +127,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 0d71ebb2a972..1916f7c1468c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -236,12 +236,15 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, O_RDONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); @@ -257,20 +260,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &migf->total_length); if (ret) - goto out_free; + goto out_pd; ret = mlx5vf_add_migration_pages( migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); if (ret) - goto out_free; + goto out_pd; - migf->mvdev = mvdev; ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) - goto out_free; + goto out_save; return migf; +out_save: + mlx5vf_disable_fd(migf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); +end: + kfree(migf); return ERR_PTR(ret); } @@ -352,6 +360,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); if (!migf) @@ -360,20 +369,30 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, O_WRONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_free: + fput(migf->filp); +end: + kfree(migf); + return ERR_PTR(ret); } void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); fput(mvdev->resuming_migf->filp); mvdev->resuming_migf = NULL; } @@ -381,6 +400,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; } -- cgit v1.2.3 From 91454f8b9bf4ce6be1d9a0b4de401bc3c6313a95 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:29 +0200 Subject: vfio/mlx5: Refactor MKEY usage This patch refactors MKEY usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED. We achieve it by having a new struct named mlx5_vhca_data_buffer which holds the mkey and its related stuff as of sg_append_table, allocated_length, etc. The above fields were taken out from the migration file main struct, into mlx5_vhca_data_buffer dedicated struct with the proper helpers in place. For now we have a single mlx5_vhca_data_buffer per migration file. However, in coming patches we'll have multiple of them to support multiple images. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 162 +++++++++++++++++++++++++++---------------- drivers/vfio/pci/mlx5/cmd.h | 37 +++++++--- drivers/vfio/pci/mlx5/main.c | 92 +++++++++++++----------- 3 files changed, 178 insertions(+), 113 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index a97eac49e3d6..ed4c472d2eae 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -210,11 +210,11 @@ err_exec: } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +232,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (buf) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,20 +255,99 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, - migf ? migf->total_length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; + int ret; + + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) + return -ENOTCONN; + + if (buf->dmaed || !buf->allocated_length) + return -EINVAL; + + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + if (ret) + return ret; + + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + if (ret) + goto err; + + buf->dmaed = true; + + return 0; +err: + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5_vf_migration_file *migf = buf->migf; + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (buf->dmaed) { + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, + buf->dma_dir, 0); + } + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, + enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + buf->dma_dir = dma_dir; + buf->migf = migf; + if (length) { + ret = mlx5vf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } + + return buf; +end: + mlx5vf_free_data_buffer(buf); + return ERR_PTR(ret); +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); - struct mlx5_core_dev *mdev = migf->mvdev->mdev; mutex_lock(&migf->lock); if (async_data->status) { @@ -276,9 +355,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - - mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -292,7 +368,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->total_length, + WRITE_ONCE(migf->buf->length, MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size)); wake_up_interruptible(&migf->poll_wait); @@ -307,39 +383,28 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; - struct mlx5_core_dev *mdev; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mdev = mvdev->mdev; err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); - if (err) - goto err_dma_map; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_create_mkey; - MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(save_vhca_state_in, in, mkey, mkey); - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; async_data->out = kvzalloc(out_size, GFP_KERNEL); @@ -348,10 +413,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } - /* no data exists till the callback comes back */ - migf->total_length = 0; get_file(migf->filp); - async_data->mkey = mkey; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -365,57 +427,33 @@ err_exec: fput(migf->filp); kvfree(async_data->out); err_out: - mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { - struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mutex_lock(&migf->lock); - if (!migf->total_length) { - err = -EINVAL; - goto end; - } - - mdev = mvdev->mdev; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); + err = mlx5vf_dma_data_buffer(buf); if (err) - goto end; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_mkey; + return err; MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(load_vhca_state_in, in, mkey, mkey); - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); - - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); - - mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -end: - mutex_unlock(&migf->lock); - return err; + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(load_vhca_state_in, in, size, buf->length); + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); } int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) @@ -445,6 +483,10 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) WARN_ON(migf->mvdev->mdev_detach); + if (migf->buf) { + mlx5vf_free_data_buffer(migf->buf); + migf->buf = NULL; + } mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index ba760f956d53..b0f08dfc8120 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,11 +12,25 @@ #include #include +struct mlx5_vhca_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + u32 mkey; + enum dma_data_direction dma_dir; + u8 dmaed:1; + struct mlx5_vf_migration_file *migf; + /* Optimize mlx5vf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 mkey; void *out; }; @@ -27,14 +41,7 @@ struct mlx5_vf_migration_file { u8 is_err:1; u32 pdn; - struct sg_append_table table; - size_t total_length; - size_t allocated_length; - - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; + struct mlx5_vhca_data_buffer *buf; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -124,12 +131,20 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 1916f7c1468c..5f694fce854c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -33,7 +33,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) } static struct page * -mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { unsigned long cur_offset = 0; @@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned int i; /* All accesses are sequential */ - if (offset < migf->last_offset || !migf->last_offset_sg) { - migf->last_offset = 0; - migf->last_offset_sg = migf->table.sgt.sgl; - migf->sg_last_entry = 0; + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; } - cur_offset = migf->last_offset; + cur_offset = buf->last_offset; - for_each_sg(migf->last_offset_sg, sg, - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { if (offset < sg->length + cur_offset) { - migf->last_offset_sg = sg; - migf->sg_last_entry += i; - migf->last_offset = cur_offset; + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; return nth_page(sg_page(sg), (offset - cur_offset) / PAGE_SIZE); } @@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, return NULL; } -static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); if (ret) goto err; - migf->allocated_length += filled * PAGE_SIZE; + buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -108,16 +108,8 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { - struct sg_page_iter sg_iter; - mutex_lock(&migf->lock); - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&migf->table); migf->disabled = true; - migf->total_length = 0; - migf->allocated_length = 0; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -136,6 +128,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; ssize_t done = 0; if (pos) @@ -144,16 +137,16 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(migf->total_length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || migf->is_err)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { + if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { done = -EAGAIN; goto out_unlock; } - if (*pos > migf->total_length) { + if (*pos > vhca_buf->length) { done = -EINVAL; goto out_unlock; } @@ -162,7 +155,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, goto out_unlock; } - len = min_t(size_t, migf->total_length - *pos, len); + len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { size_t page_offset; struct page *page; @@ -171,7 +164,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -208,7 +201,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->disabled || migf->is_err) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->total_length)) + else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -227,6 +220,8 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -257,22 +252,23 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &migf->total_length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; - ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); - if (ret) + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); goto out_pd; + } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; + migf->buf = buf; return migf; out_save: - mlx5vf_disable_fd(migf); + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -286,6 +282,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; loff_t requested_length; ssize_t done = 0; @@ -306,10 +303,10 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; } - if (migf->allocated_length < requested_length) { + if (vhca_buf->allocated_length < requested_length) { done = mlx5vf_add_migration_pages( - migf, - DIV_ROUND_UP(requested_length - migf->allocated_length, + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, PAGE_SIZE)); if (done) goto out_unlock; @@ -323,7 +320,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -342,7 +339,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, len -= page_len; done += page_len; buf += page_len; - migf->total_length += page_len; + vhca_buf->length += page_len; } out_unlock: mutex_unlock(&migf->lock); @@ -360,6 +357,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -378,9 +376,18 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } + + migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); end: @@ -474,7 +481,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf); + mvdev->resuming_migf, + mvdev->resuming_migf->buf); if (ret) return ERR_PTR(ret); mlx5vf_disable_fds(mvdev); -- cgit v1.2.3 From 8b599d143419669e57da3881d8293f17809688d7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:30 +0200 Subject: vfio/mlx5: Refactor migration file state Refactor migration file state to be an emum which is mutual exclusive. As of that dropped the 'disabled' state as 'error' is the same from functional point of view. Next patches from the series will extend this enum for other relevant states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 2 +- drivers/vfio/pci/mlx5/cmd.h | 7 +++++-- drivers/vfio/pci/mlx5/main.c | 11 ++++++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index ed4c472d2eae..fcba12326185 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,7 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->is_err = true; + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index b0f08dfc8120..14403e654e4e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,6 +12,10 @@ #include #include +enum mlx5_vf_migf_state { + MLX5_MIGF_STATE_ERROR = 1, +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; @@ -37,8 +41,7 @@ struct mlx5vf_async_data { struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - u8 disabled:1; - u8 is_err:1; + enum mlx5_vf_migf_state state; u32 pdn; struct mlx5_vhca_data_buffer *buf; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5f694fce854c..d95646c2f010 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -109,7 +109,7 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); - migf->disabled = true; + migf->state = MLX5_MIGF_STATE_ERROR; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -137,7 +137,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || + migf->state == MLX5_MIGF_STATE_ERROR)) return -ERESTARTSYS; } @@ -150,7 +151,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, done = -EINVAL; goto out_unlock; } - if (migf->disabled || migf->is_err) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } @@ -199,7 +200,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, poll_wait(filp, &migf->poll_wait, wait); mutex_lock(&migf->lock); - if (migf->disabled || migf->is_err) + if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; @@ -298,7 +299,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, return -ENOMEM; mutex_lock(&migf->lock); - if (migf->disabled) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } -- cgit v1.2.3 From c668878381b5702f867ec7f43ee3b74259c6ea03 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:31 +0200 Subject: vfio/mlx5: Refactor to use queue based data chunks Refactor to use queue based data chunks on the migration file. The SAVE command adds a chunk to the tail of the queue while the read() API finds the required chunk and returns its data. In case the queue is empty but the state of the migration file is MLX5_MIGF_STATE_COMPLETE, read() may not be blocked but will return 0 to indicate end of file. This is a step towards maintaining multiple images and their meta data (i.e. headers) on the migration file as part of next patches from the series. Note: At that point, we still use a single chunk on the migration file but becomes ready to support multiple. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 24 +++++-- drivers/vfio/pci/mlx5/cmd.h | 5 ++ drivers/vfio/pci/mlx5/main.c | 145 +++++++++++++++++++++++++++++++++---------- 3 files changed, 136 insertions(+), 38 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fcba12326185..0e36b4c8c816 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,6 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { + migf->buf = async_data->buf; migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -368,9 +369,15 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->buf->length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + unsigned long flags; + + async_data->buf->length = + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -407,6 +414,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; + async_data->buf = buf; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -479,14 +487,22 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { - lockdep_assert_held(&migf->mvdev->state_mutex); + struct mlx5_vhca_data_buffer *entry; + lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); if (migf->buf) { mlx5vf_free_data_buffer(migf->buf); migf->buf = NULL; } + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 14403e654e4e..6e594689566e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -14,6 +14,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_COMPLETE, }; struct mlx5_vhca_data_buffer { @@ -24,6 +25,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ struct scatterlist *last_offset_sg; @@ -34,6 +36,7 @@ struct mlx5_vhca_data_buffer { struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; int status; void *out; }; @@ -45,6 +48,8 @@ struct mlx5_vf_migration_file { u32 pdn; struct mlx5_vhca_data_buffer *buf; + spinlock_t list_lock; + struct list_head buf_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95646c2f010..ca16425811c4 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -124,11 +124,90 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -137,53 +216,47 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || - migf->state == MLX5_MIGF_STATE_ERROR)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > vhca_buf->length) { - done = -EINVAL; - goto out_unlock; - } if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -202,7 +275,8 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->buf->length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -253,6 +327,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; @@ -266,7 +342,6 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; - migf->buf = buf; return migf; out_save: mlx5vf_free_data_buffer(buf); @@ -386,6 +461,8 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); return migf; out_pd: mlx5vf_cmd_dealloc_pd(migf); -- cgit v1.2.3 From 3319d287f4c04b9deece8ea00e27a70bbe32941b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:32 +0200 Subject: vfio/mlx5: Introduce device transitions of PRE_COPY In order to support PRE_COPY, mlx5 driver is transferring multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. The device is saving three kinds of states: 1) Initial state - when the device moves to PRE_COPY state. 2) Middle state - during PRE_COPY phase via VFIO_MIG_GET_PRECOPY_INFO. There can be multiple states of this type. 3) Final state - when the device moves to STOP_COPY state. After moving to PRE_COPY state, user is holding the saving migf FD and can use it. For example: user can start transferring data via read() callback. Also, user can switch from PRE_COPY to STOP_COPY whenever he sees it fits. This will invoke saving of final state. This means that mlx5 VFIO device can be switched to STOP_COPY without transferring any data in PRE_COPY state. Therefore, when the device moves to STOP_COPY, mlx5 will store the final state on a dedicated queue entry on the list. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 96 ++++++++++++++++++++++++++++++++++++++++---- drivers/vfio/pci/mlx5/cmd.h | 16 +++++++- drivers/vfio/pci/mlx5/main.c | 90 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 184 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0e36b4c8c816..5fcece201d4c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; + int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) { + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + } + MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + if (migf) + complete(&migf->save_comp); + + return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,7 +63,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -59,6 +77,8 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -342,6 +362,56 @@ end: return ERR_PTR(ret); } +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf, *temp_buf; + struct list_head free_list; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return ERR_PTR(-ENOTCONN); + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + if (buf->dma_dir == dma_dir) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to mlx5vf_free_data_buffer which + * might sleep. + */ + list_add(&buf->buf_elm, &free_list); + } + } + spin_unlock_irq(&migf->list_lock); + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + mlx5vf_free_data_buffer(temp_buf); + } + + return buf; +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, @@ -351,7 +421,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->buf = async_data->buf; + mlx5vf_put_data_buffer(async_data->buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -369,15 +439,19 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { + size_t image_size; unsigned long flags; - async_data->buf->length = - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size); + image_size = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + async_data->buf->length = image_size; + async_data->buf->start_pos = migf->max_pos; + migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - migf->state = MLX5_MIGF_STATE_COMPLETE; + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -391,7 +465,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf) + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; @@ -412,9 +487,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; async_data->buf = buf; + async_data->last_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -497,6 +575,8 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + list_splice(&migf->avail_list, &migf->buf_list); + while ((entry = list_first_entry_or_null(&migf->buf_list, struct mlx5_vhca_data_buffer, buf_elm))) { list_del(&entry->buf_elm); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 6e594689566e..34e61c7aa23d 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -38,6 +38,7 @@ struct mlx5vf_async_data { struct work_struct work; struct mlx5_vhca_data_buffer *buf; int status; + u8 last_chunk:1; void *out; }; @@ -47,9 +48,11 @@ struct mlx5_vf_migration_file { enum mlx5_vf_migf_state state; u32 pdn; + loff_t max_pos; struct mlx5_vhca_data_buffer *buf; spinlock_t list_lock; struct list_head buf_list; + struct list_head avail_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -129,10 +132,14 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); @@ -140,7 +147,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf); + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf); @@ -151,6 +159,10 @@ struct mlx5_vhca_data_buffer * mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index ca16425811c4..9cabba456044 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -195,6 +195,7 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, if (*pos >= vhca_buf->start_pos + vhca_buf->length) { spin_lock_irq(&vhca_buf->migf->list_lock); list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); spin_unlock_irq(&vhca_buf->migf->list_lock); } @@ -283,6 +284,16 @@ static __poll_t mlx5vf_save_poll(struct file *filp, return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->state = MLX5_MIGF_STATE_ERROR; + wake_up_interruptible(&migf->poll_wait); +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, @@ -291,8 +302,42 @@ static const struct file_operations mlx5vf_save_fops = { .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; + int ret; + + if (migf->state == MLX5_MIGF_STATE_ERROR) + return -ENODEV; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC); + if (ret) + goto err; + + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); + if (ret) + goto err_save; + + return 0; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; struct mlx5_vhca_data_buffer *buf; @@ -328,8 +373,9 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); if (ret) goto out_pd; @@ -339,7 +385,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_pd; } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); if (ret) goto out_save; return migf; @@ -462,6 +508,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; out_pd: @@ -514,7 +561,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) @@ -522,7 +570,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) @@ -533,7 +582,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf; - migf = mlx5vf_pci_save_device_data(mvdev); + migf = mlx5vf_pci_save_device_data(mvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -541,7 +590,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { mlx5vf_disable_fds(mvdev); return NULL; } @@ -567,6 +619,28 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct mlx5_vf_migration_file *migf; + + migf = mlx5vf_pci_save_device_data(mvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + mvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = mlx5vf_cmd_suspend_vhca(mvdev, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); + if (ret) + return ERR_PTR(ret); + ret = mlx5vf_pci_save_device_inc_data(mvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -635,7 +709,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, mutex_lock(&mvdev->state_mutex); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &state_size); + &state_size, 0); if (!ret) *stop_copy_length = state_size; mlx5vf_state_mutex_unlock(mvdev); -- cgit v1.2.3 From 0c9a38fee8b210a8dfd3f177526daac567ec9265 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:33 +0200 Subject: vfio/mlx5: Introduce SW headers for migration states As mentioned in the previous patches, mlx5 is transferring multiple states when the PRE_COPY protocol is used. This states mechanism requires the target VM to know the states' size in order to execute multiple loads. Therefore, add SW header, with the needed information, for each saved state the source VM is transferring to the target VM. This patch implements the source VM handling of the headers, following patch will implement the target VM handling of the headers. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 56 +++++++++++++++++++++++++++++++++++++++++--- drivers/vfio/pci/mlx5/cmd.h | 13 ++++++++++ drivers/vfio/pci/mlx5/main.c | 2 +- 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5fcece201d4c..160fa38fc78d 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,9 +351,11 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, if (ret) goto end; - ret = mlx5vf_dma_data_buffer(buf); - if (ret) - goto end; + if (dma_dir != DMA_NONE) { + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } } return buf; @@ -422,6 +424,8 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { mlx5vf_put_data_buffer(async_data->buf); + if (async_data->header_buf) + mlx5vf_put_data_buffer(async_data->header_buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -431,6 +435,32 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) fput(migf->filp); } +static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, + size_t image_size) +{ + struct mlx5_vf_migration_file *migf = header_buf->migf; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + + header.image_size = cpu_to_le64(image_size); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->header_image_size = image_size; + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + return 0; +} + static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) { struct mlx5vf_async_data *async_data = container_of(context, @@ -444,6 +474,11 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); + if (async_data->header_buf) { + status = add_buf_header(async_data->header_buf, image_size); + if (status) + goto err; + } async_data->buf->length = image_size; async_data->buf->start_pos = migf->max_pos; migf->max_pos += async_data->buf->length; @@ -455,6 +490,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) wake_up_interruptible(&migf->poll_wait); } +err: /* * The error and the cleanup flows can't run from an * interrupt context @@ -470,6 +506,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; int err; @@ -499,6 +536,16 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } + } + + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, @@ -510,7 +557,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return 0; err_exec: + if (header_buf) + mlx5vf_put_data_buffer(header_buf); fput(migf->filp); +err_free: kvfree(async_data->out); err_out: complete(&migf->save_comp); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 34e61c7aa23d..3e36ccca820a 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,16 +12,26 @@ #include #include +#define MLX5VF_PRE_COPY_SUPP(mvdev) \ + ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) + enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, MLX5_MIGF_STATE_COMPLETE, }; +struct mlx5_vf_migration_header { + __le64 image_size; + /* For future use in case we may need to change the kernel protocol */ + __le64 flags; +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; u64 length; u64 allocated_length; + u64 header_image_size; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -37,6 +47,7 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *header_buf; int status; u8 last_chunk:1; void *out; @@ -165,6 +176,8 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9cabba456044..9a36e36ec33b 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -32,7 +32,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -static struct page * +struct page * mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { -- cgit v1.2.3 From 0dce165b1adf8d7f67030bb257e00107db8022de Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:34 +0200 Subject: vfio/mlx5: Introduce vfio precopy ioctl implementation vfio precopy ioctl returns an estimation of data available for transferring from the device. Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current state of the device, and if needed, append the dirty data to the transfer FD data. This is done by saving a middle state. As mlx5 runs the SAVE command asynchronously, make sure to query for incremental data only once there is no active save command. Running both in parallel, might end-up with a failure in the incremental query command on un-tracked vhca. Also, a middle state will be saved only after the previous state has finished its SAVE command and has been fully transferred, this prevents endless use resources. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-11-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 16 +++++++ drivers/vfio/pci/mlx5/main.c | 111 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 160fa38fc78d..12e74ecebe64 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -67,12 +67,25 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; + bool inc = query_flags & MLX5VF_QUERY_INC; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the + * incremental query command on un-tracked vhca. + */ + if (inc) { + ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); + if (ret) + return ret; + } + MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); @@ -82,6 +95,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); + if (inc) + complete(&mvdev->saving_migf->save_comp); + if (ret) return ret; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9a36e36ec33b..2c8ac763057c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -294,10 +294,121 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } +static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + size_t inc_length = 0; + bool end_of_data; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * We can't issue a SAVE command when the device is suspended, so as + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra + * bytes that can't be read. + */ + if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { + /* + * Once the query returns it's guaranteed that there is no + * active SAVE command. + * As so, the other code below is safe with the proper locks. + */ + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, + MLX5VF_QUERY_INC); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + if (buf->start_pos == 0) { + info.initial_bytes = buf->header_image_size - *pos; + } else if (buf->start_pos == + sizeof(struct mlx5_vf_migration_header)) { + /* First data buffer following the header */ + info.initial_bytes = buf->start_pos + + buf->length - *pos; + } else { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + + info.dirty_bytes = inc_length; + } + + if (!end_of_data || !inc_length) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, save a new state to be ready for. + */ + buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + mlx5vf_put_data_buffer(buf); + goto err_state_unlock; + } + +done: + mlx5vf_state_mutex_unlock(mvdev); + return copy_to_user((void __user *)arg, &info, minsz); +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, .poll = mlx5vf_save_poll, + .unlocked_ioctl = mlx5vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, .llseek = no_llseek, }; -- cgit v1.2.3 From 81156c27271c4a6c594e492c8d119fbacfc99f36 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:35 +0200 Subject: vfio/mlx5: Consider temporary end of stream as part of PRE_COPY During PRE_COPY the migration data FD may have a temporary "end of stream" that is reached when the initial_bytes were read and no other dirty data exists yet. For instance, this may indicate that the device is idle and not currently dirtying any internal state. When read() is done on this temporary end of stream the kernel driver should return ENOMSG from read(). Userspace can wait for more data or consider moving to STOP_COPY. To not block the user upon read() and let it get ENOMSG we add a new state named MLX5_MIGF_STATE_PRE_COPY on the migration file. In addition, we add the MLX5_MIGF_STATE_SAVE_LAST state to block the read() once we call the last SAVE upon moving to STOP_COPY. Any further error will be marked with MLX5_MIGF_STATE_ERROR and the user won't be blocked. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-12-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 7 +++++-- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 12e74ecebe64..f6293da033cc 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -501,8 +501,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - if (async_data->last_chunk) - migf->state = MLX5_MIGF_STATE_COMPLETE; + migf->state = async_data->last_chunk ? + MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); } @@ -561,6 +561,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } } + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_LAST; + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 3e36ccca820a..d048f23977dd 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,8 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY, + MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 2c8ac763057c..44b1543c751c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } @@ -236,6 +237,12 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, &end_of_data); if (first_loop_call) { first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + done = -ENOMSG; + goto out_unlock; + } + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { if (filp->f_flags & O_NONBLOCK) { done = -EAGAIN; -- cgit v1.2.3 From 34e2f27143d1b373f088e805f7e11cdf778f791d Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:36 +0200 Subject: vfio/mlx5: Introduce multiple loads In order to support PRE_COPY, mlx5 driver transfers multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. This patch implements the changes for the target VF to decompose the header for each state and to write and load multiple states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-13-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 13 +- drivers/vfio/pci/mlx5/cmd.h | 10 ++ drivers/vfio/pci/mlx5/main.c | 279 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 257 insertions(+), 45 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index f6293da033cc..993749818d90 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -598,9 +598,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; - err = mlx5vf_dma_data_buffer(buf); - if (err) - return err; + if (!buf->dmaed) { + err = mlx5vf_dma_data_buffer(buf); + if (err) + return err; + } MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); @@ -644,6 +646,11 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + if (migf->buf_header) { + mlx5vf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + list_splice(&migf->avail_list, &migf->buf_list); while ((entry = list_first_entry_or_null(&migf->buf_list, diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index d048f23977dd..7729eac8c78c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -22,6 +22,14 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_COMPLETE, }; +enum mlx5_vf_load_state { + MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, + MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_IMAGE, + MLX5_VF_LOAD_STATE_READ_IMAGE, + MLX5_VF_LOAD_STATE_LOAD_IMAGE, +}; + struct mlx5_vf_migration_header { __le64 image_size; /* For future use in case we may need to change the kernel protocol */ @@ -60,9 +68,11 @@ struct mlx5_vf_migration_file { struct mutex lock; enum mlx5_vf_migf_state state; + enum mlx5_vf_load_state load_state; u32 pdn; loff_t max_pos; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *buf_header; spinlock_t list_lock; struct list_head buf_list; struct list_head avail_list; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 44b1543c751c..5a669b73994a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -518,13 +518,162 @@ end: return ERR_PTR(ret); } +static int +mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + + page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static int +mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, + loff_t requested_length, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + int ret; + + if (requested_length > MAX_MIGRATION_SIZE) + return -ENOMEM; + + if (vhca_buf->allocated_length < requested_length) { + ret = mlx5vf_add_migration_pages( + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + return ret; + } + + while (*len) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, + done); + if (ret) + return ret; + } + + return 0; +} + +static ssize_t +mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + size_t image_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, image_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == image_size) { + migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; + migf->max_pos += image_size; + *has_work = true; + } + + return 0; +} + +static int +mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { + u64 flags; + + vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); + if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { + ret = -ENOMEM; + goto end; + } + + flags = le64_to_cpup((__le64 *)(to_buff + + offsetof(struct mlx5_vf_migration_header, flags))); + if (flags) { + ret = -EOPNOTSUPP; + goto end; + } + + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; + migf->max_pos += vhca_buf->length; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; loff_t requested_length; + bool has_work = false; ssize_t done = 0; + int ret = 0; if (pos) return -ESPIPE; @@ -534,56 +683,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL; - if (requested_length > MAX_MIGRATION_SIZE) - return -ENOMEM; - + mutex_lock(&migf->mvdev->state_mutex); mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) { - done = -ENODEV; + ret = -ENODEV; goto out_unlock; } - if (vhca_buf->allocated_length < requested_length) { - done = mlx5vf_add_migration_pages( - vhca_buf, - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, - PAGE_SIZE)); - if (done) - goto out_unlock; - } + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case MLX5_VF_LOAD_STATE_READ_HEADER: + ret = mlx5vf_resume_read_header(migf, vhca_buf_header, + &buf, &len, pos, + &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_PREP_IMAGE: + { + u64 size = vhca_buf_header->header_image_size; + + if (vhca_buf->allocated_length < size) { + mlx5vf_free_data_buffer(vhca_buf); + + migf->buf = mlx5vf_alloc_data_buffer(migf, + size, DMA_TO_DEVICE); + if (IS_ERR(migf->buf)) { + ret = PTR_ERR(migf->buf); + migf->buf = NULL; + goto out_unlock; + } - while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *to_buff; - int ret; + vhca_buf = migf->buf; + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; - goto out_unlock; + vhca_buf->start_pos = migf->max_pos; + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; + break; } + case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: + ret = mlx5vf_resume_read_image_no_header(vhca_buf, + requested_length, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_READ_IMAGE: + ret = mlx5vf_resume_read_image(migf, vhca_buf, + vhca_buf_header->header_image_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_LOAD_IMAGE: + ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); + if (ret) + goto out_unlock; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - to_buff = kmap_local_page(page); - ret = copy_from_user(to_buff + page_offset, buf, page_len); - kunmap_local(to_buff); - if (ret) { - done = -EFAULT; - goto out_unlock; + /* prep header buf for next image */ + vhca_buf_header->length = 0; + vhca_buf_header->header_image_size = 0; + /* prep data buf for next image */ + vhca_buf->length = 0; + + break; + default: + break; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; - vhca_buf->length += page_len; } + out_unlock: + if (ret) + migf->state = MLX5_MIGF_STATE_ERROR; mutex_unlock(&migf->lock); - return done; + mlx5vf_state_mutex_unlock(migf->mvdev); + return ret ? ret : done; } static const struct file_operations mlx5vf_resume_fops = { @@ -623,12 +799,29 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf = buf; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + buf = mlx5vf_alloc_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_buf; + } + + migf->buf_header = buf; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + } else { + /* Initial state will be to read the image */ + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; + } + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; +out_buf: + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -728,11 +921,13 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf, - mvdev->resuming_migf->buf); - if (ret) - return ERR_PTR(ret); + if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { + ret = mlx5vf_cmd_load_vhca_state(mvdev, + mvdev->resuming_migf, + mvdev->resuming_migf->buf); + if (ret) + return ERR_PTR(ret); + } mlx5vf_disable_fds(mvdev); return NULL; } -- cgit v1.2.3 From d6e18a4bec431c181a60d32876c6c89955b2a4f8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:37 +0200 Subject: vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error Before a SAVE command is issued, a QUERY command is issued in order to know the device data size. In case PRE_COPY is used, the above commands are issued while the device is running. Thus, it is possible that between the QUERY and the SAVE commands the state of the device will be changed significantly and thus the SAVE will fail. Currently, if a SAVE command is failing, the driver will fail the migration. In the above case, don't fail the migration, but don't allow for new SAVEs to be executed while the device is in a RUNNING state. Once the device will be moved to STOP_COPY, SAVE can be executed again and the full device state will be read. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-14-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 27 ++++++++++++++++++++++++++- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 6 ++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 993749818d90..01ef695e9441 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -84,6 +84,19 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); if (ret) return ret; + if (mvdev->saving_migf->state == + MLX5_MIGF_STATE_PRE_COPY_ERROR) { + /* + * In case we had a PRE_COPY error, only query full + * image for final image + */ + if (!(query_flags & MLX5VF_QUERY_FINAL)) { + *state_size = 0; + complete(&mvdev->saving_migf->save_comp); + return 0; + } + query_flags &= ~MLX5VF_QUERY_INC; + } } MLX5_SET(query_vhca_migration_state_in, in, opcode, @@ -442,7 +455,10 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5vf_put_data_buffer(async_data->buf); if (async_data->header_buf) mlx5vf_put_data_buffer(async_data->header_buf); - migf->state = MLX5_MIGF_STATE_ERROR; + if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; + else + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); @@ -511,6 +527,8 @@ err: * The error and the cleanup flows can't run from an * interrupt context */ + if (status == -EREMOTEIO) + status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } @@ -534,6 +552,13 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) + /* + * In case we had a PRE_COPY error, SAVE is triggered only for + * the final image, read device full image. + */ + inc = false; + MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 7729eac8c78c..5483171d57ad 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY_ERROR, MLX5_MIGF_STATE_PRE_COPY, MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, @@ -157,6 +158,7 @@ struct mlx5vf_pci_core_device { enum { MLX5VF_QUERY_INC = (1UL << 0), + MLX5VF_QUERY_FINAL = (1UL << 1), }; int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5a669b73994a..cd90eb86128c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; @@ -238,7 +239,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (first_loop_call) { first_loop_call = false; /* Temporary end of file as part of PRE_COPY */ - if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { done = -ENOMSG; goto out_unlock; } @@ -431,7 +433,7 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) return -ENODEV; ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, - MLX5VF_QUERY_INC); + MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); if (ret) goto err; -- cgit v1.2.3 From ccc2a52e464d1c983efede1a6d44728c151cb2ed Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:38 +0200 Subject: vfio/mlx5: Enable MIGRATION_PRE_COPY flag Now that everything has been set up for MIGRATION_PRE_COPY, enable it. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-15-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 01ef695e9441..64e68d13cb98 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -222,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) + mvdev->core_device.vdev.migration_flags |= + VFIO_MIGRATION_PRE_COPY; + end: mlx5_vf_put_core_dev(mvdev->mdev); } -- cgit v1.2.3 From 64ffbbb1e948876dd9044c32a3ab8a790662b9bb Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:33 +0000 Subject: hisi_acc_vfio_pci: Add support for precopy IOCTL PRECOPY IOCTL in the case of HiSiIicon ACC driver can be used to perform the device compatibility check earlier during migration. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-2-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 52 ++++++++++++++++++++++++++ drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 53 insertions(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0c0c0c7f0521..f3b74a06edb6 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -764,9 +764,58 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; return migf; } +static long hisi_acc_vf_precopy_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct hisi_acc_vf_migration_file *migf = filp->private_data; + struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev; + loff_t *pos = &filp->f_pos; + struct vfio_precopy_info info; + unsigned long minsz; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&hisi_acc_vdev->state_mutex); + if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) { + mutex_unlock(&hisi_acc_vdev->state_mutex); + return -EINVAL; + } + + mutex_lock(&migf->lock); + + if (migf->disabled) { + ret = -ENODEV; + goto out; + } + + if (*pos > migf->total_length) { + ret = -EINVAL; + goto out; + } + + info.dirty_bytes = 0; + info.initial_bytes = migf->total_length - *pos; + + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +out: + mutex_unlock(&migf->lock); + mutex_unlock(&hisi_acc_vdev->state_mutex); + return ret; +} + static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { @@ -807,6 +856,8 @@ out_unlock: static const struct file_operations hisi_acc_vf_save_fops = { .owner = THIS_MODULE, .read = hisi_acc_vf_save_read, + .unlocked_ioctl = hisi_acc_vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = hisi_acc_vf_release_file, .llseek = no_llseek, }; @@ -832,6 +883,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; ret = vf_qm_state_save(hisi_acc_vdev, migf); if (ret) { diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 67343325b320..11d51345f5b5 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -91,6 +91,7 @@ struct hisi_acc_vf_migration_file { struct mutex lock; bool disabled; + struct hisi_acc_vf_core_device *hisi_acc_vdev; struct acc_vf_data vf_data; size_t total_length; }; -- cgit v1.2.3 From d9a871e4a143047d1d84a606772af319f11516f9 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:34 +0000 Subject: hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions The saving_migf is open in PRE_COPY state if it is supported and reads initial device match data. hisi_acc_vf_stop_copy() is refactored to make use of common code. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-3-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index f3b74a06edb6..c8658636a84c 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -863,7 +863,7 @@ static const struct file_operations hisi_acc_vf_save_fops = { }; static struct hisi_acc_vf_migration_file * -hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; int ret; @@ -885,7 +885,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) mutex_init(&migf->lock); migf->hisi_acc_vdev = hisi_acc_vdev; - ret = vf_qm_state_save(hisi_acc_vdev, migf); + ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data); if (ret) { fput(migf->filp); return ERR_PTR(ret); @@ -894,6 +894,44 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) return migf; } +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + + migf->total_length = QM_MATCH_SIZE; + return migf; +} + +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open) +{ + int ret; + struct hisi_acc_vf_migration_file *migf = NULL; + + if (open) { + /* + * Userspace didn't use PRECOPY support. Hence saving_migf + * is not opened yet. + */ + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + } else { + migf = hisi_acc_vdev->saving_migf; + } + + ret = vf_qm_state_save(hisi_acc_vdev, migf); + if (ret) + return ERR_PTR(ret); + + return open ? migf : NULL; +} + static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct device *dev = &hisi_acc_vdev->vf_dev->dev; @@ -921,6 +959,31 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 cur = hisi_acc_vdev->mig_state; int ret; + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) { + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_vf_pre_copy(hisi_acc_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + hisi_acc_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct hisi_acc_vf_migration_file *migf; + + ret = hisi_acc_vf_stop_device(hisi_acc_vdev); + if (ret) + return ERR_PTR(ret); + + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + return NULL; + } + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) { ret = hisi_acc_vf_stop_device(hisi_acc_vdev); if (ret) @@ -931,7 +994,7 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct hisi_acc_vf_migration_file *migf; - migf = hisi_acc_vf_stop_copy(hisi_acc_vdev); + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -963,6 +1026,11 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, return NULL; } + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) { + hisi_acc_vf_disable_fds(hisi_acc_vdev); + return NULL; + } + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) { hisi_acc_vf_start_device(hisi_acc_vdev); return NULL; -- cgit v1.2.3 From 190125adcad4c5850fd74ecd697e20a446b74ed8 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:35 +0000 Subject: hisi_acc_vfio_pci: Move the dev compatibility tests for early check Instead of waiting till data transfer is complete to perform dev compatibility, do it as soon as we have enough data to perform the check. This will be useful when we enable the support for PRE_COPY. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-4-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 19 +++++++------------ drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index c8658636a84c..9a51f41e1d2a 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -360,8 +360,8 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 que_iso_state; int ret; - if (migf->total_length < QM_MATCH_SIZE) - return -EINVAL; + if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) + return 0; if (vf_data->acc_magic != ACC_DEV_MAGIC) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); @@ -406,6 +406,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, } hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + hisi_acc_vdev->match_done = true; return 0; } @@ -493,10 +494,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct device *dev = &vf_qm->pdev->dev; int ret; - ret = vf_qm_get_match_data(hisi_acc_vdev, vf_data); - if (ret) - return ret; - if (unlikely(qm_wait_dev_not_ready(vf_qm))) { /* Update state and return with match data */ vf_data->vf_qm_state = QM_NOT_READY; @@ -673,12 +670,6 @@ static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf; int ret; - /* Check dev compatibility */ - ret = vf_qm_check_match(hisi_acc_vdev, migf); - if (ret) { - dev_err(dev, "failed to match the VF!\n"); - return ret; - } /* Recover data to VF */ ret = vf_qm_load_data(hisi_acc_vdev, migf); if (ret) { @@ -732,6 +723,10 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu *pos += len; done = len; migf->total_length += len; + + ret = vf_qm_check_match(migf->hisi_acc_vdev, migf); + if (ret) + done = -EFAULT; out_unlock: mutex_unlock(&migf->lock); return done; diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 11d51345f5b5..dcabfeec6ca1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -98,6 +98,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; + u8 match_done:1; u8 deferred_reset:1; /* For migration state */ struct mutex state_mutex; -- cgit v1.2.3 From f2240b4441cc5bd87820371ee79ad7c9125e76b6 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:36 +0000 Subject: hisi_acc_vfio_pci: Enable PRE_COPY flag Now that we have everything to support the PRE_COPY state, enable it. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-5-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 9a51f41e1d2a..51941bb4f31f 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1351,7 +1351,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->vf_dev = pdev; mutex_init(&hisi_acc_vdev->state_mutex); - core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; return vfio_pci_core_init_dev(core_vdev); -- cgit v1.2.3 From d1f0f50fbbbbca1e3e8157e51934613bf88f6d44 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 8 Dec 2022 09:33:41 +0800 Subject: samples: vfio-mdev: Fix missing pci_disable_device() in mdpy_fb_probe() Add missing pci_disable_device() in fail path of mdpy_fb_probe(). Besides, fix missing release functions in mdpy_fb_remove(). Fixes: cacade1946a4 ("sample: vfio mdev display - guest driver") Signed-off-by: Shang XiaoJing Link: https://lore.kernel.org/r/20221208013341.3999-1-shangxiaojing@huawei.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mdpy-fb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c index 9ec93d90e8a5..4eb7aa11cfbb 100644 --- a/samples/vfio-mdev/mdpy-fb.c +++ b/samples/vfio-mdev/mdpy-fb.c @@ -109,7 +109,7 @@ static int mdpy_fb_probe(struct pci_dev *pdev, ret = pci_request_regions(pdev, "mdpy-fb"); if (ret < 0) - return ret; + goto err_disable_dev; pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format); pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width); @@ -191,6 +191,9 @@ err_release_fb: err_release_regions: pci_release_regions(pdev); +err_disable_dev: + pci_disable_device(pdev); + return ret; } @@ -199,7 +202,10 @@ static void mdpy_fb_remove(struct pci_dev *pdev) struct fb_info *info = pci_get_drvdata(pdev); unregister_framebuffer(info); + iounmap(info->screen_base); framebuffer_release(info); + pci_release_regions(pdev); + pci_disable_device(pdev); } static struct pci_device_id mdpy_fb_pci_table[] = { -- cgit v1.2.3 From fe3dd71db2b81c202bc80532bbe0e07238a45ed9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:01:26 +0300 Subject: vfio/mlx5: fix error code in mlx5vf_precopy_ioctl() The copy_to_user() function returns the number of bytes remaining to be copied but we want to return a negative error code here. Fixes: 0dce165b1adf ("vfio/mlx5: Introduce vfio precopy ioctl implementation") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKVknlf5Z5NPtU@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index cd90eb86128c..94f7a0fd10e8 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -404,7 +404,10 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, done: mlx5vf_state_mutex_unlock(mvdev); - return copy_to_user((void __user *)arg, &info, minsz); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + err_migf_unlock: mutex_unlock(&migf->lock); err_state_unlock: -- cgit v1.2.3 From 70be6f322860d322ebcd120cf0c05402ead5c6de Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:02:17 +0300 Subject: vfio/mlx5: error pointer dereference in error handling This code frees the wrong "buf" variable and results in an error pointer dereference. Fixes: 34e2f27143d1 ("vfio/mlx5: Introduce multiple loads") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKia5SaiVxYmG5@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 94f7a0fd10e8..031ac8cc215d 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -826,7 +826,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) spin_lock_init(&migf->list_lock); return migf; out_buf: - mlx5vf_free_data_buffer(buf); + mlx5vf_free_data_buffer(migf->buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: -- cgit v1.2.3