diff options
Diffstat (limited to 'drivers/vfio/pci/vfio_pci_core.c')
-rw-r--r-- | drivers/vfio/pci/vfio_pci_core.c | 1105 |
1 files changed, 690 insertions, 415 deletions
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index c8d3b0450fb3..badc9d828cac 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,7 +28,7 @@ #include <linux/nospec.h> #include <linux/sched/mm.h> -#include <linux/vfio_pci_core.h> +#include "vfio_pci_priv.h" #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" #define DRIVER_DESC "core driver for VFIO based PCI devices" @@ -41,6 +41,23 @@ static bool disable_idle_d3; static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); static LIST_HEAD(vfio_pci_sriov_pfs); +struct vfio_pci_dummy_resource { + struct resource resource; + int index; + struct list_head res_next; +}; + +struct vfio_pci_vf_token { + struct mutex lock; + uuid_t uuid; + int users; +}; + +struct vfio_pci_mmap_vma { + struct vm_area_struct *vma; + struct list_head vma_next; +}; + static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA @@ -260,16 +277,189 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, + struct eventfd_ctx *efdctx) +{ + /* + * The vdev power related flags are protected with 'memory_lock' + * semaphore. + */ + vfio_pci_zap_and_down_write_memory_lock(vdev); + if (vdev->pm_runtime_engaged) { + up_write(&vdev->memory_lock); + return -EINVAL; + } + + vdev->pm_runtime_engaged = true; + vdev->pm_wake_eventfd_ctx = efdctx; + pm_runtime_put_noidle(&vdev->pdev->dev); + up_write(&vdev->memory_lock); + + return 0; +} + +static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + /* + * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count + * will be decremented. The pm_runtime_put() will be invoked again + * while returning from the ioctl and then the device can go into + * runtime suspended state. + */ + return vfio_pci_runtime_pm_entry(vdev, NULL); +} + +static int vfio_pci_core_pm_entry_with_wakeup( + struct vfio_device *device, u32 flags, + struct vfio_device_low_power_entry_with_wakeup __user *arg, + size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + struct vfio_device_low_power_entry_with_wakeup entry; + struct eventfd_ctx *efdctx; + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, + sizeof(entry)); + if (ret != 1) + return ret; + + if (copy_from_user(&entry, arg, sizeof(entry))) + return -EFAULT; + + if (entry.wakeup_eventfd < 0) + return -EINVAL; + + efdctx = eventfd_ctx_fdget(entry.wakeup_eventfd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + + ret = vfio_pci_runtime_pm_entry(vdev, efdctx); + if (ret) + eventfd_ctx_put(efdctx); + + return ret; +} + +static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) +{ + if (vdev->pm_runtime_engaged) { + vdev->pm_runtime_engaged = false; + pm_runtime_get_noresume(&vdev->pdev->dev); + + if (vdev->pm_wake_eventfd_ctx) { + eventfd_ctx_put(vdev->pm_wake_eventfd_ctx); + vdev->pm_wake_eventfd_ctx = NULL; + } + } +} + +static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) +{ + /* + * The vdev power related flags are protected with 'memory_lock' + * semaphore. + */ + down_write(&vdev->memory_lock); + __vfio_pci_runtime_pm_exit(vdev); + up_write(&vdev->memory_lock); +} + +static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + /* + * The device is always in the active state here due to pm wrappers + * around ioctls. If the device had entered a low power state and + * pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has + * already signaled the eventfd and exited low power mode itself. + * pm_runtime_engaged protects the redundant call here. + */ + vfio_pci_runtime_pm_exit(vdev); + return 0; +} + +#ifdef CONFIG_PM +static int vfio_pci_core_runtime_suspend(struct device *dev) +{ + struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + + down_write(&vdev->memory_lock); + /* + * The user can move the device into D3hot state before invoking + * power management IOCTL. Move the device into D0 state here and then + * the pci-driver core runtime PM suspend function will move the device + * into the low power state. Also, for the devices which have + * NoSoftRst-, it will help in restoring the original state + * (saved locally in 'vdev->pm_save'). + */ + vfio_pci_set_power_state(vdev, PCI_D0); + up_write(&vdev->memory_lock); + + /* + * If INTx is enabled, then mask INTx before going into the runtime + * suspended state and unmask the same in the runtime resume. + * If INTx has already been masked by the user, then + * vfio_pci_intx_mask() will return false and in that case, INTx + * should not be unmasked in the runtime resume. + */ + vdev->pm_intx_masked = ((vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) && + vfio_pci_intx_mask(vdev)); + + return 0; +} + +static int vfio_pci_core_runtime_resume(struct device *dev) +{ + struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + + /* + * Resume with a pm_wake_eventfd_ctx signals the eventfd and exit + * low power mode. + */ + down_write(&vdev->memory_lock); + if (vdev->pm_wake_eventfd_ctx) { + eventfd_signal(vdev->pm_wake_eventfd_ctx, 1); + __vfio_pci_runtime_pm_exit(vdev); + } + up_write(&vdev->memory_lock); + + if (vdev->pm_intx_masked) + vfio_pci_intx_unmask(vdev); + + return 0; +} +#endif /* CONFIG_PM */ + /* - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working, - * so use structure without any callbacks. - * * The pci-driver core runtime PM routines always save the device state * before going into suspended state. If the device is going into low power * state with only with runtime PM ops, then no explicit handling is needed * for the devices which have NoSoftRst-. */ -static const struct dev_pm_ops vfio_pci_core_pm_ops = { }; +static const struct dev_pm_ops vfio_pci_core_pm_ops = { + SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend, + vfio_pci_core_runtime_resume, + NULL) +}; int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) { @@ -371,6 +561,18 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) /* * This function can be invoked while the power state is non-D0. + * This non-D0 power state can be with or without runtime PM. + * vfio_pci_runtime_pm_exit() will internally increment the usage + * count corresponding to pm_runtime_put() called during low power + * feature entry and then pm_runtime_resume() will wake up the device, + * if the device has already gone into the suspended state. Otherwise, + * the vfio_pci_set_power_state() will change the device power state + * to D0. + */ + vfio_pci_runtime_pm_exit(vdev); + pm_runtime_resume(&pdev->dev); + + /* * This function calls __pci_reset_function_locked() which internally * can use pci_pm_reset() for the function reset. pci_pm_reset() will * fail if the power state is non-D0. Also, for the devices which @@ -645,10 +847,10 @@ static int msix_mmappable_cap(struct vfio_pci_core_device *vdev, return vfio_info_add_capability(caps, &header, sizeof(header)); } -int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, - unsigned int type, unsigned int subtype, - const struct vfio_pci_regops *ops, - size_t size, u32 flags, void *data) +int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data) { struct vfio_pci_region *region; @@ -670,508 +872,532 @@ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, return 0; } -EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region); +EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); -long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_device_info __user *arg) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz; - - if (cmd == VFIO_DEVICE_GET_INFO) { - struct vfio_device_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; - int ret; - - minsz = offsetofend(struct vfio_device_info, num_irqs); + unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); + struct vfio_device_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + unsigned long capsz; + int ret; - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); + /* For backward compatibility, cannot require this */ + capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; - } + if (info.argsz >= capsz) { + minsz = capsz; + info.cap_offset = 0; + } - info.flags = VFIO_DEVICE_FLAGS_PCI; + info.flags = VFIO_DEVICE_FLAGS_PCI; - if (vdev->reset_works) - info.flags |= VFIO_DEVICE_FLAGS_RESET; + if (vdev->reset_works) + info.flags |= VFIO_DEVICE_FLAGS_RESET; - info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; - info.num_irqs = VFIO_PCI_NUM_IRQS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; + info.num_irqs = VFIO_PCI_NUM_IRQS; - ret = vfio_pci_info_zdev_add_caps(vdev, &caps); - if (ret && ret != -ENODEV) { - pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); - return ret; - } + ret = vfio_pci_info_zdev_add_caps(vdev, &caps); + if (ret && ret != -ENODEV) { + pci_warn(vdev->pdev, + "Failed to setup zPCI info capabilities\n"); + return ret; + } - if (caps.size) { - info.flags |= VFIO_DEVICE_FLAGS_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); + if (caps.size) { + info.flags |= VFIO_DEVICE_FLAGS_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; } - - kfree(caps.buf); + info.cap_offset = sizeof(*arg); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + kfree(caps.buf); + } - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - int i, ret; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} - minsz = offsetofend(struct vfio_region_info, offset); +static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct pci_dev *pdev = vdev->pdev; + struct vfio_region_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + int i, ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pdev->cfg_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pci_resource_len(pdev, info.index); + if (!info.size) { + info.flags = 0; break; - case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; - break; - } + } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); - if (ret) - return ret; - } + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info.index]) { + info.flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info.index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, &caps); + if (ret) + return ret; } + } - break; - case VFIO_PCI_ROM_REGION_INDEX: - { - void __iomem *io; - size_t size; - u16 cmd; - - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; + break; + case VFIO_PCI_ROM_REGION_INDEX: { + void __iomem *io; + size_t size; + u16 cmd; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.flags = 0; + + /* Report the BAR size, not the ROM size */ + info.size = pci_resource_len(pdev, info.index); + if (!info.size) { + /* Shadow ROMs appear as PCI option ROMs */ + if (pdev->resource[PCI_ROM_RESOURCE].flags & + IORESOURCE_ROM_SHADOW) + info.size = 0x20000; + else + break; + } - /* Report the BAR size, not the ROM size */ - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - /* Shadow ROMs appear as PCI option ROMs */ - if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) - info.size = 0x20000; - else - break; - } + /* + * Is it really there? Enable memory decode for implicit access + * in pci_map_rom(). + */ + cmd = vfio_pci_memory_lock_and_enable(vdev); + io = pci_map_rom(pdev, &size); + if (io) { + info.flags = VFIO_REGION_INFO_FLAG_READ; + pci_unmap_rom(pdev, io); + } else { + info.size = 0; + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); - /* - * Is it really there? Enable memory decode for - * implicit access in pci_map_rom(). - */ - cmd = vfio_pci_memory_lock_and_enable(vdev); - io = pci_map_rom(pdev, &size); - if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; - pci_unmap_rom(pdev, io); - } else { - info.size = 0; - } - vfio_pci_memory_unlock_and_restore(vdev, cmd); + break; + } + case VFIO_PCI_VGA_REGION_INDEX: + if (!vdev->has_vga) + return -EINVAL; - break; - } - case VFIO_PCI_VGA_REGION_INDEX: - if (!vdev->has_vga) - return -EINVAL; + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0xc0000; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; - break; - default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; + if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + return -EINVAL; + info.index = array_index_nospec( + info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - if (info.index >= - VFIO_PCI_NUM_REGIONS + vdev->num_regions) - return -EINVAL; - info.index = array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vdev->num_regions); + i = info.index - VFIO_PCI_NUM_REGIONS; - i = info.index - VFIO_PCI_NUM_REGIONS; + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vdev->region[i].size; + info.flags = vdev->region[i].flags; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + cap_type.type = vdev->region[i].type; + cap_type.subtype = vdev->region[i].subtype; - cap_type.type = vdev->region[i].type; - cap_type.subtype = vdev->region[i].subtype; + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; - ret = vfio_info_add_capability(&caps, &cap_type.header, - sizeof(cap_type)); + if (vdev->region[i].ops->add_capability) { + ret = vdev->region[i].ops->add_capability( + vdev, &vdev->region[i], &caps); if (ret) return ret; - - if (vdev->region[i].ops->add_capability) { - ret = vdev->region[i].ops->add_capability(vdev, - &vdev->region[i], &caps); - if (ret) - return ret; - } - } } + } + } - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; } - - kfree(caps.buf); + info.cap_offset = sizeof(*arg); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + kfree(caps.buf); + } - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { - struct vfio_irq_info info; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} - minsz = offsetofend(struct vfio_irq_info, count); +static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, + struct vfio_irq_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_info, count); + struct vfio_irq_info info; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; - if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) - return -EINVAL; + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) + return -EINVAL; - switch (info.index) { - case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: - case VFIO_PCI_REQ_IRQ_INDEX: + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + case VFIO_PCI_ERR_IRQ_INDEX: + if (pci_is_pcie(vdev->pdev)) break; - case VFIO_PCI_ERR_IRQ_INDEX: - if (pci_is_pcie(vdev->pdev)) - break; - fallthrough; - default: - return -EINVAL; - } - - info.flags = VFIO_IRQ_INFO_EVENTFD; - - info.count = vfio_pci_get_irq_count(vdev, info.index); + fallthrough; + default: + return -EINVAL; + } - if (info.index == VFIO_PCI_INTX_IRQ_INDEX) - info.flags |= (VFIO_IRQ_INFO_MASKABLE | - VFIO_IRQ_INFO_AUTOMASKED); - else - info.flags |= VFIO_IRQ_INFO_NORESIZE; + info.flags = VFIO_IRQ_INFO_EVENTFD; - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + info.count = vfio_pci_get_irq_count(vdev, info.index); - } else if (cmd == VFIO_DEVICE_SET_IRQS) { - struct vfio_irq_set hdr; - u8 *data = NULL; - int max, ret = 0; - size_t data_size = 0; + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) + info.flags |= + (VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED); + else + info.flags |= VFIO_IRQ_INFO_NORESIZE; - minsz = offsetofend(struct vfio_irq_set, count); + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; +static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, + struct vfio_irq_set __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_set, count); + struct vfio_irq_set hdr; + u8 *data = NULL; + int max, ret = 0; + size_t data_size = 0; - max = vfio_pci_get_irq_count(vdev, hdr.index); + if (copy_from_user(&hdr, arg, minsz)) + return -EFAULT; - ret = vfio_set_irqs_validate_and_prepare(&hdr, max, - VFIO_PCI_NUM_IRQS, &data_size); - if (ret) - return ret; + max = vfio_pci_get_irq_count(vdev, hdr.index); - if (data_size) { - data = memdup_user((void __user *)(arg + minsz), - data_size); - if (IS_ERR(data)) - return PTR_ERR(data); - } + ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS, + &data_size); + if (ret) + return ret; - mutex_lock(&vdev->igate); + if (data_size) { + data = memdup_user(&arg->data, data_size); + if (IS_ERR(data)) + return PTR_ERR(data); + } - ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, - hdr.start, hdr.count, data); + mutex_lock(&vdev->igate); - mutex_unlock(&vdev->igate); - kfree(data); + ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start, + hdr.count, data); - return ret; + mutex_unlock(&vdev->igate); + kfree(data); - } else if (cmd == VFIO_DEVICE_RESET) { - int ret; + return ret; +} - if (!vdev->reset_works) - return -EINVAL; +static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + int ret; - vfio_pci_zap_and_down_write_memory_lock(vdev); + if (!vdev->reset_works) + return -EINVAL; - /* - * This function can be invoked while the power state is non-D0. - * If pci_try_reset_function() has been called while the power - * state is non-D0, then pci_try_reset_function() will - * internally set the power state to D0 without vfio driver - * involvement. For the devices which have NoSoftRst-, the - * reset function can cause the PCI config space reset without - * restoring the original state (saved locally in - * 'vdev->pm_save'). - */ - vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_zap_and_down_write_memory_lock(vdev); - ret = pci_try_reset_function(vdev->pdev); - up_write(&vdev->memory_lock); + /* + * This function can be invoked while the power state is non-D0. If + * pci_try_reset_function() has been called while the power state is + * non-D0, then pci_try_reset_function() will internally set the power + * state to D0 without vfio driver involvement. For the devices which + * have NoSoftRst-, the reset function can cause the PCI config space + * reset without restoring the original state (saved locally in + * 'vdev->pm_save'). + */ + vfio_pci_set_power_state(vdev, PCI_D0); - return ret; + ret = pci_try_reset_function(vdev->pdev); + up_write(&vdev->memory_lock); - } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { - struct vfio_pci_hot_reset_info hdr; - struct vfio_pci_fill_info fill = { 0 }; - struct vfio_pci_dependent_device *devices = NULL; - bool slot = false; - int ret = 0; + return ret; +} - minsz = offsetofend(struct vfio_pci_hot_reset_info, count); +static int vfio_pci_ioctl_get_pci_hot_reset_info( + struct vfio_pci_core_device *vdev, + struct vfio_pci_hot_reset_info __user *arg) +{ + unsigned long minsz = + offsetofend(struct vfio_pci_hot_reset_info, count); + struct vfio_pci_hot_reset_info hdr; + struct vfio_pci_fill_info fill = { 0 }; + struct vfio_pci_dependent_device *devices = NULL; + bool slot = false; + int ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&hdr, arg, minsz)) + return -EFAULT; - if (hdr.argsz < minsz) - return -EINVAL; + if (hdr.argsz < minsz) + return -EINVAL; - hdr.flags = 0; + hdr.flags = 0; - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; - /* How many devices are affected? */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_count_devs, - &fill.max, slot); - if (ret) - return ret; + /* How many devices are affected? */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &fill.max, slot); + if (ret) + return ret; - WARN_ON(!fill.max); /* Should always be at least one */ + WARN_ON(!fill.max); /* Should always be at least one */ - /* - * If there's enough space, fill it now, otherwise return - * -ENOSPC and the number of devices affected. - */ - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { - ret = -ENOSPC; - hdr.count = fill.max; - goto reset_info_exit; - } + /* + * If there's enough space, fill it now, otherwise return -ENOSPC and + * the number of devices affected. + */ + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { + ret = -ENOSPC; + hdr.count = fill.max; + goto reset_info_exit; + } - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; - fill.devices = devices; + fill.devices = devices; - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_fill_devs, - &fill, slot); + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, + &fill, slot); - /* - * If a device was removed between counting and filling, - * we may come up short of fill.max. If a device was - * added, we'll have a return of -EAGAIN above. - */ - if (!ret) - hdr.count = fill.cur; + /* + * If a device was removed between counting and filling, we may come up + * short of fill.max. If a device was added, we'll have a return of + * -EAGAIN above. + */ + if (!ret) + hdr.count = fill.cur; reset_info_exit: - if (copy_to_user((void __user *)arg, &hdr, minsz)) + if (copy_to_user(arg, &hdr, minsz)) + ret = -EFAULT; + + if (!ret) { + if (copy_to_user(&arg->devices, devices, + hdr.count * sizeof(*devices))) ret = -EFAULT; + } - if (!ret) { - if (copy_to_user((void __user *)(arg + minsz), devices, - hdr.count * sizeof(*devices))) - ret = -EFAULT; - } + kfree(devices); + return ret; +} - kfree(devices); - return ret; +static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, + struct vfio_pci_hot_reset __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); + struct vfio_pci_hot_reset hdr; + int32_t *group_fds; + struct file **files; + struct vfio_pci_group_info info; + bool slot = false; + int file_idx, count = 0, ret = 0; - } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { - struct vfio_pci_hot_reset hdr; - int32_t *group_fds; - struct file **files; - struct vfio_pci_group_info info; - bool slot = false; - int file_idx, count = 0, ret = 0; + if (copy_from_user(&hdr, arg, minsz)) + return -EFAULT; - minsz = offsetofend(struct vfio_pci_hot_reset, count); + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; - if (hdr.argsz < minsz || hdr.flags) - return -EINVAL; + /* + * We can't let userspace give us an arbitrarily large buffer to copy, + * so verify how many we think there could be. Note groups can have + * multiple devices so one group per device is the max. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; + /* Somewhere between 1 and count is OK */ + if (!hdr.count || hdr.count > count) + return -EINVAL; - /* - * We can't let userspace give us an arbitrarily large - * buffer to copy, so verify how many we think there - * could be. Note groups can have multiple devices so - * one group per device is the max. - */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_count_devs, - &count, slot); - if (ret) - return ret; + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); + files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + if (!group_fds || !files) { + kfree(group_fds); + kfree(files); + return -ENOMEM; + } - /* Somewhere between 1 and count is OK */ - if (!hdr.count || hdr.count > count) - return -EINVAL; + if (copy_from_user(group_fds, arg->group_fds, + hdr.count * sizeof(*group_fds))) { + kfree(group_fds); + kfree(files); + return -EFAULT; + } - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); - if (!group_fds || !files) { - kfree(group_fds); - kfree(files); - return -ENOMEM; - } + /* + * For each group_fd, get the group through the vfio external user + * interface and store the group and iommu ID. This ensures the group + * is held across the reset. + */ + for (file_idx = 0; file_idx < hdr.count; file_idx++) { + struct file *file = fget(group_fds[file_idx]); - if (copy_from_user(group_fds, (void __user *)(arg + minsz), - hdr.count * sizeof(*group_fds))) { - kfree(group_fds); - kfree(files); - return -EFAULT; + if (!file) { + ret = -EBADF; + break; } - /* - * For each group_fd, get the group through the vfio external - * user interface and store the group and iommu ID. This - * ensures the group is held across the reset. - */ - for (file_idx = 0; file_idx < hdr.count; file_idx++) { - struct file *file = fget(group_fds[file_idx]); - - if (!file) { - ret = -EBADF; - break; - } - - /* Ensure the FD is a vfio group FD.*/ - if (!vfio_file_iommu_group(file)) { - fput(file); - ret = -EINVAL; - break; - } - - files[file_idx] = file; + /* Ensure the FD is a vfio group FD.*/ + if (!vfio_file_is_group(file)) { + fput(file); + ret = -EINVAL; + break; } - kfree(group_fds); + files[file_idx] = file; + } - /* release reference to groups on error */ - if (ret) - goto hot_reset_release; + kfree(group_fds); + + /* release reference to groups on error */ + if (ret) + goto hot_reset_release; - info.count = hdr.count; - info.files = files; + info.count = hdr.count; + info.files = files; - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); hot_reset_release: - for (file_idx--; file_idx >= 0; file_idx--) - fput(files[file_idx]); + for (file_idx--; file_idx >= 0; file_idx--) + fput(files[file_idx]); - kfree(files); - return ret; - } else if (cmd == VFIO_DEVICE_IOEVENTFD) { - struct vfio_device_ioeventfd ioeventfd; - int count; + kfree(files); + return ret; +} - minsz = offsetofend(struct vfio_device_ioeventfd, fd); +static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, + struct vfio_device_ioeventfd __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd); + struct vfio_device_ioeventfd ioeventfd; + int count; - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&ioeventfd, arg, minsz)) + return -EFAULT; - if (ioeventfd.argsz < minsz) - return -EINVAL; + if (ioeventfd.argsz < minsz) + return -EINVAL; - if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) - return -EINVAL; + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) + return -EINVAL; - count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; - if (hweight8(count) != 1 || ioeventfd.fd < -1) - return -EINVAL; + if (hweight8(count) != 1 || ioeventfd.fd < -1) + return -EINVAL; - return vfio_pci_ioeventfd(vdev, ioeventfd.offset, - ioeventfd.data, count, ioeventfd.fd); + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, + ioeventfd.fd); +} + +long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, + unsigned long arg) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return vfio_pci_ioctl_get_info(vdev, uarg); + case VFIO_DEVICE_GET_IRQ_INFO: + return vfio_pci_ioctl_get_irq_info(vdev, uarg); + case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: + return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); + case VFIO_DEVICE_GET_REGION_INFO: + return vfio_pci_ioctl_get_region_info(vdev, uarg); + case VFIO_DEVICE_IOEVENTFD: + return vfio_pci_ioctl_ioeventfd(vdev, uarg); + case VFIO_DEVICE_PCI_HOT_RESET: + return vfio_pci_ioctl_pci_hot_reset(vdev, uarg); + case VFIO_DEVICE_RESET: + return vfio_pci_ioctl_reset(vdev, uarg); + case VFIO_DEVICE_SET_IRQS: + return vfio_pci_ioctl_set_irqs(vdev, uarg); + default: + return -ENOTTY; } - return -ENOTTY; } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - void __user *arg, size_t argsz) + uuid_t __user *arg, size_t argsz) { struct vfio_pci_core_device *vdev = container_of(device, struct vfio_pci_core_device, vdev); @@ -1202,6 +1428,13 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { switch (flags & VFIO_DEVICE_FEATURE_MASK) { + case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: + return vfio_pci_core_pm_entry(device, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: + return vfio_pci_core_pm_entry_with_wakeup(device, flags, + arg, argsz); + case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: + return vfio_pci_core_pm_exit(device, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(device, flags, arg, argsz); default: @@ -1214,31 +1447,47 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; + ret = pm_runtime_resume_and_get(&vdev->pdev->dev); + if (ret) { + pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n", + ret); + return -EIO; + } + switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: - return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + break; case VFIO_PCI_ROM_REGION_INDEX: if (iswrite) - return -EINVAL; - return vfio_pci_bar_rw(vdev, buf, count, ppos, false); + ret = -EINVAL; + else + ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false); + break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + break; case VFIO_PCI_VGA_REGION_INDEX: - return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + break; + default: index -= VFIO_PCI_NUM_REGIONS; - return vdev->region[index].ops->rw(vdev, buf, + ret = vdev->region[index].ops->rw(vdev, buf, count, ppos, iswrite); + break; } - return -EINVAL; + pm_runtime_put(&vdev->pdev->dev); + return ret; } ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, @@ -1433,7 +1682,11 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) mutex_lock(&vdev->vma_lock); down_read(&vdev->memory_lock); - if (!__vfio_pci_memory_enabled(vdev)) { + /* + * Memory region cannot be accessed if the low power feature is engaged + * or memory access is disabled. + */ + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { ret = VM_FAULT_SIGBUS; goto up_out; } @@ -1825,12 +2078,12 @@ static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev) VGA_RSRC_LEGACY_MEM); } -void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, - struct pci_dev *pdev, - const struct vfio_device_ops *vfio_pci_ops) +int vfio_pci_core_init_dev(struct vfio_device *core_vdev) { - vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops); - vdev->pdev = pdev; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + + vdev->pdev = to_pci_dev(core_vdev->dev); vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); @@ -1841,19 +2094,24 @@ void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); + + return 0; } -EXPORT_SYMBOL_GPL(vfio_pci_core_init_device); +EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev); -void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev) +void vfio_pci_core_release_dev(struct vfio_device *core_vdev) { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + mutex_destroy(&vdev->igate); mutex_destroy(&vdev->ioeventfds_lock); mutex_destroy(&vdev->vma_lock); - vfio_uninit_group_dev(&vdev->vdev); kfree(vdev->region); kfree(vdev->pm_save); + vfio_free_device(core_vdev); } -EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); +EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) { @@ -1875,6 +2133,11 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) return -EINVAL; } + if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start && + vdev->vdev.log_ops->log_stop && + vdev->vdev.log_ops->log_read_and_clear)) + return -EINVAL; + /* * Prevent binding to PFs with VFs enabled, the VFs might be in use * by the host or other users. We cannot capture the VFs if they @@ -2148,6 +2411,15 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_unlock; } + /* + * Some of the devices in the dev_set can be in the runtime suspended + * state. Increment the usage count for all the devices in the dev_set + * before reset and decrement the same after reset. + */ + ret = vfio_pci_dev_set_pm_runtime_get(dev_set); + if (ret) + goto err_unlock; + list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { /* * Test whether all the affected devices are contained by the @@ -2203,6 +2475,9 @@ err_undo: else mutex_unlock(&cur->vma_lock); } + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; |