// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved */ #include #include /* * The device memory usable to the workloads running in the VM is cached * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) * to the VM and is represented as usemem. * Moreover, the VM GPU device driver needs a non-cacheable region to * support the MIG feature. This region is also exposed as a 64b BAR * (comprising of BAR2 and BAR3 region) and represented as resmem. */ #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX /* Memory size expected as non cached and reserved by the VM driver */ #define RESMEM_SIZE SZ_1G /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M /* * The state of the two device memory region - resmem and usemem - is * saved as struct mem_region. */ struct mem_region { phys_addr_t memphys; /* Base physical address of the region */ size_t memlength; /* Region size */ size_t bar_size; /* Reported region BAR size */ __le64 bar_val; /* Emulated BAR offset registers */ union { void *memaddr; void __iomem *ioaddr; }; /* Base virtual address of the region */ }; struct nvgrace_gpu_pci_core_device { struct vfio_pci_core_device core_device; /* Cached and usable memory for the VM. */ struct mem_region usemem; /* Non cached memory carved out from the end of device memory */ struct mem_region resmem; /* Lock to control device memory kernel mapping */ struct mutex remap_lock; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); nvdev->resmem.bar_val = 0; nvdev->usemem.bar_val = 0; } /* Choose the structure corresponding to the fake BAR with a given index. */ static struct mem_region * nvgrace_gpu_memregion(int index, struct nvgrace_gpu_pci_core_device *nvdev) { if (index == USEMEM_REGION_INDEX) return &nvdev->usemem; if (index == RESMEM_REGION_INDEX) return &nvdev->resmem; return NULL; } static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); int ret; ret = vfio_pci_core_enable(vdev); if (ret) return ret; if (nvdev->usemem.memlength) { nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); mutex_init(&nvdev->remap_lock); } vfio_pci_core_finish_enable(vdev); return 0; } static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); /* Unmap the mapping to the device memory cached region */ if (nvdev->usemem.memaddr) { memunmap(nvdev->usemem.memaddr); nvdev->usemem.memaddr = NULL; } /* Unmap the mapping to the device memory non-cached region */ if (nvdev->resmem.ioaddr) { iounmap(nvdev->resmem.ioaddr); nvdev->resmem.ioaddr = NULL; } mutex_destroy(&nvdev->remap_lock); vfio_pci_core_close_device(core_vdev); } static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); memregion = nvgrace_gpu_memregion(index, nvdev); if (!memregion) return vfio_pci_core_mmap(core_vdev, vma); /* * Request to mmap the BAR. Map to the CPU accessible memory on the * GPU using the memory information gathered from the system ACPI * tables. */ pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* * Check that the mapping request does not go beyond available device * memory size */ if (end > memregion->memlength) return -EINVAL; /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. */ if (index == RESMEM_REGION_INDEX) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); /* * Perform a PFN map to the memory and back the device BAR by the * GPU memory. * * The available GPU memory size may not be power-of-2 aligned. The * remainder is only backed by vfio_device_ops read/write handlers. * * During device reset, the GPU is safely disconnected to the CPU * and access to the BAR will be immediately returned preventing * machine check. */ ret = remap_pfn_range(vma, vma->vm_start, start_pfn, req_len, vma->vm_page_prot); if (ret) return ret; vma->vm_pgoff = start_pfn; return 0; } static long nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, unsigned long arg) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; /* * Request to determine the BAR region information. Send the * GPU memory information. */ memregion = nvgrace_gpu_memregion(info.index, nvdev); if (!memregion) return vfio_pci_core_ioctl(core_vdev, VFIO_DEVICE_GET_REGION_INFO, arg); size = struct_size(sparse, areas, 1); /* * Setup for sparse mapping for the device memory. Only the * available device memory on the hardware is shown as a * mappable region. */ sparse = kzalloc(size, GFP_KERNEL); if (!sparse) return -ENOMEM; sparse->nr_areas = 1; sparse->areas[0].offset = 0; sparse->areas[0].size = memregion->memlength; sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; ret = vfio_info_add_capability(&caps, &sparse->header, size); kfree(sparse); if (ret) return ret; info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory as a BAR and may not be * aligned, roundup to the next power-of-2. */ info.size = memregion->bar_size; info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; if (caps.size) { info.flags |= VFIO_REGION_INFO_FLAG_CAPS; if (info.argsz < sizeof(info) + caps.size) { info.argsz = sizeof(info) + caps.size; info.cap_offset = 0; } else { vfio_info_cap_shift(&caps, sizeof(info)); if (copy_to_user((void __user *)arg + sizeof(info), caps.buf, caps.size)) { kfree(caps.buf); return -EFAULT; } info.cap_offset = sizeof(info); } kfree(caps.buf); } return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { switch (cmd) { case VFIO_DEVICE_GET_REGION_INFO: return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); case VFIO_DEVICE_IOEVENTFD: return -ENOTTY; case VFIO_DEVICE_RESET: nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); fallthrough; default: return vfio_pci_core_ioctl(core_vdev, cmd, arg); } } static __le64 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) { u64 tmp_val; tmp_val = le64_to_cpu(val64); tmp_val &= ~(bar_size - 1); tmp_val |= flags; return cpu_to_le64(tmp_val); } /* * Both the usable (usemem) and the reserved (resmem) device memory region * are exposed as a 64b fake device BARs in the VM. These fake BARs must * respond to the accesses on their respective PCI config space offsets. * * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. */ static ssize_t nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion = NULL; __le64 val64; size_t register_offset; loff_t copy_offset; size_t copy_count; int ret; ret = vfio_pci_core_read(core_vdev, buf, count, ppos); if (ret < 0) return ret; if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, sizeof(val64), ©_offset, ©_count, ®ister_offset)) memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, sizeof(val64), ©_offset, ©_count, ®ister_offset)) memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); if (memregion) { val64 = nvgrace_gpu_get_read_value(memregion->bar_size, PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, memregion->bar_val); if (copy_to_user(buf + copy_offset, (void *)&val64 + register_offset, copy_count)) { /* * The position has been incremented in * vfio_pci_core_read. Reset the offset back to the * starting position. */ *ppos -= count; return -EFAULT; } } return count; } static ssize_t nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion = NULL; size_t register_offset; loff_t copy_offset; size_t copy_count; if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, sizeof(u64), ©_offset, ©_count, ®ister_offset)) memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, sizeof(u64), ©_offset, ©_count, ®ister_offset)) memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); if (memregion) { if (copy_from_user((void *)&memregion->bar_val + register_offset, buf + copy_offset, copy_count)) return -EFAULT; *ppos += copy_count; return copy_count; } return vfio_pci_core_write(core_vdev, buf, count, ppos); } /* * Ad hoc map the device memory in the module kernel VA space. Primarily needed * as vfio does not require the userspace driver to only perform accesses through * mmaps of the vfio-pci BAR regions and such accesses should be supported using * vfio_device_ops read/write implementations. * * The usemem region is cacheable memory and hence is memremaped. * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). */ static int nvgrace_gpu_map_device_mem(int index, struct nvgrace_gpu_pci_core_device *nvdev) { struct mem_region *memregion; int ret = 0; memregion = nvgrace_gpu_memregion(index, nvdev); if (!memregion) return -EINVAL; mutex_lock(&nvdev->remap_lock); if (memregion->memaddr) goto unlock; if (index == USEMEM_REGION_INDEX) memregion->memaddr = memremap(memregion->memphys, memregion->memlength, MEMREMAP_WB); else memregion->ioaddr = ioremap_wc(memregion->memphys, memregion->memlength); if (!memregion->memaddr) ret = -ENOMEM; unlock: mutex_unlock(&nvdev->remap_lock); return ret; } /* * Read the data from the device memory (mapped either through ioremap * or memremap) into the user buffer. */ static int nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t mem_count, loff_t *ppos) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; int ret; if (!mem_count) return 0; /* * Handle read on the BAR regions. Map to the target device memory * physical address and copy to the request read buffer. */ ret = nvgrace_gpu_map_device_mem(index, nvdev); if (ret) return ret; if (index == USEMEM_REGION_INDEX) { if (copy_to_user(buf, (u8 *)nvdev->usemem.memaddr + offset, mem_count)) ret = -EFAULT; } else { /* * The hardware ensures that the system does not crash when * the device memory is accessed with the memory enable * turned off. It synthesizes ~0 on such read. So there is * no need to check or support the disablement/enablement of * BAR through PCI_COMMAND config space register. Pass * test_mem flag as false. */ ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, nvdev->resmem.ioaddr, buf, offset, mem_count, 0, 0, false); } return ret; } /* * Read count bytes from the device memory at an offset. The actual device * memory size (available) may not be a power-of-2. So the driver fakes * the size to a power-of-2 (reported) when exposing to a user space driver. * * Reads starting beyond the reported size generate -EINVAL; reads extending * beyond the actual device size is filled with ~0; reads extending beyond * the reported size are truncated. */ static ssize_t nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t count, loff_t *ppos) { u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct mem_region *memregion; size_t mem_count, i; u8 val = 0xFF; int ret; /* No need to do NULL check as caller does. */ memregion = nvgrace_gpu_memregion(index, nvdev); if (offset >= memregion->bar_size) return -EINVAL; /* Clip short the read request beyond reported BAR size */ count = min(count, memregion->bar_size - (size_t)offset); /* * Determine how many bytes to be actually read from the device memory. * Read request beyond the actual device memory size is filled with ~0, * while those beyond the actual reported size is skipped. */ if (offset >= memregion->memlength) mem_count = 0; else mem_count = min(count, memregion->memlength - (size_t)offset); ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); if (ret) return ret; /* * Only the device memory present on the hardware is mapped, which may * not be power-of-2 aligned. A read to an offset beyond the device memory * size is filled with ~0. */ for (i = mem_count; i < count; i++) { ret = put_user(val, (unsigned char __user *)(buf + i)); if (ret) return ret; } *ppos += count; return count; } static ssize_t nvgrace_gpu_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); if (nvgrace_gpu_memregion(index, nvdev)) return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); return vfio_pci_core_read(core_vdev, buf, count, ppos); } /* * Write the data to the device memory (mapped either through ioremap * or memremap) from the user buffer. */ static int nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, const char __user *buf, size_t mem_count, loff_t *ppos) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; int ret; if (!mem_count) return 0; ret = nvgrace_gpu_map_device_mem(index, nvdev); if (ret) return ret; if (index == USEMEM_REGION_INDEX) { if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, buf, mem_count)) return -EFAULT; } else { /* * The hardware ensures that the system does not crash when * the device memory is accessed with the memory enable * turned off. It drops such writes. So there is no need to * check or support the disablement/enablement of BAR * through PCI_COMMAND config space register. Pass test_mem * flag as false. */ ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, nvdev->resmem.ioaddr, (char __user *)buf, pos, mem_count, 0, 0, true); } return ret; } /* * Write count bytes to the device memory at a given offset. The actual device * memory size (available) may not be a power-of-2. So the driver fakes the * size to a power-of-2 (reported) when exposing to a user space driver. * * Writes extending beyond the reported size are truncated; writes starting * beyond the reported size generate -EINVAL. */ static ssize_t nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, size_t count, loff_t *ppos, const char __user *buf) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion; size_t mem_count; int ret = 0; /* No need to do NULL check as caller does. */ memregion = nvgrace_gpu_memregion(index, nvdev); if (offset >= memregion->bar_size) return -EINVAL; /* Clip short the write request beyond reported BAR size */ count = min(count, memregion->bar_size - (size_t)offset); /* * Determine how many bytes to be actually written to the device memory. * Do not write to the offset beyond available size. */ if (offset >= memregion->memlength) goto exitfn; /* * Only the device memory present on the hardware is mapped, which may * not be power-of-2 aligned. Drop access outside the available device * memory on the hardware. */ mem_count = min(count, memregion->memlength - (size_t)offset); ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); if (ret) return ret; exitfn: *ppos += count; return count; } static ssize_t nvgrace_gpu_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); if (nvgrace_gpu_memregion(index, nvdev)) return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); return vfio_pci_core_write(core_vdev, buf, count, ppos); } static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .name = "nvgrace-gpu-vfio-pci", .init = vfio_pci_core_init_dev, .release = vfio_pci_core_release_dev, .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, .mmap = nvgrace_gpu_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .name = "nvgrace-gpu-vfio-pci-core", .init = vfio_pci_core_init_dev, .release = vfio_pci_core_release_dev, .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static int nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, u64 *pmemphys, u64 *pmemlength) { int ret; /* * The memory information is present in the system ACPI tables as DSD * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. */ ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", pmemphys); if (ret) return ret; if (*pmemphys > type_max(phys_addr_t)) return -EOVERFLOW; ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", pmemlength); if (ret) return ret; if (*pmemlength > type_max(size_t)) return -EOVERFLOW; /* * If the C2C link is not up due to an error, the coherent device * memory size is returned as 0. Fail in such case. */ if (*pmemlength == 0) return -ENOMEM; return ret; } static int nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, struct nvgrace_gpu_pci_core_device *nvdev, u64 memphys, u64 memlength) { int ret = 0; /* * The VM GPU device driver needs a non-cacheable region to support * the MIG feature. Since the device memory is mapped as NORMAL cached, * carve out a region from the end with a different NORMAL_NC * property (called as reserved memory and represented as resmem). This * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while * exposing the rest (termed as usable memory and represented using usemem) * as cacheable 64b BAR (region 4 and 5). * * devmem (memlength) * |-------------------------------------------------| * | | * usemem.memphys resmem.memphys */ nvdev->usemem.memphys = memphys; /* * The device memory exposed to the VM is added to the kernel by the * VM driver module in chunks of memory block size. Only the usable * memory (usemem) is added to the kernel for usage by the VM * workloads. Make the usable memory size memblock aligned. */ if (check_sub_overflow(memlength, RESMEM_SIZE, &nvdev->usemem.memlength)) { ret = -EOVERFLOW; goto done; } /* * The USEMEM part of the device memory has to be MEMBLK_SIZE * aligned. This is a hardwired ABI value between the GPU FW and * VFIO driver. The VM device driver is also aware of it and make * use of the value for its calculation to determine USEMEM size. */ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, MEMBLK_SIZE); if (nvdev->usemem.memlength == 0) { ret = -EINVAL; goto done; } if ((check_add_overflow(nvdev->usemem.memphys, nvdev->usemem.memlength, &nvdev->resmem.memphys)) || (check_sub_overflow(memlength, nvdev->usemem.memlength, &nvdev->resmem.memlength))) { ret = -EOVERFLOW; goto done; } /* * The memory regions are exposed as BARs. Calculate and save * the BAR size for them. */ nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); done: return ret; } static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; struct nvgrace_gpu_pci_core_device *nvdev; u64 memphys, memlength; int ret; ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); if (!ret) ops = &nvgrace_gpu_pci_ops; nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, &pdev->dev, ops); if (IS_ERR(nvdev)) return PTR_ERR(nvdev); dev_set_drvdata(&pdev->dev, &nvdev->core_device); if (ops == &nvgrace_gpu_pci_ops) { /* * Device memory properties are identified in the host ACPI * table. Set the nvgrace_gpu_pci_core_device structure. */ ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, memphys, memlength); if (ret) goto out_put_vdev; } ret = vfio_pci_core_register_device(&nvdev->core_device); if (ret) goto out_put_vdev; return ret; out_put_vdev: vfio_put_device(&nvdev->core_device.vdev); return ret; } static void nvgrace_gpu_remove(struct pci_dev *pdev) { struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); vfio_pci_core_unregister_device(core_device); vfio_put_device(&core_device->vdev); } static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { /* GH200 120GB */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, /* GH200 480GB */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, {} }; MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); static struct pci_driver nvgrace_gpu_vfio_pci_driver = { .name = KBUILD_MODNAME, .id_table = nvgrace_gpu_vfio_pci_table, .probe = nvgrace_gpu_probe, .remove = nvgrace_gpu_remove, .err_handler = &vfio_pci_core_err_handlers, .driver_managed_dma = true, }; module_pci_driver(nvgrace_gpu_vfio_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ankit Agrawal "); MODULE_AUTHOR("Aniket Agashe "); MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");