From 68cbbc3a9d1fc231810b2490bca73b3b444ef542 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 26 Mar 2015 16:42:09 +1100 Subject: drivers/vfio: Support EEH error injection The patch adds one more EEH sub-command (VFIO_EEH_PE_INJECT_ERR) to inject the specified EEH error, which is represented by (struct vfio_eeh_pe_err), to the indicated PE for testing purpose. Signed-off-by: Gavin Shan Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_spapr_eeh.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index 5fa42db769ee..38edeb4729a9 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -85,6 +85,16 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, case VFIO_EEH_PE_CONFIGURE: ret = eeh_pe_configure(pe); break; + case VFIO_EEH_PE_INJECT_ERR: + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); + if (op.argsz < minsz) + return -EINVAL; + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + + ret = eeh_pe_inject_err(pe, op.err.type, op.err.func, + op.err.addr, op.err.mask); + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 9b14a1ff8657d3ee844f8987482bc367a716848c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:34:58 +1000 Subject: vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver This moves page pinning (get_user_pages_fast()/put_page()) code out of the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs to as the platform code does not deal with page pinning. This makes iommu_take_ownership()/iommu_release_ownership() deal with the IOMMU table bitmap only. This removes page unpinning from iommu_take_ownership() as the actual TCE table might contain garbage and doing put_page() on it is undefined behaviour. Besides the last part, the rest of the patch is mechanical. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 4 -- arch/powerpc/kernel/iommu.c | 55 ------------------------- drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++++++++++++++++++++++++++++++------ 3 files changed, 67 insertions(+), 72 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 8353c865db6f..e94a5e3c5a2f 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction); extern unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry); -extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages); -extern int iommu_put_tce_user_mode(struct iommu_table *tbl, - unsigned long entry, unsigned long tce); extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 73eb39a7a9e1..0019c80aa81d 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -986,30 +986,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) } EXPORT_SYMBOL_GPL(iommu_clear_tce); -int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages) -{ - unsigned long oldtce; - struct page *page; - - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) - continue; - - page = pfn_to_page(oldtce >> PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); - /* * hwaddr is a kernel virtual address here (0xc... bazillion), * tce_build converts it to a physical address. @@ -1039,35 +1015,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_build); -int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, - unsigned long tce) -{ - int ret; - struct page *page = NULL; - unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; - enum dma_data_direction direction = iommu_tce_direction(tce); - - ret = get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, &page); - if (unlikely(ret != 1)) { - /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n", - tce, entry << tbl->it_page_shift, ret); */ - return -EFAULT; - } - hwaddr = (unsigned long) page_address(page) + offset; - - ret = iommu_tce_build(tbl, entry, hwaddr, direction); - if (ret) - put_page(page); - - if (ret < 0) - pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", - __func__, entry << tbl->it_page_shift, tce, ret); - - return ret; -} -EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); - int iommu_take_ownership(struct iommu_table *tbl) { unsigned long sz = (tbl->it_size + 7) >> 3; @@ -1081,7 +1028,6 @@ int iommu_take_ownership(struct iommu_table *tbl) } memset(tbl->it_map, 0xff, sz); - iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); /* * Disable iommu bypass, otherwise the user can DMA to all of @@ -1099,7 +1045,6 @@ void iommu_release_ownership(struct iommu_table *tbl) { unsigned long sz = (tbl->it_size + 7) >> 3; - iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); memset(tbl->it_map, 0, sz); /* Restore bit#0 set by iommu_init_table() */ diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 730b4ef3e0cc..b95fa2b64680 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -147,6 +147,67 @@ static void tce_iommu_release(void *iommu_data) kfree(container); } +static int tce_iommu_clear(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + unsigned long oldtce; + struct page *page; + + for ( ; pages; --pages, ++entry) { + oldtce = iommu_clear_tce(tbl, entry); + if (!oldtce) + continue; + + page = pfn_to_page(oldtce >> PAGE_SHIFT); + WARN_ON(!page); + if (page) { + if (oldtce & TCE_PCI_WRITE) + SetPageDirty(page); + put_page(page); + } + } + + return 0; +} + +static long tce_iommu_build(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long tce, unsigned long pages) +{ + long i, ret = 0; + struct page *page = NULL; + unsigned long hva; + enum dma_data_direction direction = iommu_tce_direction(tce); + + for (i = 0; i < pages; ++i) { + unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; + + ret = get_user_pages_fast(tce & PAGE_MASK, 1, + direction != DMA_TO_DEVICE, &page); + if (unlikely(ret != 1)) { + ret = -EFAULT; + break; + } + hva = (unsigned long) page_address(page) + offset; + + ret = iommu_tce_build(tbl, entry + i, hva, direction); + if (ret) { + put_page(page); + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", + __func__, entry << tbl->it_page_shift, + tce, ret); + break; + } + tce += IOMMU_PAGE_SIZE_4K; + } + + if (ret) + tce_iommu_clear(container, tbl, entry, i); + + return ret; +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -195,7 +256,7 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_MAP_DMA: { struct vfio_iommu_type1_dma_map param; struct iommu_table *tbl = container->tbl; - unsigned long tce, i; + unsigned long tce; if (!tbl) return -ENXIO; @@ -229,17 +290,9 @@ static long tce_iommu_ioctl(void *iommu_data, if (ret) return ret; - for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) { - ret = iommu_put_tce_user_mode(tbl, - (param.iova >> IOMMU_PAGE_SHIFT_4K) + i, - tce); - if (ret) - break; - tce += IOMMU_PAGE_SIZE_4K; - } - if (ret) - iommu_clear_tces_and_put_pages(tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, i); + ret = tce_iommu_build(container, tbl, + param.iova >> IOMMU_PAGE_SHIFT_4K, + tce, param.size >> IOMMU_PAGE_SHIFT_4K); iommu_flush_tce(tbl); @@ -273,7 +326,7 @@ static long tce_iommu_ioctl(void *iommu_data, if (ret) return ret; - ret = iommu_clear_tces_and_put_pages(tbl, + ret = tce_iommu_clear(container, tbl, param.iova >> IOMMU_PAGE_SHIFT_4K, param.size >> IOMMU_PAGE_SHIFT_4K); iommu_flush_tce(tbl); @@ -357,6 +410,7 @@ static void tce_iommu_detach_group(void *iommu_data, /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ container->tbl = NULL; + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); iommu_release_ownership(tbl); } mutex_unlock(&container->lock); -- cgit v1.2.3 From e432bc7e15d80288aff0bcccb2ff0f9ae572abdd Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:34:59 +1000 Subject: vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page This checks that the TCE table page size is not bigger that the size of a page we just pinned and going to put its physical address to the table. Otherwise the hardware gets unwanted access to physical memory between the end of the actual page and the end of the aligned up TCE page. Since compound_order() and compound_head() work correctly on non-huge pages, there is no need for additional check whether the page is huge. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index b95fa2b64680..735b308709e5 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -47,6 +47,16 @@ struct tce_container { bool enabled; }; +static bool tce_page_is_contained(struct page *page, unsigned page_shift) +{ + /* + * Check that the TCE table granularity is not bigger than the size of + * a page we just found. Otherwise the hardware can get access to + * a bigger memory chunk that it should. + */ + return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; @@ -189,6 +199,12 @@ static long tce_iommu_build(struct tce_container *container, ret = -EFAULT; break; } + + if (!tce_page_is_contained(page, tbl->it_page_shift)) { + ret = -EPERM; + break; + } + hva = (unsigned long) page_address(page) + offset; ret = iommu_tce_build(tbl, entry + i, hva, direction); -- cgit v1.2.3 From 00663d4ee05dafe97033003fd5479cf9efd4bf96 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:00 +1000 Subject: vfio: powerpc/spapr: Use it_page_size This makes use of the it_page_size from the iommu_table struct as page size can differ. This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code as recently introduced IOMMU_PAGE_XXX macros do not include IOMMU_PAGE_SHIFT. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 735b308709e5..64300ccb05db 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container) * enforcing the limit based on the max that the guest can map. */ down_write(¤t->mm->mmap_sem); - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; locked = current->mm->locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { @@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container *container) down_write(¤t->mm->mmap_sem); current->mm->locked_vm -= (container->tbl->it_size << - IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + container->tbl->it_page_shift) >> PAGE_SHIFT; up_write(¤t->mm->mmap_sem); } @@ -215,7 +215,7 @@ static long tce_iommu_build(struct tce_container *container, tce, ret); break; } - tce += IOMMU_PAGE_SIZE_4K; + tce += IOMMU_PAGE_SIZE(tbl); } if (ret) @@ -260,8 +260,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz < minsz) return -EINVAL; - info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K; - info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K; + info.dma32_window_start = tbl->it_offset << tbl->it_page_shift; + info.dma32_window_size = tbl->it_size << tbl->it_page_shift; info.flags = 0; if (copy_to_user((void __user *)arg, &info, minsz)) @@ -291,8 +291,8 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; - if ((param.size & ~IOMMU_PAGE_MASK_4K) || - (param.vaddr & ~IOMMU_PAGE_MASK_4K)) + if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || + (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) return -EINVAL; /* iova is checked by the IOMMU API */ @@ -307,8 +307,8 @@ static long tce_iommu_ioctl(void *iommu_data, return ret; ret = tce_iommu_build(container, tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, - tce, param.size >> IOMMU_PAGE_SHIFT_4K); + param.iova >> tbl->it_page_shift, + tce, param.size >> tbl->it_page_shift); iommu_flush_tce(tbl); @@ -334,17 +334,17 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; - if (param.size & ~IOMMU_PAGE_MASK_4K) + if (param.size & ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; ret = iommu_tce_clear_param_check(tbl, param.iova, 0, - param.size >> IOMMU_PAGE_SHIFT_4K); + param.size >> tbl->it_page_shift); if (ret) return ret; ret = tce_iommu_clear(container, tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, - param.size >> IOMMU_PAGE_SHIFT_4K); + param.iova >> tbl->it_page_shift, + param.size >> tbl->it_page_shift); iommu_flush_tce(tbl); return ret; -- cgit v1.2.3 From 2d270df8f71a0fded9ef01cb9282b46fd3bc0986 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:01 +1000 Subject: vfio: powerpc/spapr: Move locked_vm accounting to helpers There moves locked pages accounting to helpers. Later they will be reused for Dynamic DMA windows (DDW). This reworks debug messages to show the current value and the limit. This stores the locked pages number in the container so when unlocking the iommu table pointer won't be needed. This does not have an effect now but it will with the multiple tables per container as then we will allow attaching/detaching groups on fly and we may end up having a container with no group attached but with the counter incremented. While we are here, update the comment explaining why RLIMIT_MEMLOCK might be required to be bigger than the guest RAM. This also prints pid of the current process in pr_warn/pr_debug. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 19 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 64300ccb05db..6e2e15fdb0df 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -29,6 +29,51 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); +static long try_increment_locked_vm(long npages) +{ + long ret = 0, locked, lock_limit; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + if (!npages) + return 0; + + down_write(¤t->mm->mmap_sem); + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + current->mm->locked_vm += npages; + + pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, + npages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void decrement_locked_vm(long npages) +{ + if (!current || !current->mm || !npages) + return; /* process exited */ + + down_write(¤t->mm->mmap_sem); + if (WARN_ON_ONCE(npages > current->mm->locked_vm)) + npages = current->mm->locked_vm; + current->mm->locked_vm -= npages; + pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, + npages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK)); + up_write(¤t->mm->mmap_sem); +} + /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -45,6 +90,7 @@ struct tce_container { struct mutex lock; struct iommu_table *tbl; bool enabled; + unsigned long locked_pages; }; static bool tce_page_is_contained(struct page *page, unsigned page_shift) @@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift) static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - unsigned long locked, lock_limit, npages; + unsigned long locked; struct iommu_table *tbl = container->tbl; if (!container->tbl) @@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container) * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. + * + * Unfortunately at the moment it counts whole tables, no matter how + * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups + * each with 2GB DMA window, 8GB will be counted here. The reason for + * this is that we cannot tell here the amount of RAM used by the guest + * as this information is only available from KVM and VFIO is + * KVM agnostic. */ - down_write(¤t->mm->mmap_sem); - npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; - locked = current->mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", - rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - } else { + locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; + ret = try_increment_locked_vm(locked); + if (ret) + return ret; - current->mm->locked_vm += npages; - container->enabled = true; - } - up_write(¤t->mm->mmap_sem); + container->locked_pages = locked; + + container->enabled = true; return ret; } @@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container) container->enabled = false; - if (!container->tbl || !current->mm) + if (!current->mm) return; - down_write(¤t->mm->mmap_sem); - current->mm->locked_vm -= (container->tbl->it_size << - container->tbl->it_page_shift) >> PAGE_SHIFT; - up_write(¤t->mm->mmap_sem); + decrement_locked_vm(container->locked_pages); } static void *tce_iommu_open(unsigned long arg) -- cgit v1.2.3 From 3c56e822f8fb0105949d04bda0e549e9d08713cd Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:02 +1000 Subject: vfio: powerpc/spapr: Disable DMA mappings on disabled container At the moment DMA map/unmap requests are handled irrespective to the container's state. This allows the user space to pin memory which it might not be allowed to pin. This adds checks to MAP/UNMAP that the container is enabled, otherwise -EPERM is returned. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 6e2e15fdb0df..5bbdf378fd59 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -318,6 +318,9 @@ static long tce_iommu_ioctl(void *iommu_data, struct iommu_table *tbl = container->tbl; unsigned long tce; + if (!container->enabled) + return -EPERM; + if (!tbl) return -ENXIO; @@ -362,6 +365,9 @@ static long tce_iommu_ioctl(void *iommu_data, struct vfio_iommu_type1_dma_unmap param; struct iommu_table *tbl = container->tbl; + if (!container->enabled) + return -EPERM; + if (WARN_ON(!tbl)) return -ENXIO; -- cgit v1.2.3 From 649354b75dca1ebcb55d8db41e1d6b59ef69ac77 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:03 +1000 Subject: vfio: powerpc/spapr: Moving pinning/unpinning to helpers This is a pretty mechanical patch to make next patches simpler. New tce_iommu_unuse_page() helper does put_page() now but it might skip that after the memory registering patch applied. As we are here, this removes unnecessary checks for a value returned by pfn_to_page() as it cannot possibly return NULL. This moves tce_iommu_disable() later to let tce_iommu_clear() know if the container has been enabled because if it has not been, then put_page() must not be called on TCEs from the TCE table. This situation is not yet possible but it will after KVM acceleration patchset is applied. This changes code to work with physical addresses rather than linear mapping addresses for better code readability. Following patches will add an xchg() callback for an IOMMU table which will accept/return physical addresses (unlike current tce_build()) which will eliminate redundant conversions. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 61 +++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 20 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 5bbdf378fd59..cf5d4a159ce9 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data) struct tce_container *container = iommu_data; WARN_ON(container->tbl && !container->tbl->it_group); - tce_iommu_disable(container); if (container->tbl && container->tbl->it_group) tce_iommu_detach_group(iommu_data, container->tbl->it_group); + tce_iommu_disable(container); mutex_destroy(&container->lock); kfree(container); } +static void tce_iommu_unuse_page(struct tce_container *container, + unsigned long oldtce) +{ + struct page *page; + + if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE))) + return; + + page = pfn_to_page(oldtce >> PAGE_SHIFT); + + if (oldtce & TCE_PCI_WRITE) + SetPageDirty(page); + + put_page(page); +} + static int tce_iommu_clear(struct tce_container *container, struct iommu_table *tbl, unsigned long entry, unsigned long pages) { unsigned long oldtce; - struct page *page; for ( ; pages; --pages, ++entry) { oldtce = iommu_clear_tce(tbl, entry); if (!oldtce) continue; - page = pfn_to_page(oldtce >> PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } + tce_iommu_unuse_page(container, oldtce); } return 0; } +static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) +{ + struct page *page = NULL; + enum dma_data_direction direction = iommu_tce_direction(tce); + + if (get_user_pages_fast(tce & PAGE_MASK, 1, + direction != DMA_TO_DEVICE, &page) != 1) + return -EFAULT; + + *hpa = __pa((unsigned long) page_address(page)); + + return 0; +} + static long tce_iommu_build(struct tce_container *container, struct iommu_table *tbl, unsigned long entry, unsigned long tce, unsigned long pages) { long i, ret = 0; - struct page *page = NULL; - unsigned long hva; + struct page *page; + unsigned long hpa; enum dma_data_direction direction = iommu_tce_direction(tce); for (i = 0; i < pages; ++i) { unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; - ret = get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, &page); - if (unlikely(ret != 1)) { - ret = -EFAULT; + ret = tce_iommu_use_page(tce, &hpa); + if (ret) break; - } + page = pfn_to_page(hpa >> PAGE_SHIFT); if (!tce_page_is_contained(page, tbl->it_page_shift)) { ret = -EPERM; break; } - hva = (unsigned long) page_address(page) + offset; - - ret = iommu_tce_build(tbl, entry + i, hva, direction); + hpa |= offset; + ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa), + direction); if (ret) { - put_page(page); + tce_iommu_unuse_page(container, hpa); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", __func__, entry << tbl->it_page_shift, tce, ret); -- cgit v1.2.3 From 22af48596e9c92313d475306b684f844301ea4cd Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:04 +1000 Subject: vfio: powerpc/spapr: Rework groups attaching This is to make extended ownership and multiple groups support patches simpler for review. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 40 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index cf5d4a159ce9..e65bc73cc8a8 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -460,16 +460,21 @@ static int tce_iommu_attach_group(void *iommu_data, iommu_group_id(container->tbl->it_group), iommu_group_id(iommu_group)); ret = -EBUSY; - } else if (container->enabled) { + goto unlock_exit; + } + + if (container->enabled) { pr_err("tce_vfio: attaching group #%u to enabled container\n", iommu_group_id(iommu_group)); ret = -EBUSY; - } else { - ret = iommu_take_ownership(tbl); - if (!ret) - container->tbl = tbl; + goto unlock_exit; } + ret = iommu_take_ownership(tbl); + if (!ret) + container->tbl = tbl; + +unlock_exit: mutex_unlock(&container->lock); return ret; @@ -487,19 +492,22 @@ static void tce_iommu_detach_group(void *iommu_data, pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", iommu_group_id(iommu_group), iommu_group_id(tbl->it_group)); - } else { - if (container->enabled) { - pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", - iommu_group_id(tbl->it_group)); - tce_iommu_disable(container); - } + goto unlock_exit; + } - /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", - iommu_group_id(iommu_group), iommu_group); */ - container->tbl = NULL; - tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - iommu_release_ownership(tbl); + if (container->enabled) { + pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", + iommu_group_id(tbl->it_group)); + tce_iommu_disable(container); } + + /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + container->tbl = NULL; + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + iommu_release_ownership(tbl); + +unlock_exit: mutex_unlock(&container->lock); } -- cgit v1.2.3 From b348aa65297659c310943221ac1d3f4b4491ea44 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:08 +1000 Subject: powerpc/spapr: vfio: Replace iommu_table with iommu_table_group Modern IBM POWERPC systems support multiple (currently two) TCE tables per IOMMU group (a.k.a. PE). This adds a iommu_table_group container for TCE tables. Right now just one table is supported. This defines iommu_table_group struct which stores pointers to iommu_group and iommu_table(s). This replaces iommu_table with iommu_table_group where iommu_table was used to identify a group: - iommu_register_group(); - iommudata of generic iommu_group; This removes @data from iommu_table as it_table_group provides same access to pnv_ioda_pe. For IODA, instead of embedding iommu_table, the new iommu_table_group keeps pointers to those. The iommu_table structs are allocated dynamically. For P5IOC2, both iommu_table_group and iommu_table are embedded into PE struct. As there is no EEH and SRIOV support for P5IOC2, iommu_free_table() should not be called on iommu_table struct pointers so we can keep it embedded in pnv_phb::p5ioc2. For pSeries, this replaces multiple calls of kzalloc_node() with a new iommu_pseries_alloc_group() helper and stores the table group struct pointer into the pci_dn struct. For release, a iommu_table_free_group() helper is added. This moves iommu_table struct allocation from SR-IOV code to the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized. This change is here because those lines had to be changed anyway. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 19 ++--- arch/powerpc/include/asm/pci-bridge.h | 2 +- arch/powerpc/kernel/iommu.c | 17 ++--- arch/powerpc/platforms/powernv/pci-ioda.c | 55 +++++++------- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 18 +++-- arch/powerpc/platforms/powernv/pci.h | 3 +- arch/powerpc/platforms/pseries/iommu.c | 107 +++++++++++++++++++--------- drivers/vfio/vfio_iommu_spapr_tce.c | 23 +++--- 8 files changed, 152 insertions(+), 92 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index e2a45c37dba8..5a7267f47bdb 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -91,14 +91,9 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ -#ifdef CONFIG_IOMMU_API - struct iommu_group *it_group; -#endif + struct iommu_table_group *it_table_group; struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); -#ifdef CONFIG_PPC_POWERNV - void *data; -#endif }; /* Pure 2^n version of get_order */ @@ -129,14 +124,22 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); +#define IOMMU_TABLE_GROUP_MAX_TABLES 1 + +struct iommu_table_group { + struct iommu_group *group; + struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; +}; + #ifdef CONFIG_IOMMU_API -extern void iommu_register_group(struct iommu_table *tbl, + +extern void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); #else -static inline void iommu_register_group(struct iommu_table *tbl, +static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index b76cd5682a8c..712add590445 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -199,7 +199,7 @@ struct pci_dn { struct pci_dn *parent; struct pci_controller *phb; /* for pci devices */ - struct iommu_table *iommu_table; /* for phb's or bridges */ + struct iommu_table_group *table_group; /* for phb's or bridges */ struct device_node *node; /* back-pointer to the device_node */ int pci_ext_config_space; /* for pci devices */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c0e67e945c95..719f0485ec79 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -889,11 +889,12 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl->it_group = NULL; + struct iommu_table_group *table_group = iommu_data; + + table_group->group = NULL; } -void iommu_register_group(struct iommu_table *tbl, +void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num) { struct iommu_group *grp; @@ -905,8 +906,8 @@ void iommu_register_group(struct iommu_table *tbl, PTR_ERR(grp)); return; } - tbl->it_group = grp; - iommu_group_set_iommudata(grp, tbl, group_release); + table_group->group = grp; + iommu_group_set_iommudata(grp, table_group, group_release); name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", pci_domain_number, pe_num); if (!name) @@ -1094,7 +1095,7 @@ int iommu_add_device(struct device *dev) } tbl = get_iommu_table_base(dev); - if (!tbl || !tbl->it_group) { + if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) { pr_debug("%s: Skipping device %s with no tbl\n", __func__, dev_name(dev)); return 0; @@ -1102,7 +1103,7 @@ int iommu_add_device(struct device *dev) pr_debug("%s: Adding %s to iommu group %d\n", __func__, dev_name(dev), - iommu_group_id(tbl->it_group)); + iommu_group_id(tbl->it_table_group->group)); if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n", @@ -1111,7 +1112,7 @@ int iommu_add_device(struct device *dev) return -EINVAL; } - return iommu_group_add_device(tbl->it_group, dev); + return iommu_group_add_device(tbl->it_table_group->group, dev); } EXPORT_SYMBOL_GPL(iommu_add_device); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f710729a4a35..279dadf43d5a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1087,10 +1087,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) return; } - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), - GFP_KERNEL, hose->node); - pe->tce32_table->data = pe; - /* Associate it with all child devices */ pnv_ioda_setup_same_PE(bus, pe); @@ -1292,11 +1288,12 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe struct iommu_table *tbl; unsigned long addr; int64_t rc; + struct iommu_table_group *table_group; bus = dev->bus; hose = pci_bus_to_host(bus); phb = hose->private_data; - tbl = pe->tce32_table; + tbl = pe->table_group.tables[0]; addr = tbl->it_base; opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, @@ -1311,13 +1308,14 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe if (rc) pe_warn(pe, "OPAL error %ld release DMA window\n", rc); - if (tbl->it_group) { - iommu_group_put(tbl->it_group); - BUG_ON(tbl->it_group); + table_group = tbl->it_table_group; + if (table_group->group) { + iommu_group_put(table_group->group); + BUG_ON(table_group->group); } iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); free_pages(addr, get_order(TCE32_TABLE_SIZE)); - pe->tce32_table = NULL; + pe->table_group.tables[0] = NULL; } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) @@ -1465,10 +1463,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) continue; } - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), - GFP_KERNEL, hose->node); - pe->tce32_table->data = pe; - /* Put PE to the list */ mutex_lock(&phb->ioda.pe_list_mutex); list_add_tail(&pe->list, &phb->ioda.pe_list); @@ -1603,7 +1597,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_iommu_table_base(&pdev->dev, pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); /* * Note: iommu_add_device() will fail here as * for physical PE: the device is already added by now; @@ -1637,7 +1631,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) } else { dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); set_dma_ops(&pdev->dev, &dma_iommu_ops); - set_iommu_table_base(&pdev->dev, pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); } *pdev->dev.dma_mask = dma_mask; return 0; @@ -1671,7 +1665,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, pe->tce32_table); + set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); iommu_add_device(&dev->dev); if (dev->subordinate) @@ -1682,7 +1676,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = tbl->data; + struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : (__be64 __iomem *)tbl->it_index; @@ -1759,7 +1754,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = tbl->data; + struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct pnv_ioda_pe, table_group); unsigned long start, end, inc; __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : @@ -1835,8 +1831,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = pe->tce32_table; - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + phb->hose->node); + tbl->it_table_group = &pe->table_group; + pe->table_group.tables[0] = tbl; + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); /* Grab a 32-bit TCE table */ pe->tce32_seg = base; @@ -1915,7 +1915,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { - struct pnv_ioda_pe *pe = tbl->data; + struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct pnv_ioda_pe, table_group); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1949,10 +1950,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pe->tce_bypass_base = 1ull << 59; /* Install set_bypass callback for VFIO */ - pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass; + pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass; /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe->tce32_table, true); + pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true); } static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, @@ -1969,8 +1970,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = pe->tce32_table; - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + phb->hose->node); + tbl->it_table_group = &pe->table_group; + pe->table_group.tables[0] = tbl; + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 5a387a9132ff..6e00104093d6 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -92,14 +92,16 @@ static struct iommu_table_ops pnv_p5ioc2_iommu_ops = { static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { - if (phb->p5ioc2.iommu_table.it_map == NULL) { - phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops; - iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); - iommu_register_group(&phb->p5ioc2.iommu_table, + struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0]; + + if (!tbl->it_map) { + tbl->it_ops = &pnv_p5ioc2_iommu_ops; + iommu_init_table(tbl, phb->hose->node); + iommu_register_group(&phb->p5ioc2.table_group, pci_domain_nr(phb->hose->bus), phb->opal_id); } - set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); + set_iommu_table_base(&pdev->dev, tbl); iommu_add_device(&pdev->dev); } @@ -188,6 +190,12 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table, tce_mem, tce_size, 0, IOMMU_PAGE_SHIFT_4K); + /* + * We do not allocate iommu_table as we do not support + * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table() + * should not be called for phb->p5ioc2.table_group.tables[0] ever. + */ + phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table; } void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 45a756007ec3..d8ba34d7af1b 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -57,7 +57,7 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ int tce32_seg; int tce32_segcount; - struct iommu_table *tce32_table; + struct iommu_table_group table_group; phys_addr_t tce_inval_reg_phys; /* 64-bit TCE bypass region */ @@ -120,6 +120,7 @@ struct pnv_phb { union { struct { struct iommu_table iommu_table; + struct iommu_table_group table_group; } p5ioc2; struct { diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 33f3a855bfd9..307d704ffaa3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -52,16 +52,51 @@ #include "pseries.h" -static void iommu_pseries_free_table(struct iommu_table *tbl, +static struct iommu_table_group *iommu_pseries_alloc_group(int node) +{ + struct iommu_table_group *table_group = NULL; + struct iommu_table *tbl = NULL; + + table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, + node); + if (!table_group) + goto fail_exit; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); + if (!tbl) + goto fail_exit; + + tbl->it_table_group = table_group; + table_group->tables[0] = tbl; + + return table_group; + +fail_exit: + kfree(table_group); + kfree(tbl); + + return NULL; +} + +static void iommu_pseries_free_group(struct iommu_table_group *table_group, const char *node_name) { + struct iommu_table *tbl; + + if (!table_group) + return; + #ifdef CONFIG_IOMMU_API - if (tbl->it_group) { - iommu_group_put(tbl->it_group); - BUG_ON(tbl->it_group); + if (table_group->group) { + iommu_group_put(table_group->group); + BUG_ON(table_group->group); } #endif + + tbl = table_group->tables[0]; iommu_free_table(tbl, node_name); + + kfree(table_group); } static void tce_invalidate_pSeries_sw(struct iommu_table *tbl, @@ -631,13 +666,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pci->phb->dma_window_size = 0x8000000ul; pci->phb->dma_window_base_cur = 0x8000000ul; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci->phb->node); + pci->table_group = iommu_pseries_alloc_group(pci->phb->node); + tbl = pci->table_group->tables[0]; iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - pci->iommu_table = iommu_init_table(tbl, pci->phb->node); - iommu_register_group(tbl, pci_domain_nr(bus), 0); + iommu_init_table(tbl, pci->phb->node); + iommu_register_group(pci->table_group, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -680,16 +715,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci = PCI_DN(pdn); pr_debug(" parent is %s, iommu_table: 0x%p\n", - pdn->full_name, ppci->iommu_table); + pdn->full_name, ppci->table_group); - if (!ppci->iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - ppci->phb->node); + if (!ppci->table_group) { + ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); + tbl = ppci->table_group->tables[0]; iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); - iommu_register_group(tbl, pci_domain_nr(bus), 0); - pr_debug(" created table: %p\n", ppci->iommu_table); + iommu_init_table(tbl, ppci->phb->node); + iommu_register_group(ppci->table_group, + pci_domain_nr(bus), 0); + pr_debug(" created table: %p\n", ppci->table_group); } } @@ -711,12 +747,13 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) struct pci_controller *phb = PCI_DN(dn)->phb; pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb->node); + PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); + tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); - iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); + iommu_init_table(tbl, phb->node); + iommu_register_group(PCI_DN(dn)->table_group, + pci_domain_nr(phb->bus), 0); set_iommu_table_base(&dev->dev, tbl); iommu_add_device(&dev->dev); return; @@ -726,11 +763,12 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) * an already allocated iommu table is found and use that. */ - while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL) + while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) dn = dn->parent; if (dn && PCI_DN(dn)) { - set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + set_iommu_table_base(&dev->dev, + PCI_DN(dn)->table_group->tables[0]); iommu_add_device(&dev->dev); } else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", @@ -1117,7 +1155,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); pr_debug(" node is %s\n", dn->full_name); - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window) @@ -1133,19 +1171,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pr_debug(" parent is %s\n", pdn->full_name); pci = PCI_DN(pdn); - if (!pci->iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci->phb->node); + if (!pci->table_group) { + pci->table_group = iommu_pseries_alloc_group(pci->phb->node); + tbl = pci->table_group->tables[0]; iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - pci->iommu_table = iommu_init_table(tbl, pci->phb->node); - iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0); - pr_debug(" created table: %p\n", pci->iommu_table); + iommu_init_table(tbl, pci->phb->node); + iommu_register_group(pci->table_group, + pci_domain_nr(pci->phb->bus), 0); + pr_debug(" created table: %p\n", pci->table_group); } else { - pr_debug(" found DMA window, table: %p\n", pci->iommu_table); + pr_debug(" found DMA window, table: %p\n", pci->table_group); } - set_iommu_table_base(&dev->dev, pci->iommu_table); + set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); iommu_add_device(&dev->dev); } @@ -1176,7 +1215,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) * search upwards in the tree until we either hit a dma-window * property, OR find a parent with a table already allocated. */ - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window) @@ -1220,7 +1259,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev) dn = pci_device_to_OF_node(pdev); /* search upwards for ibm,dma-window */ - for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table; + for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group; dn = dn->parent) if (of_get_property(dn, "ibm,dma-window", NULL)) break; @@ -1300,8 +1339,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * the device node. */ remove_ddw(np, false); - if (pci && pci->iommu_table) - iommu_pseries_free_table(pci->iommu_table, + if (pci && pci->table_group) + iommu_pseries_free_group(pci->table_group, np->full_name); spin_lock(&direct_window_list_lock); diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index e65bc73cc8a8..c4bc345d64d7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -190,10 +190,11 @@ static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; - WARN_ON(container->tbl && !container->tbl->it_group); + WARN_ON(container->tbl && !container->tbl->it_table_group->group); - if (container->tbl && container->tbl->it_group) - tce_iommu_detach_group(iommu_data, container->tbl->it_group); + if (container->tbl && container->tbl->it_table_group->group) + tce_iommu_detach_group(iommu_data, + container->tbl->it_table_group->group); tce_iommu_disable(container); mutex_destroy(&container->lock); @@ -345,7 +346,7 @@ static long tce_iommu_ioctl(void *iommu_data, if (!tbl) return -ENXIO; - BUG_ON(!tbl->it_group); + BUG_ON(!tbl->it_table_group->group); minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); @@ -433,11 +434,12 @@ static long tce_iommu_ioctl(void *iommu_data, mutex_unlock(&container->lock); return 0; case VFIO_EEH_PE_OP: - if (!container->tbl || !container->tbl->it_group) + if (!container->tbl || !container->tbl->it_table_group->group) return -ENODEV; - return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group, - cmd, arg); + return vfio_spapr_iommu_eeh_ioctl( + container->tbl->it_table_group->group, + cmd, arg); } return -ENOTTY; @@ -457,7 +459,8 @@ static int tce_iommu_attach_group(void *iommu_data, iommu_group_id(iommu_group), iommu_group); */ if (container->tbl) { pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", - iommu_group_id(container->tbl->it_group), + iommu_group_id(container->tbl-> + it_table_group->group), iommu_group_id(iommu_group)); ret = -EBUSY; goto unlock_exit; @@ -491,13 +494,13 @@ static void tce_iommu_detach_group(void *iommu_data, if (tbl != container->tbl) { pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", iommu_group_id(iommu_group), - iommu_group_id(tbl->it_group)); + iommu_group_id(tbl->it_table_group->group)); goto unlock_exit; } if (container->enabled) { pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", - iommu_group_id(tbl->it_group)); + iommu_group_id(tbl->it_table_group->group)); tce_iommu_disable(container); } -- cgit v1.2.3 From 0eaf4defc7c44ed5dd33a03cab12a5f88c9b4b86 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:09 +1000 Subject: powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group So far one TCE table could only be used by one IOMMU group. However IODA2 hardware allows programming the same TCE table address to multiple PE allowing sharing tables. This replaces a single pointer to a group in a iommu_table struct with a linked list of groups which provides the way of invalidating TCE cache for every PE when an actual TCE table is updated. This adds pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group() helpers to manage the list. However without VFIO, it is still going to be a single IOMMU group per iommu_table. This changes iommu_add_device() to add a device to a first group from the group list of a table as it is only called from the platform init code or PCI bus notifier and at these moments there is only one group per table. This does not change TCE invalidation code to loop through all attached groups in order to simplify this patch and because it is not really needed in most cases. IODA2 is fixed in a later patch. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 8 +- arch/powerpc/kernel/iommu.c | 14 +++- arch/powerpc/platforms/powernv/pci-ioda.c | 45 ++++++---- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 + arch/powerpc/platforms/powernv/pci.c | 75 +++++++++++++++++ arch/powerpc/platforms/powernv/pci.h | 7 ++ arch/powerpc/platforms/pseries/iommu.c | 27 +++++- drivers/vfio/vfio_iommu_spapr_tce.c | 122 ++++++++++++++++++++-------- 8 files changed, 241 insertions(+), 60 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 5a7267f47bdb..44a20ccf06b4 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -91,7 +91,7 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ - struct iommu_table_group *it_table_group; + struct list_head it_group_list;/* List of iommu_table_group_link */ struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; @@ -126,6 +126,12 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); #define IOMMU_TABLE_GROUP_MAX_TABLES 1 +struct iommu_table_group_link { + struct list_head next; + struct rcu_head rcu; + struct iommu_table_group *table_group; +}; + struct iommu_table_group { struct iommu_group *group; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 719f0485ec79..be258b2ecb10 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1078,6 +1078,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership); int iommu_add_device(struct device *dev) { struct iommu_table *tbl; + struct iommu_table_group_link *tgl; /* * The sysfs entries should be populated before @@ -1095,15 +1096,22 @@ int iommu_add_device(struct device *dev) } tbl = get_iommu_table_base(dev); - if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) { + if (!tbl) { pr_debug("%s: Skipping device %s with no tbl\n", __func__, dev_name(dev)); return 0; } + tgl = list_first_entry_or_null(&tbl->it_group_list, + struct iommu_table_group_link, next); + if (!tgl) { + pr_debug("%s: Skipping device %s with no group\n", + __func__, dev_name(dev)); + return 0; + } pr_debug("%s: Adding %s to iommu group %d\n", __func__, dev_name(dev), - iommu_group_id(tbl->it_table_group->group)); + iommu_group_id(tgl->table_group->group)); if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n", @@ -1112,7 +1120,7 @@ int iommu_add_device(struct device *dev) return -EINVAL; } - return iommu_group_add_device(tbl->it_table_group->group, dev); + return iommu_group_add_device(tgl->table_group->group, dev); } EXPORT_SYMBOL_GPL(iommu_add_device); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 279dadf43d5a..3b4130697b05 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1288,7 +1288,6 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe struct iommu_table *tbl; unsigned long addr; int64_t rc; - struct iommu_table_group *table_group; bus = dev->bus; hose = pci_bus_to_host(bus); @@ -1308,14 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe if (rc) pe_warn(pe, "OPAL error %ld release DMA window\n", rc); - table_group = tbl->it_table_group; - if (table_group->group) { - iommu_group_put(table_group->group); - BUG_ON(table_group->group); + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + if (pe->table_group.group) { + iommu_group_put(pe->table_group.group); + BUG_ON(pe->table_group.group); } iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); free_pages(addr, get_order(TCE32_TABLE_SIZE)); - pe->table_group.tables[0] = NULL; } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) @@ -1676,7 +1674,10 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct iommu_table_group_link *tgl = list_first_entry_or_null( + &tbl->it_group_list, struct iommu_table_group_link, + next); + struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : @@ -1754,7 +1755,10 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct iommu_table_group_link *tgl = list_first_entry_or_null( + &tbl->it_group_list, struct iommu_table_group_link, + next); + struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); unsigned long start, end, inc; __be64 __iomem *invalidate = rm ? @@ -1831,12 +1835,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb->hose->node); - tbl->it_table_group = &pe->table_group; - pe->table_group.tables[0] = tbl; + tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); + pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* Grab a 32-bit TCE table */ pe->tce32_seg = base; @@ -1911,11 +1913,18 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, pe->tce32_seg = -1; if (tce_mem) __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); + if (tbl) { + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + iommu_free_table(tbl, "pnv"); + } } static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { - struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct iommu_table_group_link *tgl = list_first_entry_or_null( + &tbl->it_group_list, struct iommu_table_group_link, + next); + struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1970,12 +1979,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb->hose->node); - tbl->it_table_group = &pe->table_group; - pe->table_group.tables[0] = tbl; + tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); + pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; @@ -2048,6 +2055,10 @@ fail: pe->tce32_seg = -1; if (tce_mem) __free_pages(tce_mem, get_order(tce_table_size)); + if (tbl) { + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + iommu_free_table(tbl, "pnv"); + } } static void pnv_ioda_setup_dma(struct pnv_phb *phb) diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 6e00104093d6..f80e5a1d6117 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -99,6 +99,9 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, iommu_init_table(tbl, phb->hose->node); iommu_register_group(&phb->p5ioc2.table_group, pci_domain_nr(phb->hose->bus), phb->opal_id); + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + pnv_pci_link_table_and_group(phb->hose->node, 0, + tbl, &phb->p5ioc2.table_group); } set_iommu_table_base(&pdev->dev, tbl); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index f72fc6e7d63d..00f1abd34379 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -606,6 +606,81 @@ unsigned long pnv_tce_get(struct iommu_table *tbl, long index) return ((u64 *)tbl->it_base)[index - tbl->it_offset]; } +struct iommu_table *pnv_pci_table_alloc(int nid) +{ + struct iommu_table *tbl; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + + return tbl; +} + +long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + struct iommu_table_group_link *tgl = NULL; + + if (WARN_ON(!tbl || !table_group)) + return -EINVAL; + + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + return -ENOMEM; + + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + + table_group->tables[num] = tbl; + + return 0; +} + +static void pnv_iommu_table_group_link_free(struct rcu_head *head) +{ + struct iommu_table_group_link *tgl = container_of(head, + struct iommu_table_group_link, rcu); + + kfree(tgl); +} + +void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + long i; + bool found; + struct iommu_table_group_link *tgl; + + if (!tbl || !table_group) + return; + + /* Remove link to a group from table's list of attached groups */ + found = false; + list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + if (tgl->table_group == table_group) { + list_del_rcu(&tgl->next); + call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); + found = true; + break; + } + } + if (WARN_ON(!found)) + return; + + /* Clean a pointer to iommu_table in iommu_table_group::tables[] */ + found = false; + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] == tbl) { + table_group->tables[i] = NULL; + found = true; + break; + } + } + WARN_ON(!found); +} + void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned page_shift) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index d8ba34d7af1b..d37cb4da409f 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -210,6 +210,13 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, int where, int size, u32 *val); int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); +extern struct iommu_table *pnv_pci_table_alloc(int nid); + +extern long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group); extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned page_shift); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 307d704ffaa3..10510dea16b3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) { struct iommu_table_group *table_group = NULL; struct iommu_table *tbl = NULL; + struct iommu_table_group_link *tgl = NULL; table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, node); @@ -66,12 +68,21 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) if (!tbl) goto fail_exit; - tbl->it_table_group = table_group; + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + goto fail_exit; + + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + table_group->tables[0] = tbl; return table_group; fail_exit: + kfree(tgl); kfree(table_group); kfree(tbl); @@ -82,18 +93,28 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group, const char *node_name) { struct iommu_table *tbl; +#ifdef CONFIG_IOMMU_API + struct iommu_table_group_link *tgl; +#endif if (!table_group) return; + tbl = table_group->tables[0]; #ifdef CONFIG_IOMMU_API + tgl = list_first_entry_or_null(&tbl->it_group_list, + struct iommu_table_group_link, next); + + WARN_ON_ONCE(!tgl); + if (tgl) { + list_del_rcu(&tgl->next); + kfree(tgl); + } if (table_group->group) { iommu_group_put(table_group->group); BUG_ON(table_group->group); } #endif - - tbl = table_group->tables[0]; iommu_free_table(tbl, node_name); kfree(table_group); diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index c4bc345d64d7..ffc634a75dba 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -88,7 +88,7 @@ static void decrement_locked_vm(long npages) */ struct tce_container { struct mutex lock; - struct iommu_table *tbl; + struct iommu_group *grp; bool enabled; unsigned long locked_pages; }; @@ -103,13 +103,42 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift) return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; } +static long tce_iommu_find_table(struct tce_container *container, + phys_addr_t ioba, struct iommu_table **ptbl) +{ + long i; + struct iommu_table_group *table_group; + + table_group = iommu_group_get_iommudata(container->grp); + if (!table_group) + return -1; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (tbl) { + unsigned long entry = ioba >> tbl->it_page_shift; + unsigned long start = tbl->it_offset; + unsigned long end = start + tbl->it_size; + + if ((start <= entry) && (entry < end)) { + *ptbl = tbl; + return i; + } + } + } + + return -1; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; unsigned long locked; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl; + struct iommu_table_group *table_group; - if (!container->tbl) + if (!container->grp) return -ENXIO; if (!current->mm) @@ -143,6 +172,11 @@ static int tce_iommu_enable(struct tce_container *container) * as this information is only available from KVM and VFIO is * KVM agnostic. */ + table_group = iommu_group_get_iommudata(container->grp); + if (!table_group) + return -ENODEV; + + tbl = table_group->tables[0]; locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; ret = try_increment_locked_vm(locked); if (ret) @@ -190,11 +224,10 @@ static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; - WARN_ON(container->tbl && !container->tbl->it_table_group->group); + WARN_ON(container->grp); - if (container->tbl && container->tbl->it_table_group->group) - tce_iommu_detach_group(iommu_data, - container->tbl->it_table_group->group); + if (container->grp) + tce_iommu_detach_group(iommu_data, container->grp); tce_iommu_disable(container); mutex_destroy(&container->lock); @@ -312,9 +345,16 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { struct vfio_iommu_spapr_tce_info info; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl; + struct iommu_table_group *table_group; + + if (WARN_ON(!container->grp)) + return -ENXIO; + + table_group = iommu_group_get_iommudata(container->grp); - if (WARN_ON(!tbl)) + tbl = table_group->tables[0]; + if (WARN_ON_ONCE(!tbl)) return -ENXIO; minsz = offsetofend(struct vfio_iommu_spapr_tce_info, @@ -337,17 +377,13 @@ static long tce_iommu_ioctl(void *iommu_data, } case VFIO_IOMMU_MAP_DMA: { struct vfio_iommu_type1_dma_map param; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl = NULL; unsigned long tce; + long num; if (!container->enabled) return -EPERM; - if (!tbl) - return -ENXIO; - - BUG_ON(!tbl->it_table_group->group); - minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); if (copy_from_user(¶m, (void __user *)arg, minsz)) @@ -360,6 +396,10 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; + num = tce_iommu_find_table(container, param.iova, &tbl); + if (num < 0) + return -ENXIO; + if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) return -EINVAL; @@ -385,14 +425,12 @@ static long tce_iommu_ioctl(void *iommu_data, } case VFIO_IOMMU_UNMAP_DMA: { struct vfio_iommu_type1_dma_unmap param; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl = NULL; + long num; if (!container->enabled) return -EPERM; - if (WARN_ON(!tbl)) - return -ENXIO; - minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); @@ -406,6 +444,10 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; + num = tce_iommu_find_table(container, param.iova, &tbl); + if (num < 0) + return -ENXIO; + if (param.size & ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; @@ -434,12 +476,11 @@ static long tce_iommu_ioctl(void *iommu_data, mutex_unlock(&container->lock); return 0; case VFIO_EEH_PE_OP: - if (!container->tbl || !container->tbl->it_table_group->group) + if (!container->grp) return -ENODEV; - return vfio_spapr_iommu_eeh_ioctl( - container->tbl->it_table_group->group, - cmd, arg); + return vfio_spapr_iommu_eeh_ioctl(container->grp, + cmd, arg); } return -ENOTTY; @@ -450,17 +491,15 @@ static int tce_iommu_attach_group(void *iommu_data, { int ret; struct tce_container *container = iommu_data; - struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + struct iommu_table_group *table_group; - BUG_ON(!tbl); mutex_lock(&container->lock); /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ - if (container->tbl) { + if (container->grp) { pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", - iommu_group_id(container->tbl-> - it_table_group->group), + iommu_group_id(container->grp), iommu_group_id(iommu_group)); ret = -EBUSY; goto unlock_exit; @@ -473,9 +512,15 @@ static int tce_iommu_attach_group(void *iommu_data, goto unlock_exit; } - ret = iommu_take_ownership(tbl); + table_group = iommu_group_get_iommudata(iommu_group); + if (!table_group) { + ret = -ENXIO; + goto unlock_exit; + } + + ret = iommu_take_ownership(table_group->tables[0]); if (!ret) - container->tbl = tbl; + container->grp = iommu_group; unlock_exit: mutex_unlock(&container->lock); @@ -487,26 +532,31 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group) { struct tce_container *container = iommu_data; - struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + struct iommu_table_group *table_group; + struct iommu_table *tbl; - BUG_ON(!tbl); mutex_lock(&container->lock); - if (tbl != container->tbl) { + if (iommu_group != container->grp) { pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", iommu_group_id(iommu_group), - iommu_group_id(tbl->it_table_group->group)); + iommu_group_id(container->grp)); goto unlock_exit; } if (container->enabled) { pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", - iommu_group_id(tbl->it_table_group->group)); + iommu_group_id(container->grp)); tce_iommu_disable(container); } /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ - container->tbl = NULL; + container->grp = NULL; + + table_group = iommu_group_get_iommudata(iommu_group); + BUG_ON(!table_group); + + tbl = table_group->tables[0]; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); iommu_release_ownership(tbl); -- cgit v1.2.3 From f87a88642e660edd8912ad39fe77848c6f9927a2 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:10 +1000 Subject: vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control This adds tce_iommu_take_ownership() and tce_iommu_release_ownership which call in a loop iommu_take_ownership()/iommu_release_ownership() for every table on the group. As there is just one now, no change in behaviour is expected. At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. The set_bypass() callback is not really an iommu_table function but IOMMU/PE function. This introduces a iommu_table_group_ops struct and adds take_ownership()/release_ownership() callbacks to it which are called when an external user takes/releases control over the IOMMU. This replaces set_bypass() with ownership callbacks as it is not necessarily just bypass enabling, it can be something else/more so let's give it more generic name. The callbacks is implemented for IODA2 only. Other platforms (P5IOC2, IODA1) will use the old iommu_take_ownership/iommu_release_ownership API. The following patches will replace iommu_take_ownership/ iommu_release_ownership calls in IODA2 with full IOMMU table release/ create. As we here and touching bypass control, this removes pnv_pci_ioda2_setup_bypass_pe() as it does not do much more compared to pnv_pci_ioda2_set_bypass. This moves tce_bypass_base initialization to pnv_pci_ioda2_setup_dma_pe. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 11 ++++- arch/powerpc/kernel/iommu.c | 12 ------ arch/powerpc/platforms/powernv/pci-ioda.c | 43 ++++++++++++------- drivers/vfio/vfio_iommu_spapr_tce.c | 70 ++++++++++++++++++++++++++++--- 4 files changed, 103 insertions(+), 33 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 44a20ccf06b4..489133cf7c5e 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -93,7 +93,6 @@ struct iommu_table { unsigned long it_page_shift;/* table iommu page size */ struct list_head it_group_list;/* List of iommu_table_group_link */ struct iommu_table_ops *it_ops; - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ @@ -126,6 +125,15 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); #define IOMMU_TABLE_GROUP_MAX_TABLES 1 +struct iommu_table_group; + +struct iommu_table_group_ops { + /* Switch ownership from platform code to external user (e.g. VFIO) */ + void (*take_ownership)(struct iommu_table_group *table_group); + /* Switch ownership from external user (e.g. VFIO) back to core */ + void (*release_ownership)(struct iommu_table_group *table_group); +}; + struct iommu_table_group_link { struct list_head next; struct rcu_head rcu; @@ -135,6 +143,7 @@ struct iommu_table_group_link { struct iommu_table_group { struct iommu_group *group; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; + struct iommu_table_group_ops *ops; }; #ifdef CONFIG_IOMMU_API diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index be258b2ecb10..e7f81b7399ba 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1047,14 +1047,6 @@ int iommu_take_ownership(struct iommu_table *tbl) memset(tbl->it_map, 0xff, sz); - /* - * Disable iommu bypass, otherwise the user can DMA to all of - * our physical memory via the bypass window instead of just - * the pages that has been explicitly mapped into the iommu - */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, false); - return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); @@ -1068,10 +1060,6 @@ void iommu_release_ownership(struct iommu_table *tbl) /* Restore bit#0 set by iommu_init_table() */ if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - - /* The kernel owns the device now, we can restore the iommu bypass */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, true); } EXPORT_SYMBOL_GPL(iommu_release_ownership); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3b4130697b05..17a77522ea0d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1919,13 +1919,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } } -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { - struct iommu_table_group_link *tgl = list_first_entry_or_null( - &tbl->it_group_list, struct iommu_table_group_link, - next); - struct pnv_ioda_pe *pe = container_of(tgl->table_group, - struct pnv_ioda_pe, table_group); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1952,19 +1947,31 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) pe->tce_bypass_enabled = enable; } -static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +#ifdef CONFIG_IOMMU_API +static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { - /* TVE #1 is selected by PCI address bit 59 */ - pe->tce_bypass_base = 1ull << 59; + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); - /* Install set_bypass callback for VFIO */ - pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass; + iommu_take_ownership(table_group->tables[0]); + pnv_pci_ioda2_set_bypass(pe, false); +} - /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true); +static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + + iommu_release_ownership(table_group->tables[0]); + pnv_pci_ioda2_set_bypass(pe, true); } +static struct iommu_table_group_ops pnv_pci_ioda2_ops = { + .take_ownership = pnv_ioda2_take_ownership, + .release_ownership = pnv_ioda2_release_ownership, +}; +#endif + static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { @@ -1979,6 +1986,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; + /* TVE #1 is selected by PCI address bit 59 */ + pe->tce_bypass_base = 1ull << 59; + tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); @@ -2033,6 +2043,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } tbl->it_ops = &pnv_ioda2_iommu_ops; iommu_init_table(tbl, phb->hose->node); +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; +#endif if (pe->flags & PNV_IODA_PE_DEV) { /* @@ -2047,7 +2060,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* Also create a bypass window */ if (!pnv_iommu_bypass_disabled) - pnv_pci_ioda2_setup_bypass_pe(phb, pe); + pnv_pci_ioda2_set_bypass(pe, true); return; fail: diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index ffc634a75dba..9c720de46c33 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -486,6 +486,61 @@ static long tce_iommu_ioctl(void *iommu_data, return -ENOTTY; } +static void tce_iommu_release_ownership(struct tce_container *container, + struct iommu_table_group *table_group) +{ + int i; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl) + continue; + + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + if (tbl->it_map) + iommu_release_ownership(tbl); + } +} + +static int tce_iommu_take_ownership(struct tce_container *container, + struct iommu_table_group *table_group) +{ + int i, j, rc = 0; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl || !tbl->it_map) + continue; + + rc = iommu_take_ownership(tbl); + if (rc) { + for (j = 0; j < i; ++j) + iommu_release_ownership( + table_group->tables[j]); + + return rc; + } + } + + return 0; +} + +static void tce_iommu_release_ownership_ddw(struct tce_container *container, + struct iommu_table_group *table_group) +{ + table_group->ops->release_ownership(table_group); +} + +static long tce_iommu_take_ownership_ddw(struct tce_container *container, + struct iommu_table_group *table_group) +{ + table_group->ops->take_ownership(table_group); + + return 0; +} + static int tce_iommu_attach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -518,7 +573,12 @@ static int tce_iommu_attach_group(void *iommu_data, goto unlock_exit; } - ret = iommu_take_ownership(table_group->tables[0]); + if (!table_group->ops || !table_group->ops->take_ownership || + !table_group->ops->release_ownership) + ret = tce_iommu_take_ownership(container, table_group); + else + ret = tce_iommu_take_ownership_ddw(container, table_group); + if (!ret) container->grp = iommu_group; @@ -533,7 +593,6 @@ static void tce_iommu_detach_group(void *iommu_data, { struct tce_container *container = iommu_data; struct iommu_table_group *table_group; - struct iommu_table *tbl; mutex_lock(&container->lock); if (iommu_group != container->grp) { @@ -556,9 +615,10 @@ static void tce_iommu_detach_group(void *iommu_data, table_group = iommu_group_get_iommudata(iommu_group); BUG_ON(!table_group); - tbl = table_group->tables[0]; - tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - iommu_release_ownership(tbl); + if (!table_group->ops || !table_group->ops->release_ownership) + tce_iommu_release_ownership(container, table_group); + else + tce_iommu_release_ownership_ddw(container, table_group); unlock_exit: mutex_unlock(&container->lock); -- cgit v1.2.3 From 05c6cfb9dce0d13d37e9d007ee6a4af36f1c0a58 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:15 +1000 Subject: powerpc/iommu/powernv: Release replaced TCE At the moment writing new TCE value to the IOMMU table fails with EBUSY if there is a valid entry already. However PAPR specification allows the guest to write new TCE value without clearing it first. Another problem this patch is addressing is the use of pool locks for external IOMMU users such as VFIO. The pool locks are to protect DMA page allocator rather than entries and since the host kernel does not control what pages are in use, there is no point in pool locks and exchange()+put_page(oldtce) is sufficient to avoid possible races. This adds an exchange() callback to iommu_table_ops which does the same thing as set() plus it returns replaced TCE and DMA direction so the caller can release the pages afterwards. The exchange() receives a physical address unlike set() which receives linear mapping address; and returns a physical address as the clear() does. This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement for a platform to have exchange() implemented in order to support VFIO. This replaces iommu_tce_build() and iommu_clear_tce() with a single iommu_tce_xchg(). This makes sure that TCE permission bits are not set in TCE passed to IOMMU API as those are to be calculated by platform code from DMA direction. This moves SetPageDirty() to the IOMMU code to make it work for both VFIO ioctl interface in in-kernel TCE acceleration (when it becomes available later). Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 22 ++++++++-- arch/powerpc/kernel/iommu.c | 59 +++++++++------------------ arch/powerpc/platforms/powernv/pci-ioda.c | 34 ++++++++++++++++ arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 ++ arch/powerpc/platforms/powernv/pci.c | 18 +++++++++ arch/powerpc/platforms/powernv/pci.h | 2 + drivers/vfio/vfio_iommu_spapr_tce.c | 63 +++++++++++++++++------------ 7 files changed, 132 insertions(+), 69 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 489133cf7c5e..4636734604d7 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -45,13 +45,29 @@ extern int iommu_is_off; extern int iommu_force_on; struct iommu_table_ops { + /* + * When called with direction==DMA_NONE, it is equal to clear(). + * uaddr is a linear map address. + */ int (*set)(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); +#ifdef CONFIG_IOMMU_API + /* + * Exchanges existing TCE with new TCE plus direction bits; + * returns old TCE and DMA direction mask. + * @tce is a physical address. + */ + int (*exchange)(struct iommu_table *tbl, + long index, + unsigned long *hpa, + enum dma_data_direction *direction); +#endif void (*clear)(struct iommu_table *tbl, long index, long npages); + /* get() returns a physical address */ unsigned long (*get)(struct iommu_table *tbl, long index); void (*flush)(struct iommu_table *tbl); }; @@ -153,6 +169,8 @@ extern void iommu_register_group(struct iommu_table_group *table_group, extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); +extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, @@ -225,10 +243,6 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl, unsigned long npages); extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); -extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0fb88005c3c5..a8e3490b54e3 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -965,10 +965,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce) { - if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) - return -EINVAL; - - if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ)) + if (tce & ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; if (ioba & ~IOMMU_PAGE_MASK(tbl)) @@ -985,44 +982,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) +long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction) { - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); + long ret; - spin_lock(&(pool->lock)); + ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); - oldtce = tbl->it_ops->get(tbl, entry); - if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) - tbl->it_ops->clear(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock(&(pool->lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - -/* - * hwaddr is a kernel virtual address here (0xc... bazillion), - * tce_build converts it to a physical address. - */ -int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction) -{ - int ret = -EBUSY; - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock(&(pool->lock)); - - oldtce = tbl->it_ops->get(tbl, entry); - /* Add new entry if it is not busy */ - if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL); - - spin_unlock(&(pool->lock)); + if (!ret && ((*direction == DMA_FROM_DEVICE) || + (*direction == DMA_BIDIRECTIONAL))) + SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); /* if (unlikely(ret)) pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", @@ -1031,13 +1000,23 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, return ret; } -EXPORT_SYMBOL_GPL(iommu_tce_build); +EXPORT_SYMBOL_GPL(iommu_tce_xchg); int iommu_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; int ret = 0; + /* + * VFIO does not control TCE entries allocation and the guest + * can write new TCEs on top of existing ones so iommu_tce_build() + * must be able to release old pages. This functionality + * requires exchange() callback defined so if it is not + * implemented, we disallow taking ownership over the table. + */ + if (!tbl->it_ops->exchange) + return -EINVAL; + spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) spin_lock(&tbl->pools[i].lock); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 390863608569..ecbc071a143e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1738,6 +1738,20 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, return ret; } +#ifdef CONFIG_IOMMU_API +static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret && (tbl->it_type & + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) + pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false); + + return ret; +} +#endif + static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -1749,6 +1763,9 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda1_iommu_ops = { .set = pnv_ioda1_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_ioda1_tce_xchg, +#endif .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, }; @@ -1824,6 +1841,20 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, return ret; } +#ifdef CONFIG_IOMMU_API +static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret && (tbl->it_type & + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) + pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); + + return ret; +} +#endif + static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -1835,6 +1866,9 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_ioda2_tce_xchg, +#endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, }; diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index f80e5a1d6117..eaec85b63b34 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -85,6 +85,9 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } static struct iommu_table_ops pnv_p5ioc2_iommu_ops = { .set = pnv_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_tce_xchg, +#endif .clear = pnv_tce_free, .get = pnv_tce_get, }; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index a07b83283ccc..d465b9c32388 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -598,6 +598,24 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, return 0; } +#ifdef CONFIG_IOMMU_API +int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + u64 proto_tce = iommu_direction_to_tce_perm(*direction); + unsigned long newtce = *hpa | proto_tce, oldtce; + unsigned long idx = index - tbl->it_offset; + + BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + + oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)); + *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE); + *direction = iommu_tce_direction(oldtce); + + return 0; +} +#endif + void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { long i; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 792a723c36ec..8ef2d28aded0 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -207,6 +207,8 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 9c720de46c33..a9e2d13c03c0 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -236,18 +236,11 @@ static void tce_iommu_release(void *iommu_data) } static void tce_iommu_unuse_page(struct tce_container *container, - unsigned long oldtce) + unsigned long hpa) { struct page *page; - if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE))) - return; - - page = pfn_to_page(oldtce >> PAGE_SHIFT); - - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - + page = pfn_to_page(hpa >> PAGE_SHIFT); put_page(page); } @@ -255,14 +248,21 @@ static int tce_iommu_clear(struct tce_container *container, struct iommu_table *tbl, unsigned long entry, unsigned long pages) { - unsigned long oldtce; + unsigned long oldhpa; + long ret; + enum dma_data_direction direction; for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) + direction = DMA_NONE; + oldhpa = 0; + ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); + if (ret) + continue; + + if (direction == DMA_NONE) continue; - tce_iommu_unuse_page(container, oldtce); + tce_iommu_unuse_page(container, oldhpa); } return 0; @@ -284,12 +284,13 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) static long tce_iommu_build(struct tce_container *container, struct iommu_table *tbl, - unsigned long entry, unsigned long tce, unsigned long pages) + unsigned long entry, unsigned long tce, unsigned long pages, + enum dma_data_direction direction) { long i, ret = 0; struct page *page; unsigned long hpa; - enum dma_data_direction direction = iommu_tce_direction(tce); + enum dma_data_direction dirtmp; for (i = 0; i < pages; ++i) { unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; @@ -305,8 +306,8 @@ static long tce_iommu_build(struct tce_container *container, } hpa |= offset; - ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa), - direction); + dirtmp = direction; + ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); if (ret) { tce_iommu_unuse_page(container, hpa); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", @@ -314,6 +315,10 @@ static long tce_iommu_build(struct tce_container *container, tce, ret); break; } + + if (dirtmp != DMA_NONE) + tce_iommu_unuse_page(container, hpa); + tce += IOMMU_PAGE_SIZE(tbl); } @@ -378,8 +383,8 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_MAP_DMA: { struct vfio_iommu_type1_dma_map param; struct iommu_table *tbl = NULL; - unsigned long tce; long num; + enum dma_data_direction direction; if (!container->enabled) return -EPERM; @@ -405,19 +410,27 @@ static long tce_iommu_ioctl(void *iommu_data, return -EINVAL; /* iova is checked by the IOMMU API */ - tce = param.vaddr; - if (param.flags & VFIO_DMA_MAP_FLAG_READ) - tce |= TCE_PCI_READ; - if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) - tce |= TCE_PCI_WRITE; + if (param.flags & VFIO_DMA_MAP_FLAG_READ) { + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + direction = DMA_BIDIRECTIONAL; + else + direction = DMA_TO_DEVICE; + } else { + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + direction = DMA_FROM_DEVICE; + else + return -EINVAL; + } - ret = iommu_tce_put_param_check(tbl, param.iova, tce); + ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); if (ret) return ret; ret = tce_iommu_build(container, tbl, param.iova >> tbl->it_page_shift, - tce, param.size >> tbl->it_page_shift); + param.vaddr, + param.size >> tbl->it_page_shift, + direction); iommu_flush_tce(tbl); -- cgit v1.2.3 From 4793d65d1ac056d92b594d05c6aab3c040d913dd Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:20 +1000 Subject: vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API This extends iommu_table_group_ops by a set of callbacks to support dynamic DMA windows management. create_table() creates a TCE table with specific parameters. it receives iommu_table_group to know nodeid in order to allocate TCE table memory closer to the PHB. The exact format of allocated multi-level table might be also specific to the PHB model (not the case now though). This callback calculated the DMA window offset on a PCI bus from @num and stores it in a just created table. set_window() sets the window at specified TVT index + @num on PHB. unset_window() unsets the window from specified TVT. This adds a free() callback to iommu_table_ops to free the memory (potentially a tree of tables) allocated for the TCE table. create_table() and free() are supposed to be called once per VFIO container and set_window()/unset_window() are supposed to be called for every group in a container. This adds IOMMU capabilities to iommu_table_group such as default 32bit window parameters and others. This makes use of new values in vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not advertise pagemasks to the userspace. Signed-off-by: Alexey Kardashevskiy Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 19 ++++++ arch/powerpc/platforms/powernv/pci-ioda.c | 96 ++++++++++++++++++++++++++--- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 7 ++- drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++--- 4 files changed, 124 insertions(+), 17 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 706cfc0b1190..e5541759a37d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -70,6 +70,7 @@ struct iommu_table_ops { /* get() returns a physical address */ unsigned long (*get)(struct iommu_table *tbl, long index); void (*flush)(struct iommu_table *tbl); + void (*free)(struct iommu_table *tbl); }; /* These are used by VIO */ @@ -146,6 +147,17 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, struct iommu_table_group; struct iommu_table_group_ops { + long (*create_table)(struct iommu_table_group *table_group, + int num, + __u32 page_shift, + __u64 window_size, + __u32 levels, + struct iommu_table **ptbl); + long (*set_window)(struct iommu_table_group *table_group, + int num, + struct iommu_table *tblnew); + long (*unset_window)(struct iommu_table_group *table_group, + int num); /* Switch ownership from platform code to external user (e.g. VFIO) */ void (*take_ownership)(struct iommu_table_group *table_group); /* Switch ownership from external user (e.g. VFIO) back to core */ @@ -159,6 +171,13 @@ struct iommu_table_group_link { }; struct iommu_table_group { + /* IOMMU properties */ + __u32 tce32_start; + __u32 tce32_size; + __u64 pgsizes; /* Bitmap of supported page sizes */ + __u32 max_dynamic_windows_supported; + __u32 max_levels; + struct iommu_group *group; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct iommu_table_group_ops *ops; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 21d8c61f0b03..ee3e643c16bb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1870,6 +1871,12 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); } +static void pnv_ioda2_table_free(struct iommu_table *tbl) +{ + pnv_pci_ioda2_table_free_pages(tbl); + iommu_free_table(tbl, "pnv"); +} + static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API @@ -1877,6 +1884,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, + .free = pnv_ioda2_table_free, }; static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, @@ -1947,6 +1955,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, TCE_PCI_SWINV_PAIR); tbl->it_ops = &pnv_ioda1_iommu_ops; + pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; + pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; iommu_init_table(tbl, phb->hose->node); if (pe->flags & PNV_IODA_PE_DEV) { @@ -1985,7 +1995,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; const __u64 win_size = tbl->it_size << tbl->it_page_shift; - pe_info(pe, "Setting up window %llx..%llx pg=%x\n", + pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num, start_addr, start_addr + win_size - 1, IOMMU_PAGE_SIZE(tbl)); @@ -1995,7 +2005,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, */ rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, + (pe->pe_number << 1) + num, tbl->it_indirect_levels + 1, __pa(tbl->it_base), size << 3, @@ -2040,7 +2050,67 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) pe->tce_bypass_enabled = enable; } +static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl); + +static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + int nid = pe->phb->hose->node; + __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start; + long ret; + struct iommu_table *tbl; + + tbl = pnv_pci_table_alloc(nid); + if (!tbl) + return -ENOMEM; + + ret = pnv_pci_ioda2_table_alloc_pages(nid, + bus_offset, page_shift, window_size, + levels, tbl); + if (ret) { + iommu_free_table(tbl, "pnv"); + return ret; + } + + tbl->it_ops = &pnv_ioda2_iommu_ops; + if (pe->phb->ioda.tce_inval_reg) + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + + *ptbl = tbl; + + return 0; +} + #ifdef CONFIG_IOMMU_API +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, + int num) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = pe->phb; + long ret; + + pe_info(pe, "Removing DMA window #%d\n", num); + + ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + (pe->pe_number << 1) + num, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (ret) + pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); + else + pnv_pci_ioda2_tce_invalidate_entire(pe); + + pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); + + return ret; +} + static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, @@ -2060,6 +2130,9 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { + .create_table = pnv_pci_ioda2_create_table, + .set_window = pnv_pci_ioda2_set_window, + .unset_window = pnv_pci_ioda2_unset_window, .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; @@ -2214,7 +2287,7 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct iommu_table *tbl; + struct iommu_table *tbl = NULL; int64_t rc; /* We shouldn't already have a 32-bit DMA associated */ @@ -2224,10 +2297,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; - tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); - pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; @@ -2235,13 +2306,22 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, phb->ioda.m32_pci_base); /* Setup linux iommu table */ - rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node, - 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, - POWERNV_IOMMU_DEFAULT_LEVELS, tbl); + pe->table_group.tce32_start = 0; + pe->table_group.tce32_size = phb->ioda.m32_pci_base; + pe->table_group.max_dynamic_windows_supported = + IOMMU_TABLE_GROUP_MAX_TABLES; + pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; + pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M; + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, + IOMMU_PAGE_SHIFT_4K, + pe->table_group.tce32_size, + POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); goto fail; } + pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); tbl->it_ops = &pnv_ioda2_iommu_ops; iommu_init_table(tbl, phb->hose->node); diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index eaec85b63b34..f2bdfea3b68d 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -127,6 +127,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, u64 phb_id; int64_t rc; static int primary = 1; + struct iommu_table_group *table_group; + struct iommu_table *tbl; pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name); @@ -201,7 +203,10 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table() * should not be called for phb->p5ioc2.table_group.tables[0] ever. */ - phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table; + tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table; + table_group = &phb->p5ioc2.table_group; + table_group->tce32_start = tbl->it_offset << tbl->it_page_shift; + table_group->tce32_size = tbl->it_size << tbl->it_page_shift; } void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index a9e2d13c03c0..6d919eb4251f 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -135,7 +135,6 @@ static int tce_iommu_enable(struct tce_container *container) { int ret = 0; unsigned long locked; - struct iommu_table *tbl; struct iommu_table_group *table_group; if (!container->grp) @@ -171,13 +170,19 @@ static int tce_iommu_enable(struct tce_container *container) * this is that we cannot tell here the amount of RAM used by the guest * as this information is only available from KVM and VFIO is * KVM agnostic. + * + * So we do not allow enabling a container without a group attached + * as there is no way to know how much we should increment + * the locked_vm counter. */ table_group = iommu_group_get_iommudata(container->grp); if (!table_group) return -ENODEV; - tbl = table_group->tables[0]; - locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; + if (!table_group->tce32_size) + return -EPERM; + + locked = table_group->tce32_size >> PAGE_SHIFT; ret = try_increment_locked_vm(locked); if (ret) return ret; @@ -350,7 +355,6 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { struct vfio_iommu_spapr_tce_info info; - struct iommu_table *tbl; struct iommu_table_group *table_group; if (WARN_ON(!container->grp)) @@ -358,8 +362,7 @@ static long tce_iommu_ioctl(void *iommu_data, table_group = iommu_group_get_iommudata(container->grp); - tbl = table_group->tables[0]; - if (WARN_ON_ONCE(!tbl)) + if (!table_group) return -ENXIO; minsz = offsetofend(struct vfio_iommu_spapr_tce_info, @@ -371,8 +374,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz < minsz) return -EINVAL; - info.dma32_window_start = tbl->it_offset << tbl->it_page_shift; - info.dma32_window_size = tbl->it_size << tbl->it_page_shift; + info.dma32_window_start = table_group->tce32_start; + info.dma32_window_size = table_group->tce32_size; info.flags = 0; if (copy_to_user((void __user *)arg, &info, minsz)) -- cgit v1.2.3 From 46d3e1e16294c587a74093b1f5474c1b33b72381 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:23 +1000 Subject: vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control Before the IOMMU user (VFIO) would take control over the IOMMU table belonging to a specific IOMMU group. This approach did not allow sharing tables between IOMMU groups attached to the same container. This introduces a new IOMMU ownership flavour when the user can not just control the existing IOMMU table but remove/create tables on demand. If an IOMMU implements take/release_ownership() callbacks, this lets the user have full control over the IOMMU group. When the ownership is taken, the platform code removes all the windows so the caller must create them. Before returning the ownership back to the platform code, VFIO unprograms and removes all the tables it created. This changes IODA2's onwership handler to remove the existing table rather than manipulating with the existing one. From now on, iommu_take_ownership() and iommu_release_ownership() are only called from the vfio_iommu_spapr_tce driver. Old-style ownership is still supported allowing VFIO to run on older P5IOC2 and IODA IO controllers. No change in userspace-visible behaviour is expected. Since it recreates TCE tables on each ownership change, related kernel traces will appear more often. This adds a pnv_pci_ioda2_setup_default_config() which is called when PE is being configured at boot time and when the ownership is passed from VFIO to the platform code. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 101 ++++++++++++++++-------------- drivers/vfio/vfio_iommu_spapr_tce.c | 88 +++++++++++++++++++++++++- 2 files changed, 141 insertions(+), 48 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index a7e098dba23d..b9f0f430e249 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2073,6 +2073,49 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, return 0; } +static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) +{ + struct iommu_table *tbl = NULL; + long rc; + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, + IOMMU_PAGE_SHIFT_4K, + pe->table_group.tce32_size, + POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); + if (rc) { + pe_err(pe, "Failed to create 32-bit TCE table, err %ld", + rc); + return rc; + } + + iommu_init_table(tbl, pe->phb->hose->node); + + rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + if (rc) { + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", + rc); + pnv_ioda2_table_free(tbl); + return rc; + } + + if (!pnv_iommu_bypass_disabled) + pnv_pci_ioda2_set_bypass(pe, true); + + /* OPAL variant of PHB3 invalidated TCEs */ + if (pe->phb->ioda.tce_inval_reg) + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + + /* + * Setting table base here only for carrying iommu_group + * further down to let iommu_add_device() do the job. + * pnv_pci_ioda_dma_dev_setup will override it later anyway. + */ + if (pe->flags & PNV_IODA_PE_DEV) + set_iommu_table_base(&pe->pdev->dev, tbl); + + return 0; +} + #ifdef CONFIG_IOMMU_API static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels) @@ -2134,9 +2177,12 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); + /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */ + struct iommu_table *tbl = pe->table_group.tables[0]; - iommu_take_ownership(table_group->tables[0]); pnv_pci_ioda2_set_bypass(pe, false); + pnv_pci_ioda2_unset_window(&pe->table_group, 0); + pnv_ioda2_table_free(tbl); } static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) @@ -2144,8 +2190,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); - iommu_release_ownership(table_group->tables[0]); - pnv_pci_ioda2_set_bypass(pe, true); + pnv_pci_ioda2_setup_default_config(pe); } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { @@ -2308,7 +2353,6 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct iommu_table *tbl = NULL; int64_t rc; /* We shouldn't already have a 32-bit DMA associated */ @@ -2333,58 +2377,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, IOMMU_TABLE_GROUP_MAX_TABLES; pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M; - - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - pe->table_group.tce32_size, - POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); - if (rc) { - pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); - goto fail; - } - pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); - - tbl->it_ops = &pnv_ioda2_iommu_ops; - iommu_init_table(tbl, phb->hose->node); #ifdef CONFIG_IOMMU_API pe->table_group.ops = &pnv_pci_ioda2_ops; #endif - rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + rc = pnv_pci_ioda2_setup_default_config(pe); if (rc) { - pe_err(pe, "Failed to configure 32-bit TCE table," - " err %ld\n", rc); - goto fail; + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + return; } - /* OPAL variant of PHB3 invalidated TCEs */ - if (phb->ioda.tce_inval_reg) - tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); - - if (pe->flags & PNV_IODA_PE_DEV) { - /* - * Setting table base here only for carrying iommu_group - * further down to let iommu_add_device() do the job. - * pnv_pci_ioda_dma_dev_setup will override it later anyway. - */ - set_iommu_table_base(&pe->pdev->dev, tbl); + if (pe->flags & PNV_IODA_PE_DEV) iommu_add_device(&pe->pdev->dev); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); - - /* Also create a bypass window */ - if (!pnv_iommu_bypass_disabled) - pnv_pci_ioda2_set_bypass(pe, true); - - return; -fail: - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; - if (tbl) { - pnv_pci_ioda2_table_free_pages(tbl); - pnv_pci_unlink_table_and_group(tbl, &pe->table_group); - iommu_free_table(tbl, "pnv"); - } } static void pnv_ioda_setup_dma(struct pnv_phb *phb) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 6d919eb4251f..203caacf2242 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -333,6 +333,45 @@ static long tce_iommu_build(struct tce_container *container, return ret; } +static long tce_iommu_create_table(struct tce_container *container, + struct iommu_table_group *table_group, + int num, + __u32 page_shift, + __u64 window_size, + __u32 levels, + struct iommu_table **ptbl) +{ + long ret, table_size; + + table_size = table_group->ops->get_table_size(page_shift, window_size, + levels); + if (!table_size) + return -EINVAL; + + ret = try_increment_locked_vm(table_size >> PAGE_SHIFT); + if (ret) + return ret; + + ret = table_group->ops->create_table(table_group, num, + page_shift, window_size, levels, ptbl); + + WARN_ON(!ret && !(*ptbl)->it_ops->free); + WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); + + if (ret) + decrement_locked_vm(table_size >> PAGE_SHIFT); + + return ret; +} + +static void tce_iommu_free_table(struct iommu_table *tbl) +{ + unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; + + tbl->it_ops->free(tbl); + decrement_locked_vm(pages); +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -546,15 +585,62 @@ static int tce_iommu_take_ownership(struct tce_container *container, static void tce_iommu_release_ownership_ddw(struct tce_container *container, struct iommu_table_group *table_group) { + long i; + + if (!table_group->ops->unset_window) { + WARN_ON_ONCE(1); + return; + } + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + /* Store table pointer as unset_window resets it */ + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl) + continue; + + table_group->ops->unset_window(table_group, i); + tce_iommu_clear(container, tbl, + tbl->it_offset, tbl->it_size); + tce_iommu_free_table(tbl); + } + table_group->ops->release_ownership(table_group); } static long tce_iommu_take_ownership_ddw(struct tce_container *container, struct iommu_table_group *table_group) { + long ret; + struct iommu_table *tbl = NULL; + + if (!table_group->ops->create_table || !table_group->ops->set_window || + !table_group->ops->release_ownership) { + WARN_ON_ONCE(1); + return -EFAULT; + } + table_group->ops->take_ownership(table_group); - return 0; + ret = tce_iommu_create_table(container, + table_group, + 0, /* window number */ + IOMMU_PAGE_SHIFT_4K, + table_group->tce32_size, + 1, /* default levels */ + &tbl); + if (!ret) { + ret = table_group->ops->set_window(table_group, 0, tbl); + if (ret) + tce_iommu_free_table(tbl); + else + table_group->tables[0] = tbl; + } + + if (ret) + table_group->ops->release_ownership(table_group); + + return ret; } static int tce_iommu_attach_group(void *iommu_data, -- cgit v1.2.3 From 2157e7b82f3b81f57bd80cd67cef09ef26e5f74c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:25 +1000 Subject: vfio: powerpc/spapr: Register memory and define IOMMU v2 The existing implementation accounts the whole DMA window in the locked_vm counter. This is going to be worse with multiple containers and huge DMA windows. Also, real-time accounting would requite additional tracking of accounted pages due to the page size difference - IOMMU uses 4K pages and system uses 4K or 64K pages. Another issue is that actual pages pinning/unpinning happens on every DMA map/unmap request. This does not affect the performance much now as we spend way too much time now on switching context between guest/userspace/host but this will start to matter when we add in-kernel DMA map/unmap acceleration. This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU. New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces 2 new ioctls to register/unregister DMA memory - VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - which receive user space address and size of a memory region which needs to be pinned/unpinned and counted in locked_vm. New IOMMU splits physical pages pinning and TCE table update into 2 different operations. It requires: 1) guest pages to be registered first 2) consequent map/unmap requests to work only with pre-registered memory. For the default single window case this means that the entire guest (instead of 2GB) needs to be pinned before using VFIO. When a huge DMA window is added, no additional pinning will be required, otherwise it would be guest RAM + 2GB. The new memory registration ioctls are not supported by VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration will require memory to be preregistered in order to work. The accounting is done per the user process. This advertises v2 SPAPR TCE IOMMU and restricts what the userspace can do with v1 or v2 IOMMUs. In order to support memory pre-registration, we need a way to track the use of every registered memory region and only allow unregistration if a region is not in use anymore. So we need a way to tell from what region the just cleared TCE was from. This adds a userspace view of the TCE table into iommu_table struct. It contains userspace address, one per TCE entry. The table is only allocated when the ownership over an IOMMU group is taken which means it is only used from outside of the powernv code (such as VFIO). As v2 IOMMU supports IODA2 and pre-IODA2 IOMMUs (which do not support DDW API), this creates a default DMA window for IODA2 for consistency. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- Documentation/vfio.txt | 31 ++- arch/powerpc/include/asm/iommu.h | 6 + drivers/vfio/vfio_iommu_spapr_tce.c | 501 ++++++++++++++++++++++++++++++------ include/uapi/linux/vfio.h | 27 ++ 4 files changed, 482 insertions(+), 83 deletions(-) (limited to 'drivers/vfio') diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index 4c746a7e717a..dcc37e109c68 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note This implementation has some specifics: -1) Only one IOMMU group per container is supported as an IOMMU group -represents the minimal entity which isolation can be guaranteed for and -groups are allocated statically, one per a Partitionable Endpoint (PE) +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per +container is supported as an IOMMU table is allocated at the boot time, +one table per a IOMMU group which is a Partitionable Endpoint (PE) (PE is often a PCI domain but not always). +Newer systems (POWER8 with IODA2) have improved hardware design which allows +to remove this limitation and have multiple IOMMU groups per a VFIO container. 2) The hardware supports so called DMA windows - the PCI address range within which DMA transfer is allowed, any attempt to access address space @@ -439,6 +441,29 @@ The code flow from the example above should be slightly changed: .... +5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/ +VFIO_IOMMU_DISABLE and implements 2 new ioctls: +VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY +(which are unsupported in v1 IOMMU). + +PPC64 paravirtualized guests generate a lot of map/unmap requests, +and the handling of those includes pinning/unpinning pages and updating +mm::locked_vm counter to make sure we do not exceed the rlimit. +The v2 IOMMU splits accounting and pinning into separate operations: + +- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls +receive a user space address and size of the block to be pinned. +Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to +be called with the exact address and size used for registering +the memory block. The userspace is not expected to call these often. +The ranges are stored in a linked list in a VFIO container. + +- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual +IOMMU table and do not do pinning; instead these check that the userspace +address is from pre-registered range. + +This separation helps in optimizing DMA for guests. + ------------------------------------------------------------------------------- [1] VFIO was originally an acronym for "Virtual Function I/O" in its diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 9d3749287689..f9957eb4c659 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -112,9 +112,15 @@ struct iommu_table { unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ struct list_head it_group_list;/* List of iommu_table_group_link */ + unsigned long *it_userspace; /* userspace view of the table */ struct iommu_table_ops *it_ops; }; +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ + ((tbl)->it_userspace ? \ + &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \ + NULL) + /* Pure 2^n version of get_order */ static inline __attribute_const__ int get_iommu_order(unsigned long size, struct iommu_table *tbl) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 203caacf2242..91a32239bd0a 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -19,8 +19,10 @@ #include #include #include +#include #include #include +#include #define DRIVER_VERSION "0.1" #define DRIVER_AUTHOR "aik@ozlabs.ru" @@ -81,6 +83,11 @@ static void decrement_locked_vm(long npages) * into DMA'ble space using the IOMMU */ +struct tce_iommu_group { + struct list_head next; + struct iommu_group *grp; +}; + /* * The container descriptor supports only a single group per container. * Required by the API as the container is not supplied with the IOMMU group @@ -88,11 +95,84 @@ static void decrement_locked_vm(long npages) */ struct tce_container { struct mutex lock; - struct iommu_group *grp; bool enabled; + bool v2; unsigned long locked_pages; + struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; + struct list_head group_list; }; +static long tce_iommu_unregister_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + struct mm_iommu_table_group_mem_t *mem; + + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) + return -EINVAL; + + mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT); + if (!mem) + return -ENOENT; + + return mm_iommu_put(mem); +} + +static long tce_iommu_register_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + long ret = 0; + struct mm_iommu_table_group_mem_t *mem = NULL; + unsigned long entries = size >> PAGE_SHIFT; + + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || + ((vaddr + size) < vaddr)) + return -EINVAL; + + ret = mm_iommu_get(vaddr, entries, &mem); + if (ret) + return ret; + + container->enabled = true; + + return 0; +} + +static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) +{ + unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * + tbl->it_size, PAGE_SIZE); + unsigned long *uas; + long ret; + + BUG_ON(tbl->it_userspace); + + ret = try_increment_locked_vm(cb >> PAGE_SHIFT); + if (ret) + return ret; + + uas = vzalloc(cb); + if (!uas) { + decrement_locked_vm(cb >> PAGE_SHIFT); + return -ENOMEM; + } + tbl->it_userspace = uas; + + return 0; +} + +static void tce_iommu_userspace_view_free(struct iommu_table *tbl) +{ + unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * + tbl->it_size, PAGE_SIZE); + + if (!tbl->it_userspace) + return; + + vfree(tbl->it_userspace); + tbl->it_userspace = NULL; + decrement_locked_vm(cb >> PAGE_SHIFT); +} + static bool tce_page_is_contained(struct page *page, unsigned page_shift) { /* @@ -103,18 +183,18 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift) return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; } +static inline bool tce_groups_attached(struct tce_container *container) +{ + return !list_empty(&container->group_list); +} + static long tce_iommu_find_table(struct tce_container *container, phys_addr_t ioba, struct iommu_table **ptbl) { long i; - struct iommu_table_group *table_group; - - table_group = iommu_group_get_iommudata(container->grp); - if (!table_group) - return -1; for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { - struct iommu_table *tbl = table_group->tables[i]; + struct iommu_table *tbl = container->tables[i]; if (tbl) { unsigned long entry = ioba >> tbl->it_page_shift; @@ -136,9 +216,7 @@ static int tce_iommu_enable(struct tce_container *container) int ret = 0; unsigned long locked; struct iommu_table_group *table_group; - - if (!container->grp) - return -ENXIO; + struct tce_iommu_group *tcegrp; if (!current->mm) return -ESRCH; /* process exited */ @@ -175,7 +253,12 @@ static int tce_iommu_enable(struct tce_container *container) * as there is no way to know how much we should increment * the locked_vm counter. */ - table_group = iommu_group_get_iommudata(container->grp); + if (!tce_groups_attached(container)) + return -ENODEV; + + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); if (!table_group) return -ENODEV; @@ -211,7 +294,7 @@ static void *tce_iommu_open(unsigned long arg) { struct tce_container *container; - if (arg != VFIO_SPAPR_TCE_IOMMU) { + if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { pr_err("tce_vfio: Wrong IOMMU type\n"); return ERR_PTR(-EINVAL); } @@ -221,18 +304,45 @@ static void *tce_iommu_open(unsigned long arg) return ERR_PTR(-ENOMEM); mutex_init(&container->lock); + INIT_LIST_HEAD_RCU(&container->group_list); + + container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; return container; } +static int tce_iommu_clear(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long pages); +static void tce_iommu_free_table(struct iommu_table *tbl); + static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; + struct iommu_table_group *table_group; + struct tce_iommu_group *tcegrp; + long i; - WARN_ON(container->grp); + while (tce_groups_attached(container)) { + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); + tce_iommu_detach_group(iommu_data, tcegrp->grp); + } - if (container->grp) - tce_iommu_detach_group(iommu_data, container->grp); + /* + * If VFIO created a table, it was not disposed + * by tce_iommu_detach_group() so do it now. + */ + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = container->tables[i]; + + if (!tbl) + continue; + + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + tce_iommu_free_table(tbl); + } tce_iommu_disable(container); mutex_destroy(&container->lock); @@ -249,6 +359,47 @@ static void tce_iommu_unuse_page(struct tce_container *container, put_page(page); } +static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size, + unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) +{ + long ret = 0; + struct mm_iommu_table_group_mem_t *mem; + + mem = mm_iommu_lookup(tce, size); + if (!mem) + return -EINVAL; + + ret = mm_iommu_ua_to_hpa(mem, tce, phpa); + if (ret) + return -EINVAL; + + *pmem = mem; + + return 0; +} + +static void tce_iommu_unuse_page_v2(struct iommu_table *tbl, + unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + int ret; + unsigned long hpa = 0; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua || !current || !current->mm) + return; + + ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl), + &hpa, &mem); + if (ret) + pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", + __func__, *pua, entry, ret); + if (mem) + mm_iommu_mapped_dec(mem); + + *pua = 0; +} + static int tce_iommu_clear(struct tce_container *container, struct iommu_table *tbl, unsigned long entry, unsigned long pages) @@ -267,6 +418,11 @@ static int tce_iommu_clear(struct tce_container *container, if (direction == DMA_NONE) continue; + if (container->v2) { + tce_iommu_unuse_page_v2(tbl, entry); + continue; + } + tce_iommu_unuse_page(container, oldhpa); } @@ -333,6 +489,64 @@ static long tce_iommu_build(struct tce_container *container, return ret; } +static long tce_iommu_build_v2(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long tce, unsigned long pages, + enum dma_data_direction direction) +{ + long i, ret = 0; + struct page *page; + unsigned long hpa; + enum dma_data_direction dirtmp; + + for (i = 0; i < pages; ++i) { + struct mm_iommu_table_group_mem_t *mem = NULL; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, + entry + i); + + ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl), + &hpa, &mem); + if (ret) + break; + + page = pfn_to_page(hpa >> PAGE_SHIFT); + if (!tce_page_is_contained(page, tbl->it_page_shift)) { + ret = -EPERM; + break; + } + + /* Preserve offset within IOMMU page */ + hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; + dirtmp = direction; + + /* The registered region is being unregistered */ + if (mm_iommu_mapped_inc(mem)) + break; + + ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); + if (ret) { + /* dirtmp cannot be DMA_NONE here */ + tce_iommu_unuse_page_v2(tbl, entry + i); + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", + __func__, entry << tbl->it_page_shift, + tce, ret); + break; + } + + if (dirtmp != DMA_NONE) + tce_iommu_unuse_page_v2(tbl, entry + i); + + *pua = tce; + + tce += IOMMU_PAGE_SIZE(tbl); + } + + if (ret) + tce_iommu_clear(container, tbl, entry, i); + + return ret; +} + static long tce_iommu_create_table(struct tce_container *container, struct iommu_table_group *table_group, int num, @@ -358,6 +572,12 @@ static long tce_iommu_create_table(struct tce_container *container, WARN_ON(!ret && !(*ptbl)->it_ops->free); WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); + if (!ret && container->v2) { + ret = tce_iommu_userspace_view_alloc(*ptbl); + if (ret) + (*ptbl)->it_ops->free(*ptbl); + } + if (ret) decrement_locked_vm(table_size >> PAGE_SHIFT); @@ -368,6 +588,7 @@ static void tce_iommu_free_table(struct iommu_table *tbl) { unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; + tce_iommu_userspace_view_free(tbl); tbl->it_ops->free(tbl); decrement_locked_vm(pages); } @@ -383,6 +604,7 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_CHECK_EXTENSION: switch (arg) { case VFIO_SPAPR_TCE_IOMMU: + case VFIO_SPAPR_TCE_v2_IOMMU: ret = 1; break; default: @@ -394,12 +616,15 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { struct vfio_iommu_spapr_tce_info info; + struct tce_iommu_group *tcegrp; struct iommu_table_group *table_group; - if (WARN_ON(!container->grp)) + if (!tce_groups_attached(container)) return -ENXIO; - table_group = iommu_group_get_iommudata(container->grp); + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); if (!table_group) return -ENXIO; @@ -468,11 +693,18 @@ static long tce_iommu_ioctl(void *iommu_data, if (ret) return ret; - ret = tce_iommu_build(container, tbl, - param.iova >> tbl->it_page_shift, - param.vaddr, - param.size >> tbl->it_page_shift, - direction); + if (container->v2) + ret = tce_iommu_build_v2(container, tbl, + param.iova >> tbl->it_page_shift, + param.vaddr, + param.size >> tbl->it_page_shift, + direction); + else + ret = tce_iommu_build(container, tbl, + param.iova >> tbl->it_page_shift, + param.vaddr, + param.size >> tbl->it_page_shift, + direction); iommu_flush_tce(tbl); @@ -518,7 +750,62 @@ static long tce_iommu_ioctl(void *iommu_data, return ret; } + case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { + struct vfio_iommu_spapr_register_memory param; + + if (!container->v2) + break; + + minsz = offsetofend(struct vfio_iommu_spapr_register_memory, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + mutex_lock(&container->lock); + ret = tce_iommu_register_pages(container, param.vaddr, + param.size); + mutex_unlock(&container->lock); + + return ret; + } + case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { + struct vfio_iommu_spapr_register_memory param; + + if (!container->v2) + break; + + minsz = offsetofend(struct vfio_iommu_spapr_register_memory, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + mutex_lock(&container->lock); + ret = tce_iommu_unregister_pages(container, param.vaddr, + param.size); + mutex_unlock(&container->lock); + + return ret; + } case VFIO_IOMMU_ENABLE: + if (container->v2) + break; + mutex_lock(&container->lock); ret = tce_iommu_enable(container); mutex_unlock(&container->lock); @@ -526,16 +813,27 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_DISABLE: + if (container->v2) + break; + mutex_lock(&container->lock); tce_iommu_disable(container); mutex_unlock(&container->lock); return 0; - case VFIO_EEH_PE_OP: - if (!container->grp) - return -ENODEV; - return vfio_spapr_iommu_eeh_ioctl(container->grp, - cmd, arg); + case VFIO_EEH_PE_OP: { + struct tce_iommu_group *tcegrp; + + ret = 0; + list_for_each_entry(tcegrp, &container->group_list, next) { + ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, + cmd, arg); + if (ret) + return ret; + } + return ret; + } + } return -ENOTTY; @@ -547,14 +845,17 @@ static void tce_iommu_release_ownership(struct tce_container *container, int i; for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { - struct iommu_table *tbl = table_group->tables[i]; + struct iommu_table *tbl = container->tables[i]; if (!tbl) continue; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + tce_iommu_userspace_view_free(tbl); if (tbl->it_map) iommu_release_ownership(tbl); + + container->tables[i] = NULL; } } @@ -569,7 +870,10 @@ static int tce_iommu_take_ownership(struct tce_container *container, if (!tbl || !tbl->it_map) continue; - rc = iommu_take_ownership(tbl); + rc = tce_iommu_userspace_view_alloc(tbl); + if (!rc) + rc = iommu_take_ownership(tbl); + if (rc) { for (j = 0; j < i; ++j) iommu_release_ownership( @@ -579,6 +883,9 @@ static int tce_iommu_take_ownership(struct tce_container *container, } } + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) + container->tables[i] = table_group->tables[i]; + return 0; } @@ -592,18 +899,8 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container, return; } - for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { - /* Store table pointer as unset_window resets it */ - struct iommu_table *tbl = table_group->tables[i]; - - if (!tbl) - continue; - + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) table_group->ops->unset_window(table_group, i); - tce_iommu_clear(container, tbl, - tbl->it_offset, tbl->it_size); - tce_iommu_free_table(tbl); - } table_group->ops->release_ownership(table_group); } @@ -611,7 +908,7 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container, static long tce_iommu_take_ownership_ddw(struct tce_container *container, struct iommu_table_group *table_group) { - long ret; + long i, ret = 0; struct iommu_table *tbl = NULL; if (!table_group->ops->create_table || !table_group->ops->set_window || @@ -622,23 +919,45 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container, table_group->ops->take_ownership(table_group); - ret = tce_iommu_create_table(container, - table_group, - 0, /* window number */ - IOMMU_PAGE_SHIFT_4K, - table_group->tce32_size, - 1, /* default levels */ - &tbl); - if (!ret) { - ret = table_group->ops->set_window(table_group, 0, tbl); + /* + * If it the first group attached, check if there is + * a default DMA window and create one if none as + * the userspace expects it to exist. + */ + if (!tce_groups_attached(container) && !container->tables[0]) { + ret = tce_iommu_create_table(container, + table_group, + 0, /* window number */ + IOMMU_PAGE_SHIFT_4K, + table_group->tce32_size, + 1, /* default levels */ + &tbl); if (ret) - tce_iommu_free_table(tbl); + goto release_exit; else - table_group->tables[0] = tbl; + container->tables[0] = tbl; } - if (ret) - table_group->ops->release_ownership(table_group); + /* Set all windows to the new group */ + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + tbl = container->tables[i]; + + if (!tbl) + continue; + + /* Set the default window to a new group */ + ret = table_group->ops->set_window(table_group, i, tbl); + if (ret) + goto release_exit; + } + + return 0; + +release_exit: + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) + table_group->ops->unset_window(table_group, i); + + table_group->ops->release_ownership(table_group); return ret; } @@ -649,29 +968,44 @@ static int tce_iommu_attach_group(void *iommu_data, int ret; struct tce_container *container = iommu_data; struct iommu_table_group *table_group; + struct tce_iommu_group *tcegrp = NULL; mutex_lock(&container->lock); /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ - if (container->grp) { - pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", - iommu_group_id(container->grp), - iommu_group_id(iommu_group)); + table_group = iommu_group_get_iommudata(iommu_group); + + if (tce_groups_attached(container) && (!table_group->ops || + !table_group->ops->take_ownership || + !table_group->ops->release_ownership)) { ret = -EBUSY; goto unlock_exit; } - if (container->enabled) { - pr_err("tce_vfio: attaching group #%u to enabled container\n", - iommu_group_id(iommu_group)); - ret = -EBUSY; - goto unlock_exit; + /* Check if new group has the same iommu_ops (i.e. compatible) */ + list_for_each_entry(tcegrp, &container->group_list, next) { + struct iommu_table_group *table_group_tmp; + + if (tcegrp->grp == iommu_group) { + pr_warn("tce_vfio: Group %d is already attached\n", + iommu_group_id(iommu_group)); + ret = -EBUSY; + goto unlock_exit; + } + table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); + if (table_group_tmp->ops != table_group->ops) { + pr_warn("tce_vfio: Group %d is incompatible with group %d\n", + iommu_group_id(iommu_group), + iommu_group_id(tcegrp->grp)); + ret = -EPERM; + goto unlock_exit; + } } - table_group = iommu_group_get_iommudata(iommu_group); - if (!table_group) { - ret = -ENXIO; + tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); + if (!tcegrp) { + ret = -ENOMEM; goto unlock_exit; } @@ -681,10 +1015,15 @@ static int tce_iommu_attach_group(void *iommu_data, else ret = tce_iommu_take_ownership_ddw(container, table_group); - if (!ret) - container->grp = iommu_group; + if (!ret) { + tcegrp->grp = iommu_group; + list_add(&tcegrp->next, &container->group_list); + } unlock_exit: + if (ret && tcegrp) + kfree(tcegrp); + mutex_unlock(&container->lock); return ret; @@ -695,24 +1034,26 @@ static void tce_iommu_detach_group(void *iommu_data, { struct tce_container *container = iommu_data; struct iommu_table_group *table_group; + bool found = false; + struct tce_iommu_group *tcegrp; mutex_lock(&container->lock); - if (iommu_group != container->grp) { - pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", - iommu_group_id(iommu_group), - iommu_group_id(container->grp)); - goto unlock_exit; + + list_for_each_entry(tcegrp, &container->group_list, next) { + if (tcegrp->grp == iommu_group) { + found = true; + break; + } } - if (container->enabled) { - pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", - iommu_group_id(container->grp)); - tce_iommu_disable(container); + if (!found) { + pr_warn("tce_vfio: detaching unattached group #%u\n", + iommu_group_id(iommu_group)); + goto unlock_exit; } - /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", - iommu_group_id(iommu_group), iommu_group); */ - container->grp = NULL; + list_del(&tcegrp->next); + kfree(tcegrp); table_group = iommu_group_get_iommudata(iommu_group); BUG_ON(!table_group); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index e4fa1995f613..fa84391a0d00 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -36,6 +36,8 @@ /* Two-stage IOMMU */ #define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ +#define VFIO_SPAPR_TCE_v2_IOMMU 7 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -507,6 +509,31 @@ struct vfio_eeh_pe_op { #define VFIO_EEH_PE_OP _IO(VFIO_TYPE, VFIO_BASE + 21) +/** + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory) + * + * Registers user space memory where DMA is allowed. It pins + * user pages and does the locked memory accounting so + * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls + * get faster. + */ +struct vfio_iommu_spapr_register_memory { + __u32 argsz; + __u32 flags; + __u64 vaddr; /* Process virtual address */ + __u64 size; /* Size of mapping (bytes) */ +}; +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) + +/** + * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory) + * + * Unregisters user space memory registered with + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY. + * Uses vfio_iommu_spapr_register_memory for parameters. + */ +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) + /* ***************************************************************** */ #endif /* _UAPIVFIO_H */ -- cgit v1.2.3 From e633bc86a922468a82300eef5b9802e17be5e23d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:26 +1000 Subject: vfio: powerpc/spapr: Support Dynamic DMA windows This adds create/remove window ioctls to create and remove DMA windows. sPAPR defines a Dynamic DMA windows capability which allows para-virtualized guests to create additional DMA windows on a PCI bus. The existing linux kernels use this new window to map the entire guest memory and switch to the direct DMA operations saving time on map/unmap requests which would normally happen in a big amounts. This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows. Up to 2 windows are supported now by the hardware and by this driver. This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional information such as a number of supported windows and maximum number levels of TCE tables. DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature as we still want to support v2 on platforms which cannot do DDW for the sake of TCE acceleration in KVM (coming soon). Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- Documentation/vfio.txt | 19 ++++ arch/powerpc/include/asm/iommu.h | 2 +- drivers/vfio/vfio_iommu_spapr_tce.c | 196 +++++++++++++++++++++++++++++++++++- include/uapi/linux/vfio.h | 61 ++++++++++- 4 files changed, 273 insertions(+), 5 deletions(-) (limited to 'drivers/vfio') diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index dcc37e109c68..1dd3fddfd3a1 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -464,6 +464,25 @@ address is from pre-registered range. This separation helps in optimizing DMA for guests. +6) sPAPR specification allows guests to have an additional DMA window(s) on +a PCI bus with a variable page size. Two ioctls have been added to support +this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE. +The platform has to support the functionality or error will be returned to +the userspace. The existing hardware supports up to 2 DMA windows, one is +2GB long, uses 4K pages and called "default 32bit window"; the other can +be as big as entire RAM, use different page size, it is optional - guests +create those in run-time if the guest driver supports 64bit DMA. + +VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and +a number of TCE table levels (if a TCE table is going to be big enough and +the kernel may not be able to allocate enough of physically contiguous memory). +It creates a new window in the available slot and returns the bus address where +the new window starts. Due to hardware limitation, the user space cannot choose +the location of DMA windows. + +VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window +and removes it. + ------------------------------------------------------------------------------- [1] VFIO was originally an acronym for "Virtual Function I/O" in its diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index f9957eb4c659..ca18cff90900 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -149,7 +149,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); -#define IOMMU_TABLE_GROUP_MAX_TABLES 1 +#define IOMMU_TABLE_GROUP_MAX_TABLES 2 struct iommu_table_group; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 91a32239bd0a..0582b72ef377 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -211,6 +211,18 @@ static long tce_iommu_find_table(struct tce_container *container, return -1; } +static int tce_iommu_find_free_table(struct tce_container *container) +{ + int i; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (!container->tables[i]) + return i; + } + + return -ENOSPC; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; @@ -593,11 +605,115 @@ static void tce_iommu_free_table(struct iommu_table *tbl) decrement_locked_vm(pages); } +static long tce_iommu_create_window(struct tce_container *container, + __u32 page_shift, __u64 window_size, __u32 levels, + __u64 *start_addr) +{ + struct tce_iommu_group *tcegrp; + struct iommu_table_group *table_group; + struct iommu_table *tbl = NULL; + long ret, num; + + num = tce_iommu_find_free_table(container); + if (num < 0) + return num; + + /* Get the first group for ops::create_table */ + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); + if (!table_group) + return -EFAULT; + + if (!(table_group->pgsizes & (1ULL << page_shift))) + return -EINVAL; + + if (!table_group->ops->set_window || !table_group->ops->unset_window || + !table_group->ops->get_table_size || + !table_group->ops->create_table) + return -EPERM; + + /* Create TCE table */ + ret = tce_iommu_create_table(container, table_group, num, + page_shift, window_size, levels, &tbl); + if (ret) + return ret; + + BUG_ON(!tbl->it_ops->free); + + /* + * Program the table to every group. + * Groups have been tested for compatibility at the attach time. + */ + list_for_each_entry(tcegrp, &container->group_list, next) { + table_group = iommu_group_get_iommudata(tcegrp->grp); + + ret = table_group->ops->set_window(table_group, num, tbl); + if (ret) + goto unset_exit; + } + + container->tables[num] = tbl; + + /* Return start address assigned by platform in create_table() */ + *start_addr = tbl->it_offset << tbl->it_page_shift; + + return 0; + +unset_exit: + list_for_each_entry(tcegrp, &container->group_list, next) { + table_group = iommu_group_get_iommudata(tcegrp->grp); + table_group->ops->unset_window(table_group, num); + } + tce_iommu_free_table(tbl); + + return ret; +} + +static long tce_iommu_remove_window(struct tce_container *container, + __u64 start_addr) +{ + struct iommu_table_group *table_group = NULL; + struct iommu_table *tbl; + struct tce_iommu_group *tcegrp; + int num; + + num = tce_iommu_find_table(container, start_addr, &tbl); + if (num < 0) + return -EINVAL; + + BUG_ON(!tbl->it_size); + + /* Detach groups from IOMMUs */ + list_for_each_entry(tcegrp, &container->group_list, next) { + table_group = iommu_group_get_iommudata(tcegrp->grp); + + /* + * SPAPR TCE IOMMU exposes the default DMA window to + * the guest via dma32_window_start/size of + * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow + * the userspace to remove this window, some do not so + * here we check for the platform capability. + */ + if (!table_group->ops || !table_group->ops->unset_window) + return -EPERM; + + table_group->ops->unset_window(table_group, num); + } + + /* Free table */ + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + tce_iommu_free_table(tbl); + container->tables[num] = NULL; + + return 0; +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -641,6 +757,21 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_start = table_group->tce32_start; info.dma32_window_size = table_group->tce32_size; info.flags = 0; + memset(&info.ddw, 0, sizeof(info.ddw)); + + if (table_group->max_dynamic_windows_supported && + container->v2) { + info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; + info.ddw.pgsizes = table_group->pgsizes; + info.ddw.max_dynamic_windows_supported = + table_group->max_dynamic_windows_supported; + info.ddw.levels = table_group->max_levels; + } + + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); + + if (info.argsz >= ddwsz) + minsz = ddwsz; if (copy_to_user((void __user *)arg, &info, minsz)) return -EFAULT; @@ -834,6 +965,69 @@ static long tce_iommu_ioctl(void *iommu_data, return ret; } + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + + if (!container->v2) + break; + + if (!tce_groups_attached(container)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(&create, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz < minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + mutex_lock(&container->lock); + + ret = tce_iommu_create_window(container, create.page_shift, + create.window_size, create.levels, + &create.start_addr); + + mutex_unlock(&container->lock); + + if (!ret && copy_to_user((void __user *)arg, &create, minsz)) + ret = -EFAULT; + + return ret; + } + case VFIO_IOMMU_SPAPR_TCE_REMOVE: { + struct vfio_iommu_spapr_tce_remove remove; + + if (!container->v2) + break; + + if (!tce_groups_attached(container)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, + start_addr); + + if (copy_from_user(&remove, (void __user *)arg, minsz)) + return -EFAULT; + + if (remove.argsz < minsz) + return -EINVAL; + + if (remove.flags) + return -EINVAL; + + mutex_lock(&container->lock); + + ret = tce_iommu_remove_window(container, remove.start_addr); + + mutex_unlock(&container->lock); + + return ret; + } } return -ENOTTY; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fa84391a0d00..9fd7b5d8df2f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -444,6 +444,23 @@ struct vfio_iommu_type1_dma_unmap { /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ +/* + * The SPAPR TCE DDW info struct provides the information about + * the details of Dynamic DMA window capability. + * + * @pgsizes contains a page size bitmask, 4K/64K/16M are supported. + * @max_dynamic_windows_supported tells the maximum number of windows + * which the platform can create. + * @levels tells the maximum number of levels in multi-level IOMMU tables; + * this allows splitting a table into smaller chunks which reduces + * the amount of physically contiguous memory required for the table. + */ +struct vfio_iommu_spapr_tce_ddw_info { + __u64 pgsizes; /* Bitmap of supported page sizes */ + __u32 max_dynamic_windows_supported; + __u32 levels; +}; + /* * The SPAPR TCE info struct provides the information about the PCI bus * address ranges available for DMA, these values are programmed into @@ -454,14 +471,17 @@ struct vfio_iommu_type1_dma_unmap { * addresses too so the window works as a filter rather than an offset * for IOVA addresses. * - * A flag will need to be added if other page sizes are supported, - * so as defined here, it is always 4k. + * Flags supported: + * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows + * (DDW) support is present. @ddw is only supported when DDW is present. */ struct vfio_iommu_spapr_tce_info { __u32 argsz; - __u32 flags; /* reserved for future use */ + __u32 flags; +#define VFIO_IOMMU_SPAPR_INFO_DDW (1 << 0) /* DDW supported */ __u32 dma32_window_start; /* 32 bit window start (bytes) */ __u32 dma32_window_size; /* 32 bit window size (bytes) */ + struct vfio_iommu_spapr_tce_ddw_info ddw; }; #define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) @@ -534,6 +554,41 @@ struct vfio_iommu_spapr_register_memory { */ #define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) +/** + * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create) + * + * Creates an additional TCE table and programs it (sets a new DMA window) + * to every IOMMU group in the container. It receives page shift, window + * size and number of levels in the TCE table being created. + * + * It allocates and returns an offset on a PCI bus of the new DMA window. + */ +struct vfio_iommu_spapr_tce_create { + __u32 argsz; + __u32 flags; + /* in */ + __u32 page_shift; + __u64 window_size; + __u32 levels; + /* out */ + __u64 start_addr; +}; +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) + +/** + * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove) + * + * Unprograms a TCE table from all groups in the container and destroys it. + * It receives a PCI bus offset as a window id. + */ +struct vfio_iommu_spapr_tce_remove { + __u32 argsz; + __u32 flags; + /* in */ + __u64 start_addr; +}; +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + /* ***************************************************************** */ #endif /* _UAPIVFIO_H */ -- cgit v1.2.3