From d7baee6901b34c4895eb78efdbf13a49079d7404 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 30 Nov 2016 17:52:00 +1100 Subject: powerpc/iommu: Stop using @current in mm_iommu_xxx This changes mm_iommu_xxx helpers to take mm_struct as a parameter instead of getting it from @current which in some situations may not have a valid reference to mm. This changes helpers to receive @mm and moves all references to @current to the caller, including checks for !current and !current->mm; checks in mm_iommu_preregistered() are removed as there is no caller yet. This moves the mm_iommu_adjust_locked_vm() call to the caller as it receives mm_iommu_table_group_mem_t but it needs mm. This should cause no behavioral change. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 80378ddadc5c..d0c38b201267 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -107,14 +107,17 @@ static long tce_iommu_unregister_pages(struct tce_container *container, { struct mm_iommu_table_group_mem_t *mem; + if (!current || !current->mm) + return -ESRCH; /* process exited */ + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) return -EINVAL; - mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT); + mem = mm_iommu_find(current->mm, vaddr, size >> PAGE_SHIFT); if (!mem) return -ENOENT; - return mm_iommu_put(mem); + return mm_iommu_put(current->mm, mem); } static long tce_iommu_register_pages(struct tce_container *container, @@ -124,11 +127,14 @@ static long tce_iommu_register_pages(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem = NULL; unsigned long entries = size >> PAGE_SHIFT; + if (!current || !current->mm) + return -ESRCH; /* process exited */ + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || ((vaddr + size) < vaddr)) return -EINVAL; - ret = mm_iommu_get(vaddr, entries, &mem); + ret = mm_iommu_get(current->mm, vaddr, entries, &mem); if (ret) return ret; @@ -375,7 +381,7 @@ static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size, long ret = 0; struct mm_iommu_table_group_mem_t *mem; - mem = mm_iommu_lookup(tce, size); + mem = mm_iommu_lookup(current->mm, tce, size); if (!mem) return -EINVAL; -- cgit v1.2.3 From 39701e56f5f16ea0cf8fc9e8472e645f8de91d23 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 30 Nov 2016 17:52:01 +1100 Subject: vfio/spapr: Postpone allocation of userspace version of TCE table The iommu_table struct manages a hardware TCE table and a vmalloc'd table with corresponding userspace addresses. Both are allocated when the default DMA window is created and this happens when the very first group is attached to a container. As we are going to allow the userspace to configure container in one memory context and pas container fd to another, we have to postpones such allocations till a container fd is passed to the destination user process so we would account locked memory limit against the actual container user constrainsts. This postpones the it_userspace array allocation till it is used first time for mapping. The unmapping patch already checks if the array is allocated. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index d0c38b201267..4efd2b20c35c 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -515,6 +515,12 @@ static long tce_iommu_build_v2(struct tce_container *container, unsigned long hpa; enum dma_data_direction dirtmp; + if (!tbl->it_userspace) { + ret = tce_iommu_userspace_view_alloc(tbl); + if (ret) + return ret; + } + for (i = 0; i < pages; ++i) { struct mm_iommu_table_group_mem_t *mem = NULL; unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, @@ -588,15 +594,6 @@ static long tce_iommu_create_table(struct tce_container *container, WARN_ON(!ret && !(*ptbl)->it_ops->free); WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); - if (!ret && container->v2) { - ret = tce_iommu_userspace_view_alloc(*ptbl); - if (ret) - (*ptbl)->it_ops->free(*ptbl); - } - - if (ret) - decrement_locked_vm(table_size >> PAGE_SHIFT); - return ret; } @@ -1068,10 +1065,7 @@ static int tce_iommu_take_ownership(struct tce_container *container, if (!tbl || !tbl->it_map) continue; - rc = tce_iommu_userspace_view_alloc(tbl); - if (!rc) - rc = iommu_take_ownership(tbl); - + rc = iommu_take_ownership(tbl); if (rc) { for (j = 0; j < i; ++j) iommu_release_ownership( -- cgit v1.2.3 From 6f01cc692a16405235d5c34056455b182682123c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 30 Nov 2016 17:52:02 +1100 Subject: vfio/spapr: Add a helper to create default DMA window There is already a helper to create a DMA window which does allocate a table and programs it to the IOMMU group. However tce_iommu_take_ownership_ddw() did not use it and did these 2 calls itself to simplify error path. Since we are going to delay the default window creation till the default window is accessed/removed or new window is added, we need a helper to create a default window from all these cases. This adds tce_iommu_create_default_window(). Since it relies on a VFIO container to have at least one IOMMU group (for future use), this changes tce_iommu_attach_group() to add a group to the container first and then call the new helper. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 87 ++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 45 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 4efd2b20c35c..a67bbfdd86d5 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -710,6 +710,29 @@ static long tce_iommu_remove_window(struct tce_container *container, return 0; } +static long tce_iommu_create_default_window(struct tce_container *container) +{ + long ret; + __u64 start_addr = 0; + struct tce_iommu_group *tcegrp; + struct iommu_table_group *table_group; + + if (!tce_groups_attached(container)) + return -ENODEV; + + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); + if (!table_group) + return -ENODEV; + + ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, + table_group->tce32_size, 1, &start_addr); + WARN_ON_ONCE(!ret && start_addr); + + return ret; +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -1100,9 +1123,6 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container, static long tce_iommu_take_ownership_ddw(struct tce_container *container, struct iommu_table_group *table_group) { - long i, ret = 0; - struct iommu_table *tbl = NULL; - if (!table_group->ops->create_table || !table_group->ops->set_window || !table_group->ops->release_ownership) { WARN_ON_ONCE(1); @@ -1111,47 +1131,7 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container, table_group->ops->take_ownership(table_group); - /* - * If it the first group attached, check if there is - * a default DMA window and create one if none as - * the userspace expects it to exist. - */ - if (!tce_groups_attached(container) && !container->tables[0]) { - ret = tce_iommu_create_table(container, - table_group, - 0, /* window number */ - IOMMU_PAGE_SHIFT_4K, - table_group->tce32_size, - 1, /* default levels */ - &tbl); - if (ret) - goto release_exit; - else - container->tables[0] = tbl; - } - - /* Set all windows to the new group */ - for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { - tbl = container->tables[i]; - - if (!tbl) - continue; - - /* Set the default window to a new group */ - ret = table_group->ops->set_window(table_group, i, tbl); - if (ret) - goto release_exit; - } - return 0; - -release_exit: - for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) - table_group->ops->unset_window(table_group, i); - - table_group->ops->release_ownership(table_group); - - return ret; } static int tce_iommu_attach_group(void *iommu_data, @@ -1161,6 +1141,7 @@ static int tce_iommu_attach_group(void *iommu_data, struct tce_container *container = iommu_data; struct iommu_table_group *table_group; struct tce_iommu_group *tcegrp = NULL; + bool create_default_window = false; mutex_lock(&container->lock); @@ -1203,14 +1184,30 @@ static int tce_iommu_attach_group(void *iommu_data, } if (!table_group->ops || !table_group->ops->take_ownership || - !table_group->ops->release_ownership) + !table_group->ops->release_ownership) { ret = tce_iommu_take_ownership(container, table_group); - else + } else { ret = tce_iommu_take_ownership_ddw(container, table_group); + if (!tce_groups_attached(container) && !container->tables[0]) + create_default_window = true; + } if (!ret) { tcegrp->grp = iommu_group; list_add(&tcegrp->next, &container->group_list); + /* + * If it the first group attached, check if there is + * a default DMA window and create one if none as + * the userspace expects it to exist. + */ + if (create_default_window) { + ret = tce_iommu_create_default_window(container); + if (ret) { + list_del(&tcegrp->next); + tce_iommu_release_ownership_ddw(container, + table_group); + } + } } unlock_exit: -- cgit v1.2.3 From d9c728949ddc9de5734bf3b12ea906ca8a77f2a0 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 30 Nov 2016 17:52:03 +1100 Subject: vfio/spapr: Postpone default window creation We are going to allow the userspace to configure container in one memory context and pass container fd to another so we are postponing memory allocations accounted against the locked memory limit. One of previous patches took care of it_userspace. At the moment we create the default DMA window when the first group is attached to a container; this is done for the userspace which is not DDW-aware but familiar with the SPAPR TCE IOMMU v2 in the part of memory pre-registration - such client expects the default DMA window to exist. This postpones the default DMA window allocation till one of the folliwing happens: 1. first map/unmap request arrives; 2. new window is requested; This adds noop for the case when the userspace requested removal of the default window which has not been created yet. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 40 +++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index a67bbfdd86d5..88622be0d6b5 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -97,6 +97,7 @@ struct tce_container { struct mutex lock; bool enabled; bool v2; + bool def_window_pending; unsigned long locked_pages; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct list_head group_list; @@ -717,6 +718,9 @@ static long tce_iommu_create_default_window(struct tce_container *container) struct tce_iommu_group *tcegrp; struct iommu_table_group *table_group; + if (!container->def_window_pending) + return 0; + if (!tce_groups_attached(container)) return -ENODEV; @@ -730,6 +734,9 @@ static long tce_iommu_create_default_window(struct tce_container *container) table_group->tce32_size, 1, &start_addr); WARN_ON_ONCE(!ret && start_addr); + if (!ret) + container->def_window_pending = false; + return ret; } @@ -823,6 +830,10 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; + ret = tce_iommu_create_default_window(container); + if (ret) + return ret; + num = tce_iommu_find_table(container, param.iova, &tbl); if (num < 0) return -ENXIO; @@ -886,6 +897,10 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; + ret = tce_iommu_create_default_window(container); + if (ret) + return ret; + num = tce_iommu_find_table(container, param.iova, &tbl); if (num < 0) return -ENXIO; @@ -1012,6 +1027,10 @@ static long tce_iommu_ioctl(void *iommu_data, mutex_lock(&container->lock); + ret = tce_iommu_create_default_window(container); + if (ret) + return ret; + ret = tce_iommu_create_window(container, create.page_shift, create.window_size, create.levels, &create.start_addr); @@ -1044,6 +1063,11 @@ static long tce_iommu_ioctl(void *iommu_data, if (remove.flags) return -EINVAL; + if (container->def_window_pending && !remove.start_addr) { + container->def_window_pending = false; + return 0; + } + mutex_lock(&container->lock); ret = tce_iommu_remove_window(container, remove.start_addr); @@ -1141,7 +1165,6 @@ static int tce_iommu_attach_group(void *iommu_data, struct tce_container *container = iommu_data; struct iommu_table_group *table_group; struct tce_iommu_group *tcegrp = NULL; - bool create_default_window = false; mutex_lock(&container->lock); @@ -1189,25 +1212,12 @@ static int tce_iommu_attach_group(void *iommu_data, } else { ret = tce_iommu_take_ownership_ddw(container, table_group); if (!tce_groups_attached(container) && !container->tables[0]) - create_default_window = true; + container->def_window_pending = true; } if (!ret) { tcegrp->grp = iommu_group; list_add(&tcegrp->next, &container->group_list); - /* - * If it the first group attached, check if there is - * a default DMA window and create one if none as - * the userspace expects it to exist. - */ - if (create_default_window) { - ret = tce_iommu_create_default_window(container); - if (ret) { - list_del(&tcegrp->next); - tce_iommu_release_ownership_ddw(container, - table_group); - } - } } unlock_exit: -- cgit v1.2.3 From bc82d122ae4a0e9f971f13403995898fcfa0c09e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 30 Nov 2016 17:52:04 +1100 Subject: vfio/spapr: Reference mm in tce_container In some situations the userspace memory context may live longer than the userspace process itself so if we need to do proper memory context cleanup, we better have tce_container take a reference to mm_struct and use it later when the process is gone (@current or @current->mm is NULL). This references mm and stores the pointer in the container; this is done in a new helper - tce_iommu_mm_set() - when one of the following happens: - a container is enabled (IOMMU v1); - a first attempt to pre-register memory is made (IOMMU v2); - a DMA window is created (IOMMU v2). The @mm stays referenced till the container is destroyed. This replaces current->mm with container->mm everywhere except debug prints. This adds a check that current->mm is the same as the one stored in the container to prevent userspace from making changes to a memory context of other processes. DMA map/unmap ioctls() do not check for @mm as they already check for @enabled which is set after tce_iommu_mm_set() is called. This does not reference a task as multiple threads within the same mm are allowed to ioctl() to vfio and supposedly they will have same limits and capabilities and if they do not, we'll just fail with no harm made. Signed-off-by: Alexey Kardashevskiy Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 160 ++++++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 60 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 88622be0d6b5..4c03c8525c26 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -31,49 +31,49 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); -static long try_increment_locked_vm(long npages) +static long try_increment_locked_vm(struct mm_struct *mm, long npages) { long ret = 0, locked, lock_limit; - if (!current || !current->mm) - return -ESRCH; /* process exited */ + if (WARN_ON_ONCE(!mm)) + return -EPERM; if (!npages) return 0; - down_write(¤t->mm->mmap_sem); - locked = current->mm->locked_vm + npages; + down_write(&mm->mmap_sem); + locked = mm->locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) ret = -ENOMEM; else - current->mm->locked_vm += npages; + mm->locked_vm += npages; pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, npages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); return ret; } -static void decrement_locked_vm(long npages) +static void decrement_locked_vm(struct mm_struct *mm, long npages) { - if (!current || !current->mm || !npages) - return; /* process exited */ + if (!mm || !npages) + return; - down_write(¤t->mm->mmap_sem); - if (WARN_ON_ONCE(npages > current->mm->locked_vm)) - npages = current->mm->locked_vm; - current->mm->locked_vm -= npages; + down_write(&mm->mmap_sem); + if (WARN_ON_ONCE(npages > mm->locked_vm)) + npages = mm->locked_vm; + mm->locked_vm -= npages; pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, npages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK)); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); } /* @@ -99,26 +99,38 @@ struct tce_container { bool v2; bool def_window_pending; unsigned long locked_pages; + struct mm_struct *mm; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct list_head group_list; }; +static long tce_iommu_mm_set(struct tce_container *container) +{ + if (container->mm) { + if (container->mm == current->mm) + return 0; + return -EPERM; + } + BUG_ON(!current->mm); + container->mm = current->mm; + atomic_inc(&container->mm->mm_count); + + return 0; +} + static long tce_iommu_unregister_pages(struct tce_container *container, __u64 vaddr, __u64 size) { struct mm_iommu_table_group_mem_t *mem; - if (!current || !current->mm) - return -ESRCH; /* process exited */ - if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) return -EINVAL; - mem = mm_iommu_find(current->mm, vaddr, size >> PAGE_SHIFT); + mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); if (!mem) return -ENOENT; - return mm_iommu_put(current->mm, mem); + return mm_iommu_put(container->mm, mem); } static long tce_iommu_register_pages(struct tce_container *container, @@ -128,14 +140,11 @@ static long tce_iommu_register_pages(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem = NULL; unsigned long entries = size >> PAGE_SHIFT; - if (!current || !current->mm) - return -ESRCH; /* process exited */ - if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || ((vaddr + size) < vaddr)) return -EINVAL; - ret = mm_iommu_get(current->mm, vaddr, entries, &mem); + ret = mm_iommu_get(container->mm, vaddr, entries, &mem); if (ret) return ret; @@ -144,7 +153,8 @@ static long tce_iommu_register_pages(struct tce_container *container, return 0; } -static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) +static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, + struct mm_struct *mm) { unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE); @@ -153,13 +163,13 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) BUG_ON(tbl->it_userspace); - ret = try_increment_locked_vm(cb >> PAGE_SHIFT); + ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT); if (ret) return ret; uas = vzalloc(cb); if (!uas) { - decrement_locked_vm(cb >> PAGE_SHIFT); + decrement_locked_vm(mm, cb >> PAGE_SHIFT); return -ENOMEM; } tbl->it_userspace = uas; @@ -167,7 +177,8 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) return 0; } -static void tce_iommu_userspace_view_free(struct iommu_table *tbl) +static void tce_iommu_userspace_view_free(struct iommu_table *tbl, + struct mm_struct *mm) { unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE); @@ -177,7 +188,7 @@ static void tce_iommu_userspace_view_free(struct iommu_table *tbl) vfree(tbl->it_userspace); tbl->it_userspace = NULL; - decrement_locked_vm(cb >> PAGE_SHIFT); + decrement_locked_vm(mm, cb >> PAGE_SHIFT); } static bool tce_page_is_contained(struct page *page, unsigned page_shift) @@ -237,9 +248,6 @@ static int tce_iommu_enable(struct tce_container *container) struct iommu_table_group *table_group; struct tce_iommu_group *tcegrp; - if (!current->mm) - return -ESRCH; /* process exited */ - if (container->enabled) return -EBUSY; @@ -284,8 +292,12 @@ static int tce_iommu_enable(struct tce_container *container) if (!table_group->tce32_size) return -EPERM; + ret = tce_iommu_mm_set(container); + if (ret) + return ret; + locked = table_group->tce32_size >> PAGE_SHIFT; - ret = try_increment_locked_vm(locked); + ret = try_increment_locked_vm(container->mm, locked); if (ret) return ret; @@ -303,10 +315,8 @@ static void tce_iommu_disable(struct tce_container *container) container->enabled = false; - if (!current->mm) - return; - - decrement_locked_vm(container->locked_pages); + BUG_ON(!container->mm); + decrement_locked_vm(container->mm, container->locked_pages); } static void *tce_iommu_open(unsigned long arg) @@ -333,7 +343,8 @@ static void *tce_iommu_open(unsigned long arg) static int tce_iommu_clear(struct tce_container *container, struct iommu_table *tbl, unsigned long entry, unsigned long pages); -static void tce_iommu_free_table(struct iommu_table *tbl); +static void tce_iommu_free_table(struct tce_container *container, + struct iommu_table *tbl); static void tce_iommu_release(void *iommu_data) { @@ -358,10 +369,12 @@ static void tce_iommu_release(void *iommu_data) continue; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - tce_iommu_free_table(tbl); + tce_iommu_free_table(container, tbl); } tce_iommu_disable(container); + if (container->mm) + mmdrop(container->mm); mutex_destroy(&container->lock); kfree(container); @@ -376,13 +389,14 @@ static void tce_iommu_unuse_page(struct tce_container *container, put_page(page); } -static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size, +static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, + unsigned long tce, unsigned long size, unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) { long ret = 0; struct mm_iommu_table_group_mem_t *mem; - mem = mm_iommu_lookup(current->mm, tce, size); + mem = mm_iommu_lookup(container->mm, tce, size); if (!mem) return -EINVAL; @@ -395,18 +409,18 @@ static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size, return 0; } -static void tce_iommu_unuse_page_v2(struct iommu_table *tbl, - unsigned long entry) +static void tce_iommu_unuse_page_v2(struct tce_container *container, + struct iommu_table *tbl, unsigned long entry) { struct mm_iommu_table_group_mem_t *mem = NULL; int ret; unsigned long hpa = 0; unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); - if (!pua || !current || !current->mm) + if (!pua) return; - ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl), + ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); if (ret) pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", @@ -436,7 +450,7 @@ static int tce_iommu_clear(struct tce_container *container, continue; if (container->v2) { - tce_iommu_unuse_page_v2(tbl, entry); + tce_iommu_unuse_page_v2(container, tbl, entry); continue; } @@ -517,7 +531,7 @@ static long tce_iommu_build_v2(struct tce_container *container, enum dma_data_direction dirtmp; if (!tbl->it_userspace) { - ret = tce_iommu_userspace_view_alloc(tbl); + ret = tce_iommu_userspace_view_alloc(tbl, container->mm); if (ret) return ret; } @@ -527,8 +541,8 @@ static long tce_iommu_build_v2(struct tce_container *container, unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); - ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl), - &hpa, &mem); + ret = tce_iommu_prereg_ua_to_hpa(container, + tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); if (ret) break; @@ -549,7 +563,7 @@ static long tce_iommu_build_v2(struct tce_container *container, ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); if (ret) { /* dirtmp cannot be DMA_NONE here */ - tce_iommu_unuse_page_v2(tbl, entry + i); + tce_iommu_unuse_page_v2(container, tbl, entry + i); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", __func__, entry << tbl->it_page_shift, tce, ret); @@ -557,7 +571,7 @@ static long tce_iommu_build_v2(struct tce_container *container, } if (dirtmp != DMA_NONE) - tce_iommu_unuse_page_v2(tbl, entry + i); + tce_iommu_unuse_page_v2(container, tbl, entry + i); *pua = tce; @@ -585,7 +599,7 @@ static long tce_iommu_create_table(struct tce_container *container, if (!table_size) return -EINVAL; - ret = try_increment_locked_vm(table_size >> PAGE_SHIFT); + ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); if (ret) return ret; @@ -598,13 +612,14 @@ static long tce_iommu_create_table(struct tce_container *container, return ret; } -static void tce_iommu_free_table(struct iommu_table *tbl) +static void tce_iommu_free_table(struct tce_container *container, + struct iommu_table *tbl) { unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; - tce_iommu_userspace_view_free(tbl); + tce_iommu_userspace_view_free(tbl, container->mm); tbl->it_ops->free(tbl); - decrement_locked_vm(pages); + decrement_locked_vm(container->mm, pages); } static long tce_iommu_create_window(struct tce_container *container, @@ -667,7 +682,7 @@ unset_exit: table_group = iommu_group_get_iommudata(tcegrp->grp); table_group->ops->unset_window(table_group, num); } - tce_iommu_free_table(tbl); + tce_iommu_free_table(container, tbl); return ret; } @@ -705,7 +720,7 @@ static long tce_iommu_remove_window(struct tce_container *container, /* Free table */ tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - tce_iommu_free_table(tbl); + tce_iommu_free_table(container, tbl); container->tables[num] = NULL; return 0; @@ -760,7 +775,17 @@ static long tce_iommu_ioctl(void *iommu_data, } return (ret < 0) ? 0 : ret; + } + + /* + * Sanity check to prevent one userspace from manipulating + * another userspace mm. + */ + BUG_ON(!container); + if (container->mm && container->mm != current->mm) + return -EPERM; + switch (cmd) { case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { struct vfio_iommu_spapr_tce_info info; struct tce_iommu_group *tcegrp; @@ -929,6 +954,10 @@ static long tce_iommu_ioctl(void *iommu_data, minsz = offsetofend(struct vfio_iommu_spapr_register_memory, size); + ret = tce_iommu_mm_set(container); + if (ret) + return ret; + if (copy_from_user(¶m, (void __user *)arg, minsz)) return -EFAULT; @@ -952,6 +981,9 @@ static long tce_iommu_ioctl(void *iommu_data, if (!container->v2) break; + if (!container->mm) + return -EPERM; + minsz = offsetofend(struct vfio_iommu_spapr_register_memory, size); @@ -1010,6 +1042,10 @@ static long tce_iommu_ioctl(void *iommu_data, if (!container->v2) break; + ret = tce_iommu_mm_set(container); + if (ret) + return ret; + if (!tce_groups_attached(container)) return -ENXIO; @@ -1048,6 +1084,10 @@ static long tce_iommu_ioctl(void *iommu_data, if (!container->v2) break; + ret = tce_iommu_mm_set(container); + if (ret) + return ret; + if (!tce_groups_attached(container)) return -ENXIO; @@ -1093,7 +1133,7 @@ static void tce_iommu_release_ownership(struct tce_container *container, continue; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - tce_iommu_userspace_view_free(tbl); + tce_iommu_userspace_view_free(tbl, container->mm); if (tbl->it_map) iommu_release_ownership(tbl); -- cgit v1.2.3 From 4b6fad7097f883335b6d9627c883cb7f276d94c9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 30 Nov 2016 17:52:05 +1100 Subject: powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown At the moment the userspace tool is expected to request pinning of the entire guest RAM when VFIO IOMMU SPAPR v2 driver is present. When the userspace process finishes, all the pinned pages need to be put; this is done as a part of the userspace memory context (MM) destruction which happens on the very last mmdrop(). This approach has a problem that a MM of the userspace process may live longer than the userspace process itself as kernel threads use userspace process MMs which was runnning on a CPU where the kernel thread was scheduled to. If this happened, the MM remains referenced until this exact kernel thread wakes up again and releases the very last reference to the MM, on an idle system this can take even hours. This moves preregistered regions tracking from MM to VFIO; insteads of using mm_iommu_table_group_mem_t::used, tce_container::prereg_list is added so each container releases regions which it has pre-registered. This changes the userspace interface to return EBUSY if a memory region is already registered in a container. However it should not have any practical effect as the only userspace tool available now does register memory region once per container anyway. As tce_iommu_register_pages/tce_iommu_unregister_pages are called under container->lock, this does not need additional locking. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Nicholas Piggin Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_context_book3s64.c | 4 +-- arch/powerpc/mm/mmu_context_iommu.c | 11 ------ drivers/vfio/vfio_iommu_spapr_tce.c | 61 +++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 15 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index ad8273590975..73bf6e14c3aa 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -156,13 +156,11 @@ static inline void destroy_pagetable_page(struct mm_struct *mm) } #endif - void destroy_context(struct mm_struct *mm) { #ifdef CONFIG_SPAPR_TCE_IOMMU - mm_iommu_cleanup(mm); + WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif - #ifdef CONFIG_PPC_ICSWX drop_cop(mm->context.acop, mm); kfree(mm->context.cop_lockp); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 4c6db09e77ad..104bad029ce9 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -365,14 +365,3 @@ void mm_iommu_init(struct mm_struct *mm) { INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list); } - -void mm_iommu_cleanup(struct mm_struct *mm) -{ - struct mm_iommu_table_group_mem_t *mem, *tmp; - - list_for_each_entry_safe(mem, tmp, &mm->context.iommu_group_mem_list, - next) { - list_del_rcu(&mem->next); - mm_iommu_do_free(mem); - } -} diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 4c03c8525c26..c8823578a1b2 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -88,6 +88,15 @@ struct tce_iommu_group { struct iommu_group *grp; }; +/* + * A container needs to remember which preregistered region it has + * referenced to do proper cleanup at the userspace process exit. + */ +struct tce_iommu_prereg { + struct list_head next; + struct mm_iommu_table_group_mem_t *mem; +}; + /* * The container descriptor supports only a single group per container. * Required by the API as the container is not supplied with the IOMMU group @@ -102,6 +111,7 @@ struct tce_container { struct mm_struct *mm; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct list_head group_list; + struct list_head prereg_list; }; static long tce_iommu_mm_set(struct tce_container *container) @@ -118,10 +128,27 @@ static long tce_iommu_mm_set(struct tce_container *container) return 0; } +static long tce_iommu_prereg_free(struct tce_container *container, + struct tce_iommu_prereg *tcemem) +{ + long ret; + + ret = mm_iommu_put(container->mm, tcemem->mem); + if (ret) + return ret; + + list_del(&tcemem->next); + kfree(tcemem); + + return 0; +} + static long tce_iommu_unregister_pages(struct tce_container *container, __u64 vaddr, __u64 size) { struct mm_iommu_table_group_mem_t *mem; + struct tce_iommu_prereg *tcemem; + bool found = false; if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) return -EINVAL; @@ -130,7 +157,17 @@ static long tce_iommu_unregister_pages(struct tce_container *container, if (!mem) return -ENOENT; - return mm_iommu_put(container->mm, mem); + list_for_each_entry(tcemem, &container->prereg_list, next) { + if (tcemem->mem == mem) { + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + return tce_iommu_prereg_free(container, tcemem); } static long tce_iommu_register_pages(struct tce_container *container, @@ -138,16 +175,29 @@ static long tce_iommu_register_pages(struct tce_container *container, { long ret = 0; struct mm_iommu_table_group_mem_t *mem = NULL; + struct tce_iommu_prereg *tcemem; unsigned long entries = size >> PAGE_SHIFT; if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || ((vaddr + size) < vaddr)) return -EINVAL; + mem = mm_iommu_find(container->mm, vaddr, entries); + if (mem) { + list_for_each_entry(tcemem, &container->prereg_list, next) { + if (tcemem->mem == mem) + return -EBUSY; + } + } + ret = mm_iommu_get(container->mm, vaddr, entries, &mem); if (ret) return ret; + tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); + tcemem->mem = mem; + list_add(&tcemem->next, &container->prereg_list); + container->enabled = true; return 0; @@ -334,6 +384,7 @@ static void *tce_iommu_open(unsigned long arg) mutex_init(&container->lock); INIT_LIST_HEAD_RCU(&container->group_list); + INIT_LIST_HEAD_RCU(&container->prereg_list); container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; @@ -372,6 +423,14 @@ static void tce_iommu_release(void *iommu_data) tce_iommu_free_table(container, tbl); } + while (!list_empty(&container->prereg_list)) { + struct tce_iommu_prereg *tcemem; + + tcemem = list_first_entry(&container->prereg_list, + struct tce_iommu_prereg, next); + WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem)); + } + tce_iommu_disable(container); if (container->mm) mmdrop(container->mm); -- cgit v1.2.3