From 4cbf38511a007867def958872203ae8adb8e2351 Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 29 Apr 2020 15:36:40 +0200 Subject: iommu: Add def_domain_type() callback in iommu_ops Some devices are reqired to use a specific type (identity or dma) of default domain when they are used with a vendor iommu. When the system level default domain type is different from it, the vendor iommu driver has to request a new default domain with iommu_request_dma_domain_for_dev() and iommu_request_dm_for_dev() in the add_dev() callback. Unfortunately, these two helpers only work when the group hasn't been assigned to any other devices, hence, some vendor iommu driver has to use a private domain if it fails to request a new default one. This adds def_domain_type() callback in the iommu_ops, so that any special requirement of default domain for a device could be aware by the iommu generic layer. Signed-off-by: Sai Praneeth Prakhya Signed-off-by: Lu Baolu [ jroedel@suse.de: Added iommu_get_def_domain_type() function and use it to allocate the default domain ] Co-developed-by: Joerg Roedel Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-3-joro@8bytes.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ef8b0bda695..1f027b07e499 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -248,6 +248,10 @@ struct iommu_iotlb_gather { * @cache_invalidate: invalidate translation caches * @sva_bind_gpasid: bind guest pasid and mm * @sva_unbind_gpasid: unbind guest pasid and mm + * @def_domain_type: device default domain type, return value: + * - IOMMU_DOMAIN_IDENTITY: must use an identity domain + * - IOMMU_DOMAIN_DMA: must use a dma domain + * - 0: use the default setting * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops */ @@ -318,6 +322,8 @@ struct iommu_ops { int (*sva_unbind_gpasid)(struct device *dev, int pasid); + int (*def_domain_type)(struct device *dev); + unsigned long pgsize_bitmap; struct module *owner; }; -- cgit v1.2.3 From a6a4c7e2c5b8b981d1c546a393ff21f2112468c3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:45 +0200 Subject: iommu: Add probe_device() and release_device() call-backs Add call-backs to 'struct iommu_ops' as an alternative to the add_device() and remove_device() call-backs, which will be removed when all drivers are converted. The new call-backs will not setup IOMMU groups and domains anymore, so also add a probe_finalize() call-back where the IOMMU driver can do per-device setup work which require the device to be set up with a group and a domain. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-8-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++----- include/linux/iommu.h | 9 ++++++++ 2 files changed, 66 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5877abd9b693..6cfe7799dc8c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -174,6 +174,36 @@ static void dev_iommu_free(struct device *dev) dev->iommu = NULL; } +static int __iommu_probe_device(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + struct iommu_device *iommu_dev; + struct iommu_group *group; + int ret; + + iommu_dev = ops->probe_device(dev); + if (IS_ERR(iommu_dev)) + return PTR_ERR(iommu_dev); + + dev->iommu->iommu_dev = iommu_dev; + + group = iommu_group_get_for_dev(dev); + if (!IS_ERR(group)) { + ret = PTR_ERR(group); + goto out_release; + } + iommu_group_put(group); + + iommu_device_link(iommu_dev, dev); + + return 0; + +out_release: + ops->release_device(dev); + + return ret; +} + int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -191,10 +221,17 @@ int iommu_probe_device(struct device *dev) goto err_free_dev_param; } - ret = ops->add_device(dev); + if (ops->probe_device) + ret = __iommu_probe_device(dev); + else + ret = ops->add_device(dev); + if (ret) goto err_module_put; + if (ops->probe_finalize) + ops->probe_finalize(dev); + return 0; err_module_put: @@ -204,17 +241,31 @@ err_free_dev_param: return ret; } +static void __iommu_release_device(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + + iommu_device_unlink(dev->iommu->iommu_dev, dev); + + iommu_group_remove_device(dev); + + ops->release_device(dev); +} + void iommu_release_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; - if (dev->iommu_group) + if (!dev->iommu) + return; + + if (ops->release_device) + __iommu_release_device(dev); + else if (dev->iommu_group) ops->remove_device(dev); - if (dev->iommu) { - module_put(ops->owner); - dev_iommu_free(dev); - } + module_put(ops->owner); + dev_iommu_free(dev); } static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1f027b07e499..30170d191e5e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -225,6 +225,10 @@ struct iommu_iotlb_gather { * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping + * @probe_device: Add device to iommu driver handling + * @release_device: Remove device from iommu driver handling + * @probe_finalize: Do final setup work after the device is added to an IOMMU + * group and attached to the groups domain * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes @@ -275,6 +279,9 @@ struct iommu_ops { phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); int (*add_device)(struct device *dev); void (*remove_device)(struct device *dev); + struct iommu_device *(*probe_device)(struct device *dev); + void (*release_device)(struct device *dev); + void (*probe_finalize)(struct device *dev); struct iommu_group *(*device_group)(struct device *dev); int (*domain_get_attr)(struct iommu_domain *domain, enum iommu_attr attr, void *data); @@ -375,6 +382,7 @@ struct iommu_fault_param { * * @fault_param: IOMMU detected device fault reporting data * @fwspec: IOMMU fwspec data + * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. @@ -384,6 +392,7 @@ struct dev_iommu { struct mutex lock; struct iommu_fault_param *fault_param; struct iommu_fwspec *fwspec; + struct iommu_device *iommu_dev; void *priv; }; -- cgit v1.2.3 From 5012c3968537e2ffecbdb2eba3479bf9fb9e5597 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:51 +0200 Subject: iommu: Export bus_iommu_probe() and make is safe for re-probing Add a check to the bus_iommu_probe() call-path to make sure it ignores devices which have already been successfully probed. Then export the bus_iommu_probe() function so it can be used by IOMMU drivers. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-14-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 10 +++++++++- include/linux/iommu.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 834a45da0ed0..397fd4fd0c32 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1610,11 +1610,19 @@ static int probe_iommu_group(struct device *dev, void *data) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct list_head *group_list = data; + struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; + /* Device is probed already if in a group */ + group = iommu_group_get(dev); + if (group) { + iommu_group_put(group); + return 0; + } + if (!try_module_get(ops->owner)) { ret = -EINVAL; goto err_free_dev_iommu; @@ -1783,7 +1791,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group) iommu_do_create_direct_mappings); } -static int bus_iommu_probe(struct bus_type *bus) +int bus_iommu_probe(struct bus_type *bus) { const struct iommu_ops *ops = bus->iommu_ops; int ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 30170d191e5e..fea1622408ad 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -445,6 +445,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) #define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER 6 /* Post Driver unbind */ extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); +extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap); extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus); -- cgit v1.2.3 From 3eeeb45c6d0444b368cdeba9bdafa8bbcf5370d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:10 +0200 Subject: iommu: Remove add_device()/remove_device() code-paths All drivers are converted to use the probe/release_device() call-backs, so the add_device/remove_device() pointers are unused and the code using them can be removed. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-33-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 158 ++++++++++++-------------------------------------- include/linux/iommu.h | 4 -- 2 files changed, 38 insertions(+), 124 deletions(-) (limited to 'include') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 397fd4fd0c32..7f99e5ae432c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -220,12 +220,20 @@ out_release: return ret; } -static int __iommu_probe_device_helper(struct device *dev) +int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_group *group; int ret; + if (!dev_iommu_get(dev)) + return -ENOMEM; + + if (!try_module_get(ops->owner)) { + ret = -EINVAL; + goto err_out; + } + ret = __iommu_probe_device(dev, NULL); if (ret) goto err_out; @@ -259,75 +267,23 @@ static int __iommu_probe_device_helper(struct device *dev) err_release: iommu_release_device(dev); + err_out: return ret; } -int iommu_probe_device(struct device *dev) +void iommu_release_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; - struct iommu_group *group; - int ret; - - WARN_ON(dev->iommu_group); - - if (!ops) - return -EINVAL; - - if (!dev_iommu_get(dev)) - return -ENOMEM; - - if (!try_module_get(ops->owner)) { - ret = -EINVAL; - goto err_free_dev_param; - } - - if (ops->probe_device) - return __iommu_probe_device_helper(dev); - - ret = ops->add_device(dev); - if (ret) - goto err_module_put; - group = iommu_group_get(dev); - iommu_create_device_direct_mappings(group, dev); - iommu_group_put(group); - - if (ops->probe_finalize) - ops->probe_finalize(dev); - - return 0; - -err_module_put: - module_put(ops->owner); -err_free_dev_param: - dev_iommu_free(dev); - return ret; -} - -static void __iommu_release_device(struct device *dev) -{ - const struct iommu_ops *ops = dev->bus->iommu_ops; + if (!dev->iommu) + return; iommu_device_unlink(dev->iommu->iommu_dev, dev); - iommu_group_remove_device(dev); ops->release_device(dev); -} - -void iommu_release_device(struct device *dev) -{ - const struct iommu_ops *ops = dev->bus->iommu_ops; - - if (!dev->iommu) - return; - - if (ops->release_device) - __iommu_release_device(dev); - else if (dev->iommu_group) - ops->remove_device(dev); module_put(ops->owner); dev_iommu_free(dev); @@ -1560,23 +1516,6 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev) if (ret) goto out_put_group; - /* - * Try to allocate a default domain - needs support from the - * IOMMU driver. There are still some drivers which don't support - * default domains, so the return value is not yet checked. Only - * allocate the domain here when the driver still has the - * add_device/remove_device call-backs implemented. - */ - if (!ops->probe_device) { - iommu_alloc_default_domain(dev); - - if (group->default_domain) - ret = __iommu_attach_device(group->default_domain, dev); - - if (ret) - goto out_put_group; - } - return group; out_put_group: @@ -1591,21 +1530,6 @@ struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) return group->default_domain; } -static int add_iommu_group(struct device *dev, void *data) -{ - int ret = iommu_probe_device(dev); - - /* - * We ignore -ENODEV errors for now, as they just mean that the - * device is not translated by an IOMMU. We still care about - * other errors and fail to initialize when they happen. - */ - if (ret == -ENODEV) - ret = 0; - - return ret; -} - static int probe_iommu_group(struct device *dev, void *data) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -1793,47 +1717,41 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group) int bus_iommu_probe(struct bus_type *bus) { - const struct iommu_ops *ops = bus->iommu_ops; + struct iommu_group *group, *next; + LIST_HEAD(group_list); int ret; - if (ops->probe_device) { - struct iommu_group *group, *next; - LIST_HEAD(group_list); - - /* - * This code-path does not allocate the default domain when - * creating the iommu group, so do it after the groups are - * created. - */ - ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); - if (ret) - return ret; + /* + * This code-path does not allocate the default domain when + * creating the iommu group, so do it after the groups are + * created. + */ + ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); + if (ret) + return ret; - list_for_each_entry_safe(group, next, &group_list, entry) { - /* Remove item from the list */ - list_del_init(&group->entry); + list_for_each_entry_safe(group, next, &group_list, entry) { + /* Remove item from the list */ + list_del_init(&group->entry); - mutex_lock(&group->mutex); + mutex_lock(&group->mutex); - /* Try to allocate default domain */ - probe_alloc_default_domain(bus, group); + /* Try to allocate default domain */ + probe_alloc_default_domain(bus, group); - if (!group->default_domain) { - mutex_unlock(&group->mutex); - continue; - } + if (!group->default_domain) { + mutex_unlock(&group->mutex); + continue; + } - iommu_group_create_direct_mappings(group); + iommu_group_create_direct_mappings(group); - ret = __iommu_group_dma_attach(group); + ret = __iommu_group_dma_attach(group); - mutex_unlock(&group->mutex); + mutex_unlock(&group->mutex); - if (ret) - break; - } - } else { - ret = bus_for_each_dev(bus, NULL, NULL, add_iommu_group); + if (ret) + break; } return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fea1622408ad..dd076366383f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,8 +223,6 @@ struct iommu_iotlb_gather { * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue * @iova_to_phys: translate iova to physical address - * @add_device: add device to iommu grouping - * @remove_device: remove device from iommu grouping * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -277,8 +275,6 @@ struct iommu_ops { void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); - int (*add_device)(struct device *dev); - void (*remove_device)(struct device *dev); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); void (*probe_finalize)(struct device *dev); -- cgit v1.2.3 From 1b032ec1ecbce6047af7d11c9db432e237cb17d8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:12 +0200 Subject: iommu: Unexport iommu_group_get_for_dev() The function is now only used in IOMMU core code and shouldn't be used outside of it anyway, so remove the export for it. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-35-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 4 ++-- include/linux/iommu.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 48a95f7d7999..a9e5618cde80 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -91,6 +91,7 @@ static void __iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group); static int iommu_create_device_direct_mappings(struct iommu_group *group, struct device *dev); +static struct iommu_group *iommu_group_get_for_dev(struct device *dev); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ @@ -1500,7 +1501,7 @@ static int iommu_alloc_default_domain(struct device *dev) * to the returned IOMMU group, which will already include the provided * device. The reference should be released with iommu_group_put(). */ -struct iommu_group *iommu_group_get_for_dev(struct device *dev) +static struct iommu_group *iommu_group_get_for_dev(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_group *group; @@ -1531,7 +1532,6 @@ out_put_group: return ERR_PTR(ret); } -EXPORT_SYMBOL(iommu_group_get_for_dev); struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index dd076366383f..7cfd2dddb49d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -527,7 +527,6 @@ extern int iommu_page_response(struct device *dev, struct iommu_page_response *msg); extern int iommu_group_id(struct iommu_group *group); -extern struct iommu_group *iommu_group_get_for_dev(struct device *dev); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, -- cgit v1.2.3 From 69cf449166987d9a041020be6422ee7bf94a7228 Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 13 May 2020 15:47:21 -0700 Subject: iommu: Remove functions that support private domain After moving iommu_group setup to iommu core code [1][2] and removing private domain support in vt-d [3], there are no users for functions such as iommu_request_dm_for_dev(), iommu_request_dma_domain_for_dev() and request_default_domain_for_dev(). So, remove these functions. [1] commit dce8d6964ebd ("iommu/amd: Convert to probe/release_device() call-backs") [2] commit e5d1841f18b2 ("iommu/vt-d: Convert to probe/release_device() call-backs") [3] commit 327d5b2fee91 ("iommu/vt-d: Allow 32bit devices to uses DMA domain") Signed-off-by: Sai Praneeth Prakhya Cc: Joerg Roedel Cc: Lu Baolu Link: https://lore.kernel.org/r/20200513224721.20504-1-sai.praneeth.prakhya@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 65 --------------------------------------------------- include/linux/iommu.h | 12 ---------- 2 files changed, 77 deletions(-) (limited to 'include') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4050569188be..374b34fd6fac 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2536,71 +2536,6 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, } EXPORT_SYMBOL_GPL(iommu_alloc_resv_region); -static int -request_default_domain_for_dev(struct device *dev, unsigned long type) -{ - struct iommu_domain *domain; - struct iommu_group *group; - int ret; - - /* Device must already be in a group before calling this function */ - group = iommu_group_get(dev); - if (!group) - return -EINVAL; - - mutex_lock(&group->mutex); - - ret = 0; - if (group->default_domain && group->default_domain->type == type) - goto out; - - /* Don't change mappings of existing devices */ - ret = -EBUSY; - if (iommu_group_device_count(group) != 1) - goto out; - - ret = -ENOMEM; - domain = __iommu_domain_alloc(dev->bus, type); - if (!domain) - goto out; - - /* Attach the device to the domain */ - ret = __iommu_attach_group(domain, group); - if (ret) { - iommu_domain_free(domain); - goto out; - } - - /* Make the domain the default for this group */ - if (group->default_domain) - iommu_domain_free(group->default_domain); - group->default_domain = domain; - - iommu_create_device_direct_mappings(group, dev); - - dev_info(dev, "Using iommu %s mapping\n", - type == IOMMU_DOMAIN_DMA ? "dma" : "direct"); - - ret = 0; -out: - mutex_unlock(&group->mutex); - iommu_group_put(group); - - return ret; -} - -/* Request that a device is direct mapped by the IOMMU */ -int iommu_request_dm_for_dev(struct device *dev) -{ - return request_default_domain_for_dev(dev, IOMMU_DOMAIN_IDENTITY); -} - -/* Request that a device can't be direct mapped by the IOMMU */ -int iommu_request_dma_domain_for_dev(struct device *dev) -{ - return request_default_domain_for_dev(dev, IOMMU_DOMAIN_DMA); -} - void iommu_set_default_passthrough(bool cmd_line) { if (cmd_line) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7cfd2dddb49d..78a26ae5c2b6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -482,8 +482,6 @@ extern void iommu_get_resv_regions(struct device *dev, struct list_head *list); extern void iommu_put_resv_regions(struct device *dev, struct list_head *list); extern void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list); -extern int iommu_request_dm_for_dev(struct device *dev); -extern int iommu_request_dma_domain_for_dev(struct device *dev); extern void iommu_set_default_passthrough(bool cmd_line); extern void iommu_set_default_translated(bool cmd_line); extern bool iommu_default_passthrough(void); @@ -804,16 +802,6 @@ static inline int iommu_get_group_resv_regions(struct iommu_group *group, return -ENODEV; } -static inline int iommu_request_dm_for_dev(struct device *dev) -{ - return -ENODEV; -} - -static inline int iommu_request_dma_domain_for_dev(struct device *dev) -{ - return -ENODEV; -} - static inline void iommu_set_default_passthrough(bool cmd_line) { } -- cgit v1.2.3 From 3db9983e4327f773c490de2a8c66d6000561d88a Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:44 +0800 Subject: iommu/vt-d: Move domain helper to header Move domain helper to header to be used by SVA code. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 6 ------ include/linux/intel-iommu.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 2ff8d69ce4f8..8027f21073eb 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -441,12 +441,6 @@ static void init_translation_status(struct intel_iommu *iommu) iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; } -/* Convert generic 'struct iommu_domain to private struct dmar_domain */ -static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) -{ - return container_of(dom, struct dmar_domain, domain); -} - static int __init intel_iommu_setup(char *str) { if (!str) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 980234ae0312..ed7171d2ae1f 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -595,6 +595,12 @@ static inline void __iommu_flush_cache( clflush_cache_range(addr, size); } +/* Convert generic struct iommu_domain to private struct dmar_domain */ +static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct dmar_domain, domain); +} + /* * 0: readable * 1: writable -- cgit v1.2.3 From b0d1f8741b812352fe0e5f3b2381427085f23e19 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:46 +0800 Subject: iommu/vt-d: Add nested translation helper function Nested translation mode is supported in VT-d 3.0 Spec.CH 3.8. With PASID granular translation type set to 0x11b, translation result from the first level(FL) also subject to a second level(SL) page table translation. This mode is used for SVA virtualization, where FL performs guest virtual to guest physical translation and SL performs guest physical to host physical translation. This patch adds a helper function for setting up nested translation where second level comes from a domain and first level comes from a guest PGD. Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 25 ------- drivers/iommu/intel-pasid.c | 174 +++++++++++++++++++++++++++++++++++++++++++- drivers/iommu/intel-pasid.h | 10 +++ include/linux/intel-iommu.h | 20 +++++ include/uapi/linux/iommu.h | 5 ++ 5 files changed, 206 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 8027f21073eb..7e85c09eec71 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -296,31 +296,6 @@ static inline void context_clear_entry(struct context_entry *context) static struct dmar_domain *si_domain; static int hw_pass_through = 1; -/* si_domain contains mulitple devices */ -#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) - -/* - * This is a DMA domain allocated through the iommu domain allocation - * interface. But one or more devices belonging to this domain have - * been chosen to use a private domain. We should avoid to use the - * map/unmap/iova_to_phys APIs on it. - */ -#define DOMAIN_FLAG_LOSE_CHILDREN BIT(1) - -/* - * When VT-d works in the scalable mode, it allows DMA translation to - * happen through either first level or second level page table. This - * bit marks that the DMA translation for the domain goes through the - * first level page table, otherwise, it goes through the second level. - */ -#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2) - -/* - * Domain represents a virtual machine which demands iommu nested - * translation mode support. - */ -#define DOMAIN_FLAG_NESTING_MODE BIT(3) - #define for_each_domain_iommu(idx, domain) \ for (idx = 0; idx < g_num_of_iommus; idx++) \ if (domain->iommu_refcnt[idx]) diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index d9cea3011b58..c7fa1b79eaf7 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -359,6 +359,16 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, int pasid) @@ -492,7 +502,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, 1); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); pasid_set_present(pte); pasid_flush_caches(iommu, pte, pasid, did); @@ -561,7 +571,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); pasid_set_address_width(pte, agaw); - pasid_set_translation_type(pte, 2); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -595,7 +605,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, 4); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -609,3 +619,161 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } + +static int +intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte, + struct iommu_gpasid_bind_data_vtd *pasid_data) +{ + /* + * Not all guest PASID table entry fields are passed down during bind, + * here we only set up the ones that are dependent on guest settings. + * Execution related bits such as NXE, SMEP are not supported. + * Other fields, such as snoop related, are set based on host needs + * regardless of guest settings. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) { + if (!ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_sre(pte); + } + + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) { + if (!ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_eafe(pte); + } + + /* + * Memory type is only applicable to devices inside processor coherent + * domain. Will add MTS support once coherent devices are available. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) { + pr_warn_ratelimited("No memory type support %s\n", + iommu->name); + return -EINVAL; + } + + return 0; +} + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * This could be used for guest shared virtual address. In this case, the + * first level page tables are used for GVA-GPA translation in the guest, + * second level page tables are used for GPA-HPA translation. + * + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @gpgd: FLPTPTR: First Level Page translation pointer in GPA + * @pasid: PASID to be programmed in the device PASID table + * @pasid_data: Additional PASID info from the guest bind request + * @domain: Domain info for setting up second level page tables + * @addr_width: Address width of the first level (guest) + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + pgd_t *gpgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width) +{ + struct pasid_entry *pte; + struct dma_pte *pgd; + int ret = 0; + u64 pgd_val; + int agaw; + u16 did; + + if (!ecap_nest(iommu->ecap)) { + pr_err_ratelimited("IOMMU: %s: No nested translation support\n", + iommu->name); + return -EINVAL; + } + + if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) { + pr_err_ratelimited("Domain is not in nesting mode, %x\n", + domain->flags); + return -EINVAL; + } + + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte)) + return -EINVAL; + + /* + * Caller must ensure PASID entry is not in use, i.e. not bind the + * same PASID to the same device twice. + */ + if (pasid_pte_is_present(pte)) + return -EBUSY; + + pasid_clear_entry(pte); + + /* Sanity checking performed by caller to make sure address + * width matching in two dimensions: + * 1. CPU vs. IOMMU + * 2. Guest vs. Host. + */ + switch (addr_width) { +#ifdef CONFIG_X86 + case ADDR_WIDTH_5LEVEL: + if (!cpu_feature_enabled(X86_FEATURE_LA57) || + !cap_5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + + pasid_set_flpm(pte, 1); + break; +#endif + case ADDR_WIDTH_4LEVEL: + pasid_set_flpm(pte, 0); + break; + default: + dev_err_ratelimited(dev, "Invalid guest address width %d\n", + addr_width); + return -EINVAL; + } + + /* First level PGD is in GPA, must be supported by the second level */ + if ((unsigned long long)gpgd > domain->max_addr) { + dev_err_ratelimited(dev, + "Guest PGD %llx not supported, max %llx\n", + (unsigned long long)gpgd, domain->max_addr); + return -EINVAL; + } + pasid_set_flptr(pte, (u64)gpgd); + + ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data); + if (ret) + return ret; + + /* Setup the second level based on the given domain */ + pgd = domain->pgd; + + agaw = iommu_skip_agaw(domain, iommu, &pgd); + if (agaw < 0) { + dev_err_ratelimited(dev, "Invalid domain page table\n"); + return -EINVAL; + } + pgd_val = virt_to_phys(pgd); + pasid_set_slptr(pte, pgd_val); + pasid_set_fault_enable(pte); + + did = domain->iommu_did[iommu->seq_id]; + pasid_set_domain_id(pte, did); + + pasid_set_address_width(pte, agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + pasid_flush_caches(iommu, pte, pasid, did); + + return ret; +} diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index 92de6df24ccb..ccd50c2ae75c 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -36,6 +36,7 @@ * to vmalloc or even module mappings. */ #define PASID_FLAG_SUPERVISOR_MODE BIT(0) +#define PASID_FLAG_NESTED BIT(1) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- @@ -51,6 +52,11 @@ struct pasid_entry { u64 val[8]; }; +#define PASID_ENTRY_PGTT_FL_ONLY (1) +#define PASID_ENTRY_PGTT_SL_ONLY (2) +#define PASID_ENTRY_PGTT_NESTED (3) +#define PASID_ENTRY_PGTT_PT (4) + /* The representative of a PASID table */ struct pasid_table { void *table; /* pasid table pointer */ @@ -99,6 +105,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, int pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, int pasid); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index ed7171d2ae1f..e0d1fed7cbe4 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -42,6 +42,9 @@ #define DMA_FL_PTE_PRESENT BIT_ULL(0) #define DMA_FL_PTE_XD BIT_ULL(63) +#define ADDR_WIDTH_5LEVEL (57) +#define ADDR_WIDTH_4LEVEL (48) + #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 @@ -480,6 +483,23 @@ struct context_entry { u64 hi; }; +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) + +/* + * When VT-d works in the scalable mode, it allows DMA translation to + * happen through either first level or second level page table. This + * bit marks that the DMA translation for the domain goes through the + * first level page table, otherwise, it goes through the second level. + */ +#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(1) + +/* + * Domain represents a virtual machine which demands iommu nested + * translation mode support. + */ +#define DOMAIN_FLAG_NESTING_MODE BIT(2) + struct dmar_domain { int nid; /* node id */ diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 4ad3496e5c43..e907b7091a46 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -285,6 +285,11 @@ struct iommu_gpasid_bind_data_vtd { __u32 emt; }; +#define IOMMU_SVA_VTD_GPASID_MTS_MASK (IOMMU_SVA_VTD_GPASID_CD | \ + IOMMU_SVA_VTD_GPASID_EMTE | \ + IOMMU_SVA_VTD_GPASID_PCD | \ + IOMMU_SVA_VTD_GPASID_PWT) + /** * struct iommu_gpasid_bind_data - Information about device and guest PASID binding * @version: Version of this data structure -- cgit v1.2.3 From 56722a4398a306585ca3ed39ff54fc907af98618 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:47 +0800 Subject: iommu/vt-d: Add bind guest PASID support When supporting guest SVA with emulated IOMMU, the guest PASID table is shadowed in VMM. Updates to guest vIOMMU PASID table will result in PASID cache flush which will be passed down to the host as bind guest PASID calls. For the SL page tables, it will be harvested from device's default domain (request w/o PASID), or aux domain in case of mediated device. .-------------. .---------------------------. | vIOMMU | | Guest process CR3, FL only| | | '---------------------------' .----------------/ | PASID Entry |--- PASID cache flush - '-------------' | | | V | | CR3 in GPA '-------------' Guest ------| Shadow |--------------------------|-------- v v v Host .-------------. .----------------------. | pIOMMU | | Bind FL for GVA-GPA | | | '----------------------' .----------------/ | | PASID Entry | V (Nested xlate) '----------------\.------------------------------. | | |SL for GPA-HPA, default domain| | | '------------------------------' '-------------' Where: - FL = First level/stage one page tables - SL = Second level/stage two page tables Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 4 + drivers/iommu/intel-svm.c | 200 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 6 +- include/linux/intel-svm.h | 12 +++ 4 files changed, 221 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7e85c09eec71..f42c548f8421 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5780,6 +5780,10 @@ const struct iommu_ops intel_iommu_ops = { .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, .pgsize_bitmap = INTEL_IOMMU_PGSIZES, +#ifdef CONFIG_INTEL_IOMMU_SVM + .sva_bind_gpasid = intel_svm_bind_gpasid, + .sva_unbind_gpasid = intel_svm_unbind_gpasid, +#endif }; static void quirk_iommu_igfx(struct pci_dev *dev) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 2998418f0a38..7d3405c5a198 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -226,6 +226,206 @@ static LIST_HEAD(global_svm_list); list_for_each_entry((sdev), &(svm)->devs, list) \ if ((d) != (sdev)->dev) {} else +int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, + struct iommu_gpasid_bind_data *data) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct dmar_domain *dmar_domain; + struct intel_svm_dev *sdev; + struct intel_svm *svm; + int ret = 0; + + if (WARN_ON(!iommu) || !data) + return -EINVAL; + + if (data->version != IOMMU_GPASID_BIND_VERSION_1 || + data->format != IOMMU_PASID_FORMAT_INTEL_VTD) + return -EINVAL; + + if (!dev_is_pci(dev)) + return -ENOTSUPP; + + /* VT-d supports devices with full 20 bit PASIDs only */ + if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) + return -EINVAL; + + /* + * We only check host PASID range, we have no knowledge to check + * guest PASID range. + */ + if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) + return -EINVAL; + + dmar_domain = to_dmar_domain(domain); + + mutex_lock(&pasid_mutex); + svm = ioasid_find(NULL, data->hpasid, NULL); + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } + + if (svm) { + /* + * If we found svm for the PASID, there must be at + * least one device bond, otherwise svm should be freed. + */ + if (WARN_ON(list_empty(&svm->devs))) { + ret = -EINVAL; + goto out; + } + + for_each_svm_dev(sdev, svm, dev) { + /* + * For devices with aux domains, we should allow + * multiple bind calls with the same PASID and pdev. + */ + if (iommu_dev_feature_enabled(dev, + IOMMU_DEV_FEAT_AUX)) { + sdev->users++; + } else { + dev_warn_ratelimited(dev, + "Already bound with PASID %u\n", + svm->pasid); + ret = -EBUSY; + } + goto out; + } + } else { + /* We come here when PASID has never been bond to a device. */ + svm = kzalloc(sizeof(*svm), GFP_KERNEL); + if (!svm) { + ret = -ENOMEM; + goto out; + } + /* REVISIT: upper layer/VFIO can track host process that bind + * the PASID. ioasid_set = mm might be sufficient for vfio to + * check pasid VMM ownership. We can drop the following line + * once VFIO and IOASID set check is in place. + */ + svm->mm = get_task_mm(current); + svm->pasid = data->hpasid; + if (data->flags & IOMMU_SVA_GPASID_VAL) { + svm->gpasid = data->gpasid; + svm->flags |= SVM_FLAG_GUEST_PASID; + } + ioasid_set_data(data->hpasid, svm); + INIT_LIST_HEAD_RCU(&svm->devs); + mmput(svm->mm); + } + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!sdev) { + ret = -ENOMEM; + goto out; + } + sdev->dev = dev; + + /* Only count users if device has aux domains */ + if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) + sdev->users = 1; + + /* Set up device context entry for PASID if not enabled already */ + ret = intel_iommu_enable_pasid(iommu, sdev->dev); + if (ret) { + dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); + kfree(sdev); + goto out; + } + + /* + * PASID table is per device for better security. Therefore, for + * each bind of a new device even with an existing PASID, we need to + * call the nested mode setup function here. + */ + spin_lock(&iommu->lock); + ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)data->gpgd, + data->hpasid, &data->vtd, dmar_domain, + data->addr_width); + spin_unlock(&iommu->lock); + if (ret) { + dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", + data->hpasid, ret); + /* + * PASID entry should be in cleared state if nested mode + * set up failed. So we only need to clear IOASID tracking + * data such that free call will succeed. + */ + kfree(sdev); + goto out; + } + + svm->flags |= SVM_FLAG_GUEST_MODE; + + init_rcu_head(&sdev->rcu); + list_add_rcu(&sdev->list, &svm->devs); + out: + if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { + ioasid_set_data(data->hpasid, NULL); + kfree(svm); + } + + mutex_unlock(&pasid_mutex); + return ret; +} + +int intel_svm_unbind_gpasid(struct device *dev, int pasid) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct intel_svm_dev *sdev; + struct intel_svm *svm; + int ret = -EINVAL; + + if (WARN_ON(!iommu)) + return -EINVAL; + + mutex_lock(&pasid_mutex); + svm = ioasid_find(NULL, pasid, NULL); + if (!svm) { + ret = -EINVAL; + goto out; + } + + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } + + for_each_svm_dev(sdev, svm, dev) { + ret = 0; + if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) + sdev->users--; + if (!sdev->users) { + list_del_rcu(&sdev->list); + intel_pasid_tear_down_entry(iommu, dev, svm->pasid); + intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); + /* TODO: Drain in flight PRQ for the PASID since it + * may get reused soon, we don't want to + * confuse with its previous life. + * intel_svm_drain_prq(dev, pasid); + */ + kfree_rcu(sdev, rcu); + + if (list_empty(&svm->devs)) { + /* + * We do not free the IOASID here in that + * IOMMU driver did not allocate it. + * Unlike native SVM, IOASID for guest use was + * allocated prior to the bind call. + * In any case, if the free call comes before + * the unbind, IOMMU driver will get notified + * and perform cleanup. + */ + ioasid_set_data(pasid, NULL); + kfree(svm); + } + } + break; + } +out: + mutex_unlock(&pasid_mutex); + return ret; +} + int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e0d1fed7cbe4..3dfd426dfb03 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -698,7 +698,9 @@ struct dmar_domain *find_domain(struct device *dev); extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); - +int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, + struct iommu_gpasid_bind_data *data); +int intel_svm_unbind_gpasid(struct device *dev, int pasid); struct svm_dev_ops; struct intel_svm_dev { @@ -715,9 +717,11 @@ struct intel_svm_dev { struct intel_svm { struct mmu_notifier notifier; struct mm_struct *mm; + struct intel_iommu *iommu; int flags; int pasid; + int gpasid; /* In case that guest PASID is different from host PASID */ struct list_head devs; struct list_head list; }; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index d7c403d0dd27..1b47ca46373e 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -44,6 +44,18 @@ struct svm_dev_ops { * do such IOTLB flushes automatically. */ #define SVM_FLAG_SUPERVISOR_MODE (1<<1) +/* + * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest + * processes. Compared to the host bind, the primary differences are: + * 1. mm life cycle management + * 2. fault reporting + */ +#define SVM_FLAG_GUEST_MODE (1<<2) +/* + * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space, + * which requires guest and host PASID translation at both directions. + */ +#define SVM_FLAG_GUEST_PASID (1<<3) #ifdef CONFIG_INTEL_IOMMU_SVM -- cgit v1.2.3 From 61a06a16e36d830f7811fbf931668d87197d95b7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:48 +0800 Subject: iommu/vt-d: Support flushing more translation cache types When Shared Virtual Memory is exposed to a guest via vIOMMU, scalable IOTLB invalidation may be passed down from outside IOMMU subsystems. This patch adds invalidation functions that can be used for additional translation cache types. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 39 +++++++++++++++++++++++++++++++++++++++ drivers/iommu/intel-pasid.c | 3 ++- include/linux/intel-iommu.h | 21 +++++++++++++++++---- 3 files changed, 58 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index f77dae7ba7d4..34ee8f28555f 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1421,6 +1421,45 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, qi_submit_sync(&desc, iommu); } +/* PASID-based device IOTLB Invalidate */ +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, u64 granu) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + desc.qw1 = QI_DEV_EIOTLB_GLOB(granu); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + desc.qw1 |= addr & ~mask; + if (size_order) + desc.qw1 |= QI_DEV_EIOTLB_SIZE; + + qi_submit_sync(&desc, iommu); +} + +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, + u64 granu, int pasid) +{ + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | + QI_PC_GRAN(granu) | QI_PC_TYPE; + qi_submit_sync(&desc, iommu); +} + /* * Disable Queued Invalidation interface. */ diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index c7fa1b79eaf7..5d9d9ff49334 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -375,7 +375,8 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, { struct qi_desc desc; - desc.qw0 = QI_PC_DID(did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid); + desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) | + QI_PC_PASID(pasid) | QI_PC_TYPE; desc.qw1 = 0; desc.qw2 = 0; desc.qw3 = 0; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 3dfd426dfb03..a9c984b29a72 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -334,7 +334,7 @@ enum { #define QI_IOTLB_GRAN(gran) (((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4)) #define QI_IOTLB_ADDR(addr) (((u64)addr) & VTD_PAGE_MASK) #define QI_IOTLB_IH(ih) (((u64)ih) << 6) -#define QI_IOTLB_AM(am) (((u8)am)) +#define QI_IOTLB_AM(am) (((u8)am) & 0x3f) #define QI_CC_FM(fm) (((u64)fm) << 48) #define QI_CC_SID(sid) (((u64)sid) << 32) @@ -353,16 +353,21 @@ enum { #define QI_PC_DID(did) (((u64)did) << 16) #define QI_PC_GRAN(gran) (((u64)gran) << 4) -#define QI_PC_ALL_PASIDS (QI_PC_TYPE | QI_PC_GRAN(0)) -#define QI_PC_PASID_SEL (QI_PC_TYPE | QI_PC_GRAN(1)) +/* PASID cache invalidation granu */ +#define QI_PC_ALL_PASIDS 0 +#define QI_PC_PASID_SEL 1 #define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) #define QI_EIOTLB_IH(ih) (((u64)ih) << 6) -#define QI_EIOTLB_AM(am) (((u64)am)) +#define QI_EIOTLB_AM(am) (((u64)am) & 0x3f) #define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) #define QI_EIOTLB_DID(did) (((u64)did) << 16) #define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) +/* QI Dev-IOTLB inv granu */ +#define QI_DEV_IOTLB_GRAN_ALL 1 +#define QI_DEV_IOTLB_GRAN_PASID_SEL 0 + #define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) #define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) #define QI_DEV_EIOTLB_GLOB(g) ((u64)g) @@ -679,8 +684,16 @@ extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, u16 qdep, u64 addr, unsigned mask); + void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, unsigned long npages, bool ih); + +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, u64 granu); +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, + int pasid); + extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); -- cgit v1.2.3 From 24f27d32ab6b71dedcbbeeab8f9bdc143b539ac0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:50 +0800 Subject: iommu/vt-d: Enlightened PASID allocation Enabling IOMMU in a guest requires communication with the host driver for certain aspects. Use of PASID ID to enable Shared Virtual Addressing (SVA) requires managing PASID's in the host. VT-d 3.0 spec provides a Virtual Command Register (VCMD) to facilitate this. Writes to this register in the guest are trapped by vIOMMU which proxies the call to the host driver. This virtual command interface consists of a capability register, a virtual command register, and a virtual response register. Refer to section 10.4.42, 10.4.43, 10.4.44 for more information. This patch adds the enlightened PASID allocation/free interfaces via the virtual command interface. Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Signed-off-by: Jacob Pan Reviewed-by: Eric Auger Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-pasid.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ drivers/iommu/intel-pasid.h | 13 ++++++++++- include/linux/intel-iommu.h | 1 + 3 files changed, 70 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 5d9d9ff49334..ea8f4ef4e295 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -27,6 +27,63 @@ static DEFINE_SPINLOCK(pasid_lock); u32 intel_pasid_max_id = PASID_MAX; +int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid) +{ + unsigned long flags; + u8 status_code; + int ret = 0; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + *pasid = VCMD_VRSP_RESULT_PASID(res); + break; + case VCMD_VRSP_SC_NO_PASID_AVAIL: + pr_info("IOMMU: %s: No PASID available\n", iommu->name); + ret = -ENOSPC; + break; + default: + ret = -ENODEV; + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } + + return ret; +} + +void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid) +{ + unsigned long flags; + u8 status_code; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, + VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + break; + case VCMD_VRSP_SC_INVALID_PASID: + pr_info("IOMMU: %s: Invalid PASID\n", iommu->name); + break; + default: + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } +} + /* * Per device pasid table management: */ diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index ccd50c2ae75c..a41b09b3ffde 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -23,6 +23,16 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) +/* Virtual command interface for enlightened pasid management. */ +#define VCMD_CMD_ALLOC 0x1 +#define VCMD_CMD_FREE 0x2 +#define VCMD_VRSP_IP 0x1 +#define VCMD_VRSP_SC(e) (((e) >> 1) & 0x3) +#define VCMD_VRSP_SC_SUCCESS 0 +#define VCMD_VRSP_SC_NO_PASID_AVAIL 1 +#define VCMD_VRSP_SC_INVALID_PASID 1 +#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 8) & 0xfffff) +#define VCMD_CMD_OPERAND(e) ((e) << 8) /* * Domain ID reserved for pasid entries programmed for first-level * only and pass-through transfer modes. @@ -111,5 +121,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, int pasid); - +int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid); +void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid); #endif /* __INTEL_PASID_H */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a9c984b29a72..addb310b4ded 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -169,6 +169,7 @@ #define ecap_smpwc(e) (((e) >> 48) & 0x1) #define ecap_flts(e) (((e) >> 47) & 0x1) #define ecap_slts(e) (((e) >> 46) & 0x1) +#define ecap_vcs(e) (((e) >> 44) & 0x1) #define ecap_smts(e) (((e) >> 43) & 0x1) #define ecap_dit(e) ((e >> 41) & 0x1) #define ecap_pasid(e) ((e >> 40) & 0x1) -- cgit v1.2.3 From 3375303e82877552f3b2b42309e8233fe715fd9f Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:51 +0800 Subject: iommu/vt-d: Add custom allocator for IOASID When VT-d driver runs in the guest, PASID allocation must be performed via virtual command interface. This patch registers a custom IOASID allocator which takes precedence over the default XArray based allocator. The resulting IOASID allocation will always come from the host. This ensures that PASID namespace is system- wide. Virtual command registers are used in the guest only, to prevent vmexit cost, we cache the capability and store it during initialization. Signed-off-by: Liu, Yi L Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 1 + drivers/iommu/intel-iommu.c | 85 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 7 ++++ 3 files changed, 93 insertions(+) (limited to 'include') diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 34ee8f28555f..66af08ad10fb 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -963,6 +963,7 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) warn_invalid_dmar(phys_addr, " returns all ones"); goto unmap; } + iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG); /* the registers might be more than one page */ map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 627bb5093317..80d0bd561bdd 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1726,6 +1726,9 @@ static void free_dmar_iommu(struct intel_iommu *iommu) if (ecap_prs(iommu->ecap)) intel_svm_finish_prq(iommu); } + if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap)) + ioasid_unregister_allocator(&iommu->pasid_allocator); + #endif } @@ -3038,6 +3041,85 @@ out_unmap: return ret; } +#ifdef CONFIG_INTEL_IOMMU_SVM +static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) +{ + struct intel_iommu *iommu = data; + ioasid_t ioasid; + + if (!iommu) + return INVALID_IOASID; + /* + * VT-d virtual command interface always uses the full 20 bit + * PASID range. Host can partition guest PASID range based on + * policies but it is out of guest's control. + */ + if (min < PASID_MIN || max > intel_pasid_max_id) + return INVALID_IOASID; + + if (vcmd_alloc_pasid(iommu, &ioasid)) + return INVALID_IOASID; + + return ioasid; +} + +static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) +{ + struct intel_iommu *iommu = data; + + if (!iommu) + return; + /* + * Sanity check the ioasid owner is done at upper layer, e.g. VFIO + * We can only free the PASID when all the devices are unbound. + */ + if (ioasid_find(NULL, ioasid, NULL)) { + pr_alert("Cannot free active IOASID %d\n", ioasid); + return; + } + vcmd_free_pasid(iommu, ioasid); +} + +static void register_pasid_allocator(struct intel_iommu *iommu) +{ + /* + * If we are running in the host, no need for custom allocator + * in that PASIDs are allocated from the host system-wide. + */ + if (!cap_caching_mode(iommu->cap)) + return; + + if (!sm_supported(iommu)) { + pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); + return; + } + + /* + * Register a custom PASID allocator if we are running in a guest, + * guest PASID must be obtained via virtual command interface. + * There can be multiple vIOMMUs in each guest but only one allocator + * is active. All vIOMMU allocators will eventually be calling the same + * host allocator. + */ + if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap)) + return; + + pr_info("Register custom PASID allocator\n"); + iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; + iommu->pasid_allocator.free = intel_vcmd_ioasid_free; + iommu->pasid_allocator.pdata = (void *)iommu; + if (ioasid_register_allocator(&iommu->pasid_allocator)) { + pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); + /* + * Disable scalable mode on this IOMMU if there + * is no custom allocator. Mixing SM capable vIOMMU + * and non-SM vIOMMU are not supported. + */ + intel_iommu_sm = 0; + } +} +#endif + static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; @@ -3155,6 +3237,9 @@ static int __init init_dmars(void) */ for_each_active_iommu(iommu, drhd) { iommu_flush_write_buffer(iommu); +#ifdef CONFIG_INTEL_IOMMU_SVM + register_pasid_allocator(iommu); +#endif iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index addb310b4ded..e14124f74b3a 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -195,6 +196,9 @@ #define ecap_max_handle_mask(e) ((e >> 20) & 0xf) #define ecap_sc_support(e) ((e >> 7) & 0x1) /* Snooping Control */ +/* Virtual command interface capability */ +#define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */ + /* IOTLB_REG */ #define DMA_TLB_FLUSH_GRANU_OFFSET 60 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) @@ -288,6 +292,7 @@ /* PRS_REG */ #define DMA_PRS_PPR ((u32)1) +#define DMA_VCS_PAS ((u64)1) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ do { \ @@ -555,6 +560,7 @@ struct intel_iommu { u64 reg_size; /* size of hw register set */ u64 cap; u64 ecap; + u64 vccap; u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ raw_spinlock_t register_lock; /* protect register handling */ int seq_id; /* sequence id of the iommu */ @@ -575,6 +581,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ + struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */ #endif struct q_inval *qi; /* Queued invalidation info */ u32 *iommu_state; /* Store iommu states between suspend and resume.*/ -- cgit v1.2.3 From e85bb99b79ca5ad2681612a7bb22f94cc2c71866 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:52 +0800 Subject: iommu/vt-d: Add get_domain_info() helper Add a get_domain_info() helper to retrieve the valid per-device iommu private data. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 40 +++++++++++++++++++++++++++------------- drivers/iommu/intel-pasid.c | 12 ++++++------ drivers/iommu/intel-svm.c | 2 +- include/linux/intel-iommu.h | 1 + 4 files changed, 35 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 80d0bd561bdd..a13b723ca38d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -365,6 +365,21 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) +struct device_domain_info *get_domain_info(struct device *dev) +{ + struct device_domain_info *info; + + if (!dev) + return NULL; + + info = dev->archdata.iommu; + if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO || + info == DEFER_DEVICE_DOMAIN_INFO)) + return NULL; + + return info; +} + DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); @@ -2429,7 +2444,7 @@ struct dmar_domain *find_domain(struct device *dev) dev = &pci_real_dma_dev(to_pci_dev(dev))->dev; /* No lock here, assumes no domain exit in normal case */ - info = dev->archdata.iommu; + info = get_domain_info(dev); if (likely(info)) return info->domain; @@ -5012,9 +5027,8 @@ static void dmar_remove_one_dev_info(struct device *dev) unsigned long flags; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; - if (info && info != DEFER_DEVICE_DOMAIN_INFO - && info != DUMMY_DEVICE_DOMAIN_INFO) + info = get_domain_info(dev); + if (info) __dmar_remove_one_dev_info(info); spin_unlock_irqrestore(&device_domain_lock, flags); } @@ -5104,7 +5118,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) static inline bool is_aux_domain(struct device *dev, struct iommu_domain *domain) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); return info && info->auxd_enabled && domain->type == IOMMU_DOMAIN_UNMANAGED; @@ -5113,7 +5127,7 @@ is_aux_domain(struct device *dev, struct iommu_domain *domain) static void auxiliary_link_device(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); assert_spin_locked(&device_domain_lock); if (WARN_ON(!info)) @@ -5126,7 +5140,7 @@ static void auxiliary_link_device(struct dmar_domain *domain, static void auxiliary_unlink_device(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); assert_spin_locked(&device_domain_lock); if (WARN_ON(!info)) @@ -5214,7 +5228,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain, return; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); iommu = info->iommu; auxiliary_unlink_device(domain, dev); @@ -5404,7 +5418,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, spin_lock_irqsave(&device_domain_lock, flags); spin_lock(&iommu->lock); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info) { ret = -EINVAL; goto out_unlock; @@ -5768,7 +5782,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) spin_lock(&iommu->lock); ret = -EINVAL; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_supported) goto out; @@ -5864,7 +5878,7 @@ static int intel_iommu_enable_auxd(struct device *dev) return -ENODEV; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); info->auxd_enabled = 1; spin_unlock_irqrestore(&device_domain_lock, flags); @@ -5877,7 +5891,7 @@ static int intel_iommu_disable_auxd(struct device *dev) unsigned long flags; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!WARN_ON(!info)) info->auxd_enabled = 0; spin_unlock_irqrestore(&device_domain_lock, flags); @@ -5954,7 +5968,7 @@ intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) static bool intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); if (feat == IOMMU_DEV_FEAT_AUX) return scalable_mode_support() && info && info->auxd_enabled; diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index ea8f4ef4e295..c46a068142b9 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -151,7 +151,7 @@ int intel_pasid_alloc_table(struct device *dev) int size; might_sleep(); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) return -EINVAL; @@ -198,7 +198,7 @@ void intel_pasid_free_table(struct device *dev) struct pasid_entry *table; int i, max_pde; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !dev_is_pci(dev) || !info->pasid_table) return; @@ -224,7 +224,7 @@ struct pasid_table *intel_pasid_get_table(struct device *dev) { struct device_domain_info *info; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info) return NULL; @@ -235,7 +235,7 @@ int intel_pasid_get_dev_max_id(struct device *dev) { struct device_domain_info *info; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_table) return 0; @@ -256,7 +256,7 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid) return NULL; dir = pasid_table->table; - info = dev->archdata.iommu; + info = get_domain_info(dev); dir_index = pasid >> PASID_PDE_SHIFT; index = pasid & PASID_PTE_MASK; @@ -462,7 +462,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, struct device_domain_info *info; u16 sid, qdep, pfsid; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->ats_enabled) return; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 7d3405c5a198..75a1ba4439f7 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -503,7 +503,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ goto out; } - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_supported) { kfree(sdev); goto out; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e14124f74b3a..caa179e806fc 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -714,6 +714,7 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info, void iommu_flush_write_buffer(struct intel_iommu *iommu); int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); struct dmar_domain *find_domain(struct device *dev); +struct device_domain_info *get_domain_info(struct device *dev); #ifdef CONFIG_INTEL_IOMMU_SVM extern void intel_svm_check(struct intel_iommu *iommu); -- cgit v1.2.3 From 064a57d7ddfc46ada02b477b91c478001b03bfa3 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:54 +0800 Subject: iommu/vt-d: Replace intel SVM APIs with generic SVA APIs This patch is an initial step to replace Intel SVM code with the following IOMMU SVA ops: intel_svm_bind_mm() => iommu_sva_bind_device() intel_svm_unbind_mm() => iommu_sva_unbind_device() intel_svm_is_pasid_valid() => iommu_sva_get_pasid() The features below will continue to work but are not included in this patch in that they are handled mostly within the IOMMU subsystem. - IO page fault - mmu notifier Consolidation of the above will come after merging generic IOMMU sva code[1]. There should not be any changes needed for SVA users such as accelerator device drivers during this time. [1] http://jpbrucker.net/sva/ Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 3 ++ drivers/iommu/intel-svm.c | 124 ++++++++++++++++++++++++-------------------- include/linux/intel-iommu.h | 6 +++ include/linux/intel-svm.h | 86 ------------------------------ 4 files changed, 78 insertions(+), 141 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ed7de7420b3c..7d28ef2e6fe2 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -6071,6 +6071,9 @@ const struct iommu_ops intel_iommu_ops = { .cache_invalidate = intel_iommu_sva_invalidate, .sva_bind_gpasid = intel_svm_bind_gpasid, .sva_unbind_gpasid = intel_svm_unbind_gpasid, + .sva_bind = intel_svm_bind, + .sva_unbind = intel_svm_unbind, + .sva_get_pasid = intel_svm_get_pasid, #endif }; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 75a1ba4439f7..8b66bf45477e 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -426,13 +426,15 @@ out: return ret; } -int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) +/* Caller must hold pasid_mutex, mm reference */ +static int +intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops, + struct mm_struct *mm, struct intel_svm_dev **sd) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); struct device_domain_info *info; struct intel_svm_dev *sdev; struct intel_svm *svm = NULL; - struct mm_struct *mm = NULL; int pasid_max; int ret; @@ -449,16 +451,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } else pasid_max = 1 << 20; + /* Bind supervisor PASID shuld have mm = NULL */ if (flags & SVM_FLAG_SUPERVISOR_MODE) { - if (!ecap_srs(iommu->ecap)) + if (!ecap_srs(iommu->ecap) || mm) { + pr_err("Supervisor PASID with user provided mm.\n"); return -EINVAL; - } else if (pasid) { - mm = get_task_mm(current); - BUG_ON(!mm); + } } - mutex_lock(&pasid_mutex); - if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) { + if (!(flags & SVM_FLAG_PRIVATE_PASID)) { struct intel_svm *t; list_for_each_entry(t, &global_svm_list, list) { @@ -496,9 +497,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ sdev->dev = dev; ret = intel_iommu_enable_pasid(iommu, dev); - if (ret || !pasid) { - /* If they don't actually want to assign a PASID, this is - * just an enabling check/preparation. */ + if (ret) { kfree(sdev); goto out; } @@ -597,18 +596,17 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } } list_add_rcu(&sdev->list, &svm->devs); - - success: - *pasid = svm->pasid; +success: + sdev->pasid = svm->pasid; + sdev->sva.dev = dev; + if (sd) + *sd = sdev; ret = 0; out: - mutex_unlock(&pasid_mutex); - if (mm) - mmput(mm); return ret; } -EXPORT_SYMBOL_GPL(intel_svm_bind_mm); +/* Caller must hold pasid_mutex */ int intel_svm_unbind_mm(struct device *dev, int pasid) { struct intel_svm_dev *sdev; @@ -616,7 +614,6 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) struct intel_svm *svm; int ret = -EINVAL; - mutex_lock(&pasid_mutex); iommu = intel_svm_device_to_iommu(dev); if (!iommu) goto out; @@ -662,45 +659,9 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) break; } out: - mutex_unlock(&pasid_mutex); return ret; } -EXPORT_SYMBOL_GPL(intel_svm_unbind_mm); - -int intel_svm_is_pasid_valid(struct device *dev, int pasid) -{ - struct intel_iommu *iommu; - struct intel_svm *svm; - int ret = -EINVAL; - - mutex_lock(&pasid_mutex); - iommu = intel_svm_device_to_iommu(dev); - if (!iommu) - goto out; - - svm = ioasid_find(NULL, pasid, NULL); - if (!svm) - goto out; - - if (IS_ERR(svm)) { - ret = PTR_ERR(svm); - goto out; - } - /* init_mm is used in this case */ - if (!svm->mm) - ret = 1; - else if (atomic_read(&svm->mm->mm_users) > 0) - ret = 1; - else - ret = 0; - - out: - mutex_unlock(&pasid_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid); /* Page request queue descriptor */ struct page_req_dsc { @@ -894,3 +855,56 @@ static irqreturn_t prq_event_thread(int irq, void *d) return IRQ_RETVAL(handled); } + +#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) +struct iommu_sva * +intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +{ + struct iommu_sva *sva = ERR_PTR(-EINVAL); + struct intel_svm_dev *sdev = NULL; + int flags = 0; + int ret; + + /* + * TODO: Consolidate with generic iommu-sva bind after it is merged. + * It will require shared SVM data structures, i.e. combine io_mm + * and intel_svm etc. + */ + if (drvdata) + flags = *(int *)drvdata; + mutex_lock(&pasid_mutex); + ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); + if (ret) + sva = ERR_PTR(ret); + else if (sdev) + sva = &sdev->sva; + else + WARN(!sdev, "SVM bind succeeded with no sdev!\n"); + + mutex_unlock(&pasid_mutex); + + return sva; +} + +void intel_svm_unbind(struct iommu_sva *sva) +{ + struct intel_svm_dev *sdev; + + mutex_lock(&pasid_mutex); + sdev = to_intel_svm_dev(sva); + intel_svm_unbind_mm(sdev->dev, sdev->pasid); + mutex_unlock(&pasid_mutex); +} + +int intel_svm_get_pasid(struct iommu_sva *sva) +{ + struct intel_svm_dev *sdev; + int pasid; + + mutex_lock(&pasid_mutex); + sdev = to_intel_svm_dev(sva); + pasid = sdev->pasid; + mutex_unlock(&pasid_mutex); + + return pasid; +} diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index caa179e806fc..42245e1e1b48 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -723,6 +723,10 @@ extern int intel_svm_finish_prq(struct intel_iommu *iommu); int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, struct iommu_gpasid_bind_data *data); int intel_svm_unbind_gpasid(struct device *dev, int pasid); +struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, + void *drvdata); +void intel_svm_unbind(struct iommu_sva *handle); +int intel_svm_get_pasid(struct iommu_sva *handle); struct svm_dev_ops; struct intel_svm_dev { @@ -730,6 +734,8 @@ struct intel_svm_dev { struct rcu_head rcu; struct device *dev; struct svm_dev_ops *ops; + struct iommu_sva sva; + int pasid; int users; u16 did; u16 dev_iotlb:1; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 1b47ca46373e..c9e7e601950d 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -21,7 +21,6 @@ struct svm_dev_ops { #define SVM_REQ_EXEC (1<<1) #define SVM_REQ_PRIV (1<<0) - /* * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main" * PASID for the current process. Even if a PASID already exists, a new one @@ -57,89 +56,4 @@ struct svm_dev_ops { */ #define SVM_FLAG_GUEST_PASID (1<<3) -#ifdef CONFIG_INTEL_IOMMU_SVM - -/** - * intel_svm_bind_mm() - Bind the current process to a PASID - * @dev: Device to be granted access - * @pasid: Address for allocated PASID - * @flags: Flags. Later for requesting supervisor mode, etc. - * @ops: Callbacks to device driver - * - * This function attempts to enable PASID support for the given device. - * If the @pasid argument is non-%NULL, a PASID is allocated for access - * to the MM of the current process. - * - * By using a %NULL value for the @pasid argument, this function can - * be used to simply validate that PASID support is available for the - * given device — i.e. that it is behind an IOMMU which has the - * requisite support, and is enabled. - * - * Page faults are handled transparently by the IOMMU code, and there - * should be no need for the device driver to be involved. If a page - * fault cannot be handled (i.e. is an invalid address rather than - * just needs paging in), then the page request will be completed by - * the core IOMMU code with appropriate status, and the device itself - * can then report the resulting fault to its driver via whatever - * mechanism is appropriate. - * - * Multiple calls from the same process may result in the same PASID - * being re-used. A reference count is kept. - */ -extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, - struct svm_dev_ops *ops); - -/** - * intel_svm_unbind_mm() - Unbind a specified PASID - * @dev: Device for which PASID was allocated - * @pasid: PASID value to be unbound - * - * This function allows a PASID to be retired when the device no - * longer requires access to the address space of a given process. - * - * If the use count for the PASID in question reaches zero, the - * PASID is revoked and may no longer be used by hardware. - * - * Device drivers are required to ensure that no access (including - * page requests) is currently outstanding for the PASID in question, - * before calling this function. - */ -extern int intel_svm_unbind_mm(struct device *dev, int pasid); - -/** - * intel_svm_is_pasid_valid() - check if pasid is valid - * @dev: Device for which PASID was allocated - * @pasid: PASID value to be checked - * - * This function checks if the specified pasid is still valid. A - * valid pasid means the backing mm is still having a valid user. - * For kernel callers init_mm is always valid. for other mm, if mm->mm_users - * is non-zero, it is valid. - * - * returns -EINVAL if invalid pasid, 0 if pasid ref count is invalid - * 1 if pasid is valid. - */ -extern int intel_svm_is_pasid_valid(struct device *dev, int pasid); - -#else /* CONFIG_INTEL_IOMMU_SVM */ - -static inline int intel_svm_bind_mm(struct device *dev, int *pasid, - int flags, struct svm_dev_ops *ops) -{ - return -ENOSYS; -} - -static inline int intel_svm_unbind_mm(struct device *dev, int pasid) -{ - BUG(); -} - -static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid) -{ - return -EINVAL; -} -#endif /* CONFIG_INTEL_IOMMU_SVM */ - -#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL)) - #endif /* __INTEL_SVM_H__ */ -- cgit v1.2.3 From 8a1d824625402b3ef3c3e5965663354ff0394d86 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:55 +0800 Subject: iommu/vt-d: Multiple descriptors per qi_submit_sync() Current qi_submit_sync() only supports single invalidation descriptor per submission and appends wait descriptor after each submission to poll the hardware completion. This extends the qi_submit_sync() helper to support multiple descriptors, and add an option so that the caller could specify the Page-request Drain (PD) bit in the wait descriptor. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-13-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 63 ++++++++++++++++++++++--------------- drivers/iommu/intel-pasid.c | 4 +-- drivers/iommu/intel-svm.c | 6 ++-- drivers/iommu/intel_irq_remapping.c | 2 +- include/linux/intel-iommu.h | 9 +++++- 5 files changed, 52 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 66af08ad10fb..60a2970c37ff 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1157,12 +1157,11 @@ static inline void reclaim_free_desc(struct q_inval *qi) } } -static int qi_check_fault(struct intel_iommu *iommu, int index) +static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) { u32 fault; int head, tail; struct q_inval *qi = iommu->qi; - int wait_index = (index + 1) % QI_LENGTH; int shift = qi_shift(iommu); if (qi->desc_status[wait_index] == QI_ABORT) @@ -1225,17 +1224,21 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) } /* - * Submit the queued invalidation descriptor to the remapping - * hardware unit and wait for its completion. + * Function to submit invalidation descriptors of all types to the queued + * invalidation interface(QI). Multiple descriptors can be submitted at a + * time, a wait descriptor will be appended to each submission to ensure + * hardware has completed the invalidation before return. Wait descriptors + * can be part of the submission but it will not be polled for completion. */ -int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options) { - int rc; struct q_inval *qi = iommu->qi; - int offset, shift, length; struct qi_desc wait_desc; int wait_index, index; unsigned long flags; + int offset, shift; + int rc, i; if (!qi) return 0; @@ -1244,32 +1247,41 @@ restart: rc = 0; raw_spin_lock_irqsave(&qi->q_lock, flags); - while (qi->free_cnt < 3) { + /* + * Check if we have enough empty slots in the queue to submit, + * the calculation is based on: + * # of desc + 1 wait desc + 1 space between head and tail + */ + while (qi->free_cnt < count + 2) { raw_spin_unlock_irqrestore(&qi->q_lock, flags); cpu_relax(); raw_spin_lock_irqsave(&qi->q_lock, flags); } index = qi->free_head; - wait_index = (index + 1) % QI_LENGTH; + wait_index = (index + count) % QI_LENGTH; shift = qi_shift(iommu); - length = 1 << shift; - qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE; + for (i = 0; i < count; i++) { + offset = ((index + i) % QI_LENGTH) << shift; + memcpy(qi->desc + offset, &desc[i], 1 << shift); + qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE; + } + qi->desc_status[wait_index] = QI_IN_USE; - offset = index << shift; - memcpy(qi->desc + offset, desc, length); wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) | QI_IWD_STATUS_WRITE | QI_IWD_TYPE; + if (options & QI_OPT_WAIT_DRAIN) + wait_desc.qw0 |= QI_IWD_PRQ_DRAIN; wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]); wait_desc.qw2 = 0; wait_desc.qw3 = 0; offset = wait_index << shift; - memcpy(qi->desc + offset, &wait_desc, length); + memcpy(qi->desc + offset, &wait_desc, 1 << shift); - qi->free_head = (qi->free_head + 2) % QI_LENGTH; - qi->free_cnt -= 2; + qi->free_head = (qi->free_head + count + 1) % QI_LENGTH; + qi->free_cnt -= count + 1; /* * update the HW tail register indicating the presence of @@ -1285,7 +1297,7 @@ restart: * a deadlock where the interrupt context can wait indefinitely * for free slots in the queue. */ - rc = qi_check_fault(iommu, index); + rc = qi_check_fault(iommu, index, wait_index); if (rc) break; @@ -1294,7 +1306,8 @@ restart: raw_spin_lock(&qi->q_lock); } - qi->desc_status[index] = QI_DONE; + for (i = 0; i < count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE; reclaim_free_desc(qi); raw_spin_unlock_irqrestore(&qi->q_lock, flags); @@ -1318,7 +1331,7 @@ void qi_global_iec(struct intel_iommu *iommu) desc.qw3 = 0; /* should never fail */ - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, @@ -1332,7 +1345,7 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, @@ -1356,7 +1369,7 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, @@ -1378,7 +1391,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* PASID-based IOTLB invalidation */ @@ -1419,7 +1432,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, QI_EIOTLB_AM(mask); } - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* PASID-based device IOTLB Invalidate */ @@ -1448,7 +1461,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (size_order) desc.qw1 |= QI_DEV_EIOTLB_SIZE; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, @@ -1458,7 +1471,7 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_TYPE; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index c46a068142b9..45e9b5b291bc 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -438,7 +438,7 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } static void @@ -452,7 +452,7 @@ iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } static void diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 8b66bf45477e..5133b2d4428f 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -138,7 +138,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, svm->iommu); + qi_submit_sync(svm->iommu, &desc, 1, 0); if (sdev->dev_iotlb) { desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | @@ -162,7 +162,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, svm->iommu); + qi_submit_sync(svm->iommu, &desc, 1, 0); } } @@ -846,7 +846,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) sizeof(req->priv_data)); resp.qw2 = 0; resp.qw3 = 0; - qi_submit_sync(&resp, iommu); + qi_submit_sync(iommu, &resp, 1, 0); } head = (head + sizeof(*req)) & PRQ_RING_MASK; } diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 81e43c1df7ec..a042f123b091 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -151,7 +151,7 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) desc.qw2 = 0; desc.qw3 = 0; - return qi_submit_sync(&desc, iommu); + return qi_submit_sync(iommu, &desc, 1, 0); } static int modify_irte(struct irq_2_iommu *irq_iommu, diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 42245e1e1b48..677dee59e3c0 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -333,6 +333,7 @@ enum { #define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) #define QI_IWD_STATUS_WRITE (((u64)1) << 5) +#define QI_IWD_PRQ_DRAIN (((u64)1) << 7) #define QI_IOTLB_DID(did) (((u64)did) << 16) #define QI_IOTLB_DR(dr) (((u64)dr) << 7) @@ -702,7 +703,13 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int pasid); -extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options); +/* + * Options used in qi_submit_sync: + * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. + */ +#define QI_OPT_WAIT_DRAIN BIT(0) extern int dmar_ir_support(void); -- cgit v1.2.3 From 66ac4db36f4c12a3b02db864ccb0801cd938b6de Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:58 +0800 Subject: iommu/vt-d: Add page request draining support When a PASID is stopped or terminated, there can be pending PRQs (requests that haven't received responses) in remapping hardware. This adds the interface to drain page requests and call it when a PASID is terminated. Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-16-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 107 +++++++++++++++++++++++++++++++++++++++++--- include/linux/intel-iommu.h | 4 ++ 2 files changed, 106 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 960a3610e852..5ab71107afd5 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -23,6 +23,7 @@ #include "intel-pasid.h" static irqreturn_t prq_event_thread(int irq, void *d); +static void intel_svm_drain_prq(struct device *dev, int pasid); #define PRQ_ORDER 0 @@ -66,6 +67,8 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + init_completion(&iommu->prq_complete); + return 0; } @@ -399,12 +402,8 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid) list_del_rcu(&sdev->list); intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); + intel_svm_drain_prq(dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); - /* TODO: Drain in flight PRQ for the PASID since it - * may get reused soon, we don't want to - * confuse with its previous life. - * intel_svm_drain_prq(dev, pasid); - */ kfree_rcu(sdev, rcu); if (list_empty(&svm->devs)) { @@ -643,6 +642,7 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) * hard to be as defensive as we might like. */ intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); + intel_svm_drain_prq(dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); kfree_rcu(sdev, rcu); @@ -721,6 +721,93 @@ static bool is_canonical_address(u64 addr) return (((saddr << shift) >> shift) == saddr); } +/** + * intel_svm_drain_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +static void intel_svm_drain_prq(struct device *dev, int pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + struct pci_dev *pdev; + int head, tail; + u16 sid, did; + int qdep; + + info = get_domain_info(dev); + if (WARN_ON(!info || !dev_is_pci(dev))) + return; + + if (!info->pri_enabled) + return; + + iommu = info->iommu; + domain = info->domain; + pdev = to_pci_dev(dev); + sid = PCI_DEVID(info->bus, info->devfn); + did = domain->iommu_did[iommu->seq_id]; + qdep = pci_ats_queue_depth(pdev); + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (!req->pasid_present || req->pasid != pasid) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + desc[1].qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | + QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | + QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(info->pfsid); +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + static irqreturn_t prq_event_thread(int irq, void *d) { struct intel_iommu *iommu = d; @@ -856,6 +943,16 @@ static irqreturn_t prq_event_thread(int irq, void *d) dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + return IRQ_RETVAL(handled); } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 677dee59e3c0..21633cee6331 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -292,6 +292,8 @@ /* PRS_REG */ #define DMA_PRS_PPR ((u32)1) +#define DMA_PRS_PRO ((u32)2) + #define DMA_VCS_PAS ((u64)1) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ @@ -333,6 +335,7 @@ enum { #define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) #define QI_IWD_STATUS_WRITE (((u64)1) << 5) +#define QI_IWD_FENCE (((u64)1) << 6) #define QI_IWD_PRQ_DRAIN (((u64)1) << 7) #define QI_IOTLB_DID(did) (((u64)did) << 16) @@ -582,6 +585,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ + struct completion prq_complete; struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */ #endif struct q_inval *qi; /* Queued invalidation info */ -- cgit v1.2.3 From 521376741b2c26fe53a1ec24d02da24d477eb739 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 20 May 2020 17:22:00 +0200 Subject: PCI/ATS: Only enable ATS for trusted devices Add pci_ats_supported(), which checks whether a device has an ATS capability, and whether it is trusted. A device is untrusted if it is plugged into an external-facing port such as Thunderbolt and could be spoofing an existing device to exploit weaknesses in the IOMMU configuration. PCIe ATS is one such weaknesses since it allows endpoints to cache IOMMU translations and emit transactions with 'Translated' Address Type (10b) that partially bypass the IOMMU translation. The SMMUv3 and VT-d IOMMU drivers already disallow ATS and transactions with 'Translated' Address Type for untrusted devices. Add the check to pci_enable_ats() to let other drivers (AMD IOMMU for now) benefit from it. By checking ats_cap, the pci_ats_supported() helper also returns whether ATS was globally disabled with pci=noats, and could later include more things, for example whether the whole PCIe hierarchy down to the endpoint supports ATS. Signed-off-by: Jean-Philippe Brucker Reviewed-by: Joerg Roedel Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20200520152201.3309416-2-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 18 +++++++++++++++++- include/linux/pci-ats.h | 3 +++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 390e92f2d8d1..b761c1f72f67 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -30,6 +30,22 @@ void pci_ats_init(struct pci_dev *dev) dev->ats_cap = pos; } +/** + * pci_ats_supported - check if the device can use ATS + * @dev: the PCI device + * + * Returns true if the device supports ATS and is allowed to use it, false + * otherwise. + */ +bool pci_ats_supported(struct pci_dev *dev) +{ + if (!dev->ats_cap) + return false; + + return (dev->untrusted == 0); +} +EXPORT_SYMBOL_GPL(pci_ats_supported); + /** * pci_enable_ats - enable the ATS capability * @dev: the PCI device @@ -42,7 +58,7 @@ int pci_enable_ats(struct pci_dev *dev, int ps) u16 ctrl; struct pci_dev *pdev; - if (!dev->ats_cap) + if (!pci_ats_supported(dev)) return -EINVAL; if (WARN_ON(dev->ats_enabled)) diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index d08f0869f121..f75c307f346d 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -6,11 +6,14 @@ #ifdef CONFIG_PCI_ATS /* Address Translation Service */ +bool pci_ats_supported(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); #else /* CONFIG_PCI_ATS */ +static inline bool pci_ats_supported(struct pci_dev *d) +{ return false; } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } -- cgit v1.2.3 From fb01562e5a8a731bb1807eba0a9fadb355ca2277 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 23 Apr 2020 14:53:28 +0200 Subject: uacce: Remove mm_exit() op The mm_exit() op will be removed from the SVA API. When a process dies and its mm goes away, the IOMMU driver won't notify device drivers anymore. Drivers should expect to handle a lot more aborted DMA. On the upside, it does greatly simplify the queue management. The uacce_mm struct, that tracks all queues bound to an mm, was only used by the mm_exit() callback. Remove it. Signed-off-by: Jean-Philippe Brucker Acked-by: Jacob Pan Acked-by: Lu Baolu Acked-by: Zhangfei Gao Link: https://lore.kernel.org/r/20200423125329.782066-2-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/misc/uacce/uacce.c | 172 +++++++++++---------------------------------- include/linux/uacce.h | 34 +++------ 2 files changed, 51 insertions(+), 155 deletions(-) (limited to 'include') diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index d39307f060bd..107028e77ca3 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -90,109 +90,39 @@ static long uacce_fops_compat_ioctl(struct file *filep, } #endif -static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle, - void *data) +static int uacce_bind_queue(struct uacce_device *uacce, struct uacce_queue *q) { - struct uacce_mm *uacce_mm = data; - struct uacce_queue *q; - - /* - * No new queue can be added concurrently because no caller can have a - * reference to this mm. But there may be concurrent calls to - * uacce_mm_put(), so we need the lock. - */ - mutex_lock(&uacce_mm->lock); - list_for_each_entry(q, &uacce_mm->queues, list) - uacce_put_queue(q); - uacce_mm->mm = NULL; - mutex_unlock(&uacce_mm->lock); + int pasid; + struct iommu_sva *handle; - return 0; -} - -static struct iommu_sva_ops uacce_sva_ops = { - .mm_exit = uacce_sva_exit, -}; - -static struct uacce_mm *uacce_mm_get(struct uacce_device *uacce, - struct uacce_queue *q, - struct mm_struct *mm) -{ - struct uacce_mm *uacce_mm = NULL; - struct iommu_sva *handle = NULL; - int ret; - - lockdep_assert_held(&uacce->mm_lock); - - list_for_each_entry(uacce_mm, &uacce->mm_list, list) { - if (uacce_mm->mm == mm) { - mutex_lock(&uacce_mm->lock); - list_add(&q->list, &uacce_mm->queues); - mutex_unlock(&uacce_mm->lock); - return uacce_mm; - } - } - - uacce_mm = kzalloc(sizeof(*uacce_mm), GFP_KERNEL); - if (!uacce_mm) - return NULL; + if (!(uacce->flags & UACCE_DEV_SVA)) + return 0; - if (uacce->flags & UACCE_DEV_SVA) { - /* - * Safe to pass an incomplete uacce_mm, since mm_exit cannot - * fire while we hold a reference to the mm. - */ - handle = iommu_sva_bind_device(uacce->parent, mm, uacce_mm); - if (IS_ERR(handle)) - goto err_free; + handle = iommu_sva_bind_device(uacce->parent, current->mm, NULL); + if (IS_ERR(handle)) + return PTR_ERR(handle); - ret = iommu_sva_set_ops(handle, &uacce_sva_ops); - if (ret) - goto err_unbind; - - uacce_mm->pasid = iommu_sva_get_pasid(handle); - if (uacce_mm->pasid == IOMMU_PASID_INVALID) - goto err_unbind; + pasid = iommu_sva_get_pasid(handle); + if (pasid == IOMMU_PASID_INVALID) { + iommu_sva_unbind_device(handle); + return -ENODEV; } - uacce_mm->mm = mm; - uacce_mm->handle = handle; - INIT_LIST_HEAD(&uacce_mm->queues); - mutex_init(&uacce_mm->lock); - list_add(&q->list, &uacce_mm->queues); - list_add(&uacce_mm->list, &uacce->mm_list); - - return uacce_mm; - -err_unbind: - if (handle) - iommu_sva_unbind_device(handle); -err_free: - kfree(uacce_mm); - return NULL; + q->handle = handle; + q->pasid = pasid; + return 0; } -static void uacce_mm_put(struct uacce_queue *q) +static void uacce_unbind_queue(struct uacce_queue *q) { - struct uacce_mm *uacce_mm = q->uacce_mm; - - lockdep_assert_held(&q->uacce->mm_lock); - - mutex_lock(&uacce_mm->lock); - list_del(&q->list); - mutex_unlock(&uacce_mm->lock); - - if (list_empty(&uacce_mm->queues)) { - if (uacce_mm->handle) - iommu_sva_unbind_device(uacce_mm->handle); - list_del(&uacce_mm->list); - kfree(uacce_mm); - } + if (!q->handle) + return; + iommu_sva_unbind_device(q->handle); + q->handle = NULL; } static int uacce_fops_open(struct inode *inode, struct file *filep) { - struct uacce_mm *uacce_mm = NULL; struct uacce_device *uacce; struct uacce_queue *q; int ret = 0; @@ -205,21 +135,16 @@ static int uacce_fops_open(struct inode *inode, struct file *filep) if (!q) return -ENOMEM; - mutex_lock(&uacce->mm_lock); - uacce_mm = uacce_mm_get(uacce, q, current->mm); - mutex_unlock(&uacce->mm_lock); - if (!uacce_mm) { - ret = -ENOMEM; + ret = uacce_bind_queue(uacce, q); + if (ret) goto out_with_mem; - } q->uacce = uacce; - q->uacce_mm = uacce_mm; if (uacce->ops->get_queue) { - ret = uacce->ops->get_queue(uacce, uacce_mm->pasid, q); + ret = uacce->ops->get_queue(uacce, q->pasid, q); if (ret < 0) - goto out_with_mm; + goto out_with_bond; } init_waitqueue_head(&q->wait); @@ -227,12 +152,14 @@ static int uacce_fops_open(struct inode *inode, struct file *filep) uacce->inode = inode; q->state = UACCE_Q_INIT; + mutex_lock(&uacce->queues_lock); + list_add(&q->list, &uacce->queues); + mutex_unlock(&uacce->queues_lock); + return 0; -out_with_mm: - mutex_lock(&uacce->mm_lock); - uacce_mm_put(q); - mutex_unlock(&uacce->mm_lock); +out_with_bond: + uacce_unbind_queue(q); out_with_mem: kfree(q); return ret; @@ -241,14 +168,12 @@ out_with_mem: static int uacce_fops_release(struct inode *inode, struct file *filep) { struct uacce_queue *q = filep->private_data; - struct uacce_device *uacce = q->uacce; + mutex_lock(&q->uacce->queues_lock); + list_del(&q->list); + mutex_unlock(&q->uacce->queues_lock); uacce_put_queue(q); - - mutex_lock(&uacce->mm_lock); - uacce_mm_put(q); - mutex_unlock(&uacce->mm_lock); - + uacce_unbind_queue(q); kfree(q); return 0; @@ -513,8 +438,8 @@ struct uacce_device *uacce_alloc(struct device *parent, if (ret < 0) goto err_with_uacce; - INIT_LIST_HEAD(&uacce->mm_list); - mutex_init(&uacce->mm_lock); + INIT_LIST_HEAD(&uacce->queues); + mutex_init(&uacce->queues_lock); device_initialize(&uacce->dev); uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id); uacce->dev.class = uacce_class; @@ -561,8 +486,7 @@ EXPORT_SYMBOL_GPL(uacce_register); */ void uacce_remove(struct uacce_device *uacce) { - struct uacce_mm *uacce_mm; - struct uacce_queue *q; + struct uacce_queue *q, *next_q; if (!uacce) return; @@ -574,24 +498,12 @@ void uacce_remove(struct uacce_device *uacce) unmap_mapping_range(uacce->inode->i_mapping, 0, 0, 1); /* ensure no open queue remains */ - mutex_lock(&uacce->mm_lock); - list_for_each_entry(uacce_mm, &uacce->mm_list, list) { - /* - * We don't take the uacce_mm->lock here. Since we hold the - * device's mm_lock, no queue can be added to or removed from - * this uacce_mm. We may run concurrently with mm_exit, but - * uacce_put_queue() is serialized and iommu_sva_unbind_device() - * waits for the lock that mm_exit is holding. - */ - list_for_each_entry(q, &uacce_mm->queues, list) - uacce_put_queue(q); - - if (uacce->flags & UACCE_DEV_SVA) { - iommu_sva_unbind_device(uacce_mm->handle); - uacce_mm->handle = NULL; - } + mutex_lock(&uacce->queues_lock); + list_for_each_entry_safe(q, next_q, &uacce->queues, list) { + uacce_put_queue(q); + uacce_unbind_queue(q); } - mutex_unlock(&uacce->mm_lock); + mutex_unlock(&uacce->queues_lock); /* disable sva now since no opened queues */ if (uacce->flags & UACCE_DEV_SVA) diff --git a/include/linux/uacce.h b/include/linux/uacce.h index 0e215e6d0534..454c2f6672d7 100644 --- a/include/linux/uacce.h +++ b/include/linux/uacce.h @@ -68,19 +68,21 @@ enum uacce_q_state { * @uacce: pointer to uacce * @priv: private pointer * @wait: wait queue head - * @list: index into uacce_mm - * @uacce_mm: the corresponding mm + * @list: index into uacce queues list * @qfrs: pointer of qfr regions * @state: queue state machine + * @pasid: pasid associated to the mm + * @handle: iommu_sva handle returned by iommu_sva_bind_device() */ struct uacce_queue { struct uacce_device *uacce; void *priv; wait_queue_head_t wait; struct list_head list; - struct uacce_mm *uacce_mm; struct uacce_qfile_region *qfrs[UACCE_MAX_REGION]; enum uacce_q_state state; + int pasid; + struct iommu_sva *handle; }; /** @@ -96,8 +98,8 @@ struct uacce_queue { * @cdev: cdev of the uacce * @dev: dev of the uacce * @priv: private pointer of the uacce - * @mm_list: list head of uacce_mm->list - * @mm_lock: lock for mm_list + * @queues: list of queues + * @queues_lock: lock for queues list * @inode: core vfs */ struct uacce_device { @@ -112,27 +114,9 @@ struct uacce_device { struct cdev *cdev; struct device dev; void *priv; - struct list_head mm_list; - struct mutex mm_lock; - struct inode *inode; -}; - -/** - * struct uacce_mm - keep track of queues bound to a process - * @list: index into uacce_device - * @queues: list of queues - * @mm: the mm struct - * @lock: protects the list of queues - * @pasid: pasid of the uacce_mm - * @handle: iommu_sva handle return from iommu_sva_bind_device - */ -struct uacce_mm { - struct list_head list; struct list_head queues; - struct mm_struct *mm; - struct mutex lock; - int pasid; - struct iommu_sva *handle; + struct mutex queues_lock; + struct inode *inode; }; #if IS_ENABLED(CONFIG_UACCE) -- cgit v1.2.3 From edcc40d2ab5f47f205c2dd2a9aeedd8c77de050a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 23 Apr 2020 14:53:30 +0200 Subject: iommu: Remove iommu_sva_ops::mm_exit() After binding a device to an mm, device drivers currently need to register a mm_exit handler. This function is called when the mm exits, to gracefully stop DMA targeting the address space and flush page faults to the IOMMU. This is deemed too complex for the MMU release() notifier, which may be triggered by any mmput() invocation, from about 120 callsites [1]. The upcoming SVA module has an example of such complexity: the I/O Page Fault handler would need to call mmput_async() instead of mmput() after handling an IOPF, to avoid triggering the release() notifier which would in turn drain the IOPF queue and lock up. Another concern is the DMA stop function taking too long, up to several minutes [2]. For some mmput() callers this may disturb other users. For example, if the OOM killer picks the mm bound to a device as the victim and that mm's memory is locked, if the release() takes too long, it might choose additional innocent victims to kill. To simplify the MMU release notifier, don't forward the notification to device drivers. Since they don't stop DMA on mm exit anymore, the PASID lifetime is extended: (1) The device driver calls bind(). A PASID is allocated. Here any DMA fault is handled by mm, and on error we don't print anything to dmesg. Userspace can easily trigger errors by issuing DMA on unmapped buffers. (2) exit_mmap(), for example the process took a SIGKILL. This step doesn't happen during normal operations. Remove the pgd from the PASID table, since the page tables are about to be freed. Invalidate the IOTLBs. Here the device may still perform DMA on the address space. Incoming transactions are aborted but faults aren't printed out. ATS Translation Requests return Successful Translation Completions with R=W=0. PRI Page Requests return with Invalid Request. (3) The device driver stops DMA, possibly following release of a fd, and calls unbind(). PASID table is cleared, IOTLB invalidated if necessary. The page fault queues are drained, and the PASID is freed. If DMA for that PASID is still running here, something went seriously wrong and errors should be reported. For now remove iommu_sva_ops entirely. We might need to re-introduce them at some point, for example to notify device drivers of unhandled IOPF. [1] https://lore.kernel.org/linux-iommu/20200306174239.GM31668@ziepe.ca/ [2] https://lore.kernel.org/linux-iommu/4d68da96-0ad5-b412-5987-2f7a6aa796c3@amd.com/ Signed-off-by: Jean-Philippe Brucker Acked-by: Jacob Pan Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200423125329.782066-3-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 11 ----------- include/linux/iommu.h | 30 ------------------------------ 2 files changed, 41 deletions(-) (limited to 'include') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 298397721144..abcd19118169 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2883,17 +2883,6 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) } EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); -int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *sva_ops) -{ - if (handle->ops && handle->ops != sva_ops) - return -EEXIST; - - handle->ops = sva_ops; - return 0; -} -EXPORT_SYMBOL_GPL(iommu_sva_set_ops); - int iommu_sva_get_pasid(struct iommu_sva *handle) { const struct iommu_ops *ops = handle->dev->bus->iommu_ops; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7cfd2dddb49d..08d3506172a1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -53,8 +53,6 @@ struct iommu_fault_event; typedef int (*iommu_fault_handler_t)(struct iommu_domain *, struct device *, unsigned long, int, void *); -typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *, - void *); typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *); struct iommu_domain_geometry { @@ -171,25 +169,6 @@ enum iommu_dev_features { #define IOMMU_PASID_INVALID (-1U) -/** - * struct iommu_sva_ops - device driver callbacks for an SVA context - * - * @mm_exit: called when the mm is about to be torn down by exit_mmap. After - * @mm_exit returns, the device must not issue any more transaction - * with the PASID given as argument. - * - * The @mm_exit handler is allowed to sleep. Be careful about the - * locks taken in @mm_exit, because they might lead to deadlocks if - * they are also held when dropping references to the mm. Consider the - * following call chain: - * mutex_lock(A); mmput(mm) -> exit_mm() -> @mm_exit() -> mutex_lock(A) - * Using mmput_async() prevents this scenario. - * - */ -struct iommu_sva_ops { - iommu_mm_exit_handler_t mm_exit; -}; - #ifdef CONFIG_IOMMU_API /** @@ -616,7 +595,6 @@ struct iommu_fwspec { */ struct iommu_sva { struct device *dev; - const struct iommu_sva_ops *ops; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, @@ -664,8 +642,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata); void iommu_sva_unbind_device(struct iommu_sva *handle); -int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *ops); int iommu_sva_get_pasid(struct iommu_sva *handle); #else /* CONFIG_IOMMU_API */ @@ -1069,12 +1045,6 @@ static inline void iommu_sva_unbind_device(struct iommu_sva *handle) { } -static inline int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *ops) -{ - return -EINVAL; -} - static inline int iommu_sva_get_pasid(struct iommu_sva *handle) { return IOMMU_PASID_INVALID; -- cgit v1.2.3 From 4fda230ecddc2573ed88632e98b69b0b9b68c0ad Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 27 May 2020 10:56:16 -0600 Subject: iommu/vt-d: Allocate domain info for real DMA sub-devices Sub-devices of a real DMA device might exist on a separate segment than the real DMA device and its IOMMU. These devices should still have a valid device_domain_info, but the current dma alias model won't allocate info for the subdevice. This patch adds a segment member to struct device_domain_info and uses the sub-device's BDF so that these sub-devices won't alias to other devices. Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping") Cc: stable@vger.kernel.org # v5.6+ Signed-off-by: Jon Derrick Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200527165617.297470-3-jonathan.derrick@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 19 +++++++++++++++---- include/linux/intel-iommu.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 1ff45b2d03ab..6d39b9bd89a6 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2463,7 +2463,7 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn) struct device_domain_info *info; list_for_each_entry(info, &device_domain_list, global) - if (info->iommu->segment == segment && info->bus == bus && + if (info->segment == segment && info->bus == bus && info->devfn == devfn) return info; @@ -2520,8 +2520,18 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (!info) return NULL; - info->bus = bus; - info->devfn = devfn; + if (!dev_is_real_dma_subdevice(dev)) { + info->bus = bus; + info->devfn = devfn; + info->segment = iommu->segment; + } else { + struct pci_dev *pdev = to_pci_dev(dev); + + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->segment = pci_domain_nr(pdev->bus); + } + info->ats_supported = info->pasid_supported = info->pri_supported = 0; info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; info->ats_qdep = 0; @@ -2561,7 +2571,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (!found) { struct device_domain_info *info2; - info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); + info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, + info->devfn); if (info2) { found = info2->domain; info2->dev = dev; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 21633cee6331..4100bd224f5c 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -609,6 +609,7 @@ struct device_domain_info { struct list_head auxiliary_domains; /* auxiliary domains * attached to this device */ + u32 segment; /* PCI segment number */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ u16 pfsid; /* SRIOV physical function source ID */ -- cgit v1.2.3