summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/iommu/Kconfig4
-rw-r--r--drivers/iommu/amd/Kconfig1
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h12
-rw-r--r--drivers/iommu/amd/io_pgtable.c68
-rw-r--r--drivers/iommu/amd/iommu.c147
-rw-r--r--drivers/iommu/intel/Kconfig1
-rw-r--r--drivers/iommu/intel/Makefile2
-rw-r--r--drivers/iommu/intel/iommu.c156
-rw-r--r--drivers/iommu/intel/iommu.h64
-rw-r--r--drivers/iommu/intel/nested.c117
-rw-r--r--drivers/iommu/intel/pasid.c221
-rw-r--r--drivers/iommu/intel/pasid.h6
-rw-r--r--drivers/iommu/iommufd/Makefile1
-rw-r--r--drivers/iommu/iommufd/device.c174
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c304
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c200
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h84
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h39
-rw-r--r--drivers/iommu/iommufd/iova_bitmap.c (renamed from drivers/vfio/iova_bitmap.c)5
-rw-r--r--drivers/iommu/iommufd/main.c17
-rw-r--r--drivers/iommu/iommufd/pages.c2
-rw-r--r--drivers/iommu/iommufd/selftest.c326
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c6
-rw-r--r--drivers/vfio/Makefile3
-rw-r--r--drivers/vfio/pci/mlx5/Kconfig1
-rw-r--r--drivers/vfio/pci/mlx5/main.c1
-rw-r--r--drivers/vfio/pci/pds/Kconfig1
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c1
-rw-r--r--drivers/vfio/vfio_main.c1
-rw-r--r--include/linux/io-pgtable.h4
-rw-r--r--include/linux/iommu.h146
-rw-r--r--include/linux/iova_bitmap.h26
-rw-r--r--include/uapi/linux/iommufd.h180
-rw-r--r--tools/testing/selftests/iommu/iommufd.c379
-rw-r--r--tools/testing/selftests/iommu/iommufd_fail_nth.c7
-rw-r--r--tools/testing/selftests/iommu/iommufd_utils.h233
36 files changed, 2722 insertions, 218 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 7f04491ca5f0..ee9e2a2edbf5 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -7,6 +7,10 @@ config IOMMU_IOVA
config IOMMU_API
bool
+config IOMMUFD_DRIVER
+ bool
+ default n
+
menuconfig IOMMU_SUPPORT
bool "IOMMU Hardware Support"
depends on MMU
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index 9b5fc3356bf2..8bd4c3b183ec 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -10,6 +10,7 @@ config AMD_IOMMU
select IOMMU_API
select IOMMU_IOVA
select IOMMU_IO_PGTABLE
+ select IOMMUFD_DRIVER if IOMMUFD
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 7dc30c2b56b3..dec4e5c2b66b 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -97,7 +97,9 @@
#define FEATURE_GATS_MASK (3ULL)
#define FEATURE_GAM_VAPIC BIT_ULL(21)
#define FEATURE_GIOSUP BIT_ULL(48)
+#define FEATURE_HASUP BIT_ULL(49)
#define FEATURE_EPHSUP BIT_ULL(50)
+#define FEATURE_HDSUP BIT_ULL(52)
#define FEATURE_SNP BIT_ULL(63)
#define FEATURE_PASID_SHIFT 32
@@ -212,6 +214,7 @@
/* macros and definitions for device table entries */
#define DEV_ENTRY_VALID 0x00
#define DEV_ENTRY_TRANSLATION 0x01
+#define DEV_ENTRY_HAD 0x07
#define DEV_ENTRY_PPR 0x34
#define DEV_ENTRY_IR 0x3d
#define DEV_ENTRY_IW 0x3e
@@ -371,9 +374,15 @@
(1ULL << (12 + (9 * (level))))
/*
+ * The IOPTE dirty bit
+ */
+#define IOMMU_PTE_HD_BIT (6)
+
+/*
* Bit value definition for I/O PTE fields
*/
#define IOMMU_PTE_PR BIT_ULL(0)
+#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
#define IOMMU_PTE_U BIT_ULL(59)
#define IOMMU_PTE_FC BIT_ULL(60)
#define IOMMU_PTE_IR BIT_ULL(61)
@@ -384,6 +393,7 @@
*/
#define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1)
+#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX_SHIFT (56)
@@ -413,6 +423,7 @@
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
+#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
@@ -563,6 +574,7 @@ struct protection_domain {
int nid; /* Node ID */
u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
+ bool dirty_tracking; /* dirty tracking is enabled in the domain */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index 2892aa1b4dc1..6c0621f6f572 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
return (__pte & ~offset_mask) | (iova & offset_mask);
}
+static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
+ unsigned long flags)
+{
+ bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
+ bool dirty = false;
+ int i, count;
+
+ /*
+ * 2.2.3.2 Host Dirty Support
+ * When a non-default page size is used , software must OR the
+ * Dirty bits in all of the replicated host PTEs used to map
+ * the page. The IOMMU does not guarantee the Dirty bits are
+ * set in all of the replicated PTEs. Any portion of the page
+ * may have been written even if the Dirty bit is set in only
+ * one of the replicated PTEs.
+ */
+ count = PAGE_SIZE_PTE_COUNT(size);
+ for (i = 0; i < count && test_only; i++) {
+ if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
+ dirty = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < count && !test_only; i++) {
+ if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
+ (unsigned long *)&ptep[i])) {
+ dirty = true;
+ }
+ }
+
+ return dirty;
+}
+
+static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
+ unsigned long end = iova + size - 1;
+
+ do {
+ unsigned long pgsize = 0;
+ u64 *ptep, pte;
+
+ ptep = fetch_pte(pgtable, iova, &pgsize);
+ if (ptep)
+ pte = READ_ONCE(*ptep);
+ if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
+ pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
+ iova += pgsize;
+ continue;
+ }
+
+ /*
+ * Mark the whole IOVA range as dirty even if only one of
+ * the replicated PTEs were marked dirty.
+ */
+ if (pte_test_and_clear_dirty(ptep, pgsize, flags))
+ iommu_dirty_bitmap_record(dirty, iova, pgsize);
+ iova += pgsize;
+ } while (iova < end);
+
+ return 0;
+}
+
/*
* ----------------------------------------------------
*/
@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
pgtable->iop.ops.map_pages = iommu_v1_map_pages;
pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages;
pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
+ pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
return &pgtable->iop;
}
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 95bd7c25ba6f..b399c5741378 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -37,6 +37,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/dma.h>
+#include <uapi/linux/iommufd.h>
#include "amd_iommu.h"
#include "../dma-iommu.h"
@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
+const struct iommu_dirty_ops amd_dirty_ops;
static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
int amd_iommu_max_glx_val = -1;
@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
pte_root |= 1ULL << DEV_ENTRY_PPR;
}
+ if (domain->dirty_tracking)
+ pte_root |= DTE_FLAG_HAD;
+
if (domain->flags & PD_IOMMUV2_MASK) {
u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
u64 glx = domain->glx;
@@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void)
return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
}
-static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
+ return iommu && (iommu->features & FEATURE_HDSUP);
+}
+
+static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
+ struct device *dev, u32 flags)
+{
+ bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
struct protection_domain *domain;
+ struct amd_iommu *iommu = NULL;
+
+ if (dev) {
+ iommu = rlookup_amd_iommu(dev);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+ }
/*
* Since DTE[Mode]=0 is prohibited on SNP-enabled system,
* default to use IOMMU_DOMAIN_DMA[_FQ].
*/
if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
- return NULL;
+ return ERR_PTR(-EINVAL);
+
+ if (dirty_tracking && !amd_iommu_hd_support(iommu))
+ return ERR_PTR(-EOPNOTSUPP);
domain = protection_domain_alloc(type);
if (!domain)
- return NULL;
+ return ERR_PTR(-ENOMEM);
domain->domain.geometry.aperture_start = 0;
domain->domain.geometry.aperture_end = dma_max_address();
domain->domain.geometry.force_aperture = true;
+ if (iommu) {
+ domain->domain.type = type;
+ domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
+ domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+
+ if (dirty_tracking)
+ domain->domain.dirty_ops = &amd_dirty_ops;
+ }
+
return &domain->domain;
}
+static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
+{
+ struct iommu_domain *domain;
+
+ domain = do_iommu_domain_alloc(type, NULL, 0);
+ if (IS_ERR(domain))
+ return NULL;
+
+ return domain;
+}
+
+static struct iommu_domain *
+amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+
+{
+ unsigned int type = IOMMU_DOMAIN_UNMANAGED;
+
+ if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return do_iommu_domain_alloc(type, dev, flags);
+}
+
static void amd_iommu_domain_free(struct iommu_domain *dom)
{
struct protection_domain *domain;
@@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
dev_data->defer_attach = false;
+ /*
+ * Restrict to devices with compatible IOMMU hardware support
+ * when enforcement of dirty tracking is enabled.
+ */
+ if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
+ return -EINVAL;
+
if (dev_data->domain)
detach_device(dev);
@@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return true;
case IOMMU_CAP_DEFERRED_FLUSH:
return true;
+ case IOMMU_CAP_DIRTY_TRACKING: {
+ struct amd_iommu *iommu = rlookup_amd_iommu(dev);
+
+ return amd_iommu_hd_support(iommu);
+ }
default:
break;
}
@@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return false;
}
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
+{
+ struct protection_domain *pdomain = to_pdomain(domain);
+ struct dev_table_entry *dev_table;
+ struct iommu_dev_data *dev_data;
+ bool domain_flush = false;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+ u64 pte_root;
+
+ spin_lock_irqsave(&pdomain->lock, flags);
+ if (!(pdomain->dirty_tracking ^ enable)) {
+ spin_unlock_irqrestore(&pdomain->lock, flags);
+ return 0;
+ }
+
+ list_for_each_entry(dev_data, &pdomain->dev_list, list) {
+ iommu = rlookup_amd_iommu(dev_data->dev);
+ if (!iommu)
+ continue;
+
+ dev_table = get_dev_table(iommu);
+ pte_root = dev_table[dev_data->devid].data[0];
+
+ pte_root = (enable ? pte_root | DTE_FLAG_HAD :
+ pte_root & ~DTE_FLAG_HAD);
+
+ /* Flush device DTE */
+ dev_table[dev_data->devid].data[0] = pte_root;
+ device_flush_dte(dev_data);
+ domain_flush = true;
+ }
+
+ /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
+ if (domain_flush) {
+ amd_iommu_domain_flush_tlb_pde(pdomain);
+ amd_iommu_domain_flush_complete(pdomain);
+ }
+ pdomain->dirty_tracking = enable;
+ spin_unlock_irqrestore(&pdomain->lock, flags);
+
+ return 0;
+}
+
+static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct protection_domain *pdomain = to_pdomain(domain);
+ struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
+ unsigned long lflags;
+
+ if (!ops || !ops->read_and_clear_dirty)
+ return -EOPNOTSUPP;
+
+ spin_lock_irqsave(&pdomain->lock, lflags);
+ if (!pdomain->dirty_tracking && dirty->bitmap) {
+ spin_unlock_irqrestore(&pdomain->lock, lflags);
+ return -EINVAL;
+ }
+ spin_unlock_irqrestore(&pdomain->lock, lflags);
+
+ return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
+}
+
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
@@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
+const struct iommu_dirty_ops amd_dirty_ops = {
+ .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+ .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
+};
+
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.domain_alloc = amd_iommu_domain_alloc,
+ .domain_alloc_user = amd_iommu_domain_alloc_user,
.probe_device = amd_iommu_probe_device,
.release_device = amd_iommu_release_device,
.probe_finalize = amd_iommu_probe_finalize,
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 119d2c57a48e..012cd2541a68 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -15,6 +15,7 @@ config INTEL_IOMMU
select DMA_OPS
select IOMMU_API
select IOMMU_IOVA
+ select IOMMUFD_DRIVER if IOMMUFD
select NEED_DMA_MAP_STATE
select DMAR_TABLE
select SWIOTLB
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index 7af3b8a4f2a0..5dabf081a779 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o
obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
obj-$(CONFIG_DMAR_PERF) += perf.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3685ba90ec88..d1037280abf7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units);
#define for_each_rmrr_units(rmrr) \
list_for_each_entry(rmrr, &dmar_rmrr_units, list)
-static void device_block_translation(struct device *dev);
static void intel_iommu_domain_free(struct iommu_domain *domain);
int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
@@ -300,6 +299,7 @@ static int iommu_skip_te_disable;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
+const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -560,7 +560,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
}
/* Some capabilities may be different across iommus */
-static void domain_update_iommu_cap(struct dmar_domain *domain)
+void domain_update_iommu_cap(struct dmar_domain *domain)
{
domain_update_iommu_coherency(domain);
domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
@@ -1778,8 +1778,7 @@ static struct dmar_domain *alloc_domain(unsigned int type)
return domain;
}
-static int domain_attach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info, *curr;
unsigned long ndomains;
@@ -1828,8 +1827,7 @@ err_unlock:
return ret;
}
-static void domain_detach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info;
@@ -2196,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
return -EINVAL;
+ if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
+ pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
attr |= DMA_FL_PTE_PRESENT;
if (domain->use_first_level) {
@@ -3958,7 +3961,7 @@ static void dmar_remove_one_dev_info(struct device *dev)
* all DMA requests without PASID from the device are blocked. If the page
* table has been set, clean up the data structures.
*/
-static void device_block_translation(struct device *dev)
+void device_block_translation(struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
@@ -4058,14 +4061,62 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
return NULL;
}
+static struct iommu_domain *
+intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+ struct intel_iommu *iommu = info->iommu;
+ struct iommu_domain *domain;
+
+ /* Must be NESTING domain */
+ if (parent) {
+ if (!nested_supported(iommu) || flags)
+ return ERR_PTR(-EOPNOTSUPP);
+ return intel_nested_domain_alloc(parent, user_data);
+ }
+
+ if (flags &
+ (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (nested_parent && !nested_supported(iommu))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (user_data || (dirty_tracking && !ssads_supported(iommu)))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * domain_alloc_user op needs to fully initialize a domain before
+ * return, so uses iommu_domain_alloc() here for simple.
+ */
+ domain = iommu_domain_alloc(dev->bus);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ if (nested_parent)
+ to_dmar_domain(domain)->nested_parent = true;
+
+ if (dirty_tracking) {
+ if (to_dmar_domain(domain)->use_first_level) {
+ iommu_domain_free(domain);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ domain->dirty_ops = &intel_dirty_ops;
+ }
+
+ return domain;
+}
+
static void intel_iommu_domain_free(struct iommu_domain *domain)
{
if (domain != &si_domain->domain && domain != &blocking_domain)
domain_exit(to_dmar_domain(domain));
}
-static int prepare_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+int prepare_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu;
@@ -4078,6 +4129,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
return -EINVAL;
+ if (domain->dirty_ops && !ssads_supported(iommu))
+ return -EINVAL;
+
/* check if this iommu agaw is sufficient for max mapped address */
addr_width = agaw_to_width(iommu->agaw);
if (addr_width > cap_mgaw(iommu->cap))
@@ -4332,6 +4386,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
return dmar_platform_optin();
case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
return ecap_sc_support(info->iommu->ecap);
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return ssads_supported(info->iommu);
default:
return false;
}
@@ -4729,6 +4785,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
return -EOPNOTSUPP;
+ if (domain->dirty_ops)
+ return -EINVAL;
+
if (context_copied(iommu, info->bus, info->devfn))
return -EBUSY;
@@ -4780,6 +4839,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
if (!vtd)
return ERR_PTR(-ENOMEM);
+ vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
vtd->cap_reg = iommu->cap;
vtd->ecap_reg = iommu->ecap;
*length = sizeof(*vtd);
@@ -4787,10 +4847,88 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
return vtd;
}
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ int ret;
+
+ spin_lock(&dmar_domain->lock);
+ if (dmar_domain->dirty_tracking == enable)
+ goto out_unlock;
+
+ list_for_each_entry(info, &dmar_domain->devices, link) {
+ ret = intel_pasid_setup_dirty_tracking(info->iommu,
+ info->domain, info->dev,
+ IOMMU_NO_PASID, enable);
+ if (ret)
+ goto err_unwind;
+ }
+
+ dmar_domain->dirty_tracking = enable;
+out_unlock:
+ spin_unlock(&dmar_domain->lock);
+
+ return 0;
+
+err_unwind:
+ list_for_each_entry(info, &dmar_domain->devices, link)
+ intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
+ info->dev, IOMMU_NO_PASID,
+ dmar_domain->dirty_tracking);
+ spin_unlock(&dmar_domain->lock);
+ return ret;
+}
+
+static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ unsigned long end = iova + size - 1;
+ unsigned long pgsize;
+
+ /*
+ * IOMMUFD core calls into a dirty tracking disabled domain without an
+ * IOVA bitmap set in order to clean dirty bits in all PTEs that might
+ * have occurred when we stopped dirty tracking. This ensures that we
+ * never inherit dirtied bits from a previous cycle.
+ */
+ if (!dmar_domain->dirty_tracking && dirty->bitmap)
+ return -EINVAL;
+
+ do {
+ struct dma_pte *pte;
+ int lvl = 0;
+
+ pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
+ GFP_ATOMIC);
+ pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
+ if (!pte || !dma_pte_present(pte)) {
+ iova += pgsize;
+ continue;
+ }
+
+ if (dma_sl_pte_test_and_clear_dirty(pte, flags))
+ iommu_dirty_bitmap_record(dirty, iova, pgsize);
+ iova += pgsize;
+ } while (iova < end);
+
+ return 0;
+}
+
+const struct iommu_dirty_ops intel_dirty_ops = {
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+ .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
+};
+
const struct iommu_ops intel_iommu_ops = {
.capable = intel_iommu_capable,
.hw_info = intel_iommu_hw_info,
.domain_alloc = intel_iommu_domain_alloc,
+ .domain_alloc_user = intel_iommu_domain_alloc_user,
.probe_device = intel_iommu_probe_device,
.probe_finalize = intel_iommu_probe_finalize,
.release_device = intel_iommu_release_device,
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 7dac94f62b4e..d796d0d9b114 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -25,6 +25,7 @@
#include <asm/cacheflush.h>
#include <asm/iommu.h>
+#include <uapi/linux/iommufd.h>
/*
* VT-d hardware uses 4KiB page size regardless of host page size.
@@ -48,6 +49,9 @@
#define DMA_FL_PTE_DIRTY BIT_ULL(6)
#define DMA_FL_PTE_XD BIT_ULL(63)
+#define DMA_SL_PTE_DIRTY_BIT 9
+#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
+
#define ADDR_WIDTH_5LEVEL (57)
#define ADDR_WIDTH_4LEVEL (48)
@@ -539,6 +543,10 @@ enum {
#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
#define pasid_supported(iommu) (sm_supported(iommu) && \
ecap_pasid((iommu)->ecap))
+#define ssads_supported(iommu) (sm_supported(iommu) && \
+ ecap_slads((iommu)->ecap))
+#define nested_supported(iommu) (sm_supported(iommu) && \
+ ecap_nest((iommu)->ecap))
struct pasid_entry;
struct pasid_state_entry;
@@ -592,20 +600,45 @@ struct dmar_domain {
* otherwise, goes through the second
* level.
*/
+ u8 dirty_tracking:1; /* Dirty tracking is enabled */
+ u8 nested_parent:1; /* Has other domains nested on it */
spinlock_t lock; /* Protect device tracking lists */
struct list_head devices; /* all devices' list */
struct list_head dev_pasids; /* all attached pasids */
- struct dma_pte *pgd; /* virtual address */
- int gaw; /* max guest address width */
-
- /* adjusted guest address width, 0 is level 2 30-bit */
- int agaw;
int iommu_superpage;/* Level of superpages supported:
0 == 4KiB (no superpages), 1 == 2MiB,
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
- u64 max_addr; /* maximum mapped address */
+ union {
+ /* DMA remapping domain */
+ struct {
+ /* virtual address */
+ struct dma_pte *pgd;
+ /* max guest address width */
+ int gaw;
+ /*
+ * adjusted guest address width:
+ * 0: level 2 30-bit
+ * 1: level 3 39-bit
+ * 2: level 4 48-bit
+ * 3: level 5 57-bit
+ */
+ int agaw;
+ /* maximum mapped address */
+ u64 max_addr;
+ };
+
+ /* Nested user domain */
+ struct {
+ /* parent page table which the user domain is nested on */
+ struct dmar_domain *s2_domain;
+ /* user page table pointer (in GPA) */
+ unsigned long s1_pgtbl;
+ /* page table attributes */
+ struct iommu_hwpt_vtd_s1 s1_cfg;
+ };
+ };
struct iommu_domain domain; /* generic domain data structure for
iommu core */
@@ -781,6 +814,16 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
+static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
+ unsigned long flags)
+{
+ if (flags & IOMMU_DIRTY_NO_CLEAR)
+ return (pte->val & DMA_SL_PTE_DIRTY) != 0;
+
+ return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
+ (unsigned long *)&pte->val);
+}
+
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
@@ -836,12 +879,21 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
*/
#define QI_OPT_WAIT_DRAIN BIT(0)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void device_block_translation(struct device *dev);
+int prepare_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev);
+void domain_update_iommu_cap(struct dmar_domain *domain);
+
int dmar_ir_support(void);
void *alloc_pgtable_page(int node, gfp_t gfp);
void free_pgtable_page(void *vaddr);
void iommu_flush_write_buffer(struct intel_iommu *iommu);
struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+ const struct iommu_user_data *user_data);
#ifdef CONFIG_INTEL_IOMMU_SVM
void intel_svm_check(struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
new file mode 100644
index 000000000000..b5a5563ab32c
--- /dev/null
+++ b/drivers/iommu/intel/nested.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nested.c - nested mode translation support
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ * Jacob Pan <jacob.jun.pan@linux.intel.com>
+ * Yi Liu <yi.l.liu@intel.com>
+ */
+
+#define pr_fmt(fmt) "DMAR: " fmt
+
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+
+#include "iommu.h"
+#include "pasid.h"
+
+static int intel_nested_attach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ unsigned long flags;
+ int ret = 0;
+
+ if (info->domain)
+ device_block_translation(dev);
+
+ if (iommu->agaw < dmar_domain->s2_domain->agaw) {
+ dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
+ return -ENODEV;
+ }
+
+ /*
+ * Stage-1 domain cannot work alone, it is nested on a s2_domain.
+ * The s2_domain will be used in nested translation, hence needs
+ * to ensure the s2_domain is compatible with this IOMMU.
+ */
+ ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev);
+ if (ret) {
+ dev_err_ratelimited(dev, "s2 domain is not compatible\n");
+ return ret;
+ }
+
+ ret = domain_attach_iommu(dmar_domain, iommu);
+ if (ret) {
+ dev_err_ratelimited(dev, "Failed to attach domain to iommu\n");
+ return ret;
+ }
+
+ ret = intel_pasid_setup_nested(iommu, dev,
+ IOMMU_NO_PASID, dmar_domain);
+ if (ret) {
+ domain_detach_iommu(dmar_domain, iommu);
+ dev_err_ratelimited(dev, "Failed to setup pasid entry\n");
+ return ret;
+ }
+
+ info->domain = dmar_domain;
+ spin_lock_irqsave(&dmar_domain->lock, flags);
+ list_add(&info->link, &dmar_domain->devices);
+ spin_unlock_irqrestore(&dmar_domain->lock, flags);
+
+ return 0;
+}
+
+static void intel_nested_domain_free(struct iommu_domain *domain)
+{
+ kfree(to_dmar_domain(domain));
+}
+
+static const struct iommu_domain_ops intel_nested_domain_ops = {
+ .attach_dev = intel_nested_attach_dev,
+ .free = intel_nested_domain_free,
+};
+
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct dmar_domain *s2_domain = to_dmar_domain(parent);
+ struct iommu_hwpt_vtd_s1 vtd;
+ struct dmar_domain *domain;
+ int ret;
+
+ /* Must be nested domain */
+ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (parent->ops != intel_iommu_ops.default_domain_ops ||
+ !s2_domain->nested_parent)
+ return ERR_PTR(-EINVAL);
+
+ ret = iommu_copy_struct_from_user(&vtd, user_data,
+ IOMMU_HWPT_DATA_VTD_S1, __reserved);
+ if (ret)
+ return ERR_PTR(ret);
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ domain->use_first_level = true;
+ domain->s2_domain = s2_domain;
+ domain->s1_pgtbl = vtd.pgtbl_addr;
+ domain->s1_cfg = vtd;
+ domain->domain.ops = &intel_nested_domain_ops;
+ domain->domain.type = IOMMU_DOMAIN_NESTED;
+ INIT_LIST_HEAD(&domain->devices);
+ INIT_LIST_HEAD(&domain->dev_pasids);
+ spin_lock_init(&domain->lock);
+ xa_init(&domain->iommu_array);
+
+ return &domain->domain;
+}
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 8f92b92f3d2a..74e8e4c17e81 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
WRITE_ONCE(*ptr, (old & ~mask) | bits);
}
+static inline u64 pasid_get_bits(u64 *ptr)
+{
+ return READ_ONCE(*ptr);
+}
+
/*
* Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
* PASID entry.
@@ -336,6 +341,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe)
}
/*
+ * Enable second level A/D bits by setting the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_ssade(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9);
+}
+
+/*
+ * Disable second level A/D bits by clearing the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_clear_ssade(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 9, 0);
+}
+
+/*
+ * Checks if second level A/D bits specifically the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry is set.
+ */
+static inline bool pasid_get_ssade(struct pasid_entry *pe)
+{
+ return pasid_get_bits(&pe->val[0]) & (1 << 9);
+}
+
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 0, 1);
+}
+
+/*
* Setup the WPE(Write Protect Enable) field (Bit 132) of a
* scalable mode PASID entry.
*/
@@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value)
pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
}
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void pasid_set_eafe(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
static void
pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
u16 did, u32 pasid)
@@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ if (domain->dirty_tracking)
+ pasid_set_ssade(pte);
pasid_set_present(pte);
spin_unlock(&iommu->lock);
@@ -637,6 +692,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
}
/*
+ * Set up dirty tracking on a second only or nested translation type.
+ */
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid,
+ bool enabled)
+{
+ struct pasid_entry *pte;
+ u16 did, pgtt;
+
+ spin_lock(&iommu->lock);
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ dev_err_ratelimited(
+ dev, "Failed to get pasid entry of PASID %d\n", pasid);
+ return -ENODEV;
+ }
+
+ did = domain_id_iommu(domain, iommu);
+ pgtt = pasid_pte_get_pgtt(pte);
+ if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
+ pgtt != PASID_ENTRY_PGTT_NESTED) {
+ spin_unlock(&iommu->lock);
+ dev_err_ratelimited(
+ dev,
+ "Dirty tracking not supported on translation type %d\n",
+ pgtt);
+ return -EOPNOTSUPP;
+ }
+
+ if (pasid_get_ssade(pte) == enabled) {
+ spin_unlock(&iommu->lock);
+ return 0;
+ }
+
+ if (enabled)
+ pasid_set_ssade(pte);
+ else
+ pasid_clear_ssade(pte);
+ spin_unlock(&iommu->lock);
+
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(pte, sizeof(*pte));
+
+ /*
+ * From VT-d spec table 25 "Guidance to Software for Invalidations":
+ *
+ * - PASID-selective-within-Domain PASID-cache invalidation
+ * If (PGTT=SS or Nested)
+ * - Domain-selective IOTLB invalidation
+ * Else
+ * - PASID-selective PASID-based IOTLB invalidation
+ * - If (pasid is RID_PASID)
+ * - Global Device-TLB invalidation to affected functions
+ * Else
+ * - PASID-based Device-TLB invalidation (with S=1 and
+ * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
+ */
+ pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+ /* Device IOTLB doesn't need to be flushed in caching mode. */
+ if (!cap_caching_mode(iommu->cap))
+ devtlb_invalidation_with_pasid(iommu, dev, pasid);
+
+ return 0;
+}
+
+/*
* Set up the scalable mode pasid entry for passthrough translation type.
*/
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
@@ -713,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
if (!cap_caching_mode(iommu->cap))
devtlb_invalidation_with_pasid(iommu, dev, pasid);
}
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * @iommu: IOMMU which the device belong to
+ * @dev: Device to be set up for translation
+ * @pasid: PASID to be programmed in the device PASID table
+ * @domain: User stage-1 domain nested on a stage-2 domain
+ *
+ * This is used for nested translation. The input domain should be
+ * nested type and nested on a parent with 'is_nested_parent' flag
+ * set.
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+ u32 pasid, struct dmar_domain *domain)
+{
+ struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
+ pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
+ struct dmar_domain *s2_domain = domain->s2_domain;
+ u16 did = domain_id_iommu(domain, iommu);
+ struct dma_pte *pgd = s2_domain->pgd;
+ struct pasid_entry *pte;
+
+ /* Address width should match the address width supported by hardware */
+ switch (s1_cfg->addr_width) {
+ case ADDR_WIDTH_4LEVEL:
+ break;
+ case ADDR_WIDTH_5LEVEL:
+ if (!cap_fl5lp_support(iommu->cap)) {
+ dev_err_ratelimited(dev,
+ "5-level paging not supported\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
+ s1_cfg->addr_width);
+ return -EINVAL;
+ }
+
+ if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
+ pr_err_ratelimited("No supervisor request support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
+ pr_err_ratelimited("No extended access flag support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ spin_lock(&iommu->lock);
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ return -ENODEV;
+ }
+ if (pasid_pte_is_present(pte)) {
+ spin_unlock(&iommu->lock);
+ return -EBUSY;
+ }
+
+ pasid_clear_entry(pte);
+
+ if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
+ pasid_set_flpm(pte, 1);
+
+ pasid_set_flptr(pte, (uintptr_t)s1_gpgd);
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_SRE) {
+ pasid_set_sre(pte);
+ if (s1_cfg->flags & IOMMU_VTD_S1_WPE)
+ pasid_set_wpe(pte);
+ }
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_EAFE)
+ pasid_set_eafe(pte);
+
+ if (s2_domain->force_snooping)
+ pasid_set_pgsnp(pte);
+
+ pasid_set_slptr(pte, virt_to_phys(pgd));
+ pasid_set_fault_enable(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, s2_domain->agaw);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+ pasid_set_present(pte);
+ spin_unlock(&iommu->lock);
+
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return 0;
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 4e9e68c3c388..dd37611175cc 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -106,9 +106,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid,
+ bool enabled);
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+ u32 pasid, struct dmar_domain *domain);
void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
bool fault_ignore);
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 8aeba81800c5..34b446146961 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -11,3 +11,4 @@ iommufd-y := \
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
+obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index ce78c3671539..59d3a07300d9 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD);
static int iommufd_group_setup_msi(struct iommufd_group *igroup,
- struct iommufd_hw_pagetable *hwpt)
+ struct iommufd_hwpt_paging *hwpt_paging)
{
phys_addr_t sw_msi_start = igroup->sw_msi_start;
int rc;
@@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
* matches what the IRQ layer actually expects in a newly created
* domain.
*/
- if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) {
- rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+ if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
+ rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
+ sw_msi_start);
if (rc)
return rc;
@@ -320,7 +321,31 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
* iommu_get_msi_cookie() can only be called once per domain,
* it returns -EBUSY on later calls.
*/
- hwpt->msi_cookie = true;
+ hwpt_paging->msi_cookie = true;
+ }
+ return 0;
+}
+
+static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
+ struct iommufd_device *idev)
+{
+ int rc;
+
+ lockdep_assert_held(&idev->igroup->lock);
+
+ rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
+ idev->dev,
+ &idev->igroup->sw_msi_start);
+ if (rc)
+ return rc;
+
+ if (list_empty(&idev->igroup->device_list)) {
+ rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging);
+ if (rc) {
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt,
+ idev->dev);
+ return rc;
+ }
}
return 0;
}
@@ -337,18 +362,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
goto err_unlock;
}
- /* Try to upgrade the domain we have */
- if (idev->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ if (hwpt_is_paging(hwpt)) {
+ rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev);
if (rc)
goto err_unlock;
}
- rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev,
- &idev->igroup->sw_msi_start);
- if (rc)
- goto err_unlock;
-
/*
* Only attach to the group once for the first device that is in the
* group. All the other devices will follow this attachment. The user
@@ -357,10 +376,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
* attachment.
*/
if (list_empty(&idev->igroup->device_list)) {
- rc = iommufd_group_setup_msi(idev->igroup, hwpt);
- if (rc)
- goto err_unresv;
-
rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
if (rc)
goto err_unresv;
@@ -371,7 +386,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
mutex_unlock(&idev->igroup->lock);
return 0;
err_unresv:
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ if (hwpt_is_paging(hwpt))
+ iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+ idev->dev);
err_unlock:
mutex_unlock(&idev->igroup->lock);
return rc;
@@ -388,7 +405,9 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
iommu_detach_group(hwpt->domain, idev->igroup->group);
idev->igroup->hwpt = NULL;
}
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ if (hwpt_is_paging(hwpt))
+ iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+ idev->dev);
mutex_unlock(&idev->igroup->lock);
/* Caller must destroy hwpt */
@@ -407,14 +426,55 @@ iommufd_device_do_attach(struct iommufd_device *idev,
return NULL;
}
+static void
+iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommufd_device *cur;
+
+ lockdep_assert_held(&igroup->lock);
+
+ list_for_each_entry(cur, &igroup->device_list, group_item)
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev);
+}
+
+static int
+iommufd_group_do_replace_paging(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt;
+ struct iommufd_device *cur;
+ int rc;
+
+ lockdep_assert_held(&igroup->lock);
+
+ if (!hwpt_is_paging(old_hwpt) ||
+ hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) {
+ list_for_each_entry(cur, &igroup->device_list, group_item) {
+ rc = iopt_table_enforce_dev_resv_regions(
+ &hwpt_paging->ioas->iopt, cur->dev, NULL);
+ if (rc)
+ goto err_unresv;
+ }
+ }
+
+ rc = iommufd_group_setup_msi(igroup, hwpt_paging);
+ if (rc)
+ goto err_unresv;
+ return 0;
+
+err_unresv:
+ iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
+ return rc;
+}
+
static struct iommufd_hw_pagetable *
iommufd_device_do_replace(struct iommufd_device *idev,
struct iommufd_hw_pagetable *hwpt)
{
struct iommufd_group *igroup = idev->igroup;
struct iommufd_hw_pagetable *old_hwpt;
- unsigned int num_devices = 0;
- struct iommufd_device *cur;
+ unsigned int num_devices;
int rc;
mutex_lock(&idev->igroup->lock);
@@ -429,42 +489,27 @@ iommufd_device_do_replace(struct iommufd_device *idev,
return NULL;
}
- /* Try to upgrade the domain we have */
- list_for_each_entry(cur, &igroup->device_list, group_item) {
- num_devices++;
- if (cur->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
- if (rc)
- goto err_unlock;
- }
- }
-
old_hwpt = igroup->hwpt;
- if (hwpt->ioas != old_hwpt->ioas) {
- list_for_each_entry(cur, &igroup->device_list, group_item) {
- rc = iopt_table_enforce_dev_resv_regions(
- &hwpt->ioas->iopt, cur->dev, NULL);
- if (rc)
- goto err_unresv;
- }
+ if (hwpt_is_paging(hwpt)) {
+ rc = iommufd_group_do_replace_paging(igroup,
+ to_hwpt_paging(hwpt));
+ if (rc)
+ goto err_unlock;
}
- rc = iommufd_group_setup_msi(idev->igroup, hwpt);
- if (rc)
- goto err_unresv;
-
rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
if (rc)
goto err_unresv;
- if (hwpt->ioas != old_hwpt->ioas) {
- list_for_each_entry(cur, &igroup->device_list, group_item)
- iopt_remove_reserved_iova(&old_hwpt->ioas->iopt,
- cur->dev);
- }
+ if (hwpt_is_paging(old_hwpt) &&
+ (!hwpt_is_paging(hwpt) ||
+ to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas))
+ iommufd_group_remove_reserved_iova(igroup,
+ to_hwpt_paging(old_hwpt));
igroup->hwpt = hwpt;
+ num_devices = list_count_nodes(&igroup->device_list);
/*
* Move the refcounts held by the device_list to the new hwpt. Retain a
* refcount for this thread as the caller will free it.
@@ -478,8 +523,9 @@ iommufd_device_do_replace(struct iommufd_device *idev,
/* Caller must destroy old_hwpt */
return old_hwpt;
err_unresv:
- list_for_each_entry(cur, &igroup->device_list, group_item)
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev);
+ if (hwpt_is_paging(hwpt))
+ iommufd_group_remove_reserved_iova(igroup,
+ to_hwpt_paging(old_hwpt));
err_unlock:
mutex_unlock(&idev->igroup->lock);
return ERR_PTR(rc);
@@ -507,6 +553,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
*/
bool immediate_attach = do_attach == iommufd_device_do_attach;
struct iommufd_hw_pagetable *destroy_hwpt;
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_hw_pagetable *hwpt;
/*
@@ -515,10 +562,11 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
* other.
*/
mutex_lock(&ioas->mutex);
- list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
- if (!hwpt->auto_domain)
+ list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt_paging->auto_domain)
continue;
+ hwpt = &hwpt_paging->common;
if (!iommufd_lock_obj(&hwpt->obj))
continue;
destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -539,12 +587,13 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
goto out_unlock;
}
- hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev,
- immediate_attach);
- if (IS_ERR(hwpt)) {
- destroy_hwpt = ERR_CAST(hwpt);
+ hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0,
+ immediate_attach, NULL);
+ if (IS_ERR(hwpt_paging)) {
+ destroy_hwpt = ERR_CAST(hwpt_paging);
goto out_unlock;
}
+ hwpt = &hwpt_paging->common;
if (!immediate_attach) {
destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -554,7 +603,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
destroy_hwpt = NULL;
}
- hwpt->auto_domain = true;
+ hwpt_paging->auto_domain = true;
*pt_id = hwpt->obj.id;
iommufd_object_finalize(idev->ictx, &hwpt->obj);
@@ -579,7 +628,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
return PTR_ERR(pt_obj);
switch (pt_obj->type) {
- case IOMMUFD_OBJ_HW_PAGETABLE: {
+ case IOMMUFD_OBJ_HWPT_NESTED:
+ case IOMMUFD_OBJ_HWPT_PAGING: {
struct iommufd_hw_pagetable *hwpt =
container_of(pt_obj, struct iommufd_hw_pagetable, obj);
@@ -617,8 +667,8 @@ out_put_pt_obj:
/**
* iommufd_device_attach - Connect a device to an iommu_domain
* @idev: device to attach
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ * Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
* This connects the device to an iommu_domain, either automatically or manually
* selected. Once this completes the device could do DMA.
@@ -646,8 +696,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
/**
* iommufd_device_replace - Change the device's iommu_domain
* @idev: device to change
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ * Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
* This is the same as::
*
@@ -1185,6 +1235,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
*/
cmd->data_len = data_len;
+ cmd->out_capabilities = 0;
+ if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
+ cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING;
+
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_free:
kfree(data);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index cf2c1504e20d..2abbeafdbd22 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -5,62 +5,87 @@
#include <linux/iommu.h>
#include <uapi/linux/iommufd.h>
+#include "../iommu-priv.h"
#include "iommufd_private.h"
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
{
- struct iommufd_hw_pagetable *hwpt =
- container_of(obj, struct iommufd_hw_pagetable, obj);
+ struct iommufd_hwpt_paging *hwpt_paging =
+ container_of(obj, struct iommufd_hwpt_paging, common.obj);
- if (!list_empty(&hwpt->hwpt_item)) {
- mutex_lock(&hwpt->ioas->mutex);
- list_del(&hwpt->hwpt_item);
- mutex_unlock(&hwpt->ioas->mutex);
+ if (!list_empty(&hwpt_paging->hwpt_item)) {
+ mutex_lock(&hwpt_paging->ioas->mutex);
+ list_del(&hwpt_paging->hwpt_item);
+ mutex_unlock(&hwpt_paging->ioas->mutex);
- iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+ hwpt_paging->common.domain);
}
- if (hwpt->domain)
- iommu_domain_free(hwpt->domain);
+ if (hwpt_paging->common.domain)
+ iommu_domain_free(hwpt_paging->common.domain);
- refcount_dec(&hwpt->ioas->obj.users);
+ refcount_dec(&hwpt_paging->ioas->obj.users);
}
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj)
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj)
{
- struct iommufd_hw_pagetable *hwpt =
- container_of(obj, struct iommufd_hw_pagetable, obj);
+ struct iommufd_hwpt_paging *hwpt_paging =
+ container_of(obj, struct iommufd_hwpt_paging, common.obj);
/* The ioas->mutex must be held until finalize is called. */
- lockdep_assert_held(&hwpt->ioas->mutex);
+ lockdep_assert_held(&hwpt_paging->ioas->mutex);
- if (!list_empty(&hwpt->hwpt_item)) {
- list_del_init(&hwpt->hwpt_item);
- iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ if (!list_empty(&hwpt_paging->hwpt_item)) {
+ list_del_init(&hwpt_paging->hwpt_item);
+ iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+ hwpt_paging->common.domain);
}
- iommufd_hw_pagetable_destroy(obj);
+ iommufd_hwpt_paging_destroy(obj);
}
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
{
- if (hwpt->enforce_cache_coherency)
+ struct iommufd_hwpt_nested *hwpt_nested =
+ container_of(obj, struct iommufd_hwpt_nested, common.obj);
+
+ if (hwpt_nested->common.domain)
+ iommu_domain_free(hwpt_nested->common.domain);
+
+ refcount_dec(&hwpt_nested->parent->common.obj.users);
+}
+
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
+{
+ iommufd_hwpt_nested_destroy(obj);
+}
+
+static int
+iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommu_domain *paging_domain = hwpt_paging->common.domain;
+
+ if (hwpt_paging->enforce_cache_coherency)
return 0;
- if (hwpt->domain->ops->enforce_cache_coherency)
- hwpt->enforce_cache_coherency =
- hwpt->domain->ops->enforce_cache_coherency(
- hwpt->domain);
- if (!hwpt->enforce_cache_coherency)
+ if (paging_domain->ops->enforce_cache_coherency)
+ hwpt_paging->enforce_cache_coherency =
+ paging_domain->ops->enforce_cache_coherency(
+ paging_domain);
+ if (!hwpt_paging->enforce_cache_coherency)
return -EINVAL;
return 0;
}
/**
- * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device
+ * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device
* @ictx: iommufd context
* @ioas: IOAS to associate the domain with
* @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
* @immediate_attach: True if idev should be attached to the hwpt
+ * @user_data: The user provided driver specific data describing the domain to
+ * create
*
* Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT
* will be linked to the given ioas and upon return the underlying iommu_domain
@@ -70,28 +95,52 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
* iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on
* the returned hwpt.
*/
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, bool immediate_attach)
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, u32 flags,
+ bool immediate_attach,
+ const struct iommu_user_data *user_data)
{
+ const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_hw_pagetable *hwpt;
int rc;
lockdep_assert_held(&ioas->mutex);
- hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE);
- if (IS_ERR(hwpt))
- return hwpt;
+ if ((flags || user_data) && !ops->domain_alloc_user)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (flags & ~valid_flags)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ hwpt_paging = __iommufd_object_alloc(
+ ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj);
+ if (IS_ERR(hwpt_paging))
+ return ERR_CAST(hwpt_paging);
+ hwpt = &hwpt_paging->common;
- INIT_LIST_HEAD(&hwpt->hwpt_item);
+ INIT_LIST_HEAD(&hwpt_paging->hwpt_item);
/* Pairs with iommufd_hw_pagetable_destroy() */
refcount_inc(&ioas->obj.users);
- hwpt->ioas = ioas;
+ hwpt_paging->ioas = ioas;
+ hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
- hwpt->domain = iommu_domain_alloc(idev->dev->bus);
- if (!hwpt->domain) {
- rc = -ENOMEM;
- goto out_abort;
+ if (ops->domain_alloc_user) {
+ hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL,
+ user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+ } else {
+ hwpt->domain = iommu_domain_alloc(idev->dev->bus);
+ if (!hwpt->domain) {
+ rc = -ENOMEM;
+ goto out_abort;
+ }
}
/*
@@ -100,9 +149,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
* doing any maps. It is an iommu driver bug to report
* IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on
* a new domain.
+ *
+ * The cache coherency mode must be configured here and unchanged later.
+ * Note that a HWPT (non-CC) created for a device (non-CC) can be later
+ * reused by another device (either non-CC or CC). However, A HWPT (CC)
+ * created for a device (CC) cannot be reused by another device (non-CC)
+ * but only devices (CC). Instead user space in this case would need to
+ * allocate a separate HWPT (non-CC).
*/
if (idev->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging);
if (WARN_ON(rc))
goto out_abort;
}
@@ -119,11 +175,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
goto out_abort;
}
- rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
+ rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);
if (rc)
goto out_detach;
- list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
- return hwpt;
+ list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list);
+ return hwpt_paging;
out_detach:
if (immediate_attach)
@@ -133,32 +189,120 @@ out_abort:
return ERR_PTR(rc);
}
+/**
+ * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device
+ * @ictx: iommufd context
+ * @parent: Parent PAGING-type hwpt to associate the domain with
+ * @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as
+ * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of
+ * being a parent.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
+ struct iommufd_hwpt_paging *parent,
+ struct iommufd_device *idev, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ if (flags || !user_data->len || !ops->domain_alloc_user)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (parent->auto_domain || !parent->nest_parent)
+ return ERR_PTR(-EINVAL);
+
+ hwpt_nested = __iommufd_object_alloc(
+ ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+ if (IS_ERR(hwpt_nested))
+ return ERR_CAST(hwpt_nested);
+ hwpt = &hwpt_nested->common;
+
+ refcount_inc(&parent->common.obj.users);
+ hwpt_nested->parent = parent;
+
+ hwpt->domain = ops->domain_alloc_user(idev->dev, flags,
+ parent->common.domain, user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+ rc = -EINVAL;
+ goto out_abort;
+ }
+ return hwpt_nested;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_hwpt_alloc *cmd = ucmd->cmd;
+ const struct iommu_user_data user_data = {
+ .type = cmd->data_type,
+ .uptr = u64_to_user_ptr(cmd->data_uptr),
+ .len = cmd->data_len,
+ };
struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_ioas *ioas = NULL;
+ struct iommufd_object *pt_obj;
struct iommufd_device *idev;
- struct iommufd_ioas *ioas;
int rc;
- if (cmd->flags || cmd->__reserved)
+ if (cmd->__reserved)
return -EOPNOTSUPP;
+ if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len)
+ return -EINVAL;
idev = iommufd_get_device(ucmd, cmd->dev_id);
if (IS_ERR(idev))
return PTR_ERR(idev);
- ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id);
- if (IS_ERR(ioas)) {
- rc = PTR_ERR(ioas);
+ pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj)) {
+ rc = -EINVAL;
goto out_put_idev;
}
- mutex_lock(&ioas->mutex);
- hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false);
- if (IS_ERR(hwpt)) {
- rc = PTR_ERR(hwpt);
- goto out_unlock;
+ if (pt_obj->type == IOMMUFD_OBJ_IOAS) {
+ struct iommufd_hwpt_paging *hwpt_paging;
+
+ ioas = container_of(pt_obj, struct iommufd_ioas, obj);
+ mutex_lock(&ioas->mutex);
+ hwpt_paging = iommufd_hwpt_paging_alloc(
+ ucmd->ictx, ioas, idev, cmd->flags, false,
+ user_data.len ? &user_data : NULL);
+ if (IS_ERR(hwpt_paging)) {
+ rc = PTR_ERR(hwpt_paging);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_paging->common;
+ } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) {
+ struct iommufd_hwpt_nested *hwpt_nested;
+
+ hwpt_nested = iommufd_hwpt_nested_alloc(
+ ucmd->ictx,
+ container_of(pt_obj, struct iommufd_hwpt_paging,
+ common.obj),
+ idev, cmd->flags, &user_data);
+ if (IS_ERR(hwpt_nested)) {
+ rc = PTR_ERR(hwpt_nested);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_nested->common;
+ } else {
+ rc = -EINVAL;
+ goto out_put_pt;
}
cmd->out_hwpt_id = hwpt->obj.id;
@@ -171,9 +315,59 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
out_hwpt:
iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj);
out_unlock:
- mutex_unlock(&ioas->mutex);
- iommufd_put_object(&ioas->obj);
+ if (ioas)
+ mutex_unlock(&ioas->mutex);
+out_put_pt:
+ iommufd_put_object(pt_obj);
out_put_idev:
iommufd_put_object(&idev->obj);
return rc;
}
+
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_ioas *ioas;
+ int rc = -EOPNOTSUPP;
+ bool enable;
+
+ if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE)
+ return rc;
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging))
+ return PTR_ERR(hwpt_paging);
+
+ ioas = hwpt_paging->ioas;
+ enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE;
+
+ rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain,
+ enable);
+
+ iommufd_put_object(&hwpt_paging->common.obj);
+ return rc;
+}
+
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_ioas *ioas;
+ int rc = -EOPNOTSUPP;
+
+ if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) ||
+ cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging))
+ return PTR_ERR(hwpt_paging);
+
+ ioas = hwpt_paging->ioas;
+ rc = iopt_read_and_clear_dirty_data(
+ &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd);
+
+ iommufd_put_object(&hwpt_paging->common.obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 3a598182b761..504ac1b01b2d 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/errno.h>
+#include <uapi/linux/iommufd.h>
#include "io_pagetable.h"
#include "double_span.h"
@@ -221,6 +222,18 @@ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
return 0;
}
+static struct iopt_area *iopt_area_alloc(void)
+{
+ struct iopt_area *area;
+
+ area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ if (!area)
+ return NULL;
+ RB_CLEAR_NODE(&area->node.rb);
+ RB_CLEAR_NODE(&area->pages_node.rb);
+ return area;
+}
+
static int iopt_alloc_area_pages(struct io_pagetable *iopt,
struct list_head *pages_list,
unsigned long length, unsigned long *dst_iova,
@@ -231,7 +244,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
int rc = 0;
list_for_each_entry(elm, pages_list, next) {
- elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
+ elm->area = iopt_area_alloc();
if (!elm->area)
return -ENOMEM;
}
@@ -412,6 +425,177 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
return 0;
}
+struct iova_bitmap_fn_arg {
+ unsigned long flags;
+ struct io_pagetable *iopt;
+ struct iommu_domain *domain;
+ struct iommu_dirty_bitmap *dirty;
+};
+
+static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length,
+ void *opaque)
+{
+ struct iopt_area *area;
+ struct iopt_area_contig_iter iter;
+ struct iova_bitmap_fn_arg *arg = opaque;
+ struct iommu_domain *domain = arg->domain;
+ struct iommu_dirty_bitmap *dirty = arg->dirty;
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ unsigned long last_iova = iova + length - 1;
+ unsigned long flags = arg->flags;
+ int ret;
+
+ iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+
+ ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
+ last - iter.cur_iova + 1, flags,
+ dirty);
+ if (ret)
+ return ret;
+ }
+
+ if (!iopt_area_contig_done(&iter))
+ return -EINVAL;
+ return 0;
+}
+
+static int
+iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ struct io_pagetable *iopt, unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ struct iommu_iotlb_gather gather;
+ struct iommu_dirty_bitmap dirty;
+ struct iova_bitmap_fn_arg arg;
+ struct iova_bitmap *iter;
+ int ret = 0;
+
+ if (!ops || !ops->read_and_clear_dirty)
+ return -EOPNOTSUPP;
+
+ iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
+ bitmap->page_size,
+ u64_to_user_ptr(bitmap->data));
+ if (IS_ERR(iter))
+ return -ENOMEM;
+
+ iommu_dirty_bitmap_init(&dirty, iter, &gather);
+
+ arg.flags = flags;
+ arg.iopt = iopt;
+ arg.domain = domain;
+ arg.dirty = &dirty;
+ iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
+
+ if (!(flags & IOMMU_DIRTY_NO_CLEAR))
+ iommu_iotlb_sync(domain, &gather);
+
+ iova_bitmap_free(iter);
+
+ return ret;
+}
+
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ size_t iommu_pgsize = iopt->iova_alignment;
+ u64 last_iova;
+
+ if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
+ return -EOVERFLOW;
+
+ if ((bitmap->iova & (iommu_pgsize - 1)) ||
+ ((last_iova + 1) & (iommu_pgsize - 1)))
+ return -EINVAL;
+
+ if (!bitmap->page_size)
+ return -EINVAL;
+
+ if ((bitmap->iova & (bitmap->page_size - 1)) ||
+ ((last_iova + 1) & (bitmap->page_size - 1)))
+ return -EINVAL;
+
+ return 0;
+}
+
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain,
+ unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ int ret;
+
+ ret = iommufd_check_iova_range(iopt, bitmap);
+ if (ret)
+ return ret;
+
+ down_read(&iopt->iova_rwsem);
+ ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
+ up_read(&iopt->iova_rwsem);
+
+ return ret;
+}
+
+static int iopt_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ struct iommu_iotlb_gather gather;
+ struct iommu_dirty_bitmap dirty;
+ struct iopt_area *area;
+ int ret = 0;
+
+ lockdep_assert_held_read(&iopt->iova_rwsem);
+
+ iommu_dirty_bitmap_init(&dirty, NULL, &gather);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ if (!area->pages)
+ continue;
+
+ ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
+ iopt_area_length(area), 0,
+ &dirty);
+ if (ret)
+ break;
+ }
+
+ iommu_iotlb_sync(domain, &gather);
+ return ret;
+}
+
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+ struct iommu_domain *domain, bool enable)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ int ret = 0;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ down_read(&iopt->iova_rwsem);
+
+ /* Clear dirty bits from PTEs to ensure a clean snapshot */
+ if (enable) {
+ ret = iopt_clear_dirty_data(iopt, domain);
+ if (ret)
+ goto out_unlock;
+ }
+
+ ret = ops->set_dirty_tracking(domain, enable);
+
+out_unlock:
+ up_read(&iopt->iova_rwsem);
+ return ret;
+}
+
int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, struct list_head *pages_list)
{
@@ -1005,11 +1189,11 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
- lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ lhs = iopt_area_alloc();
if (!lhs)
return -ENOMEM;
- rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ rhs = iopt_area_alloc();
if (!rhs) {
rc = -ENOMEM;
goto err_free_lhs;
@@ -1048,6 +1232,16 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (WARN_ON(rc))
goto err_remove_lhs;
+ /*
+ * If the original area has filled a domain, domains_itree has to be
+ * updated.
+ */
+ if (area->storage_domain) {
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
+ interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
+ }
+
lhs->storage_domain = area->storage_domain;
lhs->pages = area->pages;
rhs->storage_domain = area->storage_domain;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 2c58670011fe..a74cfefffbc6 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -8,6 +8,9 @@
#include <linux/xarray.h>
#include <linux/refcount.h>
#include <linux/uaccess.h>
+#include <linux/iommu.h>
+#include <linux/iova_bitmap.h>
+#include <uapi/linux/iommufd.h>
struct iommu_domain;
struct iommu_group;
@@ -70,6 +73,13 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, unsigned long *unmapped);
int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain,
+ unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap);
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+ struct iommu_domain *domain, bool enable);
+
void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
unsigned long length);
int iopt_table_add_domain(struct io_pagetable *iopt,
@@ -113,7 +123,8 @@ enum iommufd_object_type {
IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_DEVICE,
- IOMMUFD_OBJ_HW_PAGETABLE,
+ IOMMUFD_OBJ_HWPT_PAGING,
+ IOMMUFD_OBJ_HWPT_NESTED,
IOMMUFD_OBJ_IOAS,
IOMMUFD_OBJ_ACCESS,
#ifdef CONFIG_IOMMUFD_TEST
@@ -171,7 +182,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
size_t size,
enum iommufd_object_type type);
-#define iommufd_object_alloc(ictx, ptr, type) \
+#define __iommufd_object_alloc(ictx, ptr, type, obj) \
container_of(_iommufd_object_alloc( \
ictx, \
sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
@@ -180,6 +191,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
type), \
typeof(*(ptr)), obj)
+#define iommufd_object_alloc(ictx, ptr, type) \
+ __iommufd_object_alloc(ictx, ptr, type, obj)
+
/*
* The IO Address Space (IOAS) pagetable is a virtual page table backed by the
* io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
@@ -222,6 +236,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap);
/*
* A HW pagetable is called an iommu_domain inside the kernel. This user object
@@ -231,35 +247,75 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
*/
struct iommufd_hw_pagetable {
struct iommufd_object obj;
- struct iommufd_ioas *ioas;
struct iommu_domain *domain;
+};
+
+struct iommufd_hwpt_paging {
+ struct iommufd_hw_pagetable common;
+ struct iommufd_ioas *ioas;
bool auto_domain : 1;
bool enforce_cache_coherency : 1;
bool msi_cookie : 1;
+ bool nest_parent : 1;
/* Head at iommufd_ioas::hwpt_list */
struct list_head hwpt_item;
};
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, bool immediate_attach);
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt);
+struct iommufd_hwpt_nested {
+ struct iommufd_hw_pagetable common;
+ struct iommufd_hwpt_paging *parent;
+};
+
+static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
+{
+ return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING;
+}
+
+static inline struct iommufd_hwpt_paging *
+to_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
+{
+ return container_of(hwpt, struct iommufd_hwpt_paging, common);
+}
+
+static inline struct iommufd_hwpt_paging *
+iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_HWPT_PAGING),
+ struct iommufd_hwpt_paging, common.obj);
+}
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd);
+
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, u32 flags,
+ bool immediate_attach,
+ const struct iommu_user_data *user_data);
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
struct iommufd_device *idev);
struct iommufd_hw_pagetable *
iommufd_hw_pagetable_detach(struct iommufd_device *idev);
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj);
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj);
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
struct iommufd_hw_pagetable *hwpt)
{
- lockdep_assert_not_held(&hwpt->ioas->mutex);
- if (hwpt->auto_domain)
- iommufd_object_deref_user(ictx, &hwpt->obj);
- else
- refcount_dec(&hwpt->obj.users);
+ if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) {
+ struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt);
+
+ lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
+
+ if (hwpt_paging->auto_domain) {
+ iommufd_object_deref_user(ictx, &hwpt->obj);
+ return;
+ }
+ }
+ refcount_dec(&hwpt->obj.users);
}
struct iommufd_group {
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 3f3644375bf1..7910fbe1962d 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -19,6 +19,8 @@ enum {
IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE,
IOMMU_TEST_OP_ACCESS_REPLACE_IOAS,
+ IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
+ IOMMU_TEST_OP_DIRTY,
};
enum {
@@ -40,6 +42,15 @@ enum {
MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0,
};
+enum {
+ MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
+};
+
+enum {
+ MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3,
+ MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
+};
+
struct iommu_test_cmd {
__u32 size;
__u32 op;
@@ -57,6 +68,13 @@ struct iommu_test_cmd {
__u32 out_idev_id;
} mock_domain;
struct {
+ __u32 out_stdev_id;
+ __u32 out_hwpt_id;
+ __u32 out_idev_id;
+ /* Expand mock_domain to set mock device flags */
+ __u32 dev_flags;
+ } mock_domain_flags;
+ struct {
__u32 pt_id;
} mock_domain_replace;
struct {
@@ -95,6 +113,14 @@ struct iommu_test_cmd {
struct {
__u32 ioas_id;
} access_replace_ioas;
+ struct {
+ __u32 flags;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 page_size;
+ __aligned_u64 uptr;
+ __aligned_u64 out_nr_dirty;
+ } dirty;
};
__u32 last;
};
@@ -109,4 +135,17 @@ struct iommu_test_hw_info {
__u32 test_reg;
};
+/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
+#define IOMMU_HWPT_DATA_SELFTEST 0xdead
+#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+
+/**
+ * struct iommu_hwpt_selftest
+ *
+ * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT
+ */
+struct iommu_hwpt_selftest {
+ __u32 iotlb;
+};
+
#endif
diff --git a/drivers/vfio/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index 0848f920efb7..0a92c9eeaf7f 100644
--- a/drivers/vfio/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -268,6 +268,7 @@ err:
iova_bitmap_free(bitmap);
return ERR_PTR(rc);
}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD);
/**
* iova_bitmap_free() - Frees an IOVA bitmap object
@@ -289,6 +290,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap)
kfree(bitmap);
}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD);
/*
* Returns the remaining bitmap indexes from mapped_total_index to process for
@@ -387,6 +389,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
return ret;
}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD);
/**
* iova_bitmap_set() - Records an IOVA range in bitmap
@@ -420,4 +423,4 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
cur_bit += nbits;
} while (cur_bit <= last_bit);
}
-EXPORT_SYMBOL_GPL(iova_bitmap_set);
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index e71523cbd0de..45b9d40773b1 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -307,6 +307,8 @@ union ucmd_buffer {
struct iommu_destroy destroy;
struct iommu_hw_info info;
struct iommu_hwpt_alloc hwpt;
+ struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
+ struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
struct iommu_ioas_alloc alloc;
struct iommu_ioas_allow_iovas allow_iovas;
struct iommu_ioas_copy ioas_copy;
@@ -342,6 +344,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
__reserved),
IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
__reserved),
+ IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
+ struct iommu_hwpt_get_dirty_bitmap, data),
+ IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
+ struct iommu_hwpt_set_dirty_tracking, __reserved),
IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
@@ -482,9 +488,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_IOAS] = {
.destroy = iommufd_ioas_destroy,
},
- [IOMMUFD_OBJ_HW_PAGETABLE] = {
- .destroy = iommufd_hw_pagetable_destroy,
- .abort = iommufd_hw_pagetable_abort,
+ [IOMMUFD_OBJ_HWPT_PAGING] = {
+ .destroy = iommufd_hwpt_paging_destroy,
+ .abort = iommufd_hwpt_paging_abort,
+ },
+ [IOMMUFD_OBJ_HWPT_NESTED] = {
+ .destroy = iommufd_hwpt_nested_destroy,
+ .abort = iommufd_hwpt_nested_abort,
},
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
@@ -552,5 +562,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");
#endif
MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 8d9aa297c117..528f356238b3 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -1507,6 +1507,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
area->storage_domain = NULL;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 56506d5753f1..d43a87737c1e 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -20,10 +20,13 @@
static DECLARE_FAULT_ATTR(fail_iommufd);
static struct dentry *dbgfs_root;
static struct platform_device *selftest_iommu_dev;
+static const struct iommu_ops mock_ops;
+static struct iommu_domain_ops domain_nested_ops;
size_t iommufd_test_memory_limit = 65536;
enum {
+ MOCK_DIRTY_TRACK = 1,
MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
/*
@@ -36,6 +39,7 @@ enum {
_MOCK_PFN_START = MOCK_PFN_MASK + 1,
MOCK_PFN_START_IOVA = _MOCK_PFN_START,
MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
+ MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
};
/*
@@ -86,16 +90,24 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
}
struct mock_iommu_domain {
+ unsigned long flags;
struct iommu_domain domain;
struct xarray pfns;
};
+struct mock_iommu_domain_nested {
+ struct iommu_domain domain;
+ struct mock_iommu_domain *parent;
+ u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM];
+};
+
enum selftest_obj_type {
TYPE_IDEV,
};
struct mock_dev {
struct device dev;
+ unsigned long flags;
};
struct selftest_obj {
@@ -118,6 +130,11 @@ static void mock_domain_blocking_free(struct iommu_domain *domain)
static int mock_domain_nop_attach(struct iommu_domain *domain,
struct device *dev)
{
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+ if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
+ return -EINVAL;
+
return 0;
}
@@ -146,15 +163,70 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
return info;
}
-static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
{
- struct mock_iommu_domain *mock;
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long flags = mock->flags;
- if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
- return &mock_blocking_domain;
+ if (enable && !domain->dirty_ops)
+ return -EINVAL;
- if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)
- return NULL;
+ /* No change? */
+ if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK)))
+ return 0;
+
+ flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK);
+
+ mock->flags = flags;
+ return 0;
+}
+
+static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long i, max = size / MOCK_IO_PAGE_SIZE;
+ void *ent, *old;
+
+ if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
+ return -EINVAL;
+
+ for (i = 0; i < max; i++) {
+ unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE;
+
+ ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
+ if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) {
+ /* Clear dirty */
+ if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
+ unsigned long val;
+
+ val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
+ old = xa_store(&mock->pfns,
+ cur / MOCK_IO_PAGE_SIZE,
+ xa_mk_value(val), GFP_KERNEL);
+ WARN_ON_ONCE(ent != old);
+ }
+ iommu_dirty_bitmap_record(dirty, cur,
+ MOCK_IO_PAGE_SIZE);
+ }
+ }
+
+ return 0;
+}
+
+const struct iommu_dirty_ops dirty_ops = {
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+ .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
+};
+
+static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
+{
+ struct mock_iommu_domain *mock;
mock = kzalloc(sizeof(*mock), GFP_KERNEL);
if (!mock)
@@ -162,10 +234,87 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
+ mock->domain.ops = mock_ops.default_domain_ops;
+ mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
xa_init(&mock->pfns);
return &mock->domain;
}
+static struct iommu_domain *
+__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
+ const struct iommu_hwpt_selftest *user_cfg)
+{
+ struct mock_iommu_domain_nested *mock_nested;
+ int i;
+
+ mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
+ if (!mock_nested)
+ return ERR_PTR(-ENOMEM);
+ mock_nested->parent = mock_parent;
+ mock_nested->domain.ops = &domain_nested_ops;
+ mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
+ for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
+ mock_nested->iotlb[i] = user_cfg->iotlb;
+ return &mock_nested->domain;
+}
+
+static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+{
+ if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
+ return &mock_blocking_domain;
+ if (iommu_domain_type == IOMMU_DOMAIN_UNMANAGED)
+ return mock_domain_alloc_paging(NULL);
+ return NULL;
+}
+
+static struct iommu_domain *
+mock_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct mock_iommu_domain *mock_parent;
+ struct iommu_hwpt_selftest user_cfg;
+ int rc;
+
+ /* must be mock_domain */
+ if (!parent) {
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_domain *domain;
+
+ if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (user_data || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+ domain = mock_domain_alloc_paging(NULL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+ if (has_dirty_flag)
+ container_of(domain, struct mock_iommu_domain, domain)
+ ->domain.dirty_ops = &dirty_ops;
+ return domain;
+ }
+
+ /* must be mock_domain_nested */
+ if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!parent || parent->ops != mock_ops.default_domain_ops)
+ return ERR_PTR(-EINVAL);
+
+ mock_parent = container_of(parent, struct mock_iommu_domain, domain);
+ if (!mock_parent)
+ return ERR_PTR(-EINVAL);
+
+ rc = iommu_copy_struct_from_user(&user_cfg, user_data,
+ IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
+
+ return __mock_domain_alloc_nested(mock_parent, &user_cfg);
+}
+
static void mock_domain_free(struct iommu_domain *domain)
{
struct mock_iommu_domain *mock =
@@ -243,7 +392,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- WARN_ON(!ent);
+
/*
* iommufd generates unmaps that must be a strict
* superset of the map's performend So every starting
@@ -253,13 +402,13 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
* passed to map_pages
*/
if (first) {
- WARN_ON(!(xa_to_value(ent) &
- MOCK_PFN_START_IOVA));
+ WARN_ON(ent && !(xa_to_value(ent) &
+ MOCK_PFN_START_IOVA));
first = false;
}
if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
- WARN_ON(!(xa_to_value(ent) &
- MOCK_PFN_LAST_IOVA));
+ WARN_ON(ent && !(xa_to_value(ent) &
+ MOCK_PFN_LAST_IOVA));
iova += MOCK_IO_PAGE_SIZE;
ret += MOCK_IO_PAGE_SIZE;
@@ -283,7 +432,18 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
{
- return cap == IOMMU_CAP_CACHE_COHERENCY;
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY);
+ default:
+ break;
+ }
+
+ return false;
}
static void mock_domain_set_plaform_dma_ops(struct device *dev)
@@ -307,6 +467,7 @@ static const struct iommu_ops mock_ops = {
.pgsize_bitmap = MOCK_IO_PAGE_SIZE,
.hw_info = mock_domain_hw_info,
.domain_alloc = mock_domain_alloc,
+ .domain_alloc_user = mock_domain_alloc_user,
.capable = mock_domain_capable,
.set_platform_dma_ops = mock_domain_set_plaform_dma_ops,
.device_group = generic_device_group,
@@ -321,19 +482,41 @@ static const struct iommu_ops mock_ops = {
},
};
+static void mock_domain_free_nested(struct iommu_domain *domain)
+{
+ struct mock_iommu_domain_nested *mock_nested =
+ container_of(domain, struct mock_iommu_domain_nested, domain);
+
+ kfree(mock_nested);
+}
+
+static struct iommu_domain_ops domain_nested_ops = {
+ .free = mock_domain_free_nested,
+ .attach_dev = mock_domain_nop_attach,
+};
+
static inline struct iommufd_hw_pagetable *
-get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
- struct mock_iommu_domain **mock)
+__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type)
{
- struct iommufd_hw_pagetable *hwpt;
struct iommufd_object *obj;
- obj = iommufd_get_object(ucmd->ictx, mockpt_id,
- IOMMUFD_OBJ_HW_PAGETABLE);
+ obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type);
if (IS_ERR(obj))
return ERR_CAST(obj);
- hwpt = container_of(obj, struct iommufd_hw_pagetable, obj);
- if (hwpt->domain->ops != mock_ops.default_domain_ops) {
+ return container_of(obj, struct iommufd_hw_pagetable, obj);
+}
+
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain **mock)
+{
+ struct iommufd_hw_pagetable *hwpt;
+
+ hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING);
+ if (IS_ERR(hwpt))
+ return hwpt;
+ if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
+ hwpt->domain->ops != mock_ops.default_domain_ops) {
iommufd_put_object(&hwpt->obj);
return ERR_PTR(-EINVAL);
}
@@ -341,6 +524,25 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
return hwpt;
}
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain_nested **mock_nested)
+{
+ struct iommufd_hw_pagetable *hwpt;
+
+ hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED);
+ if (IS_ERR(hwpt))
+ return hwpt;
+ if (hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
+ hwpt->domain->ops != &domain_nested_ops) {
+ iommufd_put_object(&hwpt->obj);
+ return ERR_PTR(-EINVAL);
+ }
+ *mock_nested = container_of(hwpt->domain,
+ struct mock_iommu_domain_nested, domain);
+ return hwpt;
+}
+
struct mock_bus_type {
struct bus_type bus;
struct notifier_block nb;
@@ -362,16 +564,20 @@ static void mock_dev_release(struct device *dev)
kfree(mdev);
}
-static struct mock_dev *mock_dev_create(void)
+static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{
struct mock_dev *mdev;
int rc;
+ if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY))
+ return ERR_PTR(-EINVAL);
+
mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
if (!mdev)
return ERR_PTR(-ENOMEM);
device_initialize(&mdev->dev);
+ mdev->flags = dev_flags;
mdev->dev.release = mock_dev_release;
mdev->dev.bus = &iommufd_mock_bus_type.bus;
@@ -407,6 +613,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
struct iommufd_device *idev;
struct selftest_obj *sobj;
u32 pt_id = cmd->id;
+ u32 dev_flags = 0;
u32 idev_id;
int rc;
@@ -417,7 +624,10 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
sobj->idev.ictx = ucmd->ictx;
sobj->type = TYPE_IDEV;
- sobj->idev.mock_dev = mock_dev_create();
+ if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS)
+ dev_flags = cmd->mock_domain_flags.dev_flags;
+
+ sobj->idev.mock_dev = mock_dev_create(dev_flags);
if (IS_ERR(sobj->idev.mock_dev)) {
rc = PTR_ERR(sobj->idev.mock_dev);
goto out_sobj;
@@ -977,6 +1187,73 @@ static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE);
static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH ==
__IOMMUFD_ACCESS_RW_SLOW_PATH);
+static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
+ unsigned long iova, size_t length,
+ unsigned long page_size, void __user *uptr,
+ u32 flags)
+{
+ unsigned long bitmap_size, i, max;
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct iommufd_hw_pagetable *hwpt;
+ struct mock_iommu_domain *mock;
+ int rc, count = 0;
+ void *tmp;
+
+ if (!page_size || !length || iova % page_size || length % page_size ||
+ !uptr)
+ return -EINVAL;
+
+ hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
+ if (IS_ERR(hwpt))
+ return PTR_ERR(hwpt);
+
+ if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+ rc = -EINVAL;
+ goto out_put;
+ }
+
+ max = length / page_size;
+ bitmap_size = max / BITS_PER_BYTE;
+
+ tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT);
+ if (!tmp) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+
+ if (copy_from_user(tmp, uptr, bitmap_size)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+ for (i = 0; i < max; i++) {
+ unsigned long cur = iova + i * page_size;
+ void *ent, *old;
+
+ if (!test_bit(i, (unsigned long *)tmp))
+ continue;
+
+ ent = xa_load(&mock->pfns, cur / page_size);
+ if (ent) {
+ unsigned long val;
+
+ val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
+ old = xa_store(&mock->pfns, cur / page_size,
+ xa_mk_value(val), GFP_KERNEL);
+ WARN_ON_ONCE(ent != old);
+ count++;
+ }
+ }
+
+ cmd->dirty.out_nr_dirty = count;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_free:
+ kvfree(tmp);
+out_put:
+ iommufd_put_object(&hwpt->obj);
+ return rc;
+}
+
void iommufd_selftest_destroy(struct iommufd_object *obj)
{
struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
@@ -1000,6 +1277,7 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
cmd->add_reserved.start,
cmd->add_reserved.length);
case IOMMU_TEST_OP_MOCK_DOMAIN:
+ case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS:
return iommufd_test_mock_domain(ucmd, cmd);
case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE:
return iommufd_test_mock_domain_replace(
@@ -1041,6 +1319,12 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return -EINVAL;
iommufd_test_memory_limit = cmd->memory_limit.limit;
return 0;
+ case IOMMU_TEST_OP_DIRTY:
+ return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova,
+ cmd->dirty.length,
+ cmd->dirty.page_size,
+ u64_to_user_ptr(cmd->dirty.uptr),
+ cmd->dirty.flags);
default:
return -EOPNOTSUPP;
}
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index 6c810bf80f99..538fbf76354d 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -255,7 +255,7 @@ err_put:
static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
{
- struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_ioas *ioas;
int rc = 1;
@@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
return PTR_ERR(ioas);
mutex_lock(&ioas->mutex);
- list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
- if (!hwpt->enforce_cache_coherency) {
+ list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt_paging->enforce_cache_coherency) {
rc = 0;
break;
}
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index c82ea032d352..68c05705200f 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,8 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_VFIO) += vfio.o
-vfio-y += vfio_main.o \
- iova_bitmap.o
+vfio-y += vfio_main.o
vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o
vfio-$(CONFIG_VFIO_GROUP) += group.o
vfio-$(CONFIG_IOMMUFD) += iommufd.o
diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig
index 7088edc4fb28..c3ced56b7787 100644
--- a/drivers/vfio/pci/mlx5/Kconfig
+++ b/drivers/vfio/pci/mlx5/Kconfig
@@ -3,6 +3,7 @@ config MLX5_VFIO_PCI
tristate "VFIO support for MLX5 PCI devices"
depends on MLX5_CORE
select VFIO_PCI_CORE
+ select IOMMUFD_DRIVER
help
This provides migration support for MLX5 devices using the VFIO
framework.
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index b6ac66c5008d..fe09a8c8af95 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1517,6 +1517,7 @@ static struct pci_driver mlx5vf_pci_driver = {
module_pci_driver(mlx5vf_pci_driver);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig
index 6eceef7b028a..fec9b167c7b9 100644
--- a/drivers/vfio/pci/pds/Kconfig
+++ b/drivers/vfio/pci/pds/Kconfig
@@ -5,6 +5,7 @@ config PDS_VFIO_PCI
tristate "VFIO support for PDS PCI devices"
depends on PDS_CORE && PCI_IOV
select VFIO_PCI_CORE
+ select IOMMUFD_DRIVER
help
This provides generic PCI support for PDS devices using the VFIO
framework.
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index ab4b5958e413..dd8c00c895a2 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -204,6 +204,7 @@ static struct pci_driver pds_vfio_pci_driver = {
module_pci_driver(pds_vfio_pci_driver);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index e31e1952d7b8..8d4995ada74a 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1703,6 +1703,7 @@ static void __exit vfio_cleanup(void)
module_init(vfio_init);
module_exit(vfio_cleanup);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 1b7a44b35616..25142a0e2fc2 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -166,6 +166,10 @@ struct io_pgtable_ops {
struct iommu_iotlb_gather *gather);
phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
unsigned long iova);
+ int (*read_and_clear_dirty)(struct io_pgtable_ops *ops,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty);
};
/**
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c50a769d569a..8fb1b41b4d15 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -13,6 +13,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/of.h>
+#include <linux/iova_bitmap.h>
#include <uapi/linux/iommu.h>
#define IOMMU_READ (1 << 0)
@@ -37,6 +38,7 @@ struct bus_type;
struct device;
struct iommu_domain;
struct iommu_domain_ops;
+struct iommu_dirty_ops;
struct notifier_block;
struct iommu_sva;
struct iommu_fault_event;
@@ -65,6 +67,9 @@ struct iommu_domain_geometry {
#define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */
+#define __IOMMU_DOMAIN_NESTED (1U << 6) /* User-managed address space nested
+ on a stage-2 translation */
+
#define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ
/*
* This are the possible domain-types
@@ -91,10 +96,13 @@ struct iommu_domain_geometry {
__IOMMU_DOMAIN_DMA_API | \
__IOMMU_DOMAIN_DMA_FQ)
#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA)
+#define IOMMU_DOMAIN_NESTED (__IOMMU_DOMAIN_NESTED)
struct iommu_domain {
unsigned type;
const struct iommu_domain_ops *ops;
+ const struct iommu_dirty_ops *dirty_ops;
+
unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */
struct iommu_domain_geometry geometry;
struct iommu_dma_cookie *iova_cookie;
@@ -133,6 +141,7 @@ enum iommu_cap {
* usefully support the non-strict DMA flush queue.
*/
IOMMU_CAP_DEFERRED_FLUSH,
+ IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */
};
/* These are the possible reserved region types */
@@ -228,13 +237,109 @@ struct iommu_iotlb_gather {
};
/**
+ * struct iommu_dirty_bitmap - Dirty IOVA bitmap state
+ * @bitmap: IOVA bitmap
+ * @gather: Range information for a pending IOTLB flush
+ */
+struct iommu_dirty_bitmap {
+ struct iova_bitmap *bitmap;
+ struct iommu_iotlb_gather *gather;
+};
+
+/* Read but do not clear any dirty bits */
+#define IOMMU_DIRTY_NO_CLEAR (1 << 0)
+
+/**
+ * struct iommu_dirty_ops - domain specific dirty tracking operations
+ * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain
+ * @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled
+ * into a bitmap, with a bit represented as a page.
+ * Reads the dirty PTE bits and clears it from IO
+ * pagetables.
+ */
+struct iommu_dirty_ops {
+ int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled);
+ int (*read_and_clear_dirty)(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty);
+};
+
+/**
+ * struct iommu_user_data - iommu driver specific user space data info
+ * @type: The data type of the user buffer
+ * @uptr: Pointer to the user buffer for copy_from_user()
+ * @len: The length of the user buffer in bytes
+ *
+ * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h
+ * @type, @uptr and @len should be just copied from an iommufd core uAPI struct.
+ */
+struct iommu_user_data {
+ unsigned int type;
+ void __user *uptr;
+ size_t len;
+};
+
+/**
+ * __iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @dst_data: Pointer to an iommu driver specific user data that is defined in
+ * include/uapi/linux/iommufd.h
+ * @src_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @dst_data. Must match with @src_data.type
+ * @data_len: Length of current user data structure, i.e. sizeof(struct _dst)
+ * @min_len: Initial length of user data structure for backward compatibility.
+ * This should be offsetofend using the last member in the user data
+ * struct that was initially added to include/uapi/linux/iommufd.h
+ */
+static inline int __iommu_copy_struct_from_user(
+ void *dst_data, const struct iommu_user_data *src_data,
+ unsigned int data_type, size_t data_len, size_t min_len)
+{
+ if (src_data->type != data_type)
+ return -EINVAL;
+ if (WARN_ON(!dst_data || !src_data))
+ return -EINVAL;
+ if (src_data->len < min_len || data_len < src_data->len)
+ return -EINVAL;
+ return copy_struct_from_user(dst_data, data_len, src_data->uptr,
+ src_data->len);
+}
+
+/**
+ * iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ * include/uapi/linux/iommufd.h
+ * @user_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @kdst. Must match with @user_data->type
+ * @min_last: The last memember of the data structure @kdst points in the
+ * initial version.
+ * Return 0 for success, otherwise -error.
+ */
+#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \
+ __iommu_copy_struct_from_user(kdst, user_data, data_type, \
+ sizeof(*kdst), \
+ offsetofend(typeof(*kdst), min_last))
+
+/**
* struct iommu_ops - iommu ops and capabilities
* @capable: check capability
* @hw_info: report iommu hardware information. The data buffer returned by this
* op is allocated in the iommu driver and freed by the caller after
* use. The information type is one of enum iommu_hw_info_type defined
* in include/uapi/linux/iommufd.h.
- * @domain_alloc: allocate iommu domain
+ * @domain_alloc: allocate and return an iommu domain if success. Otherwise
+ * NULL is returned. The domain is not fully initialized until
+ * the caller iommu_domain_alloc() returns.
+ * @domain_alloc_user: Allocate an iommu domain corresponding to the input
+ * parameters as defined in include/uapi/linux/iommufd.h.
+ * Unlike @domain_alloc, it is called only by IOMMUFD and
+ * must fully initialize the new domain before return.
+ * Upon success, if the @user_data is valid and the @parent
+ * points to a kernel-managed domain, the new domain must be
+ * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
+ * NULL while the @user_data can be optionally provided, the
+ * new domain must support __IOMMU_DOMAIN_PAGING.
+ * Upon failure, ERR_PTR must be returned.
* @probe_device: Add device to iommu driver handling
* @release_device: Remove device from iommu driver handling
* @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -267,6 +372,9 @@ struct iommu_ops {
/* Domain allocation and freeing by the iommu driver */
struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
+ struct iommu_domain *(*domain_alloc_user)(
+ struct device *dev, u32 flags, struct iommu_domain *parent,
+ const struct iommu_user_data *user_data);
struct iommu_device *(*probe_device)(struct device *dev);
void (*release_device)(struct device *dev);
@@ -632,6 +740,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
return gather && gather->queued;
}
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+ struct iova_bitmap *bitmap,
+ struct iommu_iotlb_gather *gather)
+{
+ if (gather)
+ iommu_iotlb_gather_init(gather);
+
+ dirty->bitmap = bitmap;
+ dirty->gather = gather;
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+ unsigned long iova,
+ unsigned long length)
+{
+ if (dirty->bitmap)
+ iova_bitmap_set(dirty->bitmap, iova, length);
+
+ if (dirty->gather)
+ iommu_iotlb_gather_add_range(dirty->gather, iova, length);
+}
+
/* PCI device grouping function */
extern struct iommu_group *pci_device_group(struct device *dev);
/* Generic device grouping function */
@@ -737,6 +867,8 @@ struct iommu_fwspec {};
struct iommu_device {};
struct iommu_fault_param {};
struct iommu_iotlb_gather {};
+struct iommu_dirty_bitmap {};
+struct iommu_dirty_ops {};
static inline bool iommu_present(const struct bus_type *bus)
{
@@ -969,6 +1101,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
return false;
}
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+ struct iova_bitmap *bitmap,
+ struct iommu_iotlb_gather *gather)
+{
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+ unsigned long iova,
+ unsigned long length)
+{
+}
+
static inline void iommu_device_unregister(struct iommu_device *iommu)
{
}
diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h
index c006cf0a25f3..1c338f5e5b7a 100644
--- a/include/linux/iova_bitmap.h
+++ b/include/linux/iova_bitmap.h
@@ -7,6 +7,7 @@
#define _IOVA_BITMAP_H_
#include <linux/types.h>
+#include <linux/errno.h>
struct iova_bitmap;
@@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap,
unsigned long iova, size_t length,
void *opaque);
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER)
struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
unsigned long page_size,
u64 __user *data);
@@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
iova_bitmap_fn_t fn);
void iova_bitmap_set(struct iova_bitmap *bitmap,
unsigned long iova, size_t length);
+#else
+static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova,
+ size_t length,
+ unsigned long page_size,
+ u64 __user *data)
+{
+ return NULL;
+}
+
+static inline void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+}
+
+static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+ iova_bitmap_fn_t fn)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iova_bitmap_set(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length)
+{
+}
+#endif
#endif
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index b4ba0c0cbab6..0b2bc6252e2c 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -47,6 +47,8 @@ enum {
IOMMUFD_CMD_VFIO_IOAS,
IOMMUFD_CMD_HWPT_ALLOC,
IOMMUFD_CMD_GET_HW_INFO,
+ IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
+ IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
};
/**
@@ -348,19 +350,85 @@ struct iommu_vfio_ioas {
#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
/**
+ * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation
+ * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as
+ * the parent HWPT in a nesting configuration.
+ * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is
+ * enforced on device attachment
+ */
+enum iommufd_hwpt_alloc_flags {
+ IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
+};
+
+/**
+ * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table
+ * entry attributes
+ * @IOMMU_VTD_S1_SRE: Supervisor request
+ * @IOMMU_VTD_S1_EAFE: Extended access enable
+ * @IOMMU_VTD_S1_WPE: Write protect enable
+ */
+enum iommu_hwpt_vtd_s1_flags {
+ IOMMU_VTD_S1_SRE = 1 << 0,
+ IOMMU_VTD_S1_EAFE = 1 << 1,
+ IOMMU_VTD_S1_WPE = 1 << 2,
+};
+
+/**
+ * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table
+ * info (IOMMU_HWPT_DATA_VTD_S1)
+ * @flags: Combination of enum iommu_hwpt_vtd_s1_flags
+ * @pgtbl_addr: The base address of the stage-1 page table.
+ * @addr_width: The address width of the stage-1 page table
+ * @__reserved: Must be 0
+ */
+struct iommu_hwpt_vtd_s1 {
+ __aligned_u64 flags;
+ __aligned_u64 pgtbl_addr;
+ __u32 addr_width;
+ __u32 __reserved;
+};
+
+/**
+ * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
+ * @IOMMU_HWPT_DATA_NONE: no data
+ * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ */
+enum iommu_hwpt_data_type {
+ IOMMU_HWPT_DATA_NONE,
+ IOMMU_HWPT_DATA_VTD_S1,
+};
+
+/**
* struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC)
* @size: sizeof(struct iommu_hwpt_alloc)
- * @flags: Must be 0
+ * @flags: Combination of enum iommufd_hwpt_alloc_flags
* @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS to connect this HWPT to
+ * @pt_id: The IOAS or HWPT to connect this HWPT to
* @out_hwpt_id: The ID of the new HWPT
* @__reserved: Must be 0
+ * @data_type: One of enum iommu_hwpt_data_type
+ * @data_len: Length of the type specific data
+ * @data_uptr: User pointer to the type specific data
*
* Explicitly allocate a hardware page table object. This is the same object
* type that is returned by iommufd_device_attach() and represents the
* underlying iommu driver's iommu_domain kernel object.
*
- * A HWPT will be created with the IOVA mappings from the given IOAS.
+ * A kernel-managed HWPT will be created with the mappings from the given
+ * IOAS via the @pt_id. The @data_type for this allocation must be set to
+ * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
+ * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
+ *
+ * A user-managed nested HWPT will be created from a given parent HWPT via
+ * @pt_id, in which the parent HWPT must be allocated previously via the
+ * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
+ * must be set to a pre-defined type corresponding to an I/O page table
+ * type supported by the underlying IOMMU hardware.
+ *
+ * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
+ * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
+ * must be given.
*/
struct iommu_hwpt_alloc {
__u32 size;
@@ -369,13 +437,26 @@ struct iommu_hwpt_alloc {
__u32 pt_id;
__u32 out_hwpt_id;
__u32 __reserved;
+ __u32 data_type;
+ __u32 data_len;
+ __aligned_u64 data_uptr;
};
#define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)
/**
+ * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info
+ * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings
+ * on a nested_parent domain.
+ * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html
+ */
+enum iommu_hw_info_vtd_flags {
+ IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0,
+};
+
+/**
* struct iommu_hw_info_vtd - Intel VT-d hardware information
*
- * @flags: Must be 0
+ * @flags: Combination of enum iommu_hw_info_vtd_flags
* @__reserved: Must be 0
*
* @cap_reg: Value of Intel VT-d capability register defined in VT-d spec
@@ -405,6 +486,20 @@ enum iommu_hw_info_type {
};
/**
+ * enum iommufd_hw_capabilities
+ * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking
+ * If available, it means the following APIs
+ * are supported:
+ *
+ * IOMMU_HWPT_GET_DIRTY_BITMAP
+ * IOMMU_HWPT_SET_DIRTY_TRACKING
+ *
+ */
+enum iommufd_hw_capabilities {
+ IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0,
+};
+
+/**
* struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO)
* @size: sizeof(struct iommu_hw_info)
* @flags: Must be 0
@@ -415,6 +510,8 @@ enum iommu_hw_info_type {
* the iommu type specific hardware information data
* @out_data_type: Output the iommu hardware info type as defined in the enum
* iommu_hw_info_type.
+ * @out_capabilities: Output the generic iommu capability info type as defined
+ * in the enum iommu_hw_capabilities.
* @__reserved: Must be 0
*
* Query an iommu type specific hardware information data from an iommu behind
@@ -439,6 +536,81 @@ struct iommu_hw_info {
__aligned_u64 data_uptr;
__u32 out_data_type;
__u32 __reserved;
+ __aligned_u64 out_capabilities;
};
#define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
+
+/*
+ * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty
+ * tracking
+ * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking
+ */
+enum iommufd_hwpt_set_dirty_tracking_flags {
+ IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1,
+};
+
+/**
+ * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING)
+ * @size: sizeof(struct iommu_hwpt_set_dirty_tracking)
+ * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @__reserved: Must be 0
+ *
+ * Toggle dirty tracking on an HW pagetable.
+ */
+struct iommu_hwpt_set_dirty_tracking {
+ __u32 size;
+ __u32 flags;
+ __u32 hwpt_id;
+ __u32 __reserved;
+};
+#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \
+ IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING)
+
+/**
+ * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits
+ * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing
+ * any dirty bits metadata. This flag
+ * can be passed in the expectation
+ * where the next operation is an unmap
+ * of the same IOVA range.
+ *
+ */
+enum iommufd_hwpt_get_dirty_bitmap_flags {
+ IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1,
+};
+
+/**
+ * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
+ * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags
+ * @__reserved: Must be 0
+ * @iova: base IOVA of the bitmap first bit
+ * @length: IOVA range size
+ * @page_size: page size granularity of each bit in the bitmap
+ * @data: bitmap where to set the dirty bits. The bitmap bits each
+ * represent a page_size which you deviate from an arbitrary iova.
+ *
+ * Checking a given IOVA is dirty:
+ *
+ * data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
+ *
+ * Walk the IOMMU pagetables for a given IOVA range to return a bitmap
+ * with the dirty IOVAs. In doing so it will also by default clear any
+ * dirty bit metadata set in the IOPTE.
+ */
+struct iommu_hwpt_get_dirty_bitmap {
+ __u32 size;
+ __u32 hwpt_id;
+ __u32 flags;
+ __u32 __reserved;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 page_size;
+ __aligned_u64 data;
+};
+#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
+ IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
+
#endif
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 33d08600be13..6ed328c863c4 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -86,12 +86,13 @@ TEST_F(iommufd, cmd_fail)
TEST_F(iommufd, cmd_length)
{
-#define TEST_LENGTH(_struct, _ioctl) \
+#define TEST_LENGTH(_struct, _ioctl, _last) \
{ \
+ size_t min_size = offsetofend(struct _struct, _last); \
struct { \
struct _struct cmd; \
uint8_t extra; \
- } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \
+ } cmd = { .cmd = { .size = min_size - 1 }, \
.extra = UINT8_MAX }; \
int old_errno; \
int rc; \
@@ -112,16 +113,19 @@ TEST_F(iommufd, cmd_length)
} \
}
- TEST_LENGTH(iommu_destroy, IOMMU_DESTROY);
- TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO);
- TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC);
- TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES);
- TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS);
- TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP);
- TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY);
- TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP);
- TEST_LENGTH(iommu_option, IOMMU_OPTION);
- TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS);
+ TEST_LENGTH(iommu_destroy, IOMMU_DESTROY, id);
+ TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO, __reserved);
+ TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC, __reserved);
+ TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC, out_ioas_id);
+ TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES,
+ out_iova_alignment);
+ TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS,
+ allowed_iovas);
+ TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP, iova);
+ TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY, src_iova);
+ TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length);
+ TEST_LENGTH(iommu_option, IOMMU_OPTION, val64);
+ TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved);
#undef TEST_LENGTH
}
@@ -260,6 +264,121 @@ TEST_F(iommufd_ioas, ioas_destroy)
}
}
+TEST_F(iommufd_ioas, alloc_hwpt_nested)
+{
+ const uint32_t min_data_len =
+ offsetofend(struct iommu_hwpt_selftest, iotlb);
+ struct iommu_hwpt_selftest data = {
+ .iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+ };
+ uint32_t nested_hwpt_id[2] = {};
+ uint32_t parent_hwpt_id = 0;
+ uint32_t parent_hwpt_id_not_work = 0;
+ uint32_t test_hwpt_id = 0;
+
+ if (self->device_id) {
+ /* Negative tests */
+ test_err_hwpt_alloc(ENOENT, self->ioas_id, self->device_id, 0,
+ &test_hwpt_id);
+ test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0,
+ &test_hwpt_id);
+
+ test_cmd_hwpt_alloc(self->device_id, self->ioas_id,
+ IOMMU_HWPT_ALLOC_NEST_PARENT,
+ &parent_hwpt_id);
+
+ test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0,
+ &parent_hwpt_id_not_work);
+
+ /* Negative nested tests */
+ test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+ parent_hwpt_id, 0,
+ &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_NONE, &data,
+ sizeof(data));
+ test_err_hwpt_alloc_nested(EOPNOTSUPP, self->device_id,
+ parent_hwpt_id, 0,
+ &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_SELFTEST + 1, &data,
+ sizeof(data));
+ test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+ parent_hwpt_id, 0,
+ &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ min_data_len - 1);
+ test_err_hwpt_alloc_nested(EFAULT, self->device_id,
+ parent_hwpt_id, 0,
+ &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_SELFTEST, NULL,
+ sizeof(data));
+ test_err_hwpt_alloc_nested(
+ EOPNOTSUPP, self->device_id, parent_hwpt_id,
+ IOMMU_HWPT_ALLOC_NEST_PARENT, &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data));
+ test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+ parent_hwpt_id_not_work, 0,
+ &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+
+ /* Allocate two nested hwpts sharing one common parent hwpt */
+ test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0,
+ &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+ test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0,
+ &nested_hwpt_id[1],
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+
+ /* Negative test: a nested hwpt on top of a nested hwpt */
+ test_err_hwpt_alloc_nested(EINVAL, self->device_id,
+ nested_hwpt_id[0], 0, &test_hwpt_id,
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+ /* Negative test: parent hwpt now cannot be freed */
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, parent_hwpt_id));
+
+ /* Attach device to nested_hwpt_id[0] that then will be busy */
+ test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[0]);
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, nested_hwpt_id[0]));
+
+ /* Switch from nested_hwpt_id[0] to nested_hwpt_id[1] */
+ test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[1]);
+ EXPECT_ERRNO(EBUSY,
+ _test_ioctl_destroy(self->fd, nested_hwpt_id[1]));
+ test_ioctl_destroy(nested_hwpt_id[0]);
+
+ /* Detach from nested_hwpt_id[1] and destroy it */
+ test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id);
+ test_ioctl_destroy(nested_hwpt_id[1]);
+
+ /* Detach from the parent hw_pagetable and destroy it */
+ test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
+ test_ioctl_destroy(parent_hwpt_id);
+ test_ioctl_destroy(parent_hwpt_id_not_work);
+ } else {
+ test_err_hwpt_alloc(ENOENT, self->device_id, self->ioas_id, 0,
+ &parent_hwpt_id);
+ test_err_hwpt_alloc_nested(ENOENT, self->device_id,
+ parent_hwpt_id, 0,
+ &nested_hwpt_id[0],
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+ test_err_hwpt_alloc_nested(ENOENT, self->device_id,
+ parent_hwpt_id, 0,
+ &nested_hwpt_id[1],
+ IOMMU_HWPT_DATA_SELFTEST, &data,
+ sizeof(data));
+ test_err_mock_domain_replace(ENOENT, self->stdev_id,
+ nested_hwpt_id[0]);
+ test_err_mock_domain_replace(ENOENT, self->stdev_id,
+ nested_hwpt_id[1]);
+ }
+}
+
TEST_F(iommufd_ioas, hwpt_attach)
{
/* Create a device attached directly to a hwpt */
@@ -1404,16 +1523,242 @@ TEST_F(iommufd_mock_domain, alloc_hwpt)
int i;
for (i = 0; i != variant->mock_domains; i++) {
+ uint32_t hwpt_id[2];
uint32_t stddev_id;
- uint32_t hwpt_id;
- test_cmd_hwpt_alloc(self->idev_ids[0], self->ioas_id, &hwpt_id);
- test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+ test_err_hwpt_alloc(EOPNOTSUPP,
+ self->idev_ids[i], self->ioas_id,
+ ~IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[0]);
+ test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id,
+ 0, &hwpt_id[0]);
+ test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id,
+ IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[1]);
+
+ /* Do a hw_pagetable rotation test */
+ test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[0]);
+ EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[0]));
+ test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[1]);
+ EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[1]));
+ test_cmd_mock_domain_replace(self->stdev_ids[i], self->ioas_id);
+ test_ioctl_destroy(hwpt_id[1]);
+
+ test_cmd_mock_domain(hwpt_id[0], &stddev_id, NULL, NULL);
test_ioctl_destroy(stddev_id);
- test_ioctl_destroy(hwpt_id);
+ test_ioctl_destroy(hwpt_id[0]);
}
}
+FIXTURE(iommufd_dirty_tracking)
+{
+ int fd;
+ uint32_t ioas_id;
+ uint32_t hwpt_id;
+ uint32_t stdev_id;
+ uint32_t idev_id;
+ unsigned long page_size;
+ unsigned long bitmap_size;
+ void *bitmap;
+ void *buffer;
+};
+
+FIXTURE_VARIANT(iommufd_dirty_tracking)
+{
+ unsigned long buffer_size;
+};
+
+FIXTURE_SETUP(iommufd_dirty_tracking)
+{
+ void *vrc;
+ int rc;
+
+ self->fd = open("/dev/iommu", O_RDWR);
+ ASSERT_NE(-1, self->fd);
+
+ rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
+ if (rc || !self->buffer) {
+ SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
+ variant->buffer_size, rc);
+ }
+
+ assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
+ vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ assert(vrc == self->buffer);
+
+ self->page_size = MOCK_PAGE_SIZE;
+ self->bitmap_size =
+ variant->buffer_size / self->page_size / BITS_PER_BYTE;
+
+ /* Provision with an extra (MOCK_PAGE_SIZE) for the unaligned case */
+ rc = posix_memalign(&self->bitmap, PAGE_SIZE,
+ self->bitmap_size + MOCK_PAGE_SIZE);
+ assert(!rc);
+ assert(self->bitmap);
+ assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
+
+ test_ioctl_ioas_alloc(&self->ioas_id);
+ test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id,
+ &self->idev_id);
+}
+
+FIXTURE_TEARDOWN(iommufd_dirty_tracking)
+{
+ munmap(self->buffer, variant->buffer_size);
+ munmap(self->bitmap, self->bitmap_size);
+ teardown_iommufd(self->fd, _metadata);
+}
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k)
+{
+ /* one u32 index bitmap */
+ .buffer_size = 128UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256k)
+{
+ /* one u64 index bitmap */
+ .buffer_size = 256UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty640k)
+{
+ /* two u64 index and trailing end bitmap */
+ .buffer_size = 640UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M)
+{
+ /* 4K bitmap (128M IOVA range) */
+ .buffer_size = 128UL * 1024UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M)
+{
+ /* 8K bitmap (256M IOVA range) */
+ .buffer_size = 256UL * 1024UL * 1024UL,
+};
+
+TEST_F(iommufd_dirty_tracking, enforce_dirty)
+{
+ uint32_t ioas_id, stddev_id, idev_id;
+ uint32_t hwpt_id, _hwpt_id;
+ uint32_t dev_flags;
+
+ /* Regular case */
+ dev_flags = MOCK_FLAGS_DEVICE_NO_DIRTY;
+ test_cmd_hwpt_alloc(self->idev_id, self->ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+ test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+ test_err_mock_domain_flags(EINVAL, hwpt_id, dev_flags, &stddev_id,
+ NULL);
+ test_ioctl_destroy(stddev_id);
+ test_ioctl_destroy(hwpt_id);
+
+ /* IOMMU device does not support dirty tracking */
+ test_ioctl_ioas_alloc(&ioas_id);
+ test_cmd_mock_domain_flags(ioas_id, dev_flags, &stddev_id, &_hwpt_id,
+ &idev_id);
+ test_err_hwpt_alloc(EOPNOTSUPP, idev_id, ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+ test_ioctl_destroy(stddev_id);
+}
+
+TEST_F(iommufd_dirty_tracking, set_dirty_tracking)
+{
+ uint32_t stddev_id;
+ uint32_t hwpt_id;
+
+ test_cmd_hwpt_alloc(self->idev_id, self->ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+ test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+ test_cmd_set_dirty_tracking(hwpt_id, true);
+ test_cmd_set_dirty_tracking(hwpt_id, false);
+
+ test_ioctl_destroy(stddev_id);
+ test_ioctl_destroy(hwpt_id);
+}
+
+TEST_F(iommufd_dirty_tracking, device_dirty_capability)
+{
+ uint32_t caps = 0;
+ uint32_t stddev_id;
+ uint32_t hwpt_id;
+
+ test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, 0, &hwpt_id);
+ test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+ test_cmd_get_hw_capabilities(self->idev_id, caps,
+ IOMMU_HW_CAP_DIRTY_TRACKING);
+ ASSERT_EQ(IOMMU_HW_CAP_DIRTY_TRACKING,
+ caps & IOMMU_HW_CAP_DIRTY_TRACKING);
+
+ test_ioctl_destroy(stddev_id);
+ test_ioctl_destroy(hwpt_id);
+}
+
+TEST_F(iommufd_dirty_tracking, get_dirty_bitmap)
+{
+ uint32_t stddev_id;
+ uint32_t hwpt_id;
+ uint32_t ioas_id;
+
+ test_ioctl_ioas_alloc(&ioas_id);
+ test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
+ variant->buffer_size, MOCK_APERTURE_START);
+
+ test_cmd_hwpt_alloc(self->idev_id, ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+ test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+
+ test_cmd_set_dirty_tracking(hwpt_id, true);
+
+ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+ MOCK_APERTURE_START, self->page_size,
+ self->bitmap, self->bitmap_size, 0, _metadata);
+
+ /* PAGE_SIZE unaligned bitmap */
+ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+ MOCK_APERTURE_START, self->page_size,
+ self->bitmap + MOCK_PAGE_SIZE,
+ self->bitmap_size, 0, _metadata);
+
+ test_ioctl_destroy(stddev_id);
+ test_ioctl_destroy(hwpt_id);
+}
+
+TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear)
+{
+ uint32_t stddev_id;
+ uint32_t hwpt_id;
+ uint32_t ioas_id;
+
+ test_ioctl_ioas_alloc(&ioas_id);
+ test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer,
+ variant->buffer_size, MOCK_APERTURE_START);
+
+ test_cmd_hwpt_alloc(self->idev_id, ioas_id,
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id);
+ test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL);
+
+ test_cmd_set_dirty_tracking(hwpt_id, true);
+
+ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+ MOCK_APERTURE_START, self->page_size,
+ self->bitmap, self->bitmap_size,
+ IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
+ _metadata);
+
+ /* Unaligned bitmap */
+ test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size,
+ MOCK_APERTURE_START, self->page_size,
+ self->bitmap + MOCK_PAGE_SIZE,
+ self->bitmap_size,
+ IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR,
+ _metadata);
+
+ test_ioctl_destroy(stddev_id);
+ test_ioctl_destroy(hwpt_id);
+}
+
/* VFIO compatibility IOCTLs */
TEST_F(iommufd, simple_ioctls)
@@ -1729,7 +2074,7 @@ TEST_F(vfio_compat_mock_domain, map)
ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size);
- /* UNMAP_FLAG_ALL requres 0 iova/size */
+ /* UNMAP_FLAG_ALL requires 0 iova/size */
ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd));
unmap_cmd.flags = VFIO_DMA_UNMAP_FLAG_ALL;
EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd));
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index a220ca2a689d..f590417cd67a 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -105,7 +105,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata,
/*
* This is just an arbitrary limit based on the current kernel
- * situation. Changes in the kernel can dramtically change the number of
+ * situation. Changes in the kernel can dramatically change the number of
* required fault injection sites, so if this hits it doesn't
* necessarily mean a test failure, just that the limit has to be made
* bigger.
@@ -612,10 +612,11 @@ TEST_FAIL_NTH(basic_fail_nth, device)
&idev_id))
return -1;
- if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info)))
+ if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL))
return -1;
- if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id))
+ if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id,
+ IOMMU_HWPT_DATA_NONE, 0, 0))
return -1;
if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL))
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index e0753d03ecaa..050e9751321c 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -16,6 +16,25 @@
/* Hack to make assertions more readable */
#define _IOMMU_TEST_CMD(x) IOMMU_TEST_CMD
+/* Imported from include/asm-generic/bitops/generic-non-atomic.h */
+#define BITS_PER_BYTE 8
+#define BITS_PER_LONG __BITS_PER_LONG
+#define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG))
+#define BIT_WORD(nr) ((nr) / __BITS_PER_LONG)
+
+static inline void set_bit(unsigned int nr, unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+ *p |= mask;
+}
+
+static inline bool test_bit(unsigned int nr, unsigned long *addr)
+{
+ return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG - 1)));
+}
+
static void *buffer;
static unsigned long BUFFER_SIZE;
@@ -74,6 +93,38 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id,
EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \
stdev_id, hwpt_id, NULL))
+static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id,
+ __u32 stdev_flags, __u32 *stdev_id,
+ __u32 *hwpt_id, __u32 *idev_id)
+{
+ struct iommu_test_cmd cmd = {
+ .size = sizeof(cmd),
+ .op = IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
+ .id = ioas_id,
+ .mock_domain_flags = { .dev_flags = stdev_flags },
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_TEST_CMD, &cmd);
+ if (ret)
+ return ret;
+ if (stdev_id)
+ *stdev_id = cmd.mock_domain_flags.out_stdev_id;
+ assert(cmd.id != 0);
+ if (hwpt_id)
+ *hwpt_id = cmd.mock_domain_flags.out_hwpt_id;
+ if (idev_id)
+ *idev_id = cmd.mock_domain_flags.out_idev_id;
+ return 0;
+}
+#define test_cmd_mock_domain_flags(ioas_id, flags, stdev_id, hwpt_id, idev_id) \
+ ASSERT_EQ(0, _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \
+ stdev_id, hwpt_id, idev_id))
+#define test_err_mock_domain_flags(_errno, ioas_id, flags, stdev_id, hwpt_id) \
+ EXPECT_ERRNO(_errno, \
+ _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \
+ stdev_id, hwpt_id, NULL))
+
static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id,
__u32 *hwpt_id)
{
@@ -103,12 +154,17 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id,
pt_id, NULL))
static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id,
- __u32 *hwpt_id)
+ __u32 flags, __u32 *hwpt_id, __u32 data_type,
+ void *data, size_t data_len)
{
struct iommu_hwpt_alloc cmd = {
.size = sizeof(cmd),
+ .flags = flags,
.dev_id = device_id,
.pt_id = pt_id,
+ .data_type = data_type,
+ .data_len = data_len,
+ .data_uptr = (uint64_t)data,
};
int ret;
@@ -120,8 +176,24 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id,
return 0;
}
-#define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \
- ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id))
+#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \
+ ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \
+ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \
+ 0))
+#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \
+ EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \
+ self->fd, device_id, pt_id, flags, \
+ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, 0))
+
+#define test_cmd_hwpt_alloc_nested(device_id, pt_id, flags, hwpt_id, \
+ data_type, data, data_len) \
+ ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \
+ hwpt_id, data_type, data, data_len))
+#define test_err_hwpt_alloc_nested(_errno, device_id, pt_id, flags, hwpt_id, \
+ data_type, data, data_len) \
+ EXPECT_ERRNO(_errno, \
+ _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \
+ hwpt_id, data_type, data, data_len))
static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
unsigned int ioas_id)
@@ -142,6 +214,126 @@ static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
#define test_cmd_access_replace_ioas(access_id, ioas_id) \
ASSERT_EQ(0, _test_cmd_access_replace_ioas(self->fd, access_id, ioas_id))
+static int _test_cmd_set_dirty_tracking(int fd, __u32 hwpt_id, bool enabled)
+{
+ struct iommu_hwpt_set_dirty_tracking cmd = {
+ .size = sizeof(cmd),
+ .flags = enabled ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0,
+ .hwpt_id = hwpt_id,
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &cmd);
+ if (ret)
+ return -errno;
+ return 0;
+}
+#define test_cmd_set_dirty_tracking(hwpt_id, enabled) \
+ ASSERT_EQ(0, _test_cmd_set_dirty_tracking(self->fd, hwpt_id, enabled))
+
+static int _test_cmd_get_dirty_bitmap(int fd, __u32 hwpt_id, size_t length,
+ __u64 iova, size_t page_size,
+ __u64 *bitmap, __u32 flags)
+{
+ struct iommu_hwpt_get_dirty_bitmap cmd = {
+ .size = sizeof(cmd),
+ .hwpt_id = hwpt_id,
+ .flags = flags,
+ .iova = iova,
+ .length = length,
+ .page_size = page_size,
+ .data = (uintptr_t)bitmap,
+ };
+ int ret;
+
+ ret = ioctl(fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &cmd);
+ if (ret)
+ return ret;
+ return 0;
+}
+
+#define test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, \
+ bitmap, flags) \
+ ASSERT_EQ(0, _test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, \
+ page_size, bitmap, flags))
+
+static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length,
+ __u64 iova, size_t page_size,
+ __u64 *bitmap, __u64 *dirty)
+{
+ struct iommu_test_cmd cmd = {
+ .size = sizeof(cmd),
+ .op = IOMMU_TEST_OP_DIRTY,
+ .id = hwpt_id,
+ .dirty = {
+ .iova = iova,
+ .length = length,
+ .page_size = page_size,
+ .uptr = (uintptr_t)bitmap,
+ }
+ };
+ int ret;
+
+ ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_DIRTY), &cmd);
+ if (ret)
+ return -ret;
+ if (dirty)
+ *dirty = cmd.dirty.out_nr_dirty;
+ return 0;
+}
+
+#define test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, \
+ bitmap, nr) \
+ ASSERT_EQ(0, \
+ _test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, \
+ page_size, bitmap, nr))
+
+static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length,
+ __u64 iova, size_t page_size, __u64 *bitmap,
+ __u64 bitmap_size, __u32 flags,
+ struct __test_metadata *_metadata)
+{
+ unsigned long i, count, nbits = bitmap_size * BITS_PER_BYTE;
+ unsigned long nr = nbits / 2;
+ __u64 out_dirty = 0;
+
+ /* Mark all even bits as dirty in the mock domain */
+ for (count = 0, i = 0; i < nbits; count += !(i % 2), i++)
+ if (!(i % 2))
+ set_bit(i, (unsigned long *)bitmap);
+ ASSERT_EQ(nr, count);
+
+ test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size,
+ bitmap, &out_dirty);
+ ASSERT_EQ(nr, out_dirty);
+
+ /* Expect all even bits as dirty in the user bitmap */
+ memset(bitmap, 0, bitmap_size);
+ test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap,
+ flags);
+ for (count = 0, i = 0; i < nbits; count += !(i % 2), i++)
+ ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap));
+ ASSERT_EQ(count, out_dirty);
+
+ memset(bitmap, 0, bitmap_size);
+ test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap,
+ flags);
+
+ /* It as read already -- expect all zeroes */
+ for (i = 0; i < nbits; i++) {
+ ASSERT_EQ(!(i % 2) && (flags &
+ IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR),
+ test_bit(i, (unsigned long *)bitmap));
+ }
+
+ return 0;
+}
+#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap, \
+ bitmap_size, flags, _metadata) \
+ ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \
+ page_size, bitmap, bitmap_size, \
+ flags, _metadata))
+
static int _test_cmd_create_access(int fd, unsigned int ioas_id,
__u32 *access_id, unsigned int flags)
{
@@ -266,6 +458,17 @@ static int _test_ioctl_ioas_map(int fd, unsigned int ioas_id, void *buffer,
IOMMU_IOAS_MAP_READABLE)); \
})
+#define test_ioctl_ioas_map_fixed_id(ioas_id, buffer, length, iova) \
+ ({ \
+ __u64 __iova = iova; \
+ ASSERT_EQ(0, \
+ _test_ioctl_ioas_map( \
+ self->fd, ioas_id, buffer, length, &__iova, \
+ IOMMU_IOAS_MAP_FIXED_IOVA | \
+ IOMMU_IOAS_MAP_WRITEABLE | \
+ IOMMU_IOAS_MAP_READABLE)); \
+ })
+
#define test_err_ioctl_ioas_map_fixed(_errno, buffer, length, iova) \
({ \
__u64 __iova = iova; \
@@ -354,8 +557,8 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata)
#endif
/* @data can be NULL */
-static int _test_cmd_get_hw_info(int fd, __u32 device_id,
- void *data, size_t data_len)
+static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data,
+ size_t data_len, uint32_t *capabilities)
{
struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data;
struct iommu_hw_info cmd = {
@@ -363,6 +566,7 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
.dev_id = device_id,
.data_len = data_len,
.data_uptr = (uint64_t)data,
+ .out_capabilities = 0,
};
int ret;
@@ -399,14 +603,19 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
assert(!info->flags);
}
+ if (capabilities)
+ *capabilities = cmd.out_capabilities;
+
return 0;
}
-#define test_cmd_get_hw_info(device_id, data, data_len) \
- ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \
- data, data_len))
+#define test_cmd_get_hw_info(device_id, data, data_len) \
+ ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \
+ data_len, NULL))
+
+#define test_err_get_hw_info(_errno, device_id, data, data_len) \
+ EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \
+ data_len, NULL))
-#define test_err_get_hw_info(_errno, device_id, data, data_len) \
- EXPECT_ERRNO(_errno, \
- _test_cmd_get_hw_info(self->fd, device_id, \
- data, data_len))
+#define test_cmd_get_hw_capabilities(device_id, caps, mask) \
+ ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps))