summaryrefslogtreecommitdiff
path: root/drivers/iommu
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-11-02 05:44:56 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-11-02 05:44:56 +0300
commit463f46e114f74465cf8d01b124e7b74ad1ce2afd (patch)
tree67ba1f82d7a3baba926eab2a896b0f4c4f3aced2 /drivers/iommu
parentff269e2cd5adce4ae14f883fc9c8803bc43ee1e9 (diff)
parentb2b67c997bf74453f3469d8b54e4859f190943bd (diff)
downloadlinux-463f46e114f74465cf8d01b124e7b74ad1ce2afd.tar.xz
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd updates from Jason Gunthorpe: "This brings three new iommufd capabilities: - Dirty tracking for DMA. AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the IOPTEs within the IO page table. This can be used to generate a record of what memory is being dirtied by DMA activities during a VM migration process. A VMM like qemu will combine the IOMMU dirty bits with the CPU's dirty log to determine what memory to transfer. VFIO already has a DMA dirty tracking framework that requires PCI devices to implement tracking HW internally. The iommufd version provides an alternative that the VMM can select, if available. The two are designed to have very similar APIs. - Userspace controlled attributes for hardware page tables (HWPT/iommu_domain). There are currently a few generic attributes for HWPTs (support dirty tracking, and parent of a nest). This is an entry point for the userspace iommu driver to control the HW in detail. - Nested translation support for HWPTs. This is a 2D translation scheme similar to the CPU where a DMA goes through a first stage to determine an intermediate address which is then translated trough a second stage to a physical address. Like for CPU translation the first stage table would exist in VM controlled memory and the second stage is in the kernel and matches the VM's guest to physical map. As every IOMMU has a unique set of parameter to describe the S1 IO page table and its associated parameters the userspace IOMMU driver has to marshal the information into the correct format. This is 1/3 of the feature, it allows creating the nested translation and binding it to VFIO devices, however the API to support IOTLB and ATC invalidation of the stage 1 io page table, and forwarding of IO faults are still in progress. The series includes AMD and Intel support for dirty tracking. Intel support for nested translation. Along the way are a number of internal items: - New iommu core items: ops->domain_alloc_user(), ops->set_dirty_tracking, ops->read_and_clear_dirty(), IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user - UAF fix in iopt_area_split() - Spelling fixes and some test suite improvement" * tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits) iommufd: Organize the mock domain alloc functions closer to Joerg's tree iommufd/selftest: Fix page-size check in iommufd_test_dirty() iommufd: Add iopt_area_alloc() iommufd: Fix missing update of domains_itree after splitting iopt_area iommu/vt-d: Disallow read-only mappings to nest parent domain iommu/vt-d: Add nested domain allocation iommu/vt-d: Set the nested domain to a device iommu/vt-d: Make domain attach helpers to be extern iommu/vt-d: Add helper to setup pasid nested translation iommu/vt-d: Add helper for nested domain allocation iommu/vt-d: Extend dmar_domain to support nested domain iommufd: Add data structure for Intel VT-d stage-1 domain allocation iommu/vt-d: Enhance capability check for nested parent domain allocation iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs iommufd/selftest: Add nested domain allocation for mock domain iommu: Add iommu_copy_struct_from_user helper iommufd: Add a nested HW pagetable object iommu: Pass in parent domain with user_data to domain_alloc_user op iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable ...
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/Kconfig4
-rw-r--r--drivers/iommu/amd/Kconfig1
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h12
-rw-r--r--drivers/iommu/amd/io_pgtable.c68
-rw-r--r--drivers/iommu/amd/iommu.c147
-rw-r--r--drivers/iommu/intel/Kconfig1
-rw-r--r--drivers/iommu/intel/Makefile2
-rw-r--r--drivers/iommu/intel/iommu.c156
-rw-r--r--drivers/iommu/intel/iommu.h64
-rw-r--r--drivers/iommu/intel/nested.c117
-rw-r--r--drivers/iommu/intel/pasid.c221
-rw-r--r--drivers/iommu/intel/pasid.h6
-rw-r--r--drivers/iommu/iommufd/Makefile1
-rw-r--r--drivers/iommu/iommufd/device.c174
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c304
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c200
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h84
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h39
-rw-r--r--drivers/iommu/iommufd/iova_bitmap.c426
-rw-r--r--drivers/iommu/iommufd/main.c17
-rw-r--r--drivers/iommu/iommufd/pages.c2
-rw-r--r--drivers/iommu/iommufd/selftest.c326
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c6
23 files changed, 2200 insertions, 178 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 7f04491ca5f0..ee9e2a2edbf5 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -7,6 +7,10 @@ config IOMMU_IOVA
config IOMMU_API
bool
+config IOMMUFD_DRIVER
+ bool
+ default n
+
menuconfig IOMMU_SUPPORT
bool "IOMMU Hardware Support"
depends on MMU
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index 9b5fc3356bf2..8bd4c3b183ec 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -10,6 +10,7 @@ config AMD_IOMMU
select IOMMU_API
select IOMMU_IOVA
select IOMMU_IO_PGTABLE
+ select IOMMUFD_DRIVER if IOMMUFD
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 7dc30c2b56b3..dec4e5c2b66b 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -97,7 +97,9 @@
#define FEATURE_GATS_MASK (3ULL)
#define FEATURE_GAM_VAPIC BIT_ULL(21)
#define FEATURE_GIOSUP BIT_ULL(48)
+#define FEATURE_HASUP BIT_ULL(49)
#define FEATURE_EPHSUP BIT_ULL(50)
+#define FEATURE_HDSUP BIT_ULL(52)
#define FEATURE_SNP BIT_ULL(63)
#define FEATURE_PASID_SHIFT 32
@@ -212,6 +214,7 @@
/* macros and definitions for device table entries */
#define DEV_ENTRY_VALID 0x00
#define DEV_ENTRY_TRANSLATION 0x01
+#define DEV_ENTRY_HAD 0x07
#define DEV_ENTRY_PPR 0x34
#define DEV_ENTRY_IR 0x3d
#define DEV_ENTRY_IW 0x3e
@@ -371,9 +374,15 @@
(1ULL << (12 + (9 * (level))))
/*
+ * The IOPTE dirty bit
+ */
+#define IOMMU_PTE_HD_BIT (6)
+
+/*
* Bit value definition for I/O PTE fields
*/
#define IOMMU_PTE_PR BIT_ULL(0)
+#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
#define IOMMU_PTE_U BIT_ULL(59)
#define IOMMU_PTE_FC BIT_ULL(60)
#define IOMMU_PTE_IR BIT_ULL(61)
@@ -384,6 +393,7 @@
*/
#define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1)
+#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX_SHIFT (56)
@@ -413,6 +423,7 @@
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
+#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
@@ -563,6 +574,7 @@ struct protection_domain {
int nid; /* Node ID */
u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
+ bool dirty_tracking; /* dirty tracking is enabled in the domain */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index 2892aa1b4dc1..6c0621f6f572 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
return (__pte & ~offset_mask) | (iova & offset_mask);
}
+static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
+ unsigned long flags)
+{
+ bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
+ bool dirty = false;
+ int i, count;
+
+ /*
+ * 2.2.3.2 Host Dirty Support
+ * When a non-default page size is used , software must OR the
+ * Dirty bits in all of the replicated host PTEs used to map
+ * the page. The IOMMU does not guarantee the Dirty bits are
+ * set in all of the replicated PTEs. Any portion of the page
+ * may have been written even if the Dirty bit is set in only
+ * one of the replicated PTEs.
+ */
+ count = PAGE_SIZE_PTE_COUNT(size);
+ for (i = 0; i < count && test_only; i++) {
+ if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
+ dirty = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < count && !test_only; i++) {
+ if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
+ (unsigned long *)&ptep[i])) {
+ dirty = true;
+ }
+ }
+
+ return dirty;
+}
+
+static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
+ unsigned long end = iova + size - 1;
+
+ do {
+ unsigned long pgsize = 0;
+ u64 *ptep, pte;
+
+ ptep = fetch_pte(pgtable, iova, &pgsize);
+ if (ptep)
+ pte = READ_ONCE(*ptep);
+ if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
+ pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
+ iova += pgsize;
+ continue;
+ }
+
+ /*
+ * Mark the whole IOVA range as dirty even if only one of
+ * the replicated PTEs were marked dirty.
+ */
+ if (pte_test_and_clear_dirty(ptep, pgsize, flags))
+ iommu_dirty_bitmap_record(dirty, iova, pgsize);
+ iova += pgsize;
+ } while (iova < end);
+
+ return 0;
+}
+
/*
* ----------------------------------------------------
*/
@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
pgtable->iop.ops.map_pages = iommu_v1_map_pages;
pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages;
pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
+ pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
return &pgtable->iop;
}
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 95bd7c25ba6f..b399c5741378 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -37,6 +37,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/dma.h>
+#include <uapi/linux/iommufd.h>
#include "amd_iommu.h"
#include "../dma-iommu.h"
@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
+const struct iommu_dirty_ops amd_dirty_ops;
static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
int amd_iommu_max_glx_val = -1;
@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
pte_root |= 1ULL << DEV_ENTRY_PPR;
}
+ if (domain->dirty_tracking)
+ pte_root |= DTE_FLAG_HAD;
+
if (domain->flags & PD_IOMMUV2_MASK) {
u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
u64 glx = domain->glx;
@@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void)
return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
}
-static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
+ return iommu && (iommu->features & FEATURE_HDSUP);
+}
+
+static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
+ struct device *dev, u32 flags)
+{
+ bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
struct protection_domain *domain;
+ struct amd_iommu *iommu = NULL;
+
+ if (dev) {
+ iommu = rlookup_amd_iommu(dev);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+ }
/*
* Since DTE[Mode]=0 is prohibited on SNP-enabled system,
* default to use IOMMU_DOMAIN_DMA[_FQ].
*/
if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
- return NULL;
+ return ERR_PTR(-EINVAL);
+
+ if (dirty_tracking && !amd_iommu_hd_support(iommu))
+ return ERR_PTR(-EOPNOTSUPP);
domain = protection_domain_alloc(type);
if (!domain)
- return NULL;
+ return ERR_PTR(-ENOMEM);
domain->domain.geometry.aperture_start = 0;
domain->domain.geometry.aperture_end = dma_max_address();
domain->domain.geometry.force_aperture = true;
+ if (iommu) {
+ domain->domain.type = type;
+ domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
+ domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+
+ if (dirty_tracking)
+ domain->domain.dirty_ops = &amd_dirty_ops;
+ }
+
return &domain->domain;
}
+static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
+{
+ struct iommu_domain *domain;
+
+ domain = do_iommu_domain_alloc(type, NULL, 0);
+ if (IS_ERR(domain))
+ return NULL;
+
+ return domain;
+}
+
+static struct iommu_domain *
+amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+
+{
+ unsigned int type = IOMMU_DOMAIN_UNMANAGED;
+
+ if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return do_iommu_domain_alloc(type, dev, flags);
+}
+
static void amd_iommu_domain_free(struct iommu_domain *dom)
{
struct protection_domain *domain;
@@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
dev_data->defer_attach = false;
+ /*
+ * Restrict to devices with compatible IOMMU hardware support
+ * when enforcement of dirty tracking is enabled.
+ */
+ if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
+ return -EINVAL;
+
if (dev_data->domain)
detach_device(dev);
@@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return true;
case IOMMU_CAP_DEFERRED_FLUSH:
return true;
+ case IOMMU_CAP_DIRTY_TRACKING: {
+ struct amd_iommu *iommu = rlookup_amd_iommu(dev);
+
+ return amd_iommu_hd_support(iommu);
+ }
default:
break;
}
@@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return false;
}
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
+{
+ struct protection_domain *pdomain = to_pdomain(domain);
+ struct dev_table_entry *dev_table;
+ struct iommu_dev_data *dev_data;
+ bool domain_flush = false;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+ u64 pte_root;
+
+ spin_lock_irqsave(&pdomain->lock, flags);
+ if (!(pdomain->dirty_tracking ^ enable)) {
+ spin_unlock_irqrestore(&pdomain->lock, flags);
+ return 0;
+ }
+
+ list_for_each_entry(dev_data, &pdomain->dev_list, list) {
+ iommu = rlookup_amd_iommu(dev_data->dev);
+ if (!iommu)
+ continue;
+
+ dev_table = get_dev_table(iommu);
+ pte_root = dev_table[dev_data->devid].data[0];
+
+ pte_root = (enable ? pte_root | DTE_FLAG_HAD :
+ pte_root & ~DTE_FLAG_HAD);
+
+ /* Flush device DTE */
+ dev_table[dev_data->devid].data[0] = pte_root;
+ device_flush_dte(dev_data);
+ domain_flush = true;
+ }
+
+ /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
+ if (domain_flush) {
+ amd_iommu_domain_flush_tlb_pde(pdomain);
+ amd_iommu_domain_flush_complete(pdomain);
+ }
+ pdomain->dirty_tracking = enable;
+ spin_unlock_irqrestore(&pdomain->lock, flags);
+
+ return 0;
+}
+
+static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct protection_domain *pdomain = to_pdomain(domain);
+ struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
+ unsigned long lflags;
+
+ if (!ops || !ops->read_and_clear_dirty)
+ return -EOPNOTSUPP;
+
+ spin_lock_irqsave(&pdomain->lock, lflags);
+ if (!pdomain->dirty_tracking && dirty->bitmap) {
+ spin_unlock_irqrestore(&pdomain->lock, lflags);
+ return -EINVAL;
+ }
+ spin_unlock_irqrestore(&pdomain->lock, lflags);
+
+ return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
+}
+
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
@@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
+const struct iommu_dirty_ops amd_dirty_ops = {
+ .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+ .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
+};
+
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.domain_alloc = amd_iommu_domain_alloc,
+ .domain_alloc_user = amd_iommu_domain_alloc_user,
.probe_device = amd_iommu_probe_device,
.release_device = amd_iommu_release_device,
.probe_finalize = amd_iommu_probe_finalize,
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 119d2c57a48e..012cd2541a68 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -15,6 +15,7 @@ config INTEL_IOMMU
select DMA_OPS
select IOMMU_API
select IOMMU_IOVA
+ select IOMMUFD_DRIVER if IOMMUFD
select NEED_DMA_MAP_STATE
select DMAR_TABLE
select SWIOTLB
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index 7af3b8a4f2a0..5dabf081a779 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o
obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
obj-$(CONFIG_DMAR_PERF) += perf.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3685ba90ec88..d1037280abf7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units);
#define for_each_rmrr_units(rmrr) \
list_for_each_entry(rmrr, &dmar_rmrr_units, list)
-static void device_block_translation(struct device *dev);
static void intel_iommu_domain_free(struct iommu_domain *domain);
int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
@@ -300,6 +299,7 @@ static int iommu_skip_te_disable;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
+const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -560,7 +560,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
}
/* Some capabilities may be different across iommus */
-static void domain_update_iommu_cap(struct dmar_domain *domain)
+void domain_update_iommu_cap(struct dmar_domain *domain)
{
domain_update_iommu_coherency(domain);
domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
@@ -1778,8 +1778,7 @@ static struct dmar_domain *alloc_domain(unsigned int type)
return domain;
}
-static int domain_attach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info, *curr;
unsigned long ndomains;
@@ -1828,8 +1827,7 @@ err_unlock:
return ret;
}
-static void domain_detach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info;
@@ -2196,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
return -EINVAL;
+ if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
+ pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
attr |= DMA_FL_PTE_PRESENT;
if (domain->use_first_level) {
@@ -3958,7 +3961,7 @@ static void dmar_remove_one_dev_info(struct device *dev)
* all DMA requests without PASID from the device are blocked. If the page
* table has been set, clean up the data structures.
*/
-static void device_block_translation(struct device *dev)
+void device_block_translation(struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
@@ -4058,14 +4061,62 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
return NULL;
}
+static struct iommu_domain *
+intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+ struct intel_iommu *iommu = info->iommu;
+ struct iommu_domain *domain;
+
+ /* Must be NESTING domain */
+ if (parent) {
+ if (!nested_supported(iommu) || flags)
+ return ERR_PTR(-EOPNOTSUPP);
+ return intel_nested_domain_alloc(parent, user_data);
+ }
+
+ if (flags &
+ (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (nested_parent && !nested_supported(iommu))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (user_data || (dirty_tracking && !ssads_supported(iommu)))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * domain_alloc_user op needs to fully initialize a domain before
+ * return, so uses iommu_domain_alloc() here for simple.
+ */
+ domain = iommu_domain_alloc(dev->bus);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ if (nested_parent)
+ to_dmar_domain(domain)->nested_parent = true;
+
+ if (dirty_tracking) {
+ if (to_dmar_domain(domain)->use_first_level) {
+ iommu_domain_free(domain);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ domain->dirty_ops = &intel_dirty_ops;
+ }
+
+ return domain;
+}
+
static void intel_iommu_domain_free(struct iommu_domain *domain)
{
if (domain != &si_domain->domain && domain != &blocking_domain)
domain_exit(to_dmar_domain(domain));
}
-static int prepare_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+int prepare_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu;
@@ -4078,6 +4129,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
return -EINVAL;
+ if (domain->dirty_ops && !ssads_supported(iommu))
+ return -EINVAL;
+
/* check if this iommu agaw is sufficient for max mapped address */
addr_width = agaw_to_width(iommu->agaw);
if (addr_width > cap_mgaw(iommu->cap))
@@ -4332,6 +4386,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
return dmar_platform_optin();
case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
return ecap_sc_support(info->iommu->ecap);
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return ssads_supported(info->iommu);
default:
return false;
}
@@ -4729,6 +4785,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
return -EOPNOTSUPP;
+ if (domain->dirty_ops)
+ return -EINVAL;
+
if (context_copied(iommu, info->bus, info->devfn))
return -EBUSY;
@@ -4780,6 +4839,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
if (!vtd)
return ERR_PTR(-ENOMEM);
+ vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
vtd->cap_reg = iommu->cap;
vtd->ecap_reg = iommu->ecap;
*length = sizeof(*vtd);
@@ -4787,10 +4847,88 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
return vtd;
}
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ int ret;
+
+ spin_lock(&dmar_domain->lock);
+ if (dmar_domain->dirty_tracking == enable)
+ goto out_unlock;
+
+ list_for_each_entry(info, &dmar_domain->devices, link) {
+ ret = intel_pasid_setup_dirty_tracking(info->iommu,
+ info->domain, info->dev,
+ IOMMU_NO_PASID, enable);
+ if (ret)
+ goto err_unwind;
+ }
+
+ dmar_domain->dirty_tracking = enable;
+out_unlock:
+ spin_unlock(&dmar_domain->lock);
+
+ return 0;
+
+err_unwind:
+ list_for_each_entry(info, &dmar_domain->devices, link)
+ intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
+ info->dev, IOMMU_NO_PASID,
+ dmar_domain->dirty_tracking);
+ spin_unlock(&dmar_domain->lock);
+ return ret;
+}
+
+static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ unsigned long end = iova + size - 1;
+ unsigned long pgsize;
+
+ /*
+ * IOMMUFD core calls into a dirty tracking disabled domain without an
+ * IOVA bitmap set in order to clean dirty bits in all PTEs that might
+ * have occurred when we stopped dirty tracking. This ensures that we
+ * never inherit dirtied bits from a previous cycle.
+ */
+ if (!dmar_domain->dirty_tracking && dirty->bitmap)
+ return -EINVAL;
+
+ do {
+ struct dma_pte *pte;
+ int lvl = 0;
+
+ pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
+ GFP_ATOMIC);
+ pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
+ if (!pte || !dma_pte_present(pte)) {
+ iova += pgsize;
+ continue;
+ }
+
+ if (dma_sl_pte_test_and_clear_dirty(pte, flags))
+ iommu_dirty_bitmap_record(dirty, iova, pgsize);
+ iova += pgsize;
+ } while (iova < end);
+
+ return 0;
+}
+
+const struct iommu_dirty_ops intel_dirty_ops = {
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+ .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
+};
+
const struct iommu_ops intel_iommu_ops = {
.capable = intel_iommu_capable,
.hw_info = intel_iommu_hw_info,
.domain_alloc = intel_iommu_domain_alloc,
+ .domain_alloc_user = intel_iommu_domain_alloc_user,
.probe_device = intel_iommu_probe_device,
.probe_finalize = intel_iommu_probe_finalize,
.release_device = intel_iommu_release_device,
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 7dac94f62b4e..d796d0d9b114 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -25,6 +25,7 @@
#include <asm/cacheflush.h>
#include <asm/iommu.h>
+#include <uapi/linux/iommufd.h>
/*
* VT-d hardware uses 4KiB page size regardless of host page size.
@@ -48,6 +49,9 @@
#define DMA_FL_PTE_DIRTY BIT_ULL(6)
#define DMA_FL_PTE_XD BIT_ULL(63)
+#define DMA_SL_PTE_DIRTY_BIT 9
+#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
+
#define ADDR_WIDTH_5LEVEL (57)
#define ADDR_WIDTH_4LEVEL (48)
@@ -539,6 +543,10 @@ enum {
#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
#define pasid_supported(iommu) (sm_supported(iommu) && \
ecap_pasid((iommu)->ecap))
+#define ssads_supported(iommu) (sm_supported(iommu) && \
+ ecap_slads((iommu)->ecap))
+#define nested_supported(iommu) (sm_supported(iommu) && \
+ ecap_nest((iommu)->ecap))
struct pasid_entry;
struct pasid_state_entry;
@@ -592,20 +600,45 @@ struct dmar_domain {
* otherwise, goes through the second
* level.
*/
+ u8 dirty_tracking:1; /* Dirty tracking is enabled */
+ u8 nested_parent:1; /* Has other domains nested on it */
spinlock_t lock; /* Protect device tracking lists */
struct list_head devices; /* all devices' list */
struct list_head dev_pasids; /* all attached pasids */
- struct dma_pte *pgd; /* virtual address */
- int gaw; /* max guest address width */
-
- /* adjusted guest address width, 0 is level 2 30-bit */
- int agaw;
int iommu_superpage;/* Level of superpages supported:
0 == 4KiB (no superpages), 1 == 2MiB,
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
- u64 max_addr; /* maximum mapped address */
+ union {
+ /* DMA remapping domain */
+ struct {
+ /* virtual address */
+ struct dma_pte *pgd;
+ /* max guest address width */
+ int gaw;
+ /*
+ * adjusted guest address width:
+ * 0: level 2 30-bit
+ * 1: level 3 39-bit
+ * 2: level 4 48-bit
+ * 3: level 5 57-bit
+ */
+ int agaw;
+ /* maximum mapped address */
+ u64 max_addr;
+ };
+
+ /* Nested user domain */
+ struct {
+ /* parent page table which the user domain is nested on */
+ struct dmar_domain *s2_domain;
+ /* user page table pointer (in GPA) */
+ unsigned long s1_pgtbl;
+ /* page table attributes */
+ struct iommu_hwpt_vtd_s1 s1_cfg;
+ };
+ };
struct iommu_domain domain; /* generic domain data structure for
iommu core */
@@ -781,6 +814,16 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
+static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
+ unsigned long flags)
+{
+ if (flags & IOMMU_DIRTY_NO_CLEAR)
+ return (pte->val & DMA_SL_PTE_DIRTY) != 0;
+
+ return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
+ (unsigned long *)&pte->val);
+}
+
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
@@ -836,12 +879,21 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
*/
#define QI_OPT_WAIT_DRAIN BIT(0)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void device_block_translation(struct device *dev);
+int prepare_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev);
+void domain_update_iommu_cap(struct dmar_domain *domain);
+
int dmar_ir_support(void);
void *alloc_pgtable_page(int node, gfp_t gfp);
void free_pgtable_page(void *vaddr);
void iommu_flush_write_buffer(struct intel_iommu *iommu);
struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+ const struct iommu_user_data *user_data);
#ifdef CONFIG_INTEL_IOMMU_SVM
void intel_svm_check(struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
new file mode 100644
index 000000000000..b5a5563ab32c
--- /dev/null
+++ b/drivers/iommu/intel/nested.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nested.c - nested mode translation support
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ * Jacob Pan <jacob.jun.pan@linux.intel.com>
+ * Yi Liu <yi.l.liu@intel.com>
+ */
+
+#define pr_fmt(fmt) "DMAR: " fmt
+
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+
+#include "iommu.h"
+#include "pasid.h"
+
+static int intel_nested_attach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ unsigned long flags;
+ int ret = 0;
+
+ if (info->domain)
+ device_block_translation(dev);
+
+ if (iommu->agaw < dmar_domain->s2_domain->agaw) {
+ dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
+ return -ENODEV;
+ }
+
+ /*
+ * Stage-1 domain cannot work alone, it is nested on a s2_domain.
+ * The s2_domain will be used in nested translation, hence needs
+ * to ensure the s2_domain is compatible with this IOMMU.
+ */
+ ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev);
+ if (ret) {
+ dev_err_ratelimited(dev, "s2 domain is not compatible\n");
+ return ret;
+ }
+
+ ret = domain_attach_iommu(dmar_domain, iommu);
+ if (ret) {
+ dev_err_ratelimited(dev, "Failed to attach domain to iommu\n");
+ return ret;
+ }
+
+ ret = intel_pasid_setup_nested(iommu, dev,
+ IOMMU_NO_PASID, dmar_domain);
+ if (ret) {
+ domain_detach_iommu(dmar_domain, iommu);
+ dev_err_ratelimited(dev, "Failed to setup pasid entry\n");
+ return ret;
+ }
+
+ info->domain = dmar_domain;
+ spin_lock_irqsave(&dmar_domain->lock, flags);
+ list_add(&info->link, &dmar_domain->devices);
+ spin_unlock_irqrestore(&dmar_domain->lock, flags);
+
+ return 0;
+}
+
+static void intel_nested_domain_free(struct iommu_domain *domain)
+{
+ kfree(to_dmar_domain(domain));
+}
+
+static const struct iommu_domain_ops intel_nested_domain_ops = {
+ .attach_dev = intel_nested_attach_dev,
+ .free = intel_nested_domain_free,
+};
+
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct dmar_domain *s2_domain = to_dmar_domain(parent);
+ struct iommu_hwpt_vtd_s1 vtd;
+ struct dmar_domain *domain;
+ int ret;
+
+ /* Must be nested domain */
+ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (parent->ops != intel_iommu_ops.default_domain_ops ||
+ !s2_domain->nested_parent)
+ return ERR_PTR(-EINVAL);
+
+ ret = iommu_copy_struct_from_user(&vtd, user_data,
+ IOMMU_HWPT_DATA_VTD_S1, __reserved);
+ if (ret)
+ return ERR_PTR(ret);
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ domain->use_first_level = true;
+ domain->s2_domain = s2_domain;
+ domain->s1_pgtbl = vtd.pgtbl_addr;
+ domain->s1_cfg = vtd;
+ domain->domain.ops = &intel_nested_domain_ops;
+ domain->domain.type = IOMMU_DOMAIN_NESTED;
+ INIT_LIST_HEAD(&domain->devices);
+ INIT_LIST_HEAD(&domain->dev_pasids);
+ spin_lock_init(&domain->lock);
+ xa_init(&domain->iommu_array);
+
+ return &domain->domain;
+}
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 8f92b92f3d2a..74e8e4c17e81 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
WRITE_ONCE(*ptr, (old & ~mask) | bits);
}
+static inline u64 pasid_get_bits(u64 *ptr)
+{
+ return READ_ONCE(*ptr);
+}
+
/*
* Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
* PASID entry.
@@ -336,6 +341,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe)
}
/*
+ * Enable second level A/D bits by setting the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_ssade(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9);
+}
+
+/*
+ * Disable second level A/D bits by clearing the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_clear_ssade(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 9, 0);
+}
+
+/*
+ * Checks if second level A/D bits specifically the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry is set.
+ */
+static inline bool pasid_get_ssade(struct pasid_entry *pe)
+{
+ return pasid_get_bits(&pe->val[0]) & (1 << 9);
+}
+
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 0, 1);
+}
+
+/*
* Setup the WPE(Write Protect Enable) field (Bit 132) of a
* scalable mode PASID entry.
*/
@@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value)
pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
}
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void pasid_set_eafe(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
static void
pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
u16 did, u32 pasid)
@@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ if (domain->dirty_tracking)
+ pasid_set_ssade(pte);
pasid_set_present(pte);
spin_unlock(&iommu->lock);
@@ -637,6 +692,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
}
/*
+ * Set up dirty tracking on a second only or nested translation type.
+ */
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid,
+ bool enabled)
+{
+ struct pasid_entry *pte;
+ u16 did, pgtt;
+
+ spin_lock(&iommu->lock);
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ dev_err_ratelimited(
+ dev, "Failed to get pasid entry of PASID %d\n", pasid);
+ return -ENODEV;
+ }
+
+ did = domain_id_iommu(domain, iommu);
+ pgtt = pasid_pte_get_pgtt(pte);
+ if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
+ pgtt != PASID_ENTRY_PGTT_NESTED) {
+ spin_unlock(&iommu->lock);
+ dev_err_ratelimited(
+ dev,
+ "Dirty tracking not supported on translation type %d\n",
+ pgtt);
+ return -EOPNOTSUPP;
+ }
+
+ if (pasid_get_ssade(pte) == enabled) {
+ spin_unlock(&iommu->lock);
+ return 0;
+ }
+
+ if (enabled)
+ pasid_set_ssade(pte);
+ else
+ pasid_clear_ssade(pte);
+ spin_unlock(&iommu->lock);
+
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(pte, sizeof(*pte));
+
+ /*
+ * From VT-d spec table 25 "Guidance to Software for Invalidations":
+ *
+ * - PASID-selective-within-Domain PASID-cache invalidation
+ * If (PGTT=SS or Nested)
+ * - Domain-selective IOTLB invalidation
+ * Else
+ * - PASID-selective PASID-based IOTLB invalidation
+ * - If (pasid is RID_PASID)
+ * - Global Device-TLB invalidation to affected functions
+ * Else
+ * - PASID-based Device-TLB invalidation (with S=1 and
+ * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
+ */
+ pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+ /* Device IOTLB doesn't need to be flushed in caching mode. */
+ if (!cap_caching_mode(iommu->cap))
+ devtlb_invalidation_with_pasid(iommu, dev, pasid);
+
+ return 0;
+}
+
+/*
* Set up the scalable mode pasid entry for passthrough translation type.
*/
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
@@ -713,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
if (!cap_caching_mode(iommu->cap))
devtlb_invalidation_with_pasid(iommu, dev, pasid);
}
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * @iommu: IOMMU which the device belong to
+ * @dev: Device to be set up for translation
+ * @pasid: PASID to be programmed in the device PASID table
+ * @domain: User stage-1 domain nested on a stage-2 domain
+ *
+ * This is used for nested translation. The input domain should be
+ * nested type and nested on a parent with 'is_nested_parent' flag
+ * set.
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+ u32 pasid, struct dmar_domain *domain)
+{
+ struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
+ pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
+ struct dmar_domain *s2_domain = domain->s2_domain;
+ u16 did = domain_id_iommu(domain, iommu);
+ struct dma_pte *pgd = s2_domain->pgd;
+ struct pasid_entry *pte;
+
+ /* Address width should match the address width supported by hardware */
+ switch (s1_cfg->addr_width) {
+ case ADDR_WIDTH_4LEVEL:
+ break;
+ case ADDR_WIDTH_5LEVEL:
+ if (!cap_fl5lp_support(iommu->cap)) {
+ dev_err_ratelimited(dev,
+ "5-level paging not supported\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
+ s1_cfg->addr_width);
+ return -EINVAL;
+ }
+
+ if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
+ pr_err_ratelimited("No supervisor request support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
+ pr_err_ratelimited("No extended access flag support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ spin_lock(&iommu->lock);
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ return -ENODEV;
+ }
+ if (pasid_pte_is_present(pte)) {
+ spin_unlock(&iommu->lock);
+ return -EBUSY;
+ }
+
+ pasid_clear_entry(pte);
+
+ if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
+ pasid_set_flpm(pte, 1);
+
+ pasid_set_flptr(pte, (uintptr_t)s1_gpgd);
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_SRE) {
+ pasid_set_sre(pte);
+ if (s1_cfg->flags & IOMMU_VTD_S1_WPE)
+ pasid_set_wpe(pte);
+ }
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_EAFE)
+ pasid_set_eafe(pte);
+
+ if (s2_domain->force_snooping)
+ pasid_set_pgsnp(pte);
+
+ pasid_set_slptr(pte, virt_to_phys(pgd));
+ pasid_set_fault_enable(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, s2_domain->agaw);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+ pasid_set_present(pte);
+ spin_unlock(&iommu->lock);
+
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return 0;
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 4e9e68c3c388..dd37611175cc 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -106,9 +106,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid,
+ bool enabled);
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+ u32 pasid, struct dmar_domain *domain);
void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
bool fault_ignore);
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 8aeba81800c5..34b446146961 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -11,3 +11,4 @@ iommufd-y := \
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
+obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index ce78c3671539..59d3a07300d9 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD);
static int iommufd_group_setup_msi(struct iommufd_group *igroup,
- struct iommufd_hw_pagetable *hwpt)
+ struct iommufd_hwpt_paging *hwpt_paging)
{
phys_addr_t sw_msi_start = igroup->sw_msi_start;
int rc;
@@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
* matches what the IRQ layer actually expects in a newly created
* domain.
*/
- if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) {
- rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+ if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
+ rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
+ sw_msi_start);
if (rc)
return rc;
@@ -320,7 +321,31 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
* iommu_get_msi_cookie() can only be called once per domain,
* it returns -EBUSY on later calls.
*/
- hwpt->msi_cookie = true;
+ hwpt_paging->msi_cookie = true;
+ }
+ return 0;
+}
+
+static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
+ struct iommufd_device *idev)
+{
+ int rc;
+
+ lockdep_assert_held(&idev->igroup->lock);
+
+ rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
+ idev->dev,
+ &idev->igroup->sw_msi_start);
+ if (rc)
+ return rc;
+
+ if (list_empty(&idev->igroup->device_list)) {
+ rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging);
+ if (rc) {
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt,
+ idev->dev);
+ return rc;
+ }
}
return 0;
}
@@ -337,18 +362,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
goto err_unlock;
}
- /* Try to upgrade the domain we have */
- if (idev->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ if (hwpt_is_paging(hwpt)) {
+ rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev);
if (rc)
goto err_unlock;
}
- rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev,
- &idev->igroup->sw_msi_start);
- if (rc)
- goto err_unlock;
-
/*
* Only attach to the group once for the first device that is in the
* group. All the other devices will follow this attachment. The user
@@ -357,10 +376,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
* attachment.
*/
if (list_empty(&idev->igroup->device_list)) {
- rc = iommufd_group_setup_msi(idev->igroup, hwpt);
- if (rc)
- goto err_unresv;
-
rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
if (rc)
goto err_unresv;
@@ -371,7 +386,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
mutex_unlock(&idev->igroup->lock);
return 0;
err_unresv:
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ if (hwpt_is_paging(hwpt))
+ iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+ idev->dev);
err_unlock:
mutex_unlock(&idev->igroup->lock);
return rc;
@@ -388,7 +405,9 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
iommu_detach_group(hwpt->domain, idev->igroup->group);
idev->igroup->hwpt = NULL;
}
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ if (hwpt_is_paging(hwpt))
+ iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+ idev->dev);
mutex_unlock(&idev->igroup->lock);
/* Caller must destroy hwpt */
@@ -407,14 +426,55 @@ iommufd_device_do_attach(struct iommufd_device *idev,
return NULL;
}
+static void
+iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommufd_device *cur;
+
+ lockdep_assert_held(&igroup->lock);
+
+ list_for_each_entry(cur, &igroup->device_list, group_item)
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev);
+}
+
+static int
+iommufd_group_do_replace_paging(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt;
+ struct iommufd_device *cur;
+ int rc;
+
+ lockdep_assert_held(&igroup->lock);
+
+ if (!hwpt_is_paging(old_hwpt) ||
+ hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) {
+ list_for_each_entry(cur, &igroup->device_list, group_item) {
+ rc = iopt_table_enforce_dev_resv_regions(
+ &hwpt_paging->ioas->iopt, cur->dev, NULL);
+ if (rc)
+ goto err_unresv;
+ }
+ }
+
+ rc = iommufd_group_setup_msi(igroup, hwpt_paging);
+ if (rc)
+ goto err_unresv;
+ return 0;
+
+err_unresv:
+ iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
+ return rc;
+}
+
static struct iommufd_hw_pagetable *
iommufd_device_do_replace(struct iommufd_device *idev,
struct iommufd_hw_pagetable *hwpt)
{
struct iommufd_group *igroup = idev->igroup;
struct iommufd_hw_pagetable *old_hwpt;
- unsigned int num_devices = 0;
- struct iommufd_device *cur;
+ unsigned int num_devices;
int rc;
mutex_lock(&idev->igroup->lock);
@@ -429,42 +489,27 @@ iommufd_device_do_replace(struct iommufd_device *idev,
return NULL;
}
- /* Try to upgrade the domain we have */
- list_for_each_entry(cur, &igroup->device_list, group_item) {
- num_devices++;
- if (cur->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
- if (rc)
- goto err_unlock;
- }
- }
-
old_hwpt = igroup->hwpt;
- if (hwpt->ioas != old_hwpt->ioas) {
- list_for_each_entry(cur, &igroup->device_list, group_item) {
- rc = iopt_table_enforce_dev_resv_regions(
- &hwpt->ioas->iopt, cur->dev, NULL);
- if (rc)
- goto err_unresv;
- }
+ if (hwpt_is_paging(hwpt)) {
+ rc = iommufd_group_do_replace_paging(igroup,
+ to_hwpt_paging(hwpt));
+ if (rc)
+ goto err_unlock;
}
- rc = iommufd_group_setup_msi(idev->igroup, hwpt);
- if (rc)
- goto err_unresv;
-
rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
if (rc)
goto err_unresv;
- if (hwpt->ioas != old_hwpt->ioas) {
- list_for_each_entry(cur, &igroup->device_list, group_item)
- iopt_remove_reserved_iova(&old_hwpt->ioas->iopt,
- cur->dev);
- }
+ if (hwpt_is_paging(old_hwpt) &&
+ (!hwpt_is_paging(hwpt) ||
+ to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas))
+ iommufd_group_remove_reserved_iova(igroup,
+ to_hwpt_paging(old_hwpt));
igroup->hwpt = hwpt;
+ num_devices = list_count_nodes(&igroup->device_list);
/*
* Move the refcounts held by the device_list to the new hwpt. Retain a
* refcount for this thread as the caller will free it.
@@ -478,8 +523,9 @@ iommufd_device_do_replace(struct iommufd_device *idev,
/* Caller must destroy old_hwpt */
return old_hwpt;
err_unresv:
- list_for_each_entry(cur, &igroup->device_list, group_item)
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev);
+ if (hwpt_is_paging(hwpt))
+ iommufd_group_remove_reserved_iova(igroup,
+ to_hwpt_paging(old_hwpt));
err_unlock:
mutex_unlock(&idev->igroup->lock);
return ERR_PTR(rc);
@@ -507,6 +553,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
*/
bool immediate_attach = do_attach == iommufd_device_do_attach;
struct iommufd_hw_pagetable *destroy_hwpt;
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_hw_pagetable *hwpt;
/*
@@ -515,10 +562,11 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
* other.
*/
mutex_lock(&ioas->mutex);
- list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
- if (!hwpt->auto_domain)
+ list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt_paging->auto_domain)
continue;
+ hwpt = &hwpt_paging->common;
if (!iommufd_lock_obj(&hwpt->obj))
continue;
destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -539,12 +587,13 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
goto out_unlock;
}
- hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev,
- immediate_attach);
- if (IS_ERR(hwpt)) {
- destroy_hwpt = ERR_CAST(hwpt);
+ hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0,
+ immediate_attach, NULL);
+ if (IS_ERR(hwpt_paging)) {
+ destroy_hwpt = ERR_CAST(hwpt_paging);
goto out_unlock;
}
+ hwpt = &hwpt_paging->common;
if (!immediate_attach) {
destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -554,7 +603,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
destroy_hwpt = NULL;
}
- hwpt->auto_domain = true;
+ hwpt_paging->auto_domain = true;
*pt_id = hwpt->obj.id;
iommufd_object_finalize(idev->ictx, &hwpt->obj);
@@ -579,7 +628,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
return PTR_ERR(pt_obj);
switch (pt_obj->type) {
- case IOMMUFD_OBJ_HW_PAGETABLE: {
+ case IOMMUFD_OBJ_HWPT_NESTED:
+ case IOMMUFD_OBJ_HWPT_PAGING: {
struct iommufd_hw_pagetable *hwpt =
container_of(pt_obj, struct iommufd_hw_pagetable, obj);
@@ -617,8 +667,8 @@ out_put_pt_obj:
/**
* iommufd_device_attach - Connect a device to an iommu_domain
* @idev: device to attach
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ * Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
* This connects the device to an iommu_domain, either automatically or manually
* selected. Once this completes the device could do DMA.
@@ -646,8 +696,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
/**
* iommufd_device_replace - Change the device's iommu_domain
* @idev: device to change
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ * Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
* This is the same as::
*
@@ -1185,6 +1235,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
*/
cmd->data_len = data_len;
+ cmd->out_capabilities = 0;
+ if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
+ cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING;
+
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_free:
kfree(data);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index cf2c1504e20d..2abbeafdbd22 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -5,62 +5,87 @@
#include <linux/iommu.h>
#include <uapi/linux/iommufd.h>
+#include "../iommu-priv.h"
#include "iommufd_private.h"
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
{
- struct iommufd_hw_pagetable *hwpt =
- container_of(obj, struct iommufd_hw_pagetable, obj);
+ struct iommufd_hwpt_paging *hwpt_paging =
+ container_of(obj, struct iommufd_hwpt_paging, common.obj);
- if (!list_empty(&hwpt->hwpt_item)) {
- mutex_lock(&hwpt->ioas->mutex);
- list_del(&hwpt->hwpt_item);
- mutex_unlock(&hwpt->ioas->mutex);
+ if (!list_empty(&hwpt_paging->hwpt_item)) {
+ mutex_lock(&hwpt_paging->ioas->mutex);
+ list_del(&hwpt_paging->hwpt_item);
+ mutex_unlock(&hwpt_paging->ioas->mutex);
- iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+ hwpt_paging->common.domain);
}
- if (hwpt->domain)
- iommu_domain_free(hwpt->domain);
+ if (hwpt_paging->common.domain)
+ iommu_domain_free(hwpt_paging->common.domain);
- refcount_dec(&hwpt->ioas->obj.users);
+ refcount_dec(&hwpt_paging->ioas->obj.users);
}
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj)
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj)
{
- struct iommufd_hw_pagetable *hwpt =
- container_of(obj, struct iommufd_hw_pagetable, obj);
+ struct iommufd_hwpt_paging *hwpt_paging =
+ container_of(obj, struct iommufd_hwpt_paging, common.obj);
/* The ioas->mutex must be held until finalize is called. */
- lockdep_assert_held(&hwpt->ioas->mutex);
+ lockdep_assert_held(&hwpt_paging->ioas->mutex);
- if (!list_empty(&hwpt->hwpt_item)) {
- list_del_init(&hwpt->hwpt_item);
- iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ if (!list_empty(&hwpt_paging->hwpt_item)) {
+ list_del_init(&hwpt_paging->hwpt_item);
+ iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+ hwpt_paging->common.domain);
}
- iommufd_hw_pagetable_destroy(obj);
+ iommufd_hwpt_paging_destroy(obj);
}
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
{
- if (hwpt->enforce_cache_coherency)
+ struct iommufd_hwpt_nested *hwpt_nested =
+ container_of(obj, struct iommufd_hwpt_nested, common.obj);
+
+ if (hwpt_nested->common.domain)
+ iommu_domain_free(hwpt_nested->common.domain);
+
+ refcount_dec(&hwpt_nested->parent->common.obj.users);
+}
+
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
+{
+ iommufd_hwpt_nested_destroy(obj);
+}
+
+static int
+iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommu_domain *paging_domain = hwpt_paging->common.domain;
+
+ if (hwpt_paging->enforce_cache_coherency)
return 0;
- if (hwpt->domain->ops->enforce_cache_coherency)
- hwpt->enforce_cache_coherency =
- hwpt->domain->ops->enforce_cache_coherency(
- hwpt->domain);
- if (!hwpt->enforce_cache_coherency)
+ if (paging_domain->ops->enforce_cache_coherency)
+ hwpt_paging->enforce_cache_coherency =
+ paging_domain->ops->enforce_cache_coherency(
+ paging_domain);
+ if (!hwpt_paging->enforce_cache_coherency)
return -EINVAL;
return 0;
}
/**
- * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device
+ * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device
* @ictx: iommufd context
* @ioas: IOAS to associate the domain with
* @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
* @immediate_attach: True if idev should be attached to the hwpt
+ * @user_data: The user provided driver specific data describing the domain to
+ * create
*
* Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT
* will be linked to the given ioas and upon return the underlying iommu_domain
@@ -70,28 +95,52 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
* iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on
* the returned hwpt.
*/
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, bool immediate_attach)
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, u32 flags,
+ bool immediate_attach,
+ const struct iommu_user_data *user_data)
{
+ const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_hw_pagetable *hwpt;
int rc;
lockdep_assert_held(&ioas->mutex);
- hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE);
- if (IS_ERR(hwpt))
- return hwpt;
+ if ((flags || user_data) && !ops->domain_alloc_user)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (flags & ~valid_flags)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ hwpt_paging = __iommufd_object_alloc(
+ ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj);
+ if (IS_ERR(hwpt_paging))
+ return ERR_CAST(hwpt_paging);
+ hwpt = &hwpt_paging->common;
- INIT_LIST_HEAD(&hwpt->hwpt_item);
+ INIT_LIST_HEAD(&hwpt_paging->hwpt_item);
/* Pairs with iommufd_hw_pagetable_destroy() */
refcount_inc(&ioas->obj.users);
- hwpt->ioas = ioas;
+ hwpt_paging->ioas = ioas;
+ hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
- hwpt->domain = iommu_domain_alloc(idev->dev->bus);
- if (!hwpt->domain) {
- rc = -ENOMEM;
- goto out_abort;
+ if (ops->domain_alloc_user) {
+ hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL,
+ user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+ } else {
+ hwpt->domain = iommu_domain_alloc(idev->dev->bus);
+ if (!hwpt->domain) {
+ rc = -ENOMEM;
+ goto out_abort;
+ }
}
/*
@@ -100,9 +149,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
* doing any maps. It is an iommu driver bug to report
* IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on
* a new domain.
+ *
+ * The cache coherency mode must be configured here and unchanged later.
+ * Note that a HWPT (non-CC) created for a device (non-CC) can be later
+ * reused by another device (either non-CC or CC). However, A HWPT (CC)
+ * created for a device (CC) cannot be reused by another device (non-CC)
+ * but only devices (CC). Instead user space in this case would need to
+ * allocate a separate HWPT (non-CC).
*/
if (idev->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging);
if (WARN_ON(rc))
goto out_abort;
}
@@ -119,11 +175,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
goto out_abort;
}
- rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
+ rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);
if (rc)
goto out_detach;
- list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
- return hwpt;
+ list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list);
+ return hwpt_paging;
out_detach:
if (immediate_attach)
@@ -133,32 +189,120 @@ out_abort:
return ERR_PTR(rc);
}
+/**
+ * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device
+ * @ictx: iommufd context
+ * @parent: Parent PAGING-type hwpt to associate the domain with
+ * @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as
+ * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of
+ * being a parent.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
+ struct iommufd_hwpt_paging *parent,
+ struct iommufd_device *idev, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ if (flags || !user_data->len || !ops->domain_alloc_user)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (parent->auto_domain || !parent->nest_parent)
+ return ERR_PTR(-EINVAL);
+
+ hwpt_nested = __iommufd_object_alloc(
+ ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+ if (IS_ERR(hwpt_nested))
+ return ERR_CAST(hwpt_nested);
+ hwpt = &hwpt_nested->common;
+
+ refcount_inc(&parent->common.obj.users);
+ hwpt_nested->parent = parent;
+
+ hwpt->domain = ops->domain_alloc_user(idev->dev, flags,
+ parent->common.domain, user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+ rc = -EINVAL;
+ goto out_abort;
+ }
+ return hwpt_nested;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_hwpt_alloc *cmd = ucmd->cmd;
+ const struct iommu_user_data user_data = {
+ .type = cmd->data_type,
+ .uptr = u64_to_user_ptr(cmd->data_uptr),
+ .len = cmd->data_len,
+ };
struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_ioas *ioas = NULL;
+ struct iommufd_object *pt_obj;
struct iommufd_device *idev;
- struct iommufd_ioas *ioas;
int rc;
- if (cmd->flags || cmd->__reserved)
+ if (cmd->__reserved)
return -EOPNOTSUPP;
+ if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len)
+ return -EINVAL;
idev = iommufd_get_device(ucmd, cmd->dev_id);
if (IS_ERR(idev))
return PTR_ERR(idev);
- ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id);
- if (IS_ERR(ioas)) {
- rc = PTR_ERR(ioas);
+ pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj)) {
+ rc = -EINVAL;
goto out_put_idev;
}
- mutex_lock(&ioas->mutex);
- hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false);
- if (IS_ERR(hwpt)) {
- rc = PTR_ERR(hwpt);
- goto out_unlock;
+ if (pt_obj->type == IOMMUFD_OBJ_IOAS) {
+ struct iommufd_hwpt_paging *hwpt_paging;
+
+ ioas = container_of(pt_obj, struct iommufd_ioas, obj);
+ mutex_lock(&ioas->mutex);
+ hwpt_paging = iommufd_hwpt_paging_alloc(
+ ucmd->ictx, ioas, idev, cmd->flags, false,
+ user_data.len ? &user_data : NULL);
+ if (IS_ERR(hwpt_paging)) {
+ rc = PTR_ERR(hwpt_paging);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_paging->common;
+ } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) {
+ struct iommufd_hwpt_nested *hwpt_nested;
+
+ hwpt_nested = iommufd_hwpt_nested_alloc(
+ ucmd->ictx,
+ container_of(pt_obj, struct iommufd_hwpt_paging,
+ common.obj),
+ idev, cmd->flags, &user_data);
+ if (IS_ERR(hwpt_nested)) {
+ rc = PTR_ERR(hwpt_nested);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_nested->common;
+ } else {
+ rc = -EINVAL;
+ goto out_put_pt;
}
cmd->out_hwpt_id = hwpt->obj.id;
@@ -171,9 +315,59 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
out_hwpt:
iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj);
out_unlock:
- mutex_unlock(&ioas->mutex);
- iommufd_put_object(&ioas->obj);
+ if (ioas)
+ mutex_unlock(&ioas->mutex);
+out_put_pt:
+ iommufd_put_object(pt_obj);
out_put_idev:
iommufd_put_object(&idev->obj);
return rc;
}
+
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_ioas *ioas;
+ int rc = -EOPNOTSUPP;
+ bool enable;
+
+ if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE)
+ return rc;
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging))
+ return PTR_ERR(hwpt_paging);
+
+ ioas = hwpt_paging->ioas;
+ enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE;
+
+ rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain,
+ enable);
+
+ iommufd_put_object(&hwpt_paging->common.obj);
+ return rc;
+}
+
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_ioas *ioas;
+ int rc = -EOPNOTSUPP;
+
+ if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) ||
+ cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging))
+ return PTR_ERR(hwpt_paging);
+
+ ioas = hwpt_paging->ioas;
+ rc = iopt_read_and_clear_dirty_data(
+ &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd);
+
+ iommufd_put_object(&hwpt_paging->common.obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 3a598182b761..504ac1b01b2d 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/errno.h>
+#include <uapi/linux/iommufd.h>
#include "io_pagetable.h"
#include "double_span.h"
@@ -221,6 +222,18 @@ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
return 0;
}
+static struct iopt_area *iopt_area_alloc(void)
+{
+ struct iopt_area *area;
+
+ area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ if (!area)
+ return NULL;
+ RB_CLEAR_NODE(&area->node.rb);
+ RB_CLEAR_NODE(&area->pages_node.rb);
+ return area;
+}
+
static int iopt_alloc_area_pages(struct io_pagetable *iopt,
struct list_head *pages_list,
unsigned long length, unsigned long *dst_iova,
@@ -231,7 +244,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
int rc = 0;
list_for_each_entry(elm, pages_list, next) {
- elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
+ elm->area = iopt_area_alloc();
if (!elm->area)
return -ENOMEM;
}
@@ -412,6 +425,177 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
return 0;
}
+struct iova_bitmap_fn_arg {
+ unsigned long flags;
+ struct io_pagetable *iopt;
+ struct iommu_domain *domain;
+ struct iommu_dirty_bitmap *dirty;
+};
+
+static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length,
+ void *opaque)
+{
+ struct iopt_area *area;
+ struct iopt_area_contig_iter iter;
+ struct iova_bitmap_fn_arg *arg = opaque;
+ struct iommu_domain *domain = arg->domain;
+ struct iommu_dirty_bitmap *dirty = arg->dirty;
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ unsigned long last_iova = iova + length - 1;
+ unsigned long flags = arg->flags;
+ int ret;
+
+ iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+
+ ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
+ last - iter.cur_iova + 1, flags,
+ dirty);
+ if (ret)
+ return ret;
+ }
+
+ if (!iopt_area_contig_done(&iter))
+ return -EINVAL;
+ return 0;
+}
+
+static int
+iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ struct io_pagetable *iopt, unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ struct iommu_iotlb_gather gather;
+ struct iommu_dirty_bitmap dirty;
+ struct iova_bitmap_fn_arg arg;
+ struct iova_bitmap *iter;
+ int ret = 0;
+
+ if (!ops || !ops->read_and_clear_dirty)
+ return -EOPNOTSUPP;
+
+ iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
+ bitmap->page_size,
+ u64_to_user_ptr(bitmap->data));
+ if (IS_ERR(iter))
+ return -ENOMEM;
+
+ iommu_dirty_bitmap_init(&dirty, iter, &gather);
+
+ arg.flags = flags;
+ arg.iopt = iopt;
+ arg.domain = domain;
+ arg.dirty = &dirty;
+ iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
+
+ if (!(flags & IOMMU_DIRTY_NO_CLEAR))
+ iommu_iotlb_sync(domain, &gather);
+
+ iova_bitmap_free(iter);
+
+ return ret;
+}
+
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ size_t iommu_pgsize = iopt->iova_alignment;
+ u64 last_iova;
+
+ if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
+ return -EOVERFLOW;
+
+ if ((bitmap->iova & (iommu_pgsize - 1)) ||
+ ((last_iova + 1) & (iommu_pgsize - 1)))
+ return -EINVAL;
+
+ if (!bitmap->page_size)
+ return -EINVAL;
+
+ if ((bitmap->iova & (bitmap->page_size - 1)) ||
+ ((last_iova + 1) & (bitmap->page_size - 1)))
+ return -EINVAL;
+
+ return 0;
+}
+
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain,
+ unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ int ret;
+
+ ret = iommufd_check_iova_range(iopt, bitmap);
+ if (ret)
+ return ret;
+
+ down_read(&iopt->iova_rwsem);
+ ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
+ up_read(&iopt->iova_rwsem);
+
+ return ret;
+}
+
+static int iopt_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ struct iommu_iotlb_gather gather;
+ struct iommu_dirty_bitmap dirty;
+ struct iopt_area *area;
+ int ret = 0;
+
+ lockdep_assert_held_read(&iopt->iova_rwsem);
+
+ iommu_dirty_bitmap_init(&dirty, NULL, &gather);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ if (!area->pages)
+ continue;
+
+ ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
+ iopt_area_length(area), 0,
+ &dirty);
+ if (ret)
+ break;
+ }
+
+ iommu_iotlb_sync(domain, &gather);
+ return ret;
+}
+
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+ struct iommu_domain *domain, bool enable)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ int ret = 0;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ down_read(&iopt->iova_rwsem);
+
+ /* Clear dirty bits from PTEs to ensure a clean snapshot */
+ if (enable) {
+ ret = iopt_clear_dirty_data(iopt, domain);
+ if (ret)
+ goto out_unlock;
+ }
+
+ ret = ops->set_dirty_tracking(domain, enable);
+
+out_unlock:
+ up_read(&iopt->iova_rwsem);
+ return ret;
+}
+
int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, struct list_head *pages_list)
{
@@ -1005,11 +1189,11 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
- lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ lhs = iopt_area_alloc();
if (!lhs)
return -ENOMEM;
- rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ rhs = iopt_area_alloc();
if (!rhs) {
rc = -ENOMEM;
goto err_free_lhs;
@@ -1048,6 +1232,16 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (WARN_ON(rc))
goto err_remove_lhs;
+ /*
+ * If the original area has filled a domain, domains_itree has to be
+ * updated.
+ */
+ if (area->storage_domain) {
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
+ interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
+ }
+
lhs->storage_domain = area->storage_domain;
lhs->pages = area->pages;
rhs->storage_domain = area->storage_domain;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 2c58670011fe..a74cfefffbc6 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -8,6 +8,9 @@
#include <linux/xarray.h>
#include <linux/refcount.h>
#include <linux/uaccess.h>
+#include <linux/iommu.h>
+#include <linux/iova_bitmap.h>
+#include <uapi/linux/iommufd.h>
struct iommu_domain;
struct iommu_group;
@@ -70,6 +73,13 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, unsigned long *unmapped);
int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain,
+ unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap);
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+ struct iommu_domain *domain, bool enable);
+
void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
unsigned long length);
int iopt_table_add_domain(struct io_pagetable *iopt,
@@ -113,7 +123,8 @@ enum iommufd_object_type {
IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_DEVICE,
- IOMMUFD_OBJ_HW_PAGETABLE,
+ IOMMUFD_OBJ_HWPT_PAGING,
+ IOMMUFD_OBJ_HWPT_NESTED,
IOMMUFD_OBJ_IOAS,
IOMMUFD_OBJ_ACCESS,
#ifdef CONFIG_IOMMUFD_TEST
@@ -171,7 +182,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
size_t size,
enum iommufd_object_type type);
-#define iommufd_object_alloc(ictx, ptr, type) \
+#define __iommufd_object_alloc(ictx, ptr, type, obj) \
container_of(_iommufd_object_alloc( \
ictx, \
sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
@@ -180,6 +191,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
type), \
typeof(*(ptr)), obj)
+#define iommufd_object_alloc(ictx, ptr, type) \
+ __iommufd_object_alloc(ictx, ptr, type, obj)
+
/*
* The IO Address Space (IOAS) pagetable is a virtual page table backed by the
* io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
@@ -222,6 +236,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap);
/*
* A HW pagetable is called an iommu_domain inside the kernel. This user object
@@ -231,35 +247,75 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
*/
struct iommufd_hw_pagetable {
struct iommufd_object obj;
- struct iommufd_ioas *ioas;
struct iommu_domain *domain;
+};
+
+struct iommufd_hwpt_paging {
+ struct iommufd_hw_pagetable common;
+ struct iommufd_ioas *ioas;
bool auto_domain : 1;
bool enforce_cache_coherency : 1;
bool msi_cookie : 1;
+ bool nest_parent : 1;
/* Head at iommufd_ioas::hwpt_list */
struct list_head hwpt_item;
};
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, bool immediate_attach);
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt);
+struct iommufd_hwpt_nested {
+ struct iommufd_hw_pagetable common;
+ struct iommufd_hwpt_paging *parent;
+};
+
+static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
+{
+ return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING;
+}
+
+static inline struct iommufd_hwpt_paging *
+to_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
+{
+ return container_of(hwpt, struct iommufd_hwpt_paging, common);
+}
+
+static inline struct iommufd_hwpt_paging *
+iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_HWPT_PAGING),
+ struct iommufd_hwpt_paging, common.obj);
+}
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd);
+
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, u32 flags,
+ bool immediate_attach,
+ const struct iommu_user_data *user_data);
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
struct iommufd_device *idev);
struct iommufd_hw_pagetable *
iommufd_hw_pagetable_detach(struct iommufd_device *idev);
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj);
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj);
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
struct iommufd_hw_pagetable *hwpt)
{
- lockdep_assert_not_held(&hwpt->ioas->mutex);
- if (hwpt->auto_domain)
- iommufd_object_deref_user(ictx, &hwpt->obj);
- else
- refcount_dec(&hwpt->obj.users);
+ if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) {
+ struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt);
+
+ lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
+
+ if (hwpt_paging->auto_domain) {
+ iommufd_object_deref_user(ictx, &hwpt->obj);
+ return;
+ }
+ }
+ refcount_dec(&hwpt->obj.users);
}
struct iommufd_group {
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 3f3644375bf1..7910fbe1962d 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -19,6 +19,8 @@ enum {
IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE,
IOMMU_TEST_OP_ACCESS_REPLACE_IOAS,
+ IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
+ IOMMU_TEST_OP_DIRTY,
};
enum {
@@ -40,6 +42,15 @@ enum {
MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0,
};
+enum {
+ MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
+};
+
+enum {
+ MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3,
+ MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
+};
+
struct iommu_test_cmd {
__u32 size;
__u32 op;
@@ -57,6 +68,13 @@ struct iommu_test_cmd {
__u32 out_idev_id;
} mock_domain;
struct {
+ __u32 out_stdev_id;
+ __u32 out_hwpt_id;
+ __u32 out_idev_id;
+ /* Expand mock_domain to set mock device flags */
+ __u32 dev_flags;
+ } mock_domain_flags;
+ struct {
__u32 pt_id;
} mock_domain_replace;
struct {
@@ -95,6 +113,14 @@ struct iommu_test_cmd {
struct {
__u32 ioas_id;
} access_replace_ioas;
+ struct {
+ __u32 flags;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 page_size;
+ __aligned_u64 uptr;
+ __aligned_u64 out_nr_dirty;
+ } dirty;
};
__u32 last;
};
@@ -109,4 +135,17 @@ struct iommu_test_hw_info {
__u32 test_reg;
};
+/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
+#define IOMMU_HWPT_DATA_SELFTEST 0xdead
+#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+
+/**
+ * struct iommu_hwpt_selftest
+ *
+ * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT
+ */
+struct iommu_hwpt_selftest {
+ __u32 iotlb;
+};
+
#endif
diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
new file mode 100644
index 000000000000..0a92c9eeaf7f
--- /dev/null
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+#include <linux/iova_bitmap.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
+
+/*
+ * struct iova_bitmap_map - A bitmap representing an IOVA range
+ *
+ * Main data structure for tracking mapped user pages of bitmap data.
+ *
+ * For example, for something recording dirty IOVAs, it will be provided a
+ * struct iova_bitmap structure, as a general structure for iterating the
+ * total IOVA range. The struct iova_bitmap_map, though, represents the
+ * subset of said IOVA space that is pinned by its parent structure (struct
+ * iova_bitmap).
+ *
+ * The user does not need to exact location of the bits in the bitmap.
+ * From user perspective the only API available is iova_bitmap_set() which
+ * records the IOVA *range* in the bitmap by setting the corresponding
+ * bits.
+ *
+ * The bitmap is an array of u64 whereas each bit represents an IOVA of
+ * range of (1 << pgshift). Thus formula for the bitmap data to be set is:
+ *
+ * data[(iova / page_size) / 64] & (1ULL << (iova % 64))
+ */
+struct iova_bitmap_map {
+ /* base IOVA representing bit 0 of the first page */
+ unsigned long iova;
+
+ /* page size order that each bit granules to */
+ unsigned long pgshift;
+
+ /* page offset of the first user page pinned */
+ unsigned long pgoff;
+
+ /* number of pages pinned */
+ unsigned long npages;
+
+ /* pinned pages representing the bitmap data */
+ struct page **pages;
+};
+
+/*
+ * struct iova_bitmap - The IOVA bitmap object
+ *
+ * Main data structure for iterating over the bitmap data.
+ *
+ * Abstracts the pinning work and iterates in IOVA ranges.
+ * It uses a windowing scheme and pins the bitmap in relatively
+ * big ranges e.g.
+ *
+ * The bitmap object uses one base page to store all the pinned pages
+ * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores
+ * 512 struct page pointers which, if the base page size is 4K, it means
+ * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is
+ * also 4K then the range window to iterate is 64G.
+ *
+ * For example iterating on a total IOVA range of 4G..128G, it will walk
+ * through this set of ranges:
+ *
+ * 4G - 68G-1 (64G)
+ * 68G - 128G-1 (64G)
+ *
+ * An example of the APIs on how to use/iterate over the IOVA bitmap:
+ *
+ * bitmap = iova_bitmap_alloc(iova, length, page_size, data);
+ * if (IS_ERR(bitmap))
+ * return PTR_ERR(bitmap);
+ *
+ * ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn);
+ *
+ * iova_bitmap_free(bitmap);
+ *
+ * Each iteration of the @dirty_reporter_fn is called with a unique @iova
+ * and @length argument, indicating the current range available through the
+ * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty
+ * areas (@iova_length) within that provided range, as following:
+ *
+ * iova_bitmap_set(bitmap, iova, iova_length);
+ *
+ * The internals of the object uses an index @mapped_base_index that indexes
+ * which u64 word of the bitmap is mapped, up to @mapped_total_index.
+ * Those keep being incremented until @mapped_total_index is reached while
+ * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages.
+ *
+ * The IOVA bitmap is usually located on what tracks DMA mapped ranges or
+ * some form of IOVA range tracking that co-relates to the user passed
+ * bitmap.
+ */
+struct iova_bitmap {
+ /* IOVA range representing the currently mapped bitmap data */
+ struct iova_bitmap_map mapped;
+
+ /* userspace address of the bitmap */
+ u64 __user *bitmap;
+
+ /* u64 index that @mapped points to */
+ unsigned long mapped_base_index;
+
+ /* how many u64 can we walk in total */
+ unsigned long mapped_total_index;
+
+ /* base IOVA of the whole bitmap */
+ unsigned long iova;
+
+ /* length of the IOVA range for the whole bitmap */
+ size_t length;
+};
+
+/*
+ * Converts a relative IOVA to a bitmap index.
+ * This function provides the index into the u64 array (bitmap::bitmap)
+ * for a given IOVA offset.
+ * Relative IOVA means relative to the bitmap::mapped base IOVA
+ * (stored in mapped::iova). All computations in this file are done using
+ * relative IOVAs and thus avoid an extra subtraction against mapped::iova.
+ * The user API iova_bitmap_set() always uses a regular absolute IOVAs.
+ */
+static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
+ unsigned long iova)
+{
+ unsigned long pgsize = 1 << bitmap->mapped.pgshift;
+
+ return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
+}
+
+/*
+ * Converts a bitmap index to a *relative* IOVA.
+ */
+static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap,
+ unsigned long index)
+{
+ unsigned long pgshift = bitmap->mapped.pgshift;
+
+ return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift;
+}
+
+/*
+ * Returns the base IOVA of the mapped range.
+ */
+static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
+{
+ unsigned long skip = bitmap->mapped_base_index;
+
+ return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
+}
+
+/*
+ * Pins the bitmap user pages for the current range window.
+ * This is internal to IOVA bitmap and called when advancing the
+ * index (@mapped_base_index) or allocating the bitmap.
+ */
+static int iova_bitmap_get(struct iova_bitmap *bitmap)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+ unsigned long npages;
+ u64 __user *addr;
+ long ret;
+
+ /*
+ * @mapped_base_index is the index of the currently mapped u64 words
+ * that we have access. Anything before @mapped_base_index is not
+ * mapped. The range @mapped_base_index .. @mapped_total_index-1 is
+ * mapped but capped at a maximum number of pages.
+ */
+ npages = DIV_ROUND_UP((bitmap->mapped_total_index -
+ bitmap->mapped_base_index) *
+ sizeof(*bitmap->bitmap), PAGE_SIZE);
+
+ /*
+ * We always cap at max number of 'struct page' a base page can fit.
+ * This is, for example, on x86 means 2M of bitmap data max.
+ */
+ npages = min(npages, PAGE_SIZE / sizeof(struct page *));
+
+ /*
+ * Bitmap address to be pinned is calculated via pointer arithmetic
+ * with bitmap u64 word index.
+ */
+ addr = bitmap->bitmap + bitmap->mapped_base_index;
+
+ ret = pin_user_pages_fast((unsigned long)addr, npages,
+ FOLL_WRITE, mapped->pages);
+ if (ret <= 0)
+ return -EFAULT;
+
+ mapped->npages = (unsigned long)ret;
+ /* Base IOVA where @pages point to i.e. bit 0 of the first page */
+ mapped->iova = iova_bitmap_mapped_iova(bitmap);
+
+ /*
+ * offset of the page where pinned pages bit 0 is located.
+ * This handles the case where the bitmap is not PAGE_SIZE
+ * aligned.
+ */
+ mapped->pgoff = offset_in_page(addr);
+ return 0;
+}
+
+/*
+ * Unpins the bitmap user pages and clears @npages
+ * (un)pinning is abstracted from API user and it's done when advancing
+ * the index or freeing the bitmap.
+ */
+static void iova_bitmap_put(struct iova_bitmap *bitmap)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+ if (mapped->npages) {
+ unpin_user_pages(mapped->pages, mapped->npages);
+ mapped->npages = 0;
+ }
+}
+
+/**
+ * iova_bitmap_alloc() - Allocates an IOVA bitmap object
+ * @iova: Start address of the IOVA range
+ * @length: Length of the IOVA range
+ * @page_size: Page size of the IOVA bitmap. It defines what each bit
+ * granularity represents
+ * @data: Userspace address of the bitmap
+ *
+ * Allocates an IOVA object and initializes all its fields including the
+ * first user pages of @data.
+ *
+ * Return: A pointer to a newly allocated struct iova_bitmap
+ * or ERR_PTR() on error.
+ */
+struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
+ unsigned long page_size, u64 __user *data)
+{
+ struct iova_bitmap_map *mapped;
+ struct iova_bitmap *bitmap;
+ int rc;
+
+ bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
+ if (!bitmap)
+ return ERR_PTR(-ENOMEM);
+
+ mapped = &bitmap->mapped;
+ mapped->pgshift = __ffs(page_size);
+ bitmap->bitmap = data;
+ bitmap->mapped_total_index =
+ iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
+ bitmap->iova = iova;
+ bitmap->length = length;
+ mapped->iova = iova;
+ mapped->pages = (struct page **)__get_free_page(GFP_KERNEL);
+ if (!mapped->pages) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ rc = iova_bitmap_get(bitmap);
+ if (rc)
+ goto err;
+ return bitmap;
+
+err:
+ iova_bitmap_free(bitmap);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD);
+
+/**
+ * iova_bitmap_free() - Frees an IOVA bitmap object
+ * @bitmap: IOVA bitmap to free
+ *
+ * It unpins and releases pages array memory and clears any leftover
+ * state.
+ */
+void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+ iova_bitmap_put(bitmap);
+
+ if (mapped->pages) {
+ free_page((unsigned long)mapped->pages);
+ mapped->pages = NULL;
+ }
+
+ kfree(bitmap);
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD);
+
+/*
+ * Returns the remaining bitmap indexes from mapped_total_index to process for
+ * the currently pinned bitmap pages.
+ */
+static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
+{
+ unsigned long remaining, bytes;
+
+ bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff;
+
+ remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
+ remaining = min_t(unsigned long, remaining,
+ bytes / sizeof(*bitmap->bitmap));
+
+ return remaining;
+}
+
+/*
+ * Returns the length of the mapped IOVA range.
+ */
+static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
+{
+ unsigned long max_iova = bitmap->iova + bitmap->length - 1;
+ unsigned long iova = iova_bitmap_mapped_iova(bitmap);
+ unsigned long remaining;
+
+ /*
+ * iova_bitmap_mapped_remaining() returns a number of indexes which
+ * when converted to IOVA gives us a max length that the bitmap
+ * pinned data can cover. Afterwards, that is capped to
+ * only cover the IOVA range in @bitmap::iova .. @bitmap::length.
+ */
+ remaining = iova_bitmap_index_to_offset(bitmap,
+ iova_bitmap_mapped_remaining(bitmap));
+
+ if (iova + remaining - 1 > max_iova)
+ remaining -= ((iova + remaining - 1) - max_iova);
+
+ return remaining;
+}
+
+/*
+ * Returns true if there's not more data to iterate.
+ */
+static bool iova_bitmap_done(struct iova_bitmap *bitmap)
+{
+ return bitmap->mapped_base_index >= bitmap->mapped_total_index;
+}
+
+/*
+ * Advances to the next range, releases the current pinned
+ * pages and pins the next set of bitmap pages.
+ * Returns 0 on success or otherwise errno.
+ */
+static int iova_bitmap_advance(struct iova_bitmap *bitmap)
+{
+ unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
+ unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
+
+ bitmap->mapped_base_index += count;
+
+ iova_bitmap_put(bitmap);
+ if (iova_bitmap_done(bitmap))
+ return 0;
+
+ /* When advancing the index we pin the next set of bitmap pages */
+ return iova_bitmap_get(bitmap);
+}
+
+/**
+ * iova_bitmap_for_each() - Iterates over the bitmap
+ * @bitmap: IOVA bitmap to iterate
+ * @opaque: Additional argument to pass to the callback
+ * @fn: Function that gets called for each IOVA range
+ *
+ * Helper function to iterate over bitmap data representing a portion of IOVA
+ * space. It hides the complexity of iterating bitmaps and translating the
+ * mapped bitmap user pages into IOVA ranges to process.
+ *
+ * Return: 0 on success, and an error on failure either upon
+ * iteration or when the callback returns an error.
+ */
+int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+ iova_bitmap_fn_t fn)
+{
+ int ret = 0;
+
+ for (; !iova_bitmap_done(bitmap) && !ret;
+ ret = iova_bitmap_advance(bitmap)) {
+ ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
+ iova_bitmap_mapped_length(bitmap), opaque);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD);
+
+/**
+ * iova_bitmap_set() - Records an IOVA range in bitmap
+ * @bitmap: IOVA bitmap
+ * @iova: IOVA to start
+ * @length: IOVA range length
+ *
+ * Set the bits corresponding to the range [iova .. iova+length-1] in
+ * the user bitmap.
+ *
+ */
+void iova_bitmap_set(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+ unsigned long cur_bit = ((iova - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+ unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+
+ do {
+ unsigned int page_idx = cur_bit / BITS_PER_PAGE;
+ unsigned int offset = cur_bit % BITS_PER_PAGE;
+ unsigned int nbits = min(BITS_PER_PAGE - offset,
+ last_bit - cur_bit + 1);
+ void *kaddr;
+
+ kaddr = kmap_local_page(mapped->pages[page_idx]);
+ bitmap_set(kaddr, offset, nbits);
+ kunmap_local(kaddr);
+ cur_bit += nbits;
+ } while (cur_bit <= last_bit);
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index e71523cbd0de..45b9d40773b1 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -307,6 +307,8 @@ union ucmd_buffer {
struct iommu_destroy destroy;
struct iommu_hw_info info;
struct iommu_hwpt_alloc hwpt;
+ struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
+ struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
struct iommu_ioas_alloc alloc;
struct iommu_ioas_allow_iovas allow_iovas;
struct iommu_ioas_copy ioas_copy;
@@ -342,6 +344,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
__reserved),
IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
__reserved),
+ IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
+ struct iommu_hwpt_get_dirty_bitmap, data),
+ IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
+ struct iommu_hwpt_set_dirty_tracking, __reserved),
IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
@@ -482,9 +488,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_IOAS] = {
.destroy = iommufd_ioas_destroy,
},
- [IOMMUFD_OBJ_HW_PAGETABLE] = {
- .destroy = iommufd_hw_pagetable_destroy,
- .abort = iommufd_hw_pagetable_abort,
+ [IOMMUFD_OBJ_HWPT_PAGING] = {
+ .destroy = iommufd_hwpt_paging_destroy,
+ .abort = iommufd_hwpt_paging_abort,
+ },
+ [IOMMUFD_OBJ_HWPT_NESTED] = {
+ .destroy = iommufd_hwpt_nested_destroy,
+ .abort = iommufd_hwpt_nested_abort,
},
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
@@ -552,5 +562,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");
#endif
MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 8d9aa297c117..528f356238b3 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -1507,6 +1507,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
area->storage_domain = NULL;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 56506d5753f1..d43a87737c1e 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -20,10 +20,13 @@
static DECLARE_FAULT_ATTR(fail_iommufd);
static struct dentry *dbgfs_root;
static struct platform_device *selftest_iommu_dev;
+static const struct iommu_ops mock_ops;
+static struct iommu_domain_ops domain_nested_ops;
size_t iommufd_test_memory_limit = 65536;
enum {
+ MOCK_DIRTY_TRACK = 1,
MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
/*
@@ -36,6 +39,7 @@ enum {
_MOCK_PFN_START = MOCK_PFN_MASK + 1,
MOCK_PFN_START_IOVA = _MOCK_PFN_START,
MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
+ MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
};
/*
@@ -86,16 +90,24 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
}
struct mock_iommu_domain {
+ unsigned long flags;
struct iommu_domain domain;
struct xarray pfns;
};
+struct mock_iommu_domain_nested {
+ struct iommu_domain domain;
+ struct mock_iommu_domain *parent;
+ u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM];
+};
+
enum selftest_obj_type {
TYPE_IDEV,
};
struct mock_dev {
struct device dev;
+ unsigned long flags;
};
struct selftest_obj {
@@ -118,6 +130,11 @@ static void mock_domain_blocking_free(struct iommu_domain *domain)
static int mock_domain_nop_attach(struct iommu_domain *domain,
struct device *dev)
{
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+ if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
+ return -EINVAL;
+
return 0;
}
@@ -146,15 +163,70 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
return info;
}
-static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
{
- struct mock_iommu_domain *mock;
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long flags = mock->flags;
- if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
- return &mock_blocking_domain;
+ if (enable && !domain->dirty_ops)
+ return -EINVAL;
- if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)
- return NULL;
+ /* No change? */
+ if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK)))
+ return 0;
+
+ flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK);
+
+ mock->flags = flags;
+ return 0;
+}
+
+static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long i, max = size / MOCK_IO_PAGE_SIZE;
+ void *ent, *old;
+
+ if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
+ return -EINVAL;
+
+ for (i = 0; i < max; i++) {
+ unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE;
+
+ ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
+ if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) {
+ /* Clear dirty */
+ if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
+ unsigned long val;
+
+ val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
+ old = xa_store(&mock->pfns,
+ cur / MOCK_IO_PAGE_SIZE,
+ xa_mk_value(val), GFP_KERNEL);
+ WARN_ON_ONCE(ent != old);
+ }
+ iommu_dirty_bitmap_record(dirty, cur,
+ MOCK_IO_PAGE_SIZE);
+ }
+ }
+
+ return 0;
+}
+
+const struct iommu_dirty_ops dirty_ops = {
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+ .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
+};
+
+static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
+{
+ struct mock_iommu_domain *mock;
mock = kzalloc(sizeof(*mock), GFP_KERNEL);
if (!mock)
@@ -162,10 +234,87 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
+ mock->domain.ops = mock_ops.default_domain_ops;
+ mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
xa_init(&mock->pfns);
return &mock->domain;
}
+static struct iommu_domain *
+__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
+ const struct iommu_hwpt_selftest *user_cfg)
+{
+ struct mock_iommu_domain_nested *mock_nested;
+ int i;
+
+ mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
+ if (!mock_nested)
+ return ERR_PTR(-ENOMEM);
+ mock_nested->parent = mock_parent;
+ mock_nested->domain.ops = &domain_nested_ops;
+ mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
+ for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
+ mock_nested->iotlb[i] = user_cfg->iotlb;
+ return &mock_nested->domain;
+}
+
+static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+{
+ if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
+ return &mock_blocking_domain;
+ if (iommu_domain_type == IOMMU_DOMAIN_UNMANAGED)
+ return mock_domain_alloc_paging(NULL);
+ return NULL;
+}
+
+static struct iommu_domain *
+mock_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct mock_iommu_domain *mock_parent;
+ struct iommu_hwpt_selftest user_cfg;
+ int rc;
+
+ /* must be mock_domain */
+ if (!parent) {
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_domain *domain;
+
+ if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (user_data || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+ domain = mock_domain_alloc_paging(NULL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+ if (has_dirty_flag)
+ container_of(domain, struct mock_iommu_domain, domain)
+ ->domain.dirty_ops = &dirty_ops;
+ return domain;
+ }
+
+ /* must be mock_domain_nested */
+ if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!parent || parent->ops != mock_ops.default_domain_ops)
+ return ERR_PTR(-EINVAL);
+
+ mock_parent = container_of(parent, struct mock_iommu_domain, domain);
+ if (!mock_parent)
+ return ERR_PTR(-EINVAL);
+
+ rc = iommu_copy_struct_from_user(&user_cfg, user_data,
+ IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
+
+ return __mock_domain_alloc_nested(mock_parent, &user_cfg);
+}
+
static void mock_domain_free(struct iommu_domain *domain)
{
struct mock_iommu_domain *mock =
@@ -243,7 +392,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- WARN_ON(!ent);
+
/*
* iommufd generates unmaps that must be a strict
* superset of the map's performend So every starting
@@ -253,13 +402,13 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
* passed to map_pages
*/
if (first) {
- WARN_ON(!(xa_to_value(ent) &
- MOCK_PFN_START_IOVA));
+ WARN_ON(ent && !(xa_to_value(ent) &
+ MOCK_PFN_START_IOVA));
first = false;
}
if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
- WARN_ON(!(xa_to_value(ent) &
- MOCK_PFN_LAST_IOVA));
+ WARN_ON(ent && !(xa_to_value(ent) &
+ MOCK_PFN_LAST_IOVA));
iova += MOCK_IO_PAGE_SIZE;
ret += MOCK_IO_PAGE_SIZE;
@@ -283,7 +432,18 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
{
- return cap == IOMMU_CAP_CACHE_COHERENCY;
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY);
+ default:
+ break;
+ }
+
+ return false;
}
static void mock_domain_set_plaform_dma_ops(struct device *dev)
@@ -307,6 +467,7 @@ static const struct iommu_ops mock_ops = {
.pgsize_bitmap = MOCK_IO_PAGE_SIZE,
.hw_info = mock_domain_hw_info,
.domain_alloc = mock_domain_alloc,
+ .domain_alloc_user = mock_domain_alloc_user,
.capable = mock_domain_capable,
.set_platform_dma_ops = mock_domain_set_plaform_dma_ops,
.device_group = generic_device_group,
@@ -321,19 +482,41 @@ static const struct iommu_ops mock_ops = {
},
};
+static void mock_domain_free_nested(struct iommu_domain *domain)
+{
+ struct mock_iommu_domain_nested *mock_nested =
+ container_of(domain, struct mock_iommu_domain_nested, domain);
+
+ kfree(mock_nested);
+}
+
+static struct iommu_domain_ops domain_nested_ops = {
+ .free = mock_domain_free_nested,
+ .attach_dev = mock_domain_nop_attach,
+};
+
static inline struct iommufd_hw_pagetable *
-get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
- struct mock_iommu_domain **mock)
+__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type)
{
- struct iommufd_hw_pagetable *hwpt;
struct iommufd_object *obj;
- obj = iommufd_get_object(ucmd->ictx, mockpt_id,
- IOMMUFD_OBJ_HW_PAGETABLE);
+ obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type);
if (IS_ERR(obj))
return ERR_CAST(obj);
- hwpt = container_of(obj, struct iommufd_hw_pagetable, obj);
- if (hwpt->domain->ops != mock_ops.default_domain_ops) {
+ return container_of(obj, struct iommufd_hw_pagetable, obj);
+}
+
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain **mock)
+{
+ struct iommufd_hw_pagetable *hwpt;
+
+ hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING);
+ if (IS_ERR(hwpt))
+ return hwpt;
+ if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
+ hwpt->domain->ops != mock_ops.default_domain_ops) {
iommufd_put_object(&hwpt->obj);
return ERR_PTR(-EINVAL);
}
@@ -341,6 +524,25 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
return hwpt;
}
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain_nested **mock_nested)
+{
+ struct iommufd_hw_pagetable *hwpt;
+
+ hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED);
+ if (IS_ERR(hwpt))
+ return hwpt;
+ if (hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
+ hwpt->domain->ops != &domain_nested_ops) {
+ iommufd_put_object(&hwpt->obj);
+ return ERR_PTR(-EINVAL);
+ }
+ *mock_nested = container_of(hwpt->domain,
+ struct mock_iommu_domain_nested, domain);
+ return hwpt;
+}
+
struct mock_bus_type {
struct bus_type bus;
struct notifier_block nb;
@@ -362,16 +564,20 @@ static void mock_dev_release(struct device *dev)
kfree(mdev);
}
-static struct mock_dev *mock_dev_create(void)
+static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{
struct mock_dev *mdev;
int rc;
+ if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY))
+ return ERR_PTR(-EINVAL);
+
mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
if (!mdev)
return ERR_PTR(-ENOMEM);
device_initialize(&mdev->dev);
+ mdev->flags = dev_flags;
mdev->dev.release = mock_dev_release;
mdev->dev.bus = &iommufd_mock_bus_type.bus;
@@ -407,6 +613,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
struct iommufd_device *idev;
struct selftest_obj *sobj;
u32 pt_id = cmd->id;
+ u32 dev_flags = 0;
u32 idev_id;
int rc;
@@ -417,7 +624,10 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
sobj->idev.ictx = ucmd->ictx;
sobj->type = TYPE_IDEV;
- sobj->idev.mock_dev = mock_dev_create();
+ if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS)
+ dev_flags = cmd->mock_domain_flags.dev_flags;
+
+ sobj->idev.mock_dev = mock_dev_create(dev_flags);
if (IS_ERR(sobj->idev.mock_dev)) {
rc = PTR_ERR(sobj->idev.mock_dev);
goto out_sobj;
@@ -977,6 +1187,73 @@ static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE);
static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH ==
__IOMMUFD_ACCESS_RW_SLOW_PATH);
+static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
+ unsigned long iova, size_t length,
+ unsigned long page_size, void __user *uptr,
+ u32 flags)
+{
+ unsigned long bitmap_size, i, max;
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct iommufd_hw_pagetable *hwpt;
+ struct mock_iommu_domain *mock;
+ int rc, count = 0;
+ void *tmp;
+
+ if (!page_size || !length || iova % page_size || length % page_size ||
+ !uptr)
+ return -EINVAL;
+
+ hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
+ if (IS_ERR(hwpt))
+ return PTR_ERR(hwpt);
+
+ if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+ rc = -EINVAL;
+ goto out_put;
+ }
+
+ max = length / page_size;
+ bitmap_size = max / BITS_PER_BYTE;
+
+ tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT);
+ if (!tmp) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+
+ if (copy_from_user(tmp, uptr, bitmap_size)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+ for (i = 0; i < max; i++) {
+ unsigned long cur = iova + i * page_size;
+ void *ent, *old;
+
+ if (!test_bit(i, (unsigned long *)tmp))
+ continue;
+
+ ent = xa_load(&mock->pfns, cur / page_size);
+ if (ent) {
+ unsigned long val;
+
+ val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
+ old = xa_store(&mock->pfns, cur / page_size,
+ xa_mk_value(val), GFP_KERNEL);
+ WARN_ON_ONCE(ent != old);
+ count++;
+ }
+ }
+
+ cmd->dirty.out_nr_dirty = count;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_free:
+ kvfree(tmp);
+out_put:
+ iommufd_put_object(&hwpt->obj);
+ return rc;
+}
+
void iommufd_selftest_destroy(struct iommufd_object *obj)
{
struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
@@ -1000,6 +1277,7 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
cmd->add_reserved.start,
cmd->add_reserved.length);
case IOMMU_TEST_OP_MOCK_DOMAIN:
+ case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS:
return iommufd_test_mock_domain(ucmd, cmd);
case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE:
return iommufd_test_mock_domain_replace(
@@ -1041,6 +1319,12 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return -EINVAL;
iommufd_test_memory_limit = cmd->memory_limit.limit;
return 0;
+ case IOMMU_TEST_OP_DIRTY:
+ return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova,
+ cmd->dirty.length,
+ cmd->dirty.page_size,
+ u64_to_user_ptr(cmd->dirty.uptr),
+ cmd->dirty.flags);
default:
return -EOPNOTSUPP;
}
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index 6c810bf80f99..538fbf76354d 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -255,7 +255,7 @@ err_put:
static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
{
- struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_ioas *ioas;
int rc = 1;
@@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
return PTR_ERR(ioas);
mutex_lock(&ioas->mutex);
- list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
- if (!hwpt->enforce_cache_coherency) {
+ list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt_paging->enforce_cache_coherency) {
rc = 0;
break;
}