summaryrefslogtreecommitdiff
path: root/drivers/iommu/iommufd/io_pagetable.h
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@nvidia.com>2022-11-29 23:29:32 +0300
committerJason Gunthorpe <jgg@nvidia.com>2022-12-01 03:16:49 +0300
commit8d160cd4d5066f864ec0f2c981470e55ac03ac27 (patch)
treef6ece76b34c49b2337d877b609f5dc1e02a8f452 /drivers/iommu/iommufd/io_pagetable.h
parentf394576eb11dbcd3a740fa41e577b97f0720d26e (diff)
downloadlinux-8d160cd4d5066f864ec0f2c981470e55ac03ac27.tar.xz
iommufd: Algorithms for PFN storage
The iopt_pages which represents a logical linear list of full PFNs held in different storage tiers. Each area points to a slice of exactly one iopt_pages, and each iopt_pages can have multiple areas and accesses. The three storage tiers are managed to meet these objectives: - If no iommu_domain or in-kerenel access exists then minimal memory should be consumed by iomufd - If a page has been pinned then an iopt_pages will not pin it again - If an in-kernel access exists then the xarray must provide the backing storage to avoid allocations on domain removals - Otherwise any iommu_domain will be used for storage In a common configuration with only an iommu_domain the iopt_pages does not allocate significant memory itself. The external interface for pages has several logical operations: iopt_area_fill_domain() will load the PFNs from storage into a single domain. This is used when attaching a new domain to an existing IOAS. iopt_area_fill_domains() will load the PFNs from storage into multiple domains. This is used when creating a new IOVA map in an existing IOAS iopt_pages_add_access() creates an iopt_pages_access that tracks an in-kernel access of PFNs. This is some external driver that might be accessing the IOVA using the CPU, or programming PFNs with the DMA API. ie a VFIO mdev. iopt_pages_rw_access() directly perform a memcpy on the PFNs, without the overhead of iopt_pages_add_access() iopt_pages_fill_xarray() will load PFNs into the xarray and return a 'struct page *' array. It is used by iopt_pages_access's to extract PFNs for in-kernel use. iopt_pages_fill_from_xarray() is a fast path when it is known the xarray is already filled. As an iopt_pages can be referred to in slices by many areas and accesses it uses interval trees to keep track of which storage tiers currently hold the PFNs. On a page-by-page basis any request for a PFN will be satisfied from one of the storage tiers and the PFN copied to target domain/array. Unfill actions are similar, on a page by page basis domains are unmapped, xarray entries freed or struct pages fully put back. Significant complexity is required to fully optimize all of these data motions. The implementation calculates the largest consecutive range of same-storage indexes and operates in blocks. The accumulation of PFNs always generates the largest contiguous PFN range possible to optimize and this gathering can cross storage tier boundaries. For cases like 'fill domains' care is taken to avoid duplicated work and PFNs are read once and pushed into all domains. The map/unmap interaction with the iommu_domain always works in contiguous PFN blocks. The implementation does not require or benefit from any split/merge optimization in the iommu_domain driver. This design suggests several possible improvements in the IOMMU API that would greatly help performance, particularly a way for the driver to map and read the pfns lists instead of working with one driver call per page to read, and one driver call per contiguous range to store. Link: https://lore.kernel.org/r/9-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian <kevin.tian@intel.com> Tested-by: Nicolin Chen <nicolinc@nvidia.com> Tested-by: Yi Liu <yi.l.liu@intel.com> Tested-by: Lixiao Yang <lixiao.yang@intel.com> Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Diffstat (limited to 'drivers/iommu/iommufd/io_pagetable.h')
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h74
1 files changed, 74 insertions, 0 deletions
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index b74bf01ffc52..a2b724175057 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -49,6 +49,15 @@ struct iopt_area {
unsigned int num_accesses;
};
+int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages);
+void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages);
+
+int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain);
+void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
+ struct iommu_domain *domain);
+void iopt_area_unmap_domain(struct iopt_area *area,
+ struct iommu_domain *domain);
+
static inline unsigned long iopt_area_index(struct iopt_area *area)
{
return area->pages_node.start;
@@ -69,6 +78,39 @@ static inline unsigned long iopt_area_last_iova(struct iopt_area *area)
return area->node.last;
}
+static inline size_t iopt_area_length(struct iopt_area *area)
+{
+ return (area->node.last - area->node.start) + 1;
+}
+
+#define __make_iopt_iter(name) \
+ static inline struct iopt_##name *iopt_##name##_iter_first( \
+ struct io_pagetable *iopt, unsigned long start, \
+ unsigned long last) \
+ { \
+ struct interval_tree_node *node; \
+ \
+ lockdep_assert_held(&iopt->iova_rwsem); \
+ node = interval_tree_iter_first(&iopt->name##_itree, start, \
+ last); \
+ if (!node) \
+ return NULL; \
+ return container_of(node, struct iopt_##name, node); \
+ } \
+ static inline struct iopt_##name *iopt_##name##_iter_next( \
+ struct iopt_##name *last_node, unsigned long start, \
+ unsigned long last) \
+ { \
+ struct interval_tree_node *node; \
+ \
+ node = interval_tree_iter_next(&last_node->node, start, last); \
+ if (!node) \
+ return NULL; \
+ return container_of(node, struct iopt_##name, node); \
+ }
+
+__make_iopt_iter(area)
+
enum {
IOPT_PAGES_ACCOUNT_NONE = 0,
IOPT_PAGES_ACCOUNT_USER = 1,
@@ -106,4 +148,36 @@ struct iopt_pages {
struct rb_root_cached domains_itree;
};
+struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
+ bool writable);
+void iopt_release_pages(struct kref *kref);
+static inline void iopt_put_pages(struct iopt_pages *pages)
+{
+ kref_put(&pages->kref, iopt_release_pages);
+}
+
+void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last, struct page **out_pages);
+int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last, struct page **out_pages);
+void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start,
+ unsigned long last);
+
+int iopt_area_add_access(struct iopt_area *area, unsigned long start,
+ unsigned long last, struct page **out_pages,
+ unsigned int flags);
+void iopt_area_remove_access(struct iopt_area *area, unsigned long start,
+ unsigned long last);
+int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
+ void *data, unsigned long length, unsigned int flags);
+
+/*
+ * Each interval represents an active iopt_access_pages(), it acts as an
+ * interval lock that keeps the PFNs pinned and stored in the xarray.
+ */
+struct iopt_pages_access {
+ struct interval_tree_node node;
+ unsigned int users;
+};
+
#endif