diff options
Diffstat (limited to 'drivers/iommu')
-rw-r--r-- | drivers/iommu/Kconfig | 19 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu.c | 12 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_init.c | 79 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_types.h | 7 | ||||
-rw-r--r-- | drivers/iommu/arm-smmu-v3.c | 11 | ||||
-rw-r--r-- | drivers/iommu/arm-smmu.c | 11 | ||||
-rw-r--r-- | drivers/iommu/dma-iommu.c | 3 | ||||
-rw-r--r-- | drivers/iommu/dmar.c | 44 | ||||
-rw-r--r-- | drivers/iommu/intel-iommu-debugfs.c | 75 | ||||
-rw-r--r-- | drivers/iommu/intel-iommu.c | 391 | ||||
-rw-r--r-- | drivers/iommu/intel-pasid.c | 97 | ||||
-rw-r--r-- | drivers/iommu/intel-pasid.h | 6 | ||||
-rw-r--r-- | drivers/iommu/intel-svm.c | 171 | ||||
-rw-r--r-- | drivers/iommu/iommu.c | 20 | ||||
-rw-r--r-- | drivers/iommu/iova.c | 2 | ||||
-rw-r--r-- | drivers/iommu/virtio-iommu.c | 14 |
16 files changed, 633 insertions, 329 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index d66ace717cf4..d2fade984999 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -82,7 +82,7 @@ config IOMMU_DEBUGFS config IOMMU_DEFAULT_PASSTHROUGH bool "IOMMU passthrough by default" depends on IOMMU_API - help + help Enable passthrough by default, removing the need to pass in iommu.passthrough=on or iommu=pt through command line. If this is enabled, you can still disable with iommu.passthrough=off @@ -91,8 +91,8 @@ config IOMMU_DEFAULT_PASSTHROUGH If unsure, say N here. config OF_IOMMU - def_bool y - depends on OF && IOMMU_API + def_bool y + depends on OF && IOMMU_API # IOMMU-agnostic DMA-mapping layer config IOMMU_DMA @@ -214,6 +214,7 @@ config INTEL_IOMMU_SVM select PCI_PASID select PCI_PRI select MMU_NOTIFIER + select IOASID help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by @@ -248,6 +249,18 @@ config INTEL_IOMMU_FLOPPY_WA workaround will setup a 1:1 mapping for the first 16MiB to make floppy (an ISA device) work. +config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON + bool "Enable Intel IOMMU scalable mode by default" + depends on INTEL_IOMMU + help + Selecting this option will enable by default the scalable mode if + hardware presents the capability. The scalable mode is defined in + VT-d 3.0. The scalable mode capability could be checked by reading + /sys/devices/virtual/iommu/dmar*/intel-iommu/ecap. If this option + is not selected, scalable mode support could also be enabled by + passing intel_iommu=sm_on to the kernel. If not sure, please use + the default value. + config IRQ_REMAP bool "Support for Interrupt Remapping" depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index bd25674ee4db..60538e9df8a6 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2297,7 +2297,6 @@ int __init amd_iommu_init_api(void) int __init amd_iommu_init_dma_ops(void) { swiotlb = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0; - iommu_detected = 1; if (amd_iommu_unmap_flush) pr_info("IO/TLB flush on unmap enabled\n"); @@ -2641,15 +2640,6 @@ static void amd_iommu_get_resv_regions(struct device *dev, list_add_tail(®ion->list, head); } -static void amd_iommu_put_resv_regions(struct device *dev, - struct list_head *head) -{ - struct iommu_resv_region *entry, *next; - - list_for_each_entry_safe(entry, next, head, list) - kfree(entry); -} - static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, struct device *dev) { @@ -2688,7 +2678,7 @@ const struct iommu_ops amd_iommu_ops = { .device_group = amd_iommu_device_group, .domain_get_attr = amd_iommu_domain_get_attr, .get_resv_regions = amd_iommu_get_resv_regions, - .put_resv_regions = amd_iommu_put_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .is_attach_deferred = amd_iommu_is_attach_deferred, .pgsize_bitmap = AMD_IOMMU_PGSIZES, .flush_iotlb_all = amd_iommu_flush_iotlb_all, diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 568c52317757..587ed1323c86 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -71,6 +71,8 @@ #define IVHD_FLAG_ISOC_EN_MASK 0x08 #define IVMD_FLAG_EXCL_RANGE 0x08 +#define IVMD_FLAG_IW 0x04 +#define IVMD_FLAG_IR 0x02 #define IVMD_FLAG_UNITY_MAP 0x01 #define ACPI_DEVFLAG_INITPASS 0x01 @@ -147,7 +149,7 @@ bool amd_iommu_dump; bool amd_iommu_irq_remap __read_mostly; int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC; -static int amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE; +static int amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE; static bool amd_iommu_detected; static bool __initdata amd_iommu_disabled; @@ -714,7 +716,7 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu) writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); - iommu_feature_enable(iommu, CONTROL_PPFLOG_EN); + iommu_feature_enable(iommu, CONTROL_PPRLOG_EN); iommu_feature_enable(iommu, CONTROL_PPR_EN); } @@ -1116,21 +1118,17 @@ static int __init add_early_maps(void) */ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - if (!(m->flags & IVMD_FLAG_EXCL_RANGE)) return; - if (iommu) { - /* - * We only can configure exclusion ranges per IOMMU, not - * per device. But we can enable the exclusion range per - * device. This is done here - */ - set_dev_entry_bit(devid, DEV_ENTRY_EX); - iommu->exclusion_start = m->range_start; - iommu->exclusion_length = m->range_length; - } + /* + * Treat per-device exclusion ranges as r/w unity-mapped regions + * since some buggy BIOSes might lead to the overwritten exclusion + * range (exclusion_start and exclusion_length members). This + * happens when there are multiple exclusion ranges (IVMD entries) + * defined in ACPI table. + */ + m->flags = (IVMD_FLAG_IW | IVMD_FLAG_IR | IVMD_FLAG_UNITY_MAP); } /* @@ -1523,8 +1521,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; if (((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0)) amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; - if (((h->efr_attr & (0x1 << IOMMU_FEAT_XTSUP_SHIFT)) == 0)) - amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE; break; case 0x11: case 0x40: @@ -1534,8 +1530,15 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; if (((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0)) amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; - if (((h->efr_reg & (0x1 << IOMMU_EFR_XTSUP_SHIFT)) == 0)) - amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE; + /* + * Note: Since iommu_update_intcapxt() leverages + * the IOMMU MMIO access to MSI capability block registers + * for MSI address lo/hi/data, we need to check both + * EFR[XtSup] and EFR[MsiCapMmioSup] for x2APIC support. + */ + if ((h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) && + (h->efr_reg & BIT(IOMMU_EFR_MSICAPMMIOSUP_SHIFT))) + amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE; break; default: return -EINVAL; @@ -1655,27 +1658,39 @@ static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, static void init_iommu_perf_ctr(struct amd_iommu *iommu) { struct pci_dev *pdev = iommu->dev; - u64 val = 0xabcd, val2 = 0; + u64 val = 0xabcd, val2 = 0, save_reg = 0; if (!iommu_feature(iommu, FEATURE_PC)) return; amd_iommu_pc_present = true; + /* save the value to restore, if writable */ + if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, false)) + goto pc_false; + /* Check if the performance counters can be written to */ if ((iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true)) || (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false)) || - (val != val2)) { - pci_err(pdev, "Unable to write to IOMMU perf counter.\n"); - amd_iommu_pc_present = false; - return; - } + (val != val2)) + goto pc_false; + + /* restore */ + if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, true)) + goto pc_false; pci_info(pdev, "IOMMU performance counters supported\n"); val = readl(iommu->mmio_base + MMIO_CNTR_CONF_OFFSET); iommu->max_banks = (u8) ((val >> 12) & 0x3f); iommu->max_counters = (u8) ((val >> 7) & 0xf); + + return; + +pc_false: + pci_err(pdev, "Unable to read/write to IOMMU perf counter.\n"); + amd_iommu_pc_present = false; + return; } static ssize_t amd_iommu_show_cap(struct device *dev, @@ -1715,7 +1730,6 @@ static const struct attribute_group *amd_iommu_groups[] = { static int __init iommu_init_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; - u32 range, misc, low, high; int ret; iommu->dev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(iommu->devid), @@ -1728,19 +1742,12 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); - pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, - &range); - pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET, - &misc); if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB))) amd_iommu_iotlb_sup = false; /* read extended feature bits */ - low = readl(iommu->mmio_base + MMIO_EXT_FEATURES); - high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4); - - iommu->features = ((u64)high << 32) | low; + iommu->features = readq(iommu->mmio_base + MMIO_EXT_FEATURES); if (iommu_feature(iommu, FEATURE_GT)) { int glxval; @@ -1984,8 +1991,8 @@ static int iommu_init_intcapxt(struct amd_iommu *iommu) struct irq_affinity_notify *notify = &iommu->intcapxt_notify; /** - * IntCapXT requires XTSup=1, which can be inferred - * amd_iommu_xt_mode. + * IntCapXT requires XTSup=1 and MsiCapMmioSup=1, + * which can be inferred from amd_iommu_xt_mode. */ if (amd_iommu_xt_mode != IRQ_REMAP_X2APIC_MODE) return 0; @@ -2032,7 +2039,7 @@ enable_faults: iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); if (iommu->ppr_log != NULL) - iommu_feature_enable(iommu, CONTROL_PPFINT_EN); + iommu_feature_enable(iommu, CONTROL_PPRINT_EN); iommu_ga_log_enable(iommu); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index f52f59d5c6bd..f8d01d6b00da 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -147,8 +147,8 @@ #define CONTROL_COHERENT_EN 0x0aULL #define CONTROL_ISOC_EN 0x0bULL #define CONTROL_CMDBUF_EN 0x0cULL -#define CONTROL_PPFLOG_EN 0x0dULL -#define CONTROL_PPFINT_EN 0x0eULL +#define CONTROL_PPRLOG_EN 0x0dULL +#define CONTROL_PPRINT_EN 0x0eULL #define CONTROL_PPR_EN 0x0fULL #define CONTROL_GT_EN 0x10ULL #define CONTROL_GA_EN 0x11ULL @@ -377,12 +377,12 @@ #define IOMMU_CAP_EFR 27 /* IOMMU Feature Reporting Field (for IVHD type 10h */ -#define IOMMU_FEAT_XTSUP_SHIFT 0 #define IOMMU_FEAT_GASUP_SHIFT 6 /* IOMMU Extended Feature Register (EFR) */ #define IOMMU_EFR_XTSUP_SHIFT 2 #define IOMMU_EFR_GASUP_SHIFT 7 +#define IOMMU_EFR_MSICAPMMIOSUP_SHIFT 46 #define MAX_DOMAIN_ID 65536 @@ -463,7 +463,6 @@ struct amd_irte_ops; * independent of their use. */ struct protection_domain { - struct list_head list; /* for list of all protection domains */ struct list_head dev_list; /* List of all devices in this domain */ struct iommu_domain domain; /* generic domain handle used by iommu core code */ diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 5e04c1f3992a..aa3ac2a03807 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2985,15 +2985,6 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); } -static void arm_smmu_put_resv_regions(struct device *dev, - struct list_head *head) -{ - struct iommu_resv_region *entry, *next; - - list_for_each_entry_safe(entry, next, head, list) - kfree(entry); -} - static struct iommu_ops arm_smmu_ops = { .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, @@ -3011,7 +3002,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_set_attr = arm_smmu_domain_set_attr, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, - .put_resv_regions = arm_smmu_put_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .pgsize_bitmap = -1UL, /* Restricted during device attach */ }; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index cca94c30b301..16c4b87af42b 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1615,15 +1615,6 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); } -static void arm_smmu_put_resv_regions(struct device *dev, - struct list_head *head) -{ - struct iommu_resv_region *entry, *next; - - list_for_each_entry_safe(entry, next, head, list) - kfree(entry); -} - static struct iommu_ops arm_smmu_ops = { .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, @@ -1641,7 +1632,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_set_attr = arm_smmu_domain_set_attr, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, - .put_resv_regions = arm_smmu_put_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .pgsize_bitmap = -1UL, /* Restricted during device attach */ }; diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index c363294b3bb9..a2e96a5fd9a7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1203,7 +1203,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_cookie *cookie; struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */ @@ -1212,8 +1211,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return 0; } - cookie = domain->iova_cookie; - /* * In fact the whole prepare operation should already be serialised by * irq_domain_mutex further up the callchain, but that's pretty subtle diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 3acfa6a25fa2..071bb42bbbc5 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -244,7 +244,7 @@ int dmar_insert_dev_scope(struct dmar_pci_notify_info *info, info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) || (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE && (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL && - info->dev->class >> 8 != PCI_CLASS_BRIDGE_OTHER))) { + info->dev->class >> 16 != PCI_BASE_CLASS_BRIDGE))) { pr_warn("Device scope type does not match for %s\n", pci_name(info->dev)); return -EINVAL; @@ -1354,7 +1354,6 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, struct qi_desc desc; if (mask) { - WARN_ON_ONCE(addr & ((1ULL << (VTD_PAGE_SHIFT + mask)) - 1)); addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; } else @@ -1371,6 +1370,47 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, qi_submit_sync(&desc, iommu); } +/* PASID-based IOTLB invalidation */ +void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih) +{ + struct qi_desc desc = {.qw2 = 0, .qw3 = 0}; + + /* + * npages == -1 means a PASID-selective invalidation, otherwise, + * a positive value for Page-selective-within-PASID invalidation. + * 0 is not a valid input. + */ + if (WARN_ON(!npages)) { + pr_err("Invalid input npages = %ld\n", npages); + return; + } + + if (npages == -1) { + desc.qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc.qw1 = 0; + } else { + int mask = ilog2(__roundup_pow_of_two(npages)); + unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); + + if (WARN_ON_ONCE(!ALIGN(addr, align))) + addr &= ~(align - 1); + + desc.qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | + QI_EIOTLB_TYPE; + desc.qw1 = QI_EIOTLB_ADDR(addr) | + QI_EIOTLB_IH(ih) | + QI_EIOTLB_AM(mask); + } + + qi_submit_sync(&desc, iommu); +} + /* * Disable Queued Invalidation interface. */ diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel-iommu-debugfs.c index 471f05d452e0..c1257bef553c 100644 --- a/drivers/iommu/intel-iommu-debugfs.c +++ b/drivers/iommu/intel-iommu-debugfs.c @@ -5,6 +5,7 @@ * Authors: Gayatri Kammela <gayatri.kammela@intel.com> * Sohil Mehta <sohil.mehta@intel.com> * Jacob Pan <jacob.jun.pan@linux.intel.com> + * Lu Baolu <baolu.lu@linux.intel.com> */ #include <linux/debugfs.h> @@ -283,6 +284,77 @@ static int dmar_translation_struct_show(struct seq_file *m, void *unused) } DEFINE_SHOW_ATTRIBUTE(dmar_translation_struct); +static inline unsigned long level_to_directory_size(int level) +{ + return BIT_ULL(VTD_PAGE_SHIFT + VTD_STRIDE_SHIFT * (level - 1)); +} + +static inline void +dump_page_info(struct seq_file *m, unsigned long iova, u64 *path) +{ + seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\t0x%016llx\n", + iova >> VTD_PAGE_SHIFT, path[5], path[4], + path[3], path[2], path[1]); +} + +static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, + int level, unsigned long start, + u64 *path) +{ + int i; + + if (level > 5 || level < 1) + return; + + for (i = 0; i < BIT_ULL(VTD_STRIDE_SHIFT); + i++, pde++, start += level_to_directory_size(level)) { + if (!dma_pte_present(pde)) + continue; + + path[level] = pde->val; + if (dma_pte_superpage(pde) || level == 1) + dump_page_info(m, start, path); + else + pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)), + level - 1, start, path); + path[level] = 0; + } +} + +static int show_device_domain_translation(struct device *dev, void *data) +{ + struct dmar_domain *domain = find_domain(dev); + struct seq_file *m = data; + u64 path[6] = { 0 }; + + if (!domain) + return 0; + + seq_printf(m, "Device %s with pasid %d @0x%llx\n", + dev_name(dev), domain->default_pasid, + (u64)virt_to_phys(domain->pgd)); + seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n"); + + pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); + seq_putc(m, '\n'); + + return 0; +} + +static int domain_translation_struct_show(struct seq_file *m, void *unused) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&device_domain_lock, flags); + ret = bus_for_each_dev(&pci_bus_type, NULL, m, + show_device_domain_translation); + spin_unlock_irqrestore(&device_domain_lock, flags); + + return ret; +} +DEFINE_SHOW_ATTRIBUTE(domain_translation_struct); + #ifdef CONFIG_IRQ_REMAP static void ir_tbl_remap_entry_show(struct seq_file *m, struct intel_iommu *iommu) @@ -396,6 +468,9 @@ void __init intel_iommu_debugfs_init(void) &iommu_regset_fops); debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug, NULL, &dmar_translation_struct_fops); + debugfs_create_file("domain_translation_struct", 0444, + intel_iommu_debug, NULL, + &domain_translation_struct_fops); #ifdef CONFIG_IRQ_REMAP debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug, NULL, &ir_translation_struct_fops); diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 42966611a192..64ddccf1d5fe 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -307,6 +307,20 @@ static int hw_pass_through = 1; */ #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1) +/* + * When VT-d works in the scalable mode, it allows DMA translation to + * happen through either first level or second level page table. This + * bit marks that the DMA translation for the domain goes through the + * first level page table, otherwise, it goes through the second level. + */ +#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2) + +/* + * Domain represents a virtual machine which demands iommu nested + * translation mode support. + */ +#define DOMAIN_FLAG_NESTING_MODE BIT(3) + #define for_each_domain_iommu(idx, domain) \ for (idx = 0; idx < g_num_of_iommus; idx++) \ if (domain->iommu_refcnt[idx]) @@ -355,9 +369,14 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, int dmar_disabled = 0; #else int dmar_disabled = 1; -#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/ +#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ +#ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON +int intel_iommu_sm = 1; +#else int intel_iommu_sm; +#endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ + int intel_iommu_enabled = 0; EXPORT_SYMBOL_GPL(intel_iommu_enabled); @@ -368,7 +387,6 @@ static int intel_iommu_superpage = 1; static int iommu_identity_mapping; static int intel_no_bounce; -#define IDENTMAP_ALL 1 #define IDENTMAP_GFX 2 #define IDENTMAP_AZALIA 4 @@ -377,7 +395,7 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) -static DEFINE_SPINLOCK(device_domain_lock); +DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \ @@ -552,6 +570,11 @@ static inline int domain_type_is_si(struct dmar_domain *domain) return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY; } +static inline bool domain_use_first_level(struct dmar_domain *domain) +{ + return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; +} + static inline int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { @@ -661,11 +684,12 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip) return ret; } -static int domain_update_iommu_superpage(struct intel_iommu *skip) +static int domain_update_iommu_superpage(struct dmar_domain *domain, + struct intel_iommu *skip) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; - int mask = 0xf; + int mask = 0x3; if (!intel_iommu_superpage) { return 0; @@ -675,7 +699,13 @@ static int domain_update_iommu_superpage(struct intel_iommu *skip) rcu_read_lock(); for_each_active_iommu(iommu, drhd) { if (iommu != skip) { - mask &= cap_super_page_val(iommu->cap); + if (domain && domain_use_first_level(domain)) { + if (!cap_fl1gp_support(iommu->cap)) + mask = 0x1; + } else { + mask &= cap_super_page_val(iommu->cap); + } + if (!mask) break; } @@ -690,7 +720,7 @@ static void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); domain->iommu_snooping = domain_update_iommu_snooping(NULL); - domain->iommu_superpage = domain_update_iommu_superpage(NULL); + domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); } struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, @@ -913,6 +943,8 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; + if (domain_use_first_level(domain)) + pteval |= DMA_FL_PTE_XD; if (cmpxchg64(&pte->val, 0ULL, pteval)) /* Someone else set it while we were thinking; use theirs. */ free_pgtable_page(tmp_page); @@ -1483,6 +1515,20 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, spin_unlock_irqrestore(&device_domain_lock, flags); } +static void domain_flush_piotlb(struct intel_iommu *iommu, + struct dmar_domain *domain, + u64 addr, unsigned long npages, bool ih) +{ + u16 did = domain->iommu_did[iommu->seq_id]; + + if (domain->default_pasid) + qi_flush_piotlb(iommu, did, domain->default_pasid, + addr, npages, ih); + + if (!list_empty(&domain->devices)) + qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih); +} + static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, struct dmar_domain *domain, unsigned long pfn, unsigned int pages, @@ -1496,18 +1542,23 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, if (ih) ih = 1 << 6; - /* - * Fallback to domain selective flush if no PSI support or the size is - * too big. - * PSI requires page size to be 2 ^ x, and the base address is naturally - * aligned to the size - */ - if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, - DMA_TLB_PSI_FLUSH); + + if (domain_use_first_level(domain)) { + domain_flush_piotlb(iommu, domain, addr, pages, ih); + } else { + /* + * Fallback to domain selective flush if no PSI support or + * the size is too big. PSI requires page size to be 2 ^ x, + * and the base address is naturally aligned to the size. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, did, 0, 0, + DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, did, addr | ih, mask, + DMA_TLB_PSI_FLUSH); + } /* * In caching mode, changes of pages from non-present to present require @@ -1522,8 +1573,11 @@ static inline void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, unsigned long pfn, unsigned int pages) { - /* It's a non-present to present mapping. Only flush if caching mode */ - if (cap_caching_mode(iommu->cap)) + /* + * It's a non-present to present mapping. Only flush if caching mode + * and second level. + */ + if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); else iommu_flush_write_buffer(iommu); @@ -1540,7 +1594,11 @@ static void iommu_flush_iova(struct iova_domain *iovad) struct intel_iommu *iommu = g_iommus[idx]; u16 did = domain->iommu_did[iommu->seq_id]; - iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + if (domain_use_first_level(domain)) + domain_flush_piotlb(iommu, domain, 0, -1, 0); + else + iommu->flush.flush_iotlb(iommu, did, 0, 0, + DMA_TLB_DSI_FLUSH); if (!cap_caching_mode(iommu->cap)) iommu_flush_dev_iotlb(get_iommu_domain(iommu, did), @@ -1709,6 +1767,33 @@ static void free_dmar_iommu(struct intel_iommu *iommu) #endif } +/* + * Check and return whether first level is used by default for + * DMA translation. + */ +static bool first_level_by_default(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + static int first_level_support = -1; + + if (likely(first_level_support != -1)) + return first_level_support; + + first_level_support = 1; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) { + first_level_support = 0; + break; + } + } + rcu_read_unlock(); + + return first_level_support; +} + static struct dmar_domain *alloc_domain(int flags) { struct dmar_domain *domain; @@ -1720,6 +1805,8 @@ static struct dmar_domain *alloc_domain(int flags) memset(domain, 0, sizeof(*domain)); domain->nid = NUMA_NO_NODE; domain->flags = flags; + if (first_level_by_default()) + domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); @@ -1849,14 +1936,16 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu, { int adjust_width, agaw; unsigned long sagaw; - int err; + int ret; init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); - err = init_iova_flush_queue(&domain->iovad, - iommu_flush_iova, iova_entry_free); - if (err) - return err; + if (!intel_iommu_strict) { + ret = init_iova_flush_queue(&domain->iovad, + iommu_flush_iova, iova_entry_free); + if (ret) + pr_info("iova flush queue initialization failed\n"); + } domain_reserve_special_ranges(domain); @@ -2229,17 +2318,20 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long sg_res = 0; unsigned int largepage_lvl = 0; unsigned long lvl_pages = 0; + u64 attr; BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)); if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; - prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP; + attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); + if (domain_use_first_level(domain)) + attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD; if (!sg) { sg_res = nr_pages; - pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; + pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; } while (nr_pages > 0) { @@ -2251,7 +2343,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, sg_res = aligned_nrpages(sg->offset, sg->length); sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff; sg->dma_length = sg->length; - pteval = (sg_phys(sg) - pgoff) | prot; + pteval = (sg_phys(sg) - pgoff) | attr; phys_pfn = pteval >> VTD_PAGE_SHIFT; } @@ -2420,7 +2512,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) spin_unlock_irqrestore(&device_domain_lock, flags); } -static struct dmar_domain *find_domain(struct device *dev) +struct dmar_domain *find_domain(struct device *dev) { struct device_domain_info *info; @@ -2463,6 +2555,36 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn) return NULL; } +static int domain_setup_first_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, + int pasid) +{ + int flags = PASID_FLAG_SUPERVISOR_MODE; + struct dma_pte *pgd = domain->pgd; + int agaw, level; + + /* + * Skip top levels of page tables for iommu which has + * less agaw than default. Unnecessary for PT mode. + */ + for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + pgd = phys_to_virt(dma_pte_addr(pgd)); + if (!dma_pte_present(pgd)) + return -ENOMEM; + } + + level = agaw_to_level(agaw); + if (level != 4 && level != 5) + return -EINVAL; + + flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; + + return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, + domain->iommu_did[iommu->seq_id], + flags); +} + static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, int bus, int devfn, struct device *dev, @@ -2562,6 +2684,9 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (hw_pass_through && domain_type_is_si(domain)) ret = intel_pasid_setup_pass_through(iommu, domain, dev, PASID_RID2PASID); + else if (domain_use_first_level(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + PASID_RID2PASID); else ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); @@ -2767,10 +2892,8 @@ static int __init si_domain_init(int hw) } /* - * Normally we use DMA domains for devices which have RMRRs. But we - * loose this requirement for graphic and usb devices. Identity map - * the RMRRs for graphic and USB devices so that they could use the - * si_domain. + * Identity map the RMRRs so that devices with RMRRs could also use + * the si_domain. */ for_each_rmrr_units(rmrr) { for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, @@ -2778,9 +2901,6 @@ static int __init si_domain_init(int hw) unsigned long long start = rmrr->base_address; unsigned long long end = rmrr->end_address; - if (device_is_rmrr_locked(dev)) - continue; - if (WARN_ON(end < start || end >> agaw_to_width(si_domain->agaw))) continue; @@ -2919,9 +3039,6 @@ static int device_def_domain_type(struct device *dev) if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); - if (device_is_rmrr_locked(dev)) - return IOMMU_DOMAIN_DMA; - /* * Prevent any device marked as untrusted from getting * placed into the statically identity mapping domain. @@ -2959,13 +3076,9 @@ static int device_def_domain_type(struct device *dev) return IOMMU_DOMAIN_DMA; } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE) return IOMMU_DOMAIN_DMA; - } else { - if (device_has_rmrr(dev)) - return IOMMU_DOMAIN_DMA; } - return (iommu_identity_mapping & IDENTMAP_ALL) ? - IOMMU_DOMAIN_IDENTITY : 0; + return 0; } static void intel_iommu_init_qi(struct intel_iommu *iommu) @@ -3294,10 +3407,7 @@ static int __init init_dmars(void) if (!ecap_pass_through(iommu->ecap)) hw_pass_through = 0; -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu)) - intel_svm_init(iommu); -#endif + intel_svm_check(iommu); } /* @@ -3312,9 +3422,6 @@ static int __init init_dmars(void) iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } - if (iommu_default_passthrough()) - iommu_identity_mapping |= IDENTMAP_ALL; - #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA dmar_map_gfx = 0; #endif @@ -3387,8 +3494,21 @@ static unsigned long intel_alloc_iova(struct device *dev, { unsigned long iova_pfn; - /* Restrict dma_mask to the width that the iommu can handle */ - dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask); + /* + * Restrict dma_mask to the width that the iommu can handle. + * First-level translation restricts the input-address to a + * canonical address (i.e., address bits 63:N have the same + * value as address bit [N-1], where N is 48-bits with 4-level + * paging and 57-bits with 5-level paging). Hence, skip bit + * [N-1]. + */ + if (domain_use_first_level(domain)) + dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1), + dma_mask); + else + dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), + dma_mask); + /* Ensure we reserve the whole size-aligned region */ nrpages = __roundup_pow_of_two(nrpages); @@ -3406,7 +3526,8 @@ static unsigned long intel_alloc_iova(struct device *dev, iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask), true); if (unlikely(!iova_pfn)) { - dev_err(dev, "Allocating %ld-page iova failed", nrpages); + dev_err_once(dev, "Allocating %ld-page iova failed\n", + nrpages); return 0; } @@ -3774,8 +3895,8 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele return 0; } - trace_map_sg(dev, iova_pfn << PAGE_SHIFT, - sg_phys(sglist), size << VTD_PAGE_SHIFT); + for_each_sg(sglist, sg, nelems, i) + trace_map_sg(dev, i + 1, nelems, sg); return nelems; } @@ -3987,6 +4108,9 @@ bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, sg_dma_len(sg) = sg->length; } + for_each_sg(sglist, sg, nelems, i) + trace_bounce_map_sg(dev, i + 1, nelems, sg); + return nelems; out_unmap: @@ -4315,16 +4439,31 @@ static void __init init_iommu_pm_ops(void) static inline void init_iommu_pm_ops(void) {} #endif /* CONFIG_PM */ +static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) +{ + if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || + !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || + rmrr->end_address <= rmrr->base_address || + arch_rmrr_sanity_check(rmrr)) + return -EINVAL; + + return 0; +} + int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) { struct acpi_dmar_reserved_memory *rmrr; struct dmar_rmrr_unit *rmrru; - int ret; rmrr = (struct acpi_dmar_reserved_memory *)header; - ret = arch_rmrr_sanity_check(rmrr); - if (ret) - return ret; + if (rmrr_sanity_check(rmrr)) + WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND, + "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + rmrr->base_address, rmrr->end_address, + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); if (!rmrru) @@ -4470,7 +4609,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) iommu->name); return -ENXIO; } - sp = domain_update_iommu_superpage(iommu) - 1; + sp = domain_update_iommu_superpage(NULL, iommu) - 1; if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { pr_warn("%s: Doesn't support large page.\n", iommu->name); @@ -4490,10 +4629,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) if (ret) goto out; -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu)) - intel_svm_init(iommu); -#endif + intel_svm_check(iommu); if (dmaru->ignored) { /* @@ -4898,7 +5034,7 @@ static int __init platform_optin_force_iommu(void) * map for all devices except those marked as being untrusted. */ if (dmar_disabled) - iommu_identity_mapping |= IDENTMAP_ALL; + iommu_set_default_passthrough(false); dmar_disabled = 0; no_iommu = 0; @@ -5163,7 +5299,8 @@ static void dmar_remove_one_dev_info(struct device *dev) spin_lock_irqsave(&device_domain_lock, flags); info = dev->archdata.iommu; - if (info) + if (info && info != DEFER_DEVICE_DOMAIN_INFO + && info != DUMMY_DEVICE_DOMAIN_INFO) __dmar_remove_one_dev_info(info); spin_unlock_irqrestore(&device_domain_lock, flags); } @@ -5197,6 +5334,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) { struct dmar_domain *dmar_domain; struct iommu_domain *domain; + int ret; switch (type) { case IOMMU_DOMAIN_DMA: @@ -5213,11 +5351,12 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return NULL; } - if (type == IOMMU_DOMAIN_DMA && - init_iova_flush_queue(&dmar_domain->iovad, - iommu_flush_iova, iova_entry_free)) { - pr_warn("iova flush queue initialization failed\n"); - intel_iommu_strict = 1; + if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) { + ret = init_iova_flush_queue(&dmar_domain->iovad, + iommu_flush_iova, + iova_entry_free); + if (ret) + pr_info("iova flush queue initialization failed\n"); } domain_update_iommu_cap(dmar_domain); @@ -5283,7 +5422,7 @@ static void auxiliary_unlink_device(struct dmar_domain *domain, domain->auxd_refcnt--; if (!domain->auxd_refcnt && domain->default_pasid > 0) - intel_pasid_free_id(domain->default_pasid); + ioasid_free(domain->default_pasid); } static int aux_domain_add_dev(struct dmar_domain *domain, @@ -5301,10 +5440,11 @@ static int aux_domain_add_dev(struct dmar_domain *domain, if (domain->default_pasid <= 0) { int pasid; - pasid = intel_pasid_alloc_id(domain, PASID_MIN, - pci_max_pasids(to_pci_dev(dev)), - GFP_KERNEL); - if (pasid <= 0) { + /* No private data needed for the default pasid */ + pasid = ioasid_alloc(NULL, PASID_MIN, + pci_max_pasids(to_pci_dev(dev)) - 1, + NULL); + if (pasid == INVALID_IOASID) { pr_err("Can't allocate default pasid\n"); return -ENODEV; } @@ -5322,8 +5462,12 @@ static int aux_domain_add_dev(struct dmar_domain *domain, goto attach_failed; /* Setup the PASID entry for mediated devices: */ - ret = intel_pasid_setup_second_level(iommu, domain, dev, - domain->default_pasid); + if (domain_use_first_level(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + domain->default_pasid); + else + ret = intel_pasid_setup_second_level(iommu, domain, dev, + domain->default_pasid); if (ret) goto table_failed; spin_unlock(&iommu->lock); @@ -5340,7 +5484,7 @@ attach_failed: spin_unlock(&iommu->lock); spin_unlock_irqrestore(&device_domain_lock, flags); if (!domain->auxd_refcnt && domain->default_pasid > 0) - intel_pasid_free_id(domain->default_pasid); + ioasid_free(domain->default_pasid); return ret; } @@ -5594,6 +5738,24 @@ static inline bool iommu_pasid_support(void) return ret; } +static inline bool nested_mode_support(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + bool ret = true; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) { + ret = false; + break; + } + } + rcu_read_unlock(); + + return ret; +} + static bool intel_iommu_capable(enum iommu_cap cap) { if (cap == IOMMU_CAP_CACHE_COHERENCY) @@ -5624,8 +5786,10 @@ static int intel_iommu_add_device(struct device *dev) group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto unlink; + } iommu_group_put(group); @@ -5651,7 +5815,8 @@ static int intel_iommu_add_device(struct device *dev) if (!get_private_domain_for_dev(dev)) { dev_warn(dev, "Failed to get a private domain.\n"); - return -ENOMEM; + ret = -ENOMEM; + goto unlink; } dev_info(dev, @@ -5666,6 +5831,10 @@ static int intel_iommu_add_device(struct device *dev) } return 0; + +unlink: + iommu_device_unlink(&iommu->iommu, dev); + return ret; } static void intel_iommu_remove_device(struct device *dev) @@ -5744,15 +5913,6 @@ static void intel_iommu_get_resv_regions(struct device *device, list_add_tail(®->list, head); } -static void intel_iommu_put_resv_regions(struct device *dev, - struct list_head *head) -{ - struct iommu_resv_region *entry, *next; - - list_for_each_entry_safe(entry, next, head, list) - kfree(entry); -} - int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) { struct device_domain_info *info; @@ -5817,6 +5977,13 @@ static void intel_iommu_apply_resv_region(struct device *dev, WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end)); } +static struct iommu_group *intel_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + return generic_device_group(dev); +} + #ifdef CONFIG_INTEL_IOMMU_SVM struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) { @@ -5972,10 +6139,42 @@ static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO; } +static int +intel_iommu_domain_set_attr(struct iommu_domain *domain, + enum iommu_attr attr, void *data) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned long flags; + int ret = 0; + + if (domain->type != IOMMU_DOMAIN_UNMANAGED) + return -EINVAL; + + switch (attr) { + case DOMAIN_ATTR_NESTING: + spin_lock_irqsave(&device_domain_lock, flags); + if (nested_mode_support() && + list_empty(&dmar_domain->devices)) { + dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; + dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; + } else { + ret = -ENODEV; + } + spin_unlock_irqrestore(&device_domain_lock, flags); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .domain_alloc = intel_iommu_domain_alloc, .domain_free = intel_iommu_domain_free, + .domain_set_attr = intel_iommu_domain_set_attr, .attach_dev = intel_iommu_attach_device, .detach_dev = intel_iommu_detach_device, .aux_attach_dev = intel_iommu_aux_attach_device, @@ -5987,9 +6186,9 @@ const struct iommu_ops intel_iommu_ops = { .add_device = intel_iommu_add_device, .remove_device = intel_iommu_remove_device, .get_resv_regions = intel_iommu_get_resv_regions, - .put_resv_regions = intel_iommu_put_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .apply_resv_region = intel_iommu_apply_resv_region, - .device_group = pci_device_group, + .device_group = intel_iommu_device_group, .dev_has_feat = intel_iommu_dev_has_feat, .dev_feat_enabled = intel_iommu_dev_feat_enabled, .dev_enable_feat = intel_iommu_dev_enable_feat, diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 040a445be300..22b30f10b396 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -26,42 +26,6 @@ */ static DEFINE_SPINLOCK(pasid_lock); u32 intel_pasid_max_id = PASID_MAX; -static DEFINE_IDR(pasid_idr); - -int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp) -{ - int ret, min, max; - - min = max_t(int, start, PASID_MIN); - max = min_t(int, end, intel_pasid_max_id); - - WARN_ON(in_interrupt()); - idr_preload(gfp); - spin_lock(&pasid_lock); - ret = idr_alloc(&pasid_idr, ptr, min, max, GFP_ATOMIC); - spin_unlock(&pasid_lock); - idr_preload_end(); - - return ret; -} - -void intel_pasid_free_id(int pasid) -{ - spin_lock(&pasid_lock); - idr_remove(&pasid_idr, pasid); - spin_unlock(&pasid_lock); -} - -void *intel_pasid_lookup_id(int pasid) -{ - void *p; - - spin_lock(&pasid_lock); - p = idr_find(&pasid_idr, pasid); - spin_unlock(&pasid_lock); - - return p; -} /* * Per device pasid table management: @@ -465,6 +429,21 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, devtlb_invalidation_with_pasid(iommu, dev, pasid); } +static void pasid_flush_caches(struct intel_iommu *iommu, + struct pasid_entry *pte, + int pasid, u16 did) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + if (cap_caching_mode(iommu->cap)) { + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + iotlb_invalidation_with_pasid(iommu, did, pasid); + } else { + iommu_flush_write_buffer(iommu); + } +} + /* * Set up the scalable mode pasid table entry for first only * translation type. @@ -498,10 +477,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, pasid_set_sre(pte); } -#ifdef CONFIG_X86 - if (cpu_feature_enabled(X86_FEATURE_LA57)) - pasid_set_flpm(pte, 1); -#endif /* CONFIG_X86 */ + if (flags & PASID_FLAG_FL5LP) { + if (cap_5lp_support(iommu->cap)) { + pasid_set_flpm(pte, 1); + } else { + pr_err("No 5-level paging support for first-level\n"); + pasid_clear_entry(pte); + return -EINVAL; + } + } pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); @@ -510,16 +494,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, /* Setup Present and PASID Granular Transfer Type: */ pasid_set_translation_type(pte, 1); pasid_set_present(pte); - - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); - - if (cap_caching_mode(iommu->cap)) { - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - iotlb_invalidation_with_pasid(iommu, did, pasid); - } else { - iommu_flush_write_buffer(iommu); - } + pasid_flush_caches(iommu, pte, pasid, did); return 0; } @@ -583,16 +558,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, */ pasid_set_sre(pte); pasid_set_present(pte); - - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); - - if (cap_caching_mode(iommu->cap)) { - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - iotlb_invalidation_with_pasid(iommu, did, pasid); - } else { - iommu_flush_write_buffer(iommu); - } + pasid_flush_caches(iommu, pte, pasid, did); return 0; } @@ -626,16 +592,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, */ pasid_set_sre(pte); pasid_set_present(pte); - - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); - - if (cap_caching_mode(iommu->cap)) { - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - iotlb_invalidation_with_pasid(iommu, did, pasid); - } else { - iommu_flush_write_buffer(iommu); - } + pasid_flush_caches(iommu, pte, pasid, did); return 0; } diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index fc8cd8f17de1..92de6df24ccb 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -37,6 +37,12 @@ */ #define PASID_FLAG_SUPERVISOR_MODE BIT(0) +/* + * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- + * level translation, otherwise, 4-level paging will be used. + */ +#define PASID_FLAG_FL5LP BIT(1) + struct pasid_dir_entry { u64 val; }; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index dca88f9fdf29..d7f2a5358900 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -17,25 +17,13 @@ #include <linux/dmar.h> #include <linux/interrupt.h> #include <linux/mm_types.h> +#include <linux/ioasid.h> #include <asm/page.h> #include "intel-pasid.h" static irqreturn_t prq_event_thread(int irq, void *d); -int intel_svm_init(struct intel_iommu *iommu) -{ - if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && - !cap_fl1gp_support(iommu->cap)) - return -EINVAL; - - if (cpu_feature_enabled(X86_FEATURE_LA57) && - !cap_5lp_support(iommu->cap)) - return -EINVAL; - - return 0; -} - #define PRQ_ORDER 0 int intel_svm_enable_prq(struct intel_iommu *iommu) @@ -99,6 +87,33 @@ int intel_svm_finish_prq(struct intel_iommu *iommu) return 0; } +static inline bool intel_svm_capable(struct intel_iommu *iommu) +{ + return iommu->flags & VTD_FLAG_SVM_CAPABLE; +} + +void intel_svm_check(struct intel_iommu *iommu) +{ + if (!pasid_supported(iommu)) + return; + + if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && + !cap_fl1gp_support(iommu->cap)) { + pr_err("%s SVM disabled, incompatible 1GB page capability\n", + iommu->name); + return; + } + + if (cpu_feature_enabled(X86_FEATURE_LA57) && + !cap_5lp_support(iommu->cap)) { + pr_err("%s SVM disabled, incompatible paging mode\n", + iommu->name); + return; + } + + iommu->flags |= VTD_FLAG_SVM_CAPABLE; +} + static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, unsigned long address, unsigned long pages, int ih) { @@ -207,6 +222,10 @@ static const struct mmu_notifier_ops intel_mmuops = { static DEFINE_MUTEX(pasid_mutex); static LIST_HEAD(global_svm_list); +#define for_each_svm_dev(sdev, svm, d) \ + list_for_each_entry((sdev), &(svm)->devs, list) \ + if ((d) != (sdev)->dev) {} else + int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); @@ -220,6 +239,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ if (!iommu || dmar_disabled) return -EINVAL; + if (!intel_svm_capable(iommu)) + return -ENOTSUPP; + if (dev_is_pci(dev)) { pasid_max = pci_max_pasids(to_pci_dev(dev)); if (pasid_max < 0) @@ -252,15 +274,14 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ goto out; } - list_for_each_entry(sdev, &svm->devs, list) { - if (dev == sdev->dev) { - if (sdev->ops != ops) { - ret = -EBUSY; - goto out; - } - sdev->users++; - goto success; + /* Find the matching device in svm list */ + for_each_svm_dev(sdev, svm, dev) { + if (sdev->ops != ops) { + ret = -EBUSY; + goto out; } + sdev->users++; + goto success; } break; @@ -314,16 +335,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ if (pasid_max > intel_pasid_max_id) pasid_max = intel_pasid_max_id; - /* Do not use PASID 0 in caching mode (virtualised IOMMU) */ - ret = intel_pasid_alloc_id(svm, - !!cap_caching_mode(iommu->cap), - pasid_max - 1, GFP_KERNEL); - if (ret < 0) { + /* Do not use PASID 0, reserved for RID to PASID */ + svm->pasid = ioasid_alloc(NULL, PASID_MIN, + pasid_max - 1, svm); + if (svm->pasid == INVALID_IOASID) { kfree(svm); kfree(sdev); + ret = -ENOSPC; goto out; } - svm->pasid = ret; svm->notifier.ops = &intel_mmuops; svm->mm = mm; svm->flags = flags; @@ -333,7 +353,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ if (mm) { ret = mmu_notifier_register(&svm->notifier, mm); if (ret) { - intel_pasid_free_id(svm->pasid); + ioasid_free(svm->pasid); kfree(svm); kfree(sdev); goto out; @@ -344,12 +364,14 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ ret = intel_pasid_setup_first_level(iommu, dev, mm ? mm->pgd : init_mm.pgd, svm->pasid, FLPT_DEFAULT_DID, - mm ? 0 : PASID_FLAG_SUPERVISOR_MODE); + (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | + (cpu_feature_enabled(X86_FEATURE_LA57) ? + PASID_FLAG_FL5LP : 0)); spin_unlock(&iommu->lock); if (ret) { if (mm) mmu_notifier_unregister(&svm->notifier, mm); - intel_pasid_free_id(svm->pasid); + ioasid_free(svm->pasid); kfree(svm); kfree(sdev); goto out; @@ -365,7 +387,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ ret = intel_pasid_setup_first_level(iommu, dev, mm ? mm->pgd : init_mm.pgd, svm->pasid, FLPT_DEFAULT_DID, - mm ? 0 : PASID_FLAG_SUPERVISOR_MODE); + (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | + (cpu_feature_enabled(X86_FEATURE_LA57) ? + PASID_FLAG_FL5LP : 0)); spin_unlock(&iommu->lock); if (ret) { kfree(sdev); @@ -397,44 +421,45 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) if (!iommu) goto out; - svm = intel_pasid_lookup_id(pasid); + svm = ioasid_find(NULL, pasid, NULL); if (!svm) goto out; - list_for_each_entry(sdev, &svm->devs, list) { - if (dev == sdev->dev) { - ret = 0; - sdev->users--; - if (!sdev->users) { - list_del_rcu(&sdev->list); - /* Flush the PASID cache and IOTLB for this device. - * Note that we do depend on the hardware *not* using - * the PASID any more. Just as we depend on other - * devices never using PASIDs that they have no right - * to use. We have a *shared* PASID table, because it's - * large and has to be physically contiguous. So it's - * hard to be as defensive as we might like. */ - intel_pasid_tear_down_entry(iommu, dev, svm->pasid); - intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); - kfree_rcu(sdev, rcu); - - if (list_empty(&svm->devs)) { - intel_pasid_free_id(svm->pasid); - if (svm->mm) - mmu_notifier_unregister(&svm->notifier, svm->mm); - - list_del(&svm->list); - - /* We mandate that no page faults may be outstanding - * for the PASID when intel_svm_unbind_mm() is called. - * If that is not obeyed, subtle errors will happen. - * Let's make them less subtle... */ - memset(svm, 0x6b, sizeof(*svm)); - kfree(svm); - } + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } + + for_each_svm_dev(sdev, svm, dev) { + ret = 0; + sdev->users--; + if (!sdev->users) { + list_del_rcu(&sdev->list); + /* Flush the PASID cache and IOTLB for this device. + * Note that we do depend on the hardware *not* using + * the PASID any more. Just as we depend on other + * devices never using PASIDs that they have no right + * to use. We have a *shared* PASID table, because it's + * large and has to be physically contiguous. So it's + * hard to be as defensive as we might like. */ + intel_pasid_tear_down_entry(iommu, dev, svm->pasid); + intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); + kfree_rcu(sdev, rcu); + + if (list_empty(&svm->devs)) { + ioasid_free(svm->pasid); + if (svm->mm) + mmu_notifier_unregister(&svm->notifier, svm->mm); + list_del(&svm->list); + /* We mandate that no page faults may be outstanding + * for the PASID when intel_svm_unbind_mm() is called. + * If that is not obeyed, subtle errors will happen. + * Let's make them less subtle... */ + memset(svm, 0x6b, sizeof(*svm)); + kfree(svm); } - break; } + break; } out: mutex_unlock(&pasid_mutex); @@ -454,10 +479,14 @@ int intel_svm_is_pasid_valid(struct device *dev, int pasid) if (!iommu) goto out; - svm = intel_pasid_lookup_id(pasid); + svm = ioasid_find(NULL, pasid, NULL); if (!svm) goto out; + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } /* init_mm is used in this case */ if (!svm->mm) ret = 1; @@ -564,13 +593,12 @@ static irqreturn_t prq_event_thread(int irq, void *d) if (!svm || svm->pasid != req->pasid) { rcu_read_lock(); - svm = intel_pasid_lookup_id(req->pasid); + svm = ioasid_find(NULL, req->pasid, NULL); /* It *can't* go away, because the driver is not permitted * to unbind the mm while any page faults are outstanding. * So we only need RCU to protect the internal idr code. */ rcu_read_unlock(); - - if (!svm) { + if (IS_ERR_OR_NULL(svm)) { pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", iommu->name, req->pasid, ((unsigned long long *)req)[0], ((unsigned long long *)req)[1]); @@ -654,11 +682,10 @@ static irqreturn_t prq_event_thread(int irq, void *d) if (req->priv_data_present) memcpy(&resp.qw2, req->priv_data, sizeof(req->priv_data)); + resp.qw2 = 0; + resp.qw3 = 0; + qi_submit_sync(&resp, iommu); } - resp.qw2 = 0; - resp.qw3 = 0; - qi_submit_sync(&resp, iommu); - head = (head + sizeof(*req)) & PRQ_RING_MASK; } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ffe6f685ceae..3e3528436e0b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -768,6 +768,7 @@ err_put_group: mutex_unlock(&group->mutex); dev->iommu_group = NULL; kobject_put(group->devices_kobj); + sysfs_remove_link(group->devices_kobj, device->name); err_free_name: kfree(device->name); err_remove_link: @@ -2256,6 +2257,25 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list) ops->put_resv_regions(dev, list); } +/** + * generic_iommu_put_resv_regions - Reserved region driver helper + * @dev: device for which to free reserved regions + * @list: reserved region list for device + * + * IOMMU drivers can use this to implement their .put_resv_regions() callback + * for simple reservations. Memory allocated for each reserved region will be + * freed. If an IOMMU driver allocates additional resources per region, it is + * going to have to implement a custom callback. + */ +void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list) +{ + struct iommu_resv_region *entry, *next; + + list_for_each_entry_safe(entry, next, list, list) + kfree(entry); +} +EXPORT_SYMBOL(generic_iommu_put_resv_regions); + struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, enum iommu_resv_type type) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index c7a914b9bbbc..0e6a9536eca6 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -233,7 +233,7 @@ static DEFINE_MUTEX(iova_cache_mutex); struct iova *alloc_iova_mem(void) { - return kmem_cache_zalloc(iova_cache, GFP_ATOMIC); + return kmem_cache_zalloc(iova_cache, GFP_ATOMIC | __GFP_NOWARN); } EXPORT_SYMBOL(alloc_iova_mem); diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 315c7cc4f99d..cce329d71fba 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -837,14 +837,6 @@ static void viommu_get_resv_regions(struct device *dev, struct list_head *head) iommu_dma_get_resv_regions(dev, head); } -static void viommu_put_resv_regions(struct device *dev, struct list_head *head) -{ - struct iommu_resv_region *entry, *next; - - list_for_each_entry_safe(entry, next, head, list) - kfree(entry); -} - static struct iommu_ops viommu_ops; static struct virtio_driver virtio_iommu_drv; @@ -914,7 +906,7 @@ static int viommu_add_device(struct device *dev) err_unlink_dev: iommu_device_unlink(&viommu->iommu, dev); err_free_dev: - viommu_put_resv_regions(dev, &vdev->resv_regions); + generic_iommu_put_resv_regions(dev, &vdev->resv_regions); kfree(vdev); return ret; @@ -932,7 +924,7 @@ static void viommu_remove_device(struct device *dev) iommu_group_remove_device(dev); iommu_device_unlink(&vdev->viommu->iommu, dev); - viommu_put_resv_regions(dev, &vdev->resv_regions); + generic_iommu_put_resv_regions(dev, &vdev->resv_regions); kfree(vdev); } @@ -961,7 +953,7 @@ static struct iommu_ops viommu_ops = { .remove_device = viommu_remove_device, .device_group = viommu_device_group, .get_resv_regions = viommu_get_resv_regions, - .put_resv_regions = viommu_put_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .of_xlate = viommu_of_xlate, }; |