diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/idle.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/npu-dma.c | 117 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-fadump.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal.c | 4 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda-tce.c | 28 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 299 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 20 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 28 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/vas-api.c | 278 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/vas-debug.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/vas-fault.c | 382 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/vas-window.c | 238 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/vas.c | 85 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/vas.h | 59 |
15 files changed, 1230 insertions, 316 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index c0f8120045c3..fe3f0fb5aeca 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o -obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o vas-fault.o vas-api.o obj-$(CONFIG_OCXL_BASE) += ocxl.o obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..2dd467383a88 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -1270,7 +1270,7 @@ static int pnv_parse_cpuidle_dt(void) /* Read residencies */ if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns", temp_u32, nr_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n"); + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n"); rc = -EINVAL; goto out; } diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index b95b9e3c4c98..abeaa533b976 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -15,6 +15,7 @@ #include <asm/debugfs.h> #include <asm/powernv.h> +#include <asm/ppc-pci.h> #include <asm/opal.h> #include "pci.h" @@ -425,9 +426,10 @@ static void pnv_comp_attach_table_group(struct npu_comp *npucomp, ++npucomp->pe_num; } -struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) +static struct iommu_table_group * + pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) { - struct iommu_table_group *table_group; + struct iommu_table_group *compound_group; struct npu_comp *npucomp; struct pci_dev *gpdev = NULL; struct pci_controller *hose; @@ -446,39 +448,52 @@ struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) hose = pci_bus_to_host(npdev->bus); if (hose->npu) { - table_group = &hose->npu->npucomp.table_group; - - if (!table_group->group) { - table_group->ops = &pnv_npu_peers_ops; - iommu_register_group(table_group, - hose->global_number, - pe->pe_number); - } + /* P9 case: compound group is per-NPU (all gpus, all links) */ + npucomp = &hose->npu->npucomp; } else { - /* Create a group for 1 GPU and attached NPUs for POWER8 */ - pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL); - table_group = &pe->npucomp->table_group; - table_group->ops = &pnv_npu_peers_ops; - iommu_register_group(table_group, hose->global_number, - pe->pe_number); + /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */ + npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL); } - /* Steal capabilities from a GPU PE */ - table_group->max_dynamic_windows_supported = - pe->table_group.max_dynamic_windows_supported; - table_group->tce32_start = pe->table_group.tce32_start; - table_group->tce32_size = pe->table_group.tce32_size; - table_group->max_levels = pe->table_group.max_levels; - if (!table_group->pgsizes) - table_group->pgsizes = pe->table_group.pgsizes; + compound_group = &npucomp->table_group; + if (!compound_group->group) { + compound_group->ops = &pnv_npu_peers_ops; + iommu_register_group(compound_group, hose->global_number, + pe->pe_number); - npucomp = container_of(table_group, struct npu_comp, table_group); + /* Steal capabilities from a GPU PE */ + compound_group->max_dynamic_windows_supported = + pe->table_group.max_dynamic_windows_supported; + compound_group->tce32_start = pe->table_group.tce32_start; + compound_group->tce32_size = pe->table_group.tce32_size; + compound_group->max_levels = pe->table_group.max_levels; + if (!compound_group->pgsizes) + compound_group->pgsizes = pe->table_group.pgsizes; + } + + /* + * The gpu would have been added to the iommu group that's created + * for the PE. Pull it out now. + */ + iommu_del_device(&gpdev->dev); + + /* + * I'm not sure this is strictly required, but it's probably a good idea + * since the table_group for the PE is going to be attached to the + * compound table group. If we leave the PE's iommu group active then + * we might have the same table_group being modifiable via two sepeate + * iommu groups. + */ + iommu_group_put(pe->table_group.group); + + /* now put the GPU into the compound group */ pnv_comp_attach_table_group(npucomp, pe); + iommu_add_device(compound_group, &gpdev->dev); - return table_group; + return compound_group; } -struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) +static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) { struct iommu_table_group *table_group; struct npu_comp *npucomp; @@ -521,6 +536,54 @@ struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) return table_group; } + +void pnv_pci_npu_setup_iommu_groups(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + + /* + * For non-nvlink devices the IOMMU group is registered when the PE is + * configured and devices are added to the group when the per-device + * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is + * only initialise for "normal" IODA PHBs. + * + * For NVLink devices we need to ensure the NVLinks and the GPU end up + * in the same IOMMU group, so that's handled here. + */ + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->type == PNV_PHB_IODA2) + list_for_each_entry(pe, &phb->ioda.pe_list, list) + pnv_try_setup_npu_table_group(pe); + } + + /* + * Now we have all PHBs discovered, time to add NPU devices to + * the corresponding IOMMU groups. + */ + list_for_each_entry(hose, &hose_list, list_node) { + unsigned long pgsizes; + + phb = hose->private_data; + + if (phb->type != PNV_PHB_NPU_NVLINK) + continue; + + pgsizes = pnv_ioda_parse_tce_sizes(phb); + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + /* + * IODA2 bridges get this set up from + * pci_controller_ops::setup_bridge but NPU bridges + * do not have this hook defined so we do it here. + */ + pe->table_group.pgsizes = pgsizes; + pnv_npu_compound_attach(pe); + } + } +} #endif /* CONFIG_IOMMU_API */ int pnv_npu2_init(struct pci_controller *hose) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index d361d37d975f..9a360ced663b 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -671,7 +671,7 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) * Firmware supports 32-bit field for size. Align it to PAGE_SIZE * and request firmware to copy multiple kernel boot memory regions. */ - fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE); + fadump_conf->max_copy_size = ALIGN_DOWN(U32_MAX, PAGE_SIZE); /* * Check if dump has been initiated on last reboot. diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 2b3dfd0b6cdd..d95954ad4c0a 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -811,6 +811,10 @@ static int opal_add_one_export(struct kobject *parent, const char *export_name, goto out; attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) { + rc = -ENOMEM; + goto out; + } name = kstrdup(export_name, GFP_KERNEL); if (!name) { rc = -ENOMEM; diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 5dc6847d5f4c..f923359d8afc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -17,6 +17,34 @@ #include <asm/tce.h> #include "pci.h" +unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + struct device_node *dn = hose->dn; + unsigned long mask = 0; + int i, rc, count; + u32 val; + + count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes"); + if (count <= 0) { + mask = SZ_4K | SZ_64K; + /* Add 16M for POWER8 by default */ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + !cpu_has_feature(CPU_FTR_ARCH_300)) + mask |= SZ_16M | SZ_256M; + return mask; + } + + for (i = 0; i < count; i++) { + rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes", + i, &val); + if (rc == 0) + mask |= 1ULL << val; + } + + return mask; +} + void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned int page_shift) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 57d3a6af1d52..73a63efcf855 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -51,6 +51,7 @@ static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK", "NPU_OCAPI" }; static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +static void pnv_pci_configure_bus(struct pci_bus *bus); void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) @@ -264,8 +265,8 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, if (!r->parent || !pnv_pci_is_m64(phb, r)) continue; - start = _ALIGN_DOWN(r->start - base, sgsz); - end = _ALIGN_UP(r->end - base, sgsz); + start = ALIGN_DOWN(r->start - base, sgsz); + end = ALIGN(r->end - base, sgsz); for (segno = start / sgsz; segno < end / sgsz; segno++) { if (pe_bitmap) set_bit(segno, pe_bitmap); @@ -361,7 +362,7 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) return NULL; /* Allocate bitmap */ - size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); + size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); pe_alloc = kzalloc(size, GFP_KERNEL); if (!pe_alloc) { pr_warn("%s: Out of memory !\n", @@ -660,6 +661,16 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) return state; } +struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn) +{ + int pe_number = phb->ioda.pe_rmap[bdfn]; + + if (pe_number == IODA_INVALID_PE) + return NULL; + + return &phb->ioda.pe_array[pe_number]; +} + struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); @@ -1110,34 +1121,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) return pe; } -static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - struct pci_dn *pdn = pci_get_pdn(dev); - - if (pdn == NULL) { - pr_warn("%s: No device node associated with device !\n", - pci_name(dev)); - continue; - } - - /* - * In partial hotplug case, the PCI device might be still - * associated with the PE and needn't attach it to the PE - * again. - */ - if (pdn->pe_number != IODA_INVALID_PE) - continue; - - pe->device_count++; - pdn->pe_number = pe->pe_number; - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_same_PE(dev->subordinate, pe); - } -} - /* * There're 2 types of PCI bus sensitive PEs: One that is compromised of * single PCI bus. Another one that contains the primary PCI bus and its @@ -1156,15 +1139,13 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) * We should reuse it instead of allocating a new one. */ pe_num = phb->ioda.pe_rmap[bus->number << 8]; - if (pe_num != IODA_INVALID_PE) { + if (WARN_ON(pe_num != IODA_INVALID_PE)) { pe = &phb->ioda.pe_array[pe_num]; - pnv_ioda_setup_same_PE(bus, pe); return NULL; } /* PE number for root bus should have been reserved */ - if (pci_is_root_bus(bus) && - phb->ioda.root_pe_idx != IODA_INVALID_PE) + if (pci_is_root_bus(bus)) pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx]; /* Check if PE is determined by M64 */ @@ -1202,9 +1183,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) return NULL; } - /* Associate it with all child devices */ - pnv_ioda_setup_same_PE(bus, pe); - /* Put PE to the list */ list_add_tail(&pe->list, &phb->ioda.pe_list); @@ -1288,7 +1266,7 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) pnv_ioda_setup_npu_PE(pdev); } -static void pnv_pci_ioda_setup_PEs(void) +static void pnv_pci_ioda_setup_nvlink(void) { struct pci_controller *hose; struct pnv_phb *phb; @@ -1312,6 +1290,11 @@ static void pnv_pci_ioda_setup_PEs(void) list_for_each_entry(pe, &phb->ioda.pe_list, list) pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV); } + +#ifdef CONFIG_IOMMU_API + /* setup iommu groups so we can do nvlink pass-thru */ + pnv_pci_npu_setup_iommu_groups(); +#endif } #ifdef CONFIG_PCI_IOV @@ -1550,11 +1533,6 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); -#ifdef CONFIG_IOMMU_API -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, struct pci_bus *bus); - -#endif static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) { struct pci_bus *bus; @@ -1619,11 +1597,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) } pnv_pci_ioda2_setup_dma_pe(phb, pe); -#ifdef CONFIG_IOMMU_API - iommu_register_group(&pe->table_group, - pe->phb->hose->global_number, pe->pe_number); - pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL); -#endif } } @@ -1767,24 +1740,39 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - /* - * The function can be called while the PE# - * hasn't been assigned. Do nothing for the - * case. - */ - if (!pdn || pdn->pe_number == IODA_INVALID_PE) - return; + /* Check if the BDFN for this device is associated with a PE yet */ + pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8)); + if (!pe) { + /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */ + if (WARN_ON(pdev->is_virtfn)) + return; + + pnv_pci_configure_bus(pdev->bus); + pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8)); + pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff); + + + /* + * If we can't setup the IODA PE something has gone horribly + * wrong and we can't enable DMA for the device. + */ + if (WARN_ON(!pe)) + return; + } else { + pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number); + } + + if (pdn) + pdn->pe_number = pe->pe_number; + pe->device_count++; - pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); - /* - * Note: iommu_add_device() will fail here as - * for physical PE: the device is already added by now; - * for virtual PE: sysfs entries are not ready yet and - * tce_iommu_bus_notifier will add the device to a group later. - */ + + /* PEs with a DMA weight of zero won't have a group */ + if (pe->table_group.group) + iommu_add_device(&pe->table_group, &pdev->dev); } /* @@ -2297,9 +2285,6 @@ found: pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; iommu_init_table(tbl, phb->hose->node, 0, 0); - if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus); - return; fail: /* XXX Failure: Try to fallback to 64-bit only ? */ @@ -2537,7 +2522,7 @@ unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, direct_table_size = 1UL << table_shift; for ( ; levels; --levels) { - bytes += _ALIGN_UP(tce_table_size, direct_table_size); + bytes += ALIGN(tce_table_size, direct_table_size); tce_table_size /= direct_table_size; tce_table_size <<= 3; @@ -2596,137 +2581,8 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; - -static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, - struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - iommu_add_device(table_group, &dev->dev); - - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_iommu_group_add_devices(pe, - table_group, dev->subordinate); - } -} - -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, struct pci_bus *bus) -{ - - if (pe->flags & PNV_IODA_PE_DEV) - iommu_add_device(table_group, &pe->pdev->dev); - - if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus) - pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group, - bus); -} - -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); - -static void pnv_pci_ioda_setup_iommu_api(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - /* - * There are 4 types of PEs: - * - PNV_IODA_PE_BUS: a downstream port with an adapter, - * created from pnv_pci_setup_bridge(); - * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it, - * created from pnv_pci_setup_bridge(); - * - PNV_IODA_PE_VF: a SRIOV virtual function, - * created from pnv_pcibios_sriov_enable(); - * - PNV_IODA_PE_DEV: an NPU or OCAPI device, - * created from pnv_pci_ioda_fixup(). - * - * Normally a PE is represented by an IOMMU group, however for - * devices with side channels the groups need to be more strict. - */ - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - - if (phb->type == PNV_PHB_NPU_NVLINK || - phb->type == PNV_PHB_NPU_OCAPI) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - struct iommu_table_group *table_group; - - table_group = pnv_try_setup_npu_table_group(pe); - if (!table_group) { - if (!pnv_pci_ioda_pe_dma_weight(pe)) - continue; - - table_group = &pe->table_group; - iommu_register_group(&pe->table_group, - pe->phb->hose->global_number, - pe->pe_number); - } - pnv_ioda_setup_bus_iommu_group(pe, table_group, - pe->pbus); - } - } - - /* - * Now we have all PHBs discovered, time to add NPU devices to - * the corresponding IOMMU groups. - */ - list_for_each_entry(hose, &hose_list, list_node) { - unsigned long pgsizes; - - phb = hose->private_data; - - if (phb->type != PNV_PHB_NPU_NVLINK) - continue; - - pgsizes = pnv_ioda_parse_tce_sizes(phb); - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - /* - * IODA2 bridges get this set up from - * pci_controller_ops::setup_bridge but NPU bridges - * do not have this hook defined so we do it here. - */ - pe->table_group.pgsizes = pgsizes; - pnv_npu_compound_attach(pe); - } - } -} -#else /* !CONFIG_IOMMU_API */ -static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) -{ - struct pci_controller *hose = phb->hose; - struct device_node *dn = hose->dn; - unsigned long mask = 0; - int i, rc, count; - u32 val; - - count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes"); - if (count <= 0) { - mask = SZ_4K | SZ_64K; - /* Add 16M for POWER8 by default */ - if (cpu_has_feature(CPU_FTR_ARCH_207S) && - !cpu_has_feature(CPU_FTR_ARCH_300)) - mask |= SZ_16M | SZ_256M; - return mask; - } - - for (i = 0; i < count; i++) { - rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes", - i, &val); - if (rc == 0) - mask |= 1ULL << val; - } - - return mask; -} - static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { @@ -2749,16 +2605,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, IOMMU_TABLE_GROUP_MAX_TABLES; pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb); -#ifdef CONFIG_IOMMU_API - pe->table_group.ops = &pnv_pci_ioda2_ops; -#endif rc = pnv_pci_ioda2_setup_default_config(pe); if (rc) return; - if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus); +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); +#endif } int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) @@ -3220,8 +3076,7 @@ static void pnv_pci_enable_bridges(void) static void pnv_pci_ioda_fixup(void) { - pnv_pci_ioda_setup_PEs(); - pnv_pci_ioda_setup_iommu_api(); + pnv_pci_ioda_setup_nvlink(); pnv_pci_ioda_create_dbgfs(); pnv_pci_enable_bridges(); @@ -3333,28 +3188,18 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus, } } -static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type) +static void pnv_pci_configure_bus(struct pci_bus *bus) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; struct pci_dev *bridge = bus->self; struct pnv_ioda_pe *pe; - bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); - - /* Extend bridge's windows if necessary */ - pnv_pci_fixup_bridge_resources(bus, type); + bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); - /* The PE for root bus should be realized before any one else */ - if (!phb->ioda.root_pe_populated) { - pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false); - if (pe) { - phb->ioda.root_pe_idx = pe->pe_number; - phb->ioda.root_pe_populated = true; - } - } + dev_info(&bus->dev, "Configuring PE for bus\n"); /* Don't assign PE to PCI bus, which doesn't have subordinate devices */ - if (list_empty(&bus->devices)) + if (WARN_ON(list_empty(&bus->devices))) return; /* Reserve PEs according to used M64 resources */ @@ -3599,6 +3444,8 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) struct pnv_phb *phb = pe->phb; struct pnv_ioda_pe *slave, *tmp; + pe_info(pe, "Releasing PE\n"); + mutex_lock(&phb->ioda.pe_list_mutex); list_del(&pe->list); mutex_unlock(&phb->ioda.pe_list_mutex); @@ -3633,11 +3480,10 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) * that it can be populated again in PCI hot add path. The PE * shouldn't be destroyed as it's the global reserved resource. */ - if (phb->ioda.root_pe_populated && - phb->ioda.root_pe_idx == pe->pe_number) - phb->ioda.root_pe_populated = false; - else - pnv_ioda_free_pe(pe); + if (phb->ioda.root_pe_idx == pe->pe_number) + return; + + pnv_ioda_free_pe(pe); } static void pnv_pci_release_device(struct pci_dev *pdev) @@ -3715,7 +3561,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .enable_device_hook = pnv_pci_enable_device_hook, .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, - .setup_bridge = pnv_pci_setup_bridge, + .setup_bridge = pnv_pci_fixup_bridge_resources, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .shutdown = pnv_pci_ioda_shutdown, }; @@ -3745,6 +3591,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, struct pnv_phb *phb; unsigned long size, m64map_off, m32map_off, pemap_off; unsigned long iomap_off = 0, dma32map_off = 0; + struct pnv_ioda_pe *root_pe; struct resource r; const __be64 *prop64; const __be32 *prop32; @@ -3863,7 +3710,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, PNV_IODA1_DMA32_SEGSIZE; /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ - size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, + size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, sizeof(unsigned long)); m64map_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); @@ -3912,7 +3759,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1; pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); } else { - phb->ioda.root_pe_idx = IODA_INVALID_PE; + /* otherwise just allocate one */ + root_pe = pnv_ioda_alloc_pe(phb); + phb->ioda.root_pe_idx = root_pe->pe_number; } INIT_LIST_HEAD(&phb->ioda.pe_list); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 5bf818246339..091fe1cf386b 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -955,28 +955,8 @@ static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct pci_dev *pdev; - struct pci_dn *pdn; - struct pnv_ioda_pe *pe; - struct pci_controller *hose; - struct pnv_phb *phb; switch (action) { - case BUS_NOTIFY_ADD_DEVICE: - pdev = to_pci_dev(dev); - pdn = pci_get_pdn(pdev); - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - - WARN_ON_ONCE(!phb); - if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb) - return 0; - - pe = &phb->ioda.pe_array[pdn->pe_number]; - if (!pe->table_group.group) - return 0; - iommu_add_device(&pe->table_group, dev); - return 0; case BUS_NOTIFY_DEL_DEVICE: iommu_del_device(dev); return 0; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index d3bbdeab3a32..51c254f2f3cb 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -33,6 +33,24 @@ enum pnv_phb_model { #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ +/* + * A brief note on PNV_IODA_PE_BUS_ALL + * + * This is needed because of the behaviour of PCIe-to-PCI bridges. The PHB uses + * the Requester ID field of the PCIe request header to determine the device + * (and PE) that initiated a DMA. In legacy PCI individual memory read/write + * requests aren't tagged with the RID. To work around this the PCIe-to-PCI + * bridge will use (secondary_bus_no << 8) | 0x00 as the RID on the PCIe side. + * + * PCIe-to-X bridges have a similar issue even though PCI-X requests also have + * a RID in the transaction header. The PCIe-to-X bridge is permitted to "take + * ownership" of a transaction by a PCI-X device when forwarding it to the PCIe + * side of the bridge. + * + * To work around these problems we use the BUS_ALL flag since every subordinate + * bus of the bridge should go into the same PE. + */ + /* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */ #define PNV_IODA_STOPPED_STATE 0x8000000000000000 @@ -118,7 +136,6 @@ struct pnv_phb { unsigned int total_pe_num; unsigned int reserved_pe_idx; unsigned int root_pe_idx; - bool root_pe_populated; /* 32-bit MMIO window */ unsigned int m32_size; @@ -190,6 +207,7 @@ extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, @@ -209,11 +227,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, /* Nvlink functions */ extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); -extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); -extern struct iommu_table_group *pnv_try_setup_npu_table_group( - struct pnv_ioda_pe *pe); -extern struct iommu_table_group *pnv_npu_compound_attach( - struct pnv_ioda_pe *pe); +extern void pnv_pci_npu_setup_iommu_groups(void); /* pci-ioda-tce.c */ #define POWERNV_IOMMU_DEFAULT_LEVELS 2 @@ -244,4 +258,6 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned int page_shift); +extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); + #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/vas-api.c b/arch/powerpc/platforms/powernv/vas-api.c new file mode 100644 index 000000000000..98ed5d8c5441 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-api.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * VAS user space API for its accelerators (Only NX-GZIP is supported now) + * Copyright (C) 2019 Haren Myneni, IBM Corp + */ + +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/cdev.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <asm/vas.h> +#include <uapi/asm/vas-api.h> +#include "vas.h" + +/* + * The driver creates the device node that can be used as follows: + * For NX-GZIP + * + * fd = open("/dev/crypto/nx-gzip", O_RDWR); + * rc = ioctl(fd, VAS_TX_WIN_OPEN, &attr); + * paste_addr = mmap(NULL, PAGE_SIZE, prot, MAP_SHARED, fd, 0ULL). + * vas_copy(&crb, 0, 1); + * vas_paste(paste_addr, 0, 1); + * close(fd) or exit process to close window. + * + * where "vas_copy" and "vas_paste" are defined in copy-paste.h. + * copy/paste returns to the user space directly. So refer NX hardware + * documententation for exact copy/paste usage and completion / error + * conditions. + */ + +/* + * Wrapper object for the nx-gzip device - there is just one instance of + * this node for the whole system. + */ +static struct coproc_dev { + struct cdev cdev; + struct device *device; + char *name; + dev_t devt; + struct class *class; + enum vas_cop_type cop_type; +} coproc_device; + +struct coproc_instance { + struct coproc_dev *coproc; + struct vas_window *txwin; +}; + +static char *coproc_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "crypto/%s", dev_name(dev)); +} + +static int coproc_open(struct inode *inode, struct file *fp) +{ + struct coproc_instance *cp_inst; + + cp_inst = kzalloc(sizeof(*cp_inst), GFP_KERNEL); + if (!cp_inst) + return -ENOMEM; + + cp_inst->coproc = container_of(inode->i_cdev, struct coproc_dev, + cdev); + fp->private_data = cp_inst; + + return 0; +} + +static int coproc_ioc_tx_win_open(struct file *fp, unsigned long arg) +{ + void __user *uptr = (void __user *)arg; + struct vas_tx_win_attr txattr = {}; + struct vas_tx_win_open_attr uattr; + struct coproc_instance *cp_inst; + struct vas_window *txwin; + int rc, vasid; + + cp_inst = fp->private_data; + + /* + * One window for file descriptor + */ + if (cp_inst->txwin) + return -EEXIST; + + rc = copy_from_user(&uattr, uptr, sizeof(uattr)); + if (rc) { + pr_err("%s(): copy_from_user() returns %d\n", __func__, rc); + return -EFAULT; + } + + if (uattr.version != 1) { + pr_err("Invalid version\n"); + return -EINVAL; + } + + vasid = uattr.vas_id; + + vas_init_tx_win_attr(&txattr, cp_inst->coproc->cop_type); + + txattr.lpid = mfspr(SPRN_LPID); + txattr.pidr = mfspr(SPRN_PID); + txattr.user_win = true; + txattr.rsvd_txbuf_count = false; + txattr.pswid = false; + + pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr, + mfspr(SPRN_PID)); + + txwin = vas_tx_win_open(vasid, cp_inst->coproc->cop_type, &txattr); + if (IS_ERR(txwin)) { + pr_err("%s() vas_tx_win_open() failed, %ld\n", __func__, + PTR_ERR(txwin)); + return PTR_ERR(txwin); + } + + cp_inst->txwin = txwin; + + return 0; +} + +static int coproc_release(struct inode *inode, struct file *fp) +{ + struct coproc_instance *cp_inst = fp->private_data; + + if (cp_inst->txwin) { + vas_win_close(cp_inst->txwin); + cp_inst->txwin = NULL; + } + + kfree(cp_inst); + fp->private_data = NULL; + + /* + * We don't know here if user has other receive windows + * open, so we can't really call clear_thread_tidr(). + * So, once the process calls set_thread_tidr(), the + * TIDR value sticks around until process exits, resulting + * in an extra copy in restore_sprs(). + */ + + return 0; +} + +static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct coproc_instance *cp_inst = fp->private_data; + struct vas_window *txwin; + unsigned long pfn; + u64 paste_addr; + pgprot_t prot; + int rc; + + txwin = cp_inst->txwin; + + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { + pr_debug("%s(): size 0x%zx, PAGE_SIZE 0x%zx\n", __func__, + (vma->vm_end - vma->vm_start), PAGE_SIZE); + return -EINVAL; + } + + /* Ensure instance has an open send window */ + if (!txwin) { + pr_err("%s(): No send window open?\n", __func__); + return -EINVAL; + } + + vas_win_paste_addr(txwin, &paste_addr, NULL); + pfn = paste_addr >> PAGE_SHIFT; + + /* flags, page_prot from cxl_mmap(), except we want cachable */ + vma->vm_flags |= VM_IO | VM_PFNMAP; + vma->vm_page_prot = pgprot_cached(vma->vm_page_prot); + + prot = __pgprot(pgprot_val(vma->vm_page_prot) | _PAGE_DIRTY); + + rc = remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + vma->vm_end - vma->vm_start, prot); + + pr_devel("%s(): paste addr %llx at %lx, rc %d\n", __func__, + paste_addr, vma->vm_start, rc); + + return rc; +} + +static long coproc_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case VAS_TX_WIN_OPEN: + return coproc_ioc_tx_win_open(fp, arg); + default: + return -EINVAL; + } +} + +static struct file_operations coproc_fops = { + .open = coproc_open, + .release = coproc_release, + .mmap = coproc_mmap, + .unlocked_ioctl = coproc_ioctl, +}; + +/* + * Supporting only nx-gzip coprocessor type now, but this API code + * extended to other coprocessor types later. + */ +int vas_register_coproc_api(struct module *mod, enum vas_cop_type cop_type, + const char *name) +{ + int rc = -EINVAL; + dev_t devno; + + rc = alloc_chrdev_region(&coproc_device.devt, 1, 1, name); + if (rc) { + pr_err("Unable to allocate coproc major number: %i\n", rc); + return rc; + } + + pr_devel("%s device allocated, dev [%i,%i]\n", name, + MAJOR(coproc_device.devt), MINOR(coproc_device.devt)); + + coproc_device.class = class_create(mod, name); + if (IS_ERR(coproc_device.class)) { + rc = PTR_ERR(coproc_device.class); + pr_err("Unable to create %s class %d\n", name, rc); + goto err_class; + } + coproc_device.class->devnode = coproc_devnode; + coproc_device.cop_type = cop_type; + + coproc_fops.owner = mod; + cdev_init(&coproc_device.cdev, &coproc_fops); + + devno = MKDEV(MAJOR(coproc_device.devt), 0); + rc = cdev_add(&coproc_device.cdev, devno, 1); + if (rc) { + pr_err("cdev_add() failed %d\n", rc); + goto err_cdev; + } + + coproc_device.device = device_create(coproc_device.class, NULL, + devno, NULL, name, MINOR(devno)); + if (IS_ERR(coproc_device.device)) { + rc = PTR_ERR(coproc_device.device); + pr_err("Unable to create coproc-%d %d\n", MINOR(devno), rc); + goto err; + } + + pr_devel("%s: Added dev [%d,%d]\n", __func__, MAJOR(devno), + MINOR(devno)); + + return 0; + +err: + cdev_del(&coproc_device.cdev); +err_cdev: + class_destroy(coproc_device.class); +err_class: + unregister_chrdev_region(coproc_device.devt, 1); + return rc; +} +EXPORT_SYMBOL_GPL(vas_register_coproc_api); + +void vas_unregister_coproc_api(void) +{ + dev_t devno; + + cdev_del(&coproc_device.cdev); + devno = MKDEV(MAJOR(coproc_device.devt), 0); + device_destroy(coproc_device.class, devno); + + class_destroy(coproc_device.class); + unregister_chrdev_region(coproc_device.devt, 1); +} +EXPORT_SYMBOL_GPL(vas_unregister_coproc_api); diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c index 44035a3d6414..41fa90d2f4ab 100644 --- a/arch/powerpc/platforms/powernv/vas-debug.c +++ b/arch/powerpc/platforms/powernv/vas-debug.c @@ -38,7 +38,7 @@ static int info_show(struct seq_file *s, void *private) seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop), window->tx_win ? "Send" : "Receive"); - seq_printf(s, "Pid : %d\n", window->pid); + seq_printf(s, "Pid : %d\n", vas_window_pid(window)); unlock: mutex_unlock(&vas_mutex); diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c new file mode 100644 index 000000000000..25db70be4c9c --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * VAS Fault handling. + * Copyright 2019, IBM Corporation + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/kthread.h> +#include <linux/sched/signal.h> +#include <linux/mmu_context.h> +#include <asm/icswx.h> + +#include "vas.h" + +/* + * The maximum FIFO size for fault window can be 8MB + * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS + * instance will be having fault window. + * 8MB FIFO can be used if expects more faults for each VAS + * instance. + */ +#define VAS_FAULT_WIN_FIFO_SIZE (4 << 20) + +static void dump_crb(struct coprocessor_request_block *crb) +{ + struct data_descriptor_entry *dde; + struct nx_fault_stamp *nx; + + dde = &crb->source; + pr_devel("SrcDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", + be64_to_cpu(dde->address), be32_to_cpu(dde->length), + dde->count, dde->index, dde->flags); + + dde = &crb->target; + pr_devel("TgtDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", + be64_to_cpu(dde->address), be32_to_cpu(dde->length), + dde->count, dde->index, dde->flags); + + nx = &crb->stamp.nx; + pr_devel("NX Stamp: PSWID 0x%x, FSA 0x%llx, flags 0x%x, FS 0x%x\n", + be32_to_cpu(nx->pswid), + be64_to_cpu(crb->stamp.nx.fault_storage_addr), + nx->flags, nx->fault_status); +} + +/* + * Update the CSB to indicate a translation error. + * + * User space will be polling on CSB after the request is issued. + * If NX can handle the request without any issues, it updates CSB. + * Whereas if NX encounters page fault, the kernel will handle the + * fault and update CSB with translation error. + * + * If we are unable to update the CSB means copy_to_user failed due to + * invalid csb_addr, send a signal to the process. + */ +static void update_csb(struct vas_window *window, + struct coprocessor_request_block *crb) +{ + struct coprocessor_status_block csb; + struct kernel_siginfo info; + struct task_struct *tsk; + void __user *csb_addr; + struct pid *pid; + int rc; + + /* + * NX user space windows can not be opened for task->mm=NULL + * and faults will not be generated for kernel requests. + */ + if (WARN_ON_ONCE(!window->mm || !window->user_win)) + return; + + csb_addr = (void __user *)be64_to_cpu(crb->csb_addr); + + memset(&csb, 0, sizeof(csb)); + csb.cc = CSB_CC_TRANSLATION; + csb.ce = CSB_CE_TERMINATION; + csb.cs = 0; + csb.count = 0; + + /* + * NX operates and returns in BE format as defined CRB struct. + * So saves fault_storage_addr in BE as NX pastes in FIFO and + * expects user space to convert to CPU format. + */ + csb.address = crb->stamp.nx.fault_storage_addr; + csb.flags = 0; + + pid = window->pid; + tsk = get_pid_task(pid, PIDTYPE_PID); + /* + * Process closes send window after all pending NX requests are + * completed. In multi-thread applications, a child thread can + * open a window and can exit without closing it. May be some + * requests are pending or this window can be used by other + * threads later. We should handle faults if NX encounters + * pages faults on these requests. Update CSB with translation + * error and fault address. If csb_addr passed by user space is + * invalid, send SEGV signal to pid saved in window. If the + * child thread is not running, send the signal to tgid. + * Parent thread (tgid) will close this window upon its exit. + * + * pid and mm references are taken when window is opened by + * process (pid). So tgid is used only when child thread opens + * a window and exits without closing it. + */ + if (!tsk) { + pid = window->tgid; + tsk = get_pid_task(pid, PIDTYPE_PID); + /* + * Parent thread (tgid) will be closing window when it + * exits. So should not get here. + */ + if (WARN_ON_ONCE(!tsk)) + return; + } + + /* Return if the task is exiting. */ + if (tsk->flags & PF_EXITING) { + put_task_struct(tsk); + return; + } + + use_mm(window->mm); + rc = copy_to_user(csb_addr, &csb, sizeof(csb)); + /* + * User space polls on csb.flags (first byte). So add barrier + * then copy first byte with csb flags update. + */ + if (!rc) { + csb.flags = CSB_V; + /* Make sure update to csb.flags is visible now */ + smp_mb(); + rc = copy_to_user(csb_addr, &csb, sizeof(u8)); + } + unuse_mm(window->mm); + put_task_struct(tsk); + + /* Success */ + if (!rc) + return; + + pr_debug("Invalid CSB address 0x%p signalling pid(%d)\n", + csb_addr, pid_vnr(pid)); + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = EFAULT; + info.si_code = SEGV_MAPERR; + info.si_addr = csb_addr; + + /* + * process will be polling on csb.flags after request is sent to + * NX. So generally CSB update should not fail except when an + * application passes invalid csb_addr. So an error message will + * be displayed and leave it to user space whether to ignore or + * handle this signal. + */ + rcu_read_lock(); + rc = kill_pid_info(SIGSEGV, &info, pid); + rcu_read_unlock(); + + pr_devel("%s(): pid %d kill_proc_info() rc %d\n", __func__, + pid_vnr(pid), rc); +} + +static void dump_fifo(struct vas_instance *vinst, void *entry) +{ + unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size; + unsigned long *fifo = entry; + int i; + + pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size, + vinst->fault_fifo_size / CRB_SIZE); + + /* Dump 10 CRB entries or until end of FIFO */ + pr_err("Fault FIFO Dump:\n"); + for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) { + pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n", + i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3)); + } +} + +/* + * Process valid CRBs in fault FIFO. + * NX process user space requests, return credit and update the status + * in CRB. If it encounters transalation error when accessing CRB or + * request buffers, raises interrupt on the CPU to handle the fault. + * It takes credit on fault window, updates nx_fault_stamp in CRB with + * the following information and pastes CRB in fault FIFO. + * + * pswid - window ID of the window on which the request is sent. + * fault_storage_addr - fault address + * + * It can raise a single interrupt for multiple faults. Expects OS to + * process all valid faults and return credit for each fault on user + * space and fault windows. This fault FIFO control will be done with + * credit mechanism. NX can continuously paste CRBs until credits are not + * available on fault window. Otherwise, returns with RMA_reject. + * + * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128) + * + */ +irqreturn_t vas_fault_thread_fn(int irq, void *data) +{ + struct vas_instance *vinst = data; + struct coprocessor_request_block *crb, *entry; + struct coprocessor_request_block buf; + struct vas_window *window; + unsigned long flags; + void *fifo; + + crb = &buf; + + /* + * VAS can interrupt with multiple page faults. So process all + * valid CRBs within fault FIFO until reaches invalid CRB. + * We use CCW[0] and pswid to validate validate CRBs: + * + * CCW[0] Reserved bit. When NX pastes CRB, CCW[0]=0 + * OS sets this bit to 1 after reading CRB. + * pswid NX assigns window ID. Set pswid to -1 after + * reading CRB from fault FIFO. + * + * We exit this function if no valid CRBs are available to process. + * So acquire fault_lock and reset fifo_in_progress to 0 before + * exit. + * In case kernel receives another interrupt with different page + * fault, interrupt handler returns with IRQ_HANDLED if + * fifo_in_progress is set. Means these new faults will be + * handled by the current thread. Otherwise set fifo_in_progress + * and return IRQ_WAKE_THREAD to wake up thread. + */ + while (true) { + spin_lock_irqsave(&vinst->fault_lock, flags); + /* + * Advance the fault fifo pointer to next CRB. + * Use CRB_SIZE rather than sizeof(*crb) since the latter is + * aligned to CRB_ALIGN (256) but the CRB written to by VAS is + * only CRB_SIZE in len. + */ + fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE); + entry = fifo; + + if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY)) + || (entry->ccw & cpu_to_be32(CCW0_INVALID))) { + vinst->fifo_in_progress = 0; + spin_unlock_irqrestore(&vinst->fault_lock, flags); + return IRQ_HANDLED; + } + + spin_unlock_irqrestore(&vinst->fault_lock, flags); + vinst->fault_crbs++; + if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE)) + vinst->fault_crbs = 0; + + memcpy(crb, fifo, CRB_SIZE); + entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY); + entry->ccw |= cpu_to_be32(CCW0_INVALID); + /* + * Return credit for the fault window. + */ + vas_return_credit(vinst->fault_win, false); + + pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n", + vinst->vas_id, vinst->fault_fifo, fifo, + vinst->fault_crbs); + + dump_crb(crb); + window = vas_pswid_to_window(vinst, + be32_to_cpu(crb->stamp.nx.pswid)); + + if (IS_ERR(window)) { + /* + * We got an interrupt about a specific send + * window but we can't find that window and we can't + * even clean it up (return credit on user space + * window). + * But we should not get here. + * TODO: Disable IRQ. + */ + dump_fifo(vinst, (void *)entry); + pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n", + vinst->vas_id, vinst->fault_fifo, fifo, + be32_to_cpu(crb->stamp.nx.pswid), + vinst->fault_crbs); + + WARN_ON_ONCE(1); + } else { + update_csb(window, crb); + /* + * Return credit for send window after processing + * fault CRB. + */ + vas_return_credit(window, true); + } + } +} + +irqreturn_t vas_fault_handler(int irq, void *dev_id) +{ + struct vas_instance *vinst = dev_id; + irqreturn_t ret = IRQ_WAKE_THREAD; + unsigned long flags; + + /* + * NX can generate an interrupt for multiple faults. So the + * fault handler thread process all CRBs until finds invalid + * entry. In case if NX sees continuous faults, it is possible + * that the thread function entered with the first interrupt + * can execute and process all valid CRBs. + * So wake up thread only if the fault thread is not in progress. + */ + spin_lock_irqsave(&vinst->fault_lock, flags); + + if (vinst->fifo_in_progress) + ret = IRQ_HANDLED; + else + vinst->fifo_in_progress = 1; + + spin_unlock_irqrestore(&vinst->fault_lock, flags); + + return ret; +} + +/* + * Fault window is opened per VAS instance. NX pastes fault CRB in fault + * FIFO upon page faults. + */ +int vas_setup_fault_window(struct vas_instance *vinst) +{ + struct vas_rx_win_attr attr; + + vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE; + vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL); + if (!vinst->fault_fifo) { + pr_err("Unable to alloc %d bytes for fault_fifo\n", + vinst->fault_fifo_size); + return -ENOMEM; + } + + /* + * Invalidate all CRB entries. NX pastes valid entry for each fault. + */ + memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size); + vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT); + + attr.rx_fifo_size = vinst->fault_fifo_size; + attr.rx_fifo = vinst->fault_fifo; + + /* + * Max creds is based on number of CRBs can fit in the FIFO. + * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds + * will be 0xffff since the receive creds field is 16bits wide. + */ + attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE; + attr.lnotify_lpid = 0; + attr.lnotify_pid = mfspr(SPRN_PID); + attr.lnotify_tid = mfspr(SPRN_PID); + + vinst->fault_win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT, + &attr); + + if (IS_ERR(vinst->fault_win)) { + pr_err("VAS: Error %ld opening FaultWin\n", + PTR_ERR(vinst->fault_win)); + kfree(vinst->fault_fifo); + return PTR_ERR(vinst->fault_win); + } + + pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n", + vinst->fault_win->winid, attr.lnotify_lpid, + attr.lnotify_pid, attr.lnotify_tid); + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 0c0d27d17976..6434f9cb5aed 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -12,6 +12,8 @@ #include <linux/log2.h> #include <linux/rcupdate.h> #include <linux/cred.h> +#include <linux/sched/mm.h> +#include <linux/mmu_context.h> #include <asm/switch_to.h> #include <asm/ppc-opcode.h> #include "vas.h" @@ -24,7 +26,7 @@ * Compute the paste address region for the window @window using the * ->paste_base_addr and ->paste_win_id_shift we got from device tree. */ -static void compute_paste_address(struct vas_window *window, u64 *addr, int *len) +void vas_win_paste_addr(struct vas_window *window, u64 *addr, int *len) { int winid; u64 base, shift; @@ -78,7 +80,7 @@ static void *map_paste_region(struct vas_window *txwin) goto free_name; txwin->paste_addr_name = name; - compute_paste_address(txwin, &start, &len); + vas_win_paste_addr(txwin, &start, &len); if (!request_mem_region(start, len, name)) { pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n", @@ -136,7 +138,7 @@ static void unmap_paste_region(struct vas_window *window) u64 busaddr_start; if (window->paste_kaddr) { - compute_paste_address(window, &busaddr_start, &len); + vas_win_paste_addr(window, &busaddr_start, &len); unmap_region(window->paste_kaddr, busaddr_start, len); window->paste_kaddr = NULL; kfree(window->paste_addr_name); @@ -373,7 +375,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) init_xlate_regs(window, winctx->user_win); val = 0ULL; - val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0); + val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id); write_hvwc_reg(window, VREG(FAULT_TX_WIN), val); /* In PowerNV, interrupts go to HV. */ @@ -748,6 +750,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (rxwin->vinst->virq) + winctx->irq_port = rxwin->vinst->irq_port; } static bool rx_win_args_valid(enum vas_cop_type cop, @@ -768,7 +772,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop, if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) return false; - if (attr->wcreds_max > VAS_RX_WCREDS_MAX) + if (!attr->wcreds_max) return false; if (attr->nx_win) { @@ -813,7 +817,8 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop) { memset(rxattr, 0, sizeof(*rxattr)); - if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI || + cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) { rxattr->pin_win = true; rxattr->nx_win = true; rxattr->fault_win = false; @@ -827,9 +832,9 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop) rxattr->fault_win = true; rxattr->notify_disable = true; rxattr->rx_wcred_mode = true; - rxattr->tx_wcred_mode = true; rxattr->rx_win_ord_mode = true; - rxattr->tx_win_ord_mode = true; + rxattr->rej_no_credit = true; + rxattr->tc_mode = VAS_THRESH_DISABLED; } else if (cop == VAS_COP_TYPE_FTW) { rxattr->user_win = true; rxattr->intr_disable = true; @@ -873,9 +878,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin->nx_win = rxattr->nx_win; rxwin->user_win = rxattr->user_win; rxwin->cop = cop; - rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; - if (rxattr->user_win) - rxwin->pid = task_pid_vnr(current); + rxwin->wcreds_max = rxattr->wcreds_max; init_winctx_for_rxwin(rxwin, rxattr, &winctx); init_winctx_regs(rxwin, &winctx); @@ -890,7 +893,8 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop) { memset(txattr, 0, sizeof(*txattr)); - if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI || + cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) { txattr->rej_no_credit = false; txattr->rx_wcred_mode = true; txattr->tx_wcred_mode = true; @@ -944,13 +948,22 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->lpid = txattr->lpid; winctx->pidr = txattr->pidr; winctx->rx_win_id = txwin->rxwin->winid; + /* + * IRQ and fault window setup is successful. Set fault window + * for the send window so that ready to handle faults. + */ + if (txwin->vinst->virq) + winctx->fault_win_id = txwin->vinst->fault_win->winid; winctx->dma_type = VAS_DMA_TYPE_INJECT; winctx->tc_mode = txattr->tc_mode; winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (txwin->vinst->virq) + winctx->irq_port = txwin->vinst->irq_port; - winctx->pswid = 0; + winctx->pswid = txattr->pswid ? txattr->pswid : + encode_pswid(txwin->vinst->vas_id, txwin->winid); } static bool tx_win_args_valid(enum vas_cop_type cop, @@ -965,9 +978,14 @@ static bool tx_win_args_valid(enum vas_cop_type cop, if (attr->wcreds_max > VAS_TX_WCREDS_MAX) return false; - if (attr->user_win && - (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count)) - return false; + if (attr->user_win) { + if (attr->rsvd_txbuf_count) + return false; + + if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP && + cop != VAS_COP_TYPE_GZIP_HIPRI) + return false; + } return true; } @@ -1016,7 +1034,6 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, txwin->tx_win = 1; txwin->rxwin = rxwin; txwin->nx_win = txwin->rxwin->nx_win; - txwin->pid = attr->pid; txwin->user_win = attr->user_win; txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; @@ -1040,12 +1057,59 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } else { /* - * A user mapping must ensure that context switch issues - * CP_ABORT for this thread. + * Interrupt hanlder or fault window setup failed. Means + * NX can not generate fault for page fault. So not + * opening for user space tx window. */ - rc = set_thread_uses_vas(); - if (rc) + if (!vinst->virq) { + rc = -ENODEV; goto free_window; + } + + /* + * Window opened by a child thread may not be closed when + * it exits. So take reference to its pid and release it + * when the window is free by parent thread. + * Acquire a reference to the task's pid to make sure + * pid will not be re-used - needed only for multithread + * applications. + */ + txwin->pid = get_task_pid(current, PIDTYPE_PID); + /* + * Acquire a reference to the task's mm. + */ + txwin->mm = get_task_mm(current); + + if (!txwin->mm) { + put_pid(txwin->pid); + pr_err("VAS: pid(%d): mm_struct is not found\n", + current->pid); + rc = -EPERM; + goto free_window; + } + + mmgrab(txwin->mm); + mmput(txwin->mm); + mm_context_add_vas_window(txwin->mm); + /* + * Process closes window during exit. In the case of + * multithread application, the child thread can open + * window and can exit without closing it. Expects parent + * thread to use and close the window. So do not need + * to take pid reference for parent thread. + */ + txwin->tgid = find_get_pid(task_tgid_vnr(current)); + /* + * Even a process that has no foreign real address mapping can + * use an unpaired COPY instruction (to no real effect). Issue + * CP_ABORT to clear any pending COPY and prevent a covert + * channel. + * + * __switch_to() will issue CP_ABORT on future context switches + * if process / thread has any open VAS window (Use + * current->mm->context.vas_windows). + */ + asm volatile(PPC_CP_ABORT); } set_vinst_win(vinst, txwin); @@ -1128,6 +1192,7 @@ static void poll_window_credits(struct vas_window *window) { u64 val; int creds, mode; + int count = 0; val = read_hvwc_reg(window, VREG(WINCTL)); if (window->tx_win) @@ -1146,10 +1211,27 @@ retry: creds = GET_FIELD(VAS_LRX_WCRED, val); } + /* + * Takes around few milliseconds to complete all pending requests + * and return credits. + * TODO: Scan fault FIFO and invalidate CRBs points to this window + * and issue CRB Kill to stop all pending requests. Need only + * if there is a bug in NX or fault handling in kernel. + */ if (creds < window->wcreds_max) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(10)); + count++; + /* + * Process can not close send window until all credits are + * returned. + */ + if (!(count % 1000)) + pr_warn_ratelimited("VAS: pid %d stuck. Waiting for credits returned for Window(%d). creds %d, Retries %d\n", + vas_window_pid(window), window->winid, + creds, count); + goto retry; } } @@ -1163,6 +1245,7 @@ static void poll_window_busy_state(struct vas_window *window) { int busy; u64 val; + int count = 0; retry: val = read_hvwc_reg(window, VREG(WIN_STATUS)); @@ -1170,7 +1253,16 @@ retry: if (busy) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(msecs_to_jiffies(5)); + schedule_timeout(msecs_to_jiffies(10)); + count++; + /* + * Takes around few milliseconds to process all pending + * requests. + */ + if (!(count % 1000)) + pr_warn_ratelimited("VAS: pid %d stuck. Window (ID=%d) is in busy state. Retries %d\n", + vas_window_pid(window), window->winid, count); + goto retry; } } @@ -1235,22 +1327,118 @@ int vas_win_close(struct vas_window *window) unmap_paste_region(window); - clear_vinst_win(window); - poll_window_busy_state(window); unpin_close_window(window); poll_window_credits(window); + clear_vinst_win(window); + poll_window_castout(window); /* if send window, drop reference to matching receive window */ - if (window->tx_win) + if (window->tx_win) { + if (window->user_win) { + /* Drop references to pid and mm */ + put_pid(window->pid); + if (window->mm) { + mm_context_remove_vas_window(window->mm); + mmdrop(window->mm); + } + } put_rx_win(window->rxwin); + } vas_window_free(window); return 0; } EXPORT_SYMBOL_GPL(vas_win_close); + +/* + * Return credit for the given window. + * Send windows and fault window uses credit mechanism as follows: + * + * Send windows: + * - The default number of credits available for each send window is + * 1024. It means 1024 requests can be issued asynchronously at the + * same time. If the credit is not available, that request will be + * returned with RMA_Busy. + * - One credit is taken when NX request is issued. + * - This credit is returned after NX processed that request. + * - If NX encounters translation error, kernel will return the + * credit on the specific send window after processing the fault CRB. + * + * Fault window: + * - The total number credits available is FIFO_SIZE/CRB_SIZE. + * Means 4MB/128 in the current implementation. If credit is not + * available, RMA_Reject is returned. + * - A credit is taken when NX pastes CRB in fault FIFO. + * - The kernel with return credit on fault window after reading entry + * from fault FIFO. + */ +void vas_return_credit(struct vas_window *window, bool tx) +{ + uint64_t val; + + val = 0ULL; + if (tx) { /* send window */ + val = SET_FIELD(VAS_TX_WCRED, val, 1); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val); + } else { + val = SET_FIELD(VAS_LRX_WCRED, val, 1); + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val); + } +} + +struct vas_window *vas_pswid_to_window(struct vas_instance *vinst, + uint32_t pswid) +{ + struct vas_window *window; + int winid; + + if (!pswid) { + pr_devel("%s: called for pswid 0!\n", __func__); + return ERR_PTR(-ESRCH); + } + + decode_pswid(pswid, NULL, &winid); + + if (winid >= VAS_WINDOWS_PER_CHIP) + return ERR_PTR(-ESRCH); + + /* + * If application closes the window before the hardware + * returns the fault CRB, we should wait in vas_win_close() + * for the pending requests. so the window must be active + * and the process alive. + * + * If its a kernel process, we should not get any faults and + * should not get here. + */ + window = vinst->windows[winid]; + + if (!window) { + pr_err("PSWID decode: Could not find window for winid %d pswid %d vinst 0x%p\n", + winid, pswid, vinst); + return NULL; + } + + /* + * Do some sanity checks on the decoded window. Window should be + * NX GZIP user send window. FTW windows should not incur faults + * since their CRBs are ignored (not queued on FIFO or processed + * by NX). + */ + if (!window->tx_win || !window->user_win || !window->nx_win || + window->cop == VAS_COP_TYPE_FAULT || + window->cop == VAS_COP_TYPE_FTW) { + pr_err("PSWID decode: id %d, tx %d, user %d, nx %d, cop %d\n", + winid, window->tx_win, window->user_win, + window->nx_win, window->cop); + WARN_ON(1); + } + + return window; +} diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index ed9cc6df329a..598e4cd563fb 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -14,7 +14,10 @@ #include <linux/of_platform.h> #include <linux/of_address.h> #include <linux/of.h> +#include <linux/irqdomain.h> +#include <linux/interrupt.h> #include <asm/prom.h> +#include <asm/xive.h> #include "vas.h" @@ -23,12 +26,37 @@ static LIST_HEAD(vas_instances); static DEFINE_PER_CPU(int, cpu_vas_id); +static int vas_irq_fault_window_setup(struct vas_instance *vinst) +{ + char devname[64]; + int rc = 0; + + snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id); + rc = request_threaded_irq(vinst->virq, vas_fault_handler, + vas_fault_thread_fn, 0, devname, vinst); + + if (rc) { + pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n", + vinst->vas_id, vinst->virq, rc); + goto out; + } + + rc = vas_setup_fault_window(vinst); + if (rc) + free_irq(vinst->virq, vinst); + +out: + return rc; +} + static int init_vas_instance(struct platform_device *pdev) { - int rc, cpu, vasid; - struct resource *res; - struct vas_instance *vinst; struct device_node *dn = pdev->dev.of_node; + struct vas_instance *vinst; + struct xive_irq_data *xd; + uint32_t chipid, hwirq; + struct resource *res; + int rc, cpu, vasid; rc = of_property_read_u32(dn, "ibm,vas-id", &vasid); if (rc) { @@ -36,6 +64,12 @@ static int init_vas_instance(struct platform_device *pdev) return -ENODEV; } + rc = of_property_read_u32(dn, "ibm,chip-id", &chipid); + if (rc) { + pr_err("No ibm,chip-id property for %s?\n", pdev->name); + return -ENODEV; + } + if (pdev->num_resources != 4) { pr_err("Unexpected DT configuration for [%s, %d]\n", pdev->name, vasid); @@ -69,9 +103,32 @@ static int init_vas_instance(struct platform_device *pdev) vinst->paste_win_id_shift = 63 - res->end; - pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, " - "paste_win_id_shift 0x%llx\n", pdev->name, vasid, - vinst->paste_base_addr, vinst->paste_win_id_shift); + hwirq = xive_native_alloc_irq_on_chip(chipid); + if (!hwirq) { + pr_err("Inst%d: Unable to allocate global irq for chip %d\n", + vinst->vas_id, chipid); + return -ENOENT; + } + + vinst->virq = irq_create_mapping(NULL, hwirq); + if (!vinst->virq) { + pr_err("Inst%d: Unable to map global irq %d\n", + vinst->vas_id, hwirq); + return -EINVAL; + } + + xd = irq_get_handler_data(vinst->virq); + if (!xd) { + pr_err("Inst%d: Invalid virq %d\n", + vinst->vas_id, vinst->virq); + return -EINVAL; + } + + vinst->irq_port = xd->trig_page; + pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n", + pdev->name, vasid, vinst->paste_base_addr, + vinst->paste_win_id_shift, vinst->virq, + vinst->irq_port); for_each_possible_cpu(cpu) { if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn)) @@ -82,6 +139,22 @@ static int init_vas_instance(struct platform_device *pdev) list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); + spin_lock_init(&vinst->fault_lock); + /* + * IRQ and fault handling setup is needed only for user space + * send windows. + */ + if (vinst->virq) { + rc = vas_irq_fault_window_setup(vinst); + /* + * Fault window is used only for user space send windows. + * So if vinst->virq is NULL, tx_win_open returns -ENODEV + * for user space. + */ + if (rc) + vinst->virq = 0; + } + vas_instance_init_dbgdir(vinst); dev_set_drvdata(&pdev->dev, vinst); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 5574aec9ee88..70f793e8f6cc 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -101,11 +101,9 @@ /* * Initial per-process credits. * Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED) - * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED) * * TODO: Needs tuning for per-process credits */ -#define VAS_RX_WCREDS_MAX ((64 << 10) - 1) #define VAS_TX_WCREDS_MAX ((4 << 10) - 1) #define VAS_WCREDS_DEFAULT (1 << 10) @@ -296,6 +294,22 @@ enum vas_notify_after_count { }; /* + * NX can generate an interrupt for multiple faults and expects kernel + * to process all of them. So read all valid CRB entries until find the + * invalid one. So use pswid which is pasted by NX and ccw[0] (reserved + * bit in BE) to check valid CRB. CCW[0] will not be touched by user + * space. Application gets CRB formt error if it updates this bit. + * + * Invalidate FIFO during allocation and process all entries from last + * successful read until finds invalid pswid and ccw[0] values. + * After reading each CRB entry from fault FIFO, the kernel invalidate + * it by updating pswid with FIFO_INVALID_ENTRY and CCW[0] with + * CCW0_INVALID. + */ +#define FIFO_INVALID_ENTRY 0xffffffff +#define CCW0_INVALID 1 + +/* * One per instance of VAS. Each instance will have a separate set of * receive windows, one per coprocessor type. * @@ -313,6 +327,15 @@ struct vas_instance { u64 paste_base_addr; u64 paste_win_id_shift; + u64 irq_port; + int virq; + int fault_crbs; + int fault_fifo_size; + int fifo_in_progress; /* To wake up thread or return IRQ_HANDLED */ + spinlock_t fault_lock; /* Protects fifo_in_progress update */ + void *fault_fifo; + struct vas_window *fault_win; /* Fault window */ + struct mutex mutex; struct vas_window *rxwin[VAS_COP_TYPE_MAX]; struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; @@ -333,7 +356,9 @@ struct vas_window { bool user_win; /* True if user space window */ void *hvwc_map; /* HV window context */ void *uwc_map; /* OS/User window context */ - pid_t pid; /* Linux process id of owner */ + struct pid *pid; /* Linux process id of owner */ + struct pid *tgid; /* Thread group ID of owner */ + struct mm_struct *mm; /* Linux process mm_struct */ int wcreds_max; /* Window credits */ char *dbgname; @@ -406,6 +431,19 @@ extern void vas_init_dbgdir(void); extern void vas_instance_init_dbgdir(struct vas_instance *vinst); extern void vas_window_init_dbgdir(struct vas_window *win); extern void vas_window_free_dbgdir(struct vas_window *win); +extern int vas_setup_fault_window(struct vas_instance *vinst); +extern irqreturn_t vas_fault_thread_fn(int irq, void *data); +extern irqreturn_t vas_fault_handler(int irq, void *dev_id); +extern void vas_return_credit(struct vas_window *window, bool tx); +extern struct vas_window *vas_pswid_to_window(struct vas_instance *vinst, + uint32_t pswid); +extern void vas_win_paste_addr(struct vas_window *window, u64 *addr, + int *len); + +static inline int vas_window_pid(struct vas_window *window) +{ + return pid_vnr(window->pid); +} static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) @@ -444,6 +482,21 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } +/* + * Encode/decode the Partition Send Window ID (PSWID) for a window in + * a way that we can uniquely identify any window in the system. i.e. + * we should be able to locate the 'struct vas_window' given the PSWID. + * + * Bits Usage + * 0:7 VAS id (8 bits) + * 8:15 Unused, 0 (3 bits) + * 16:31 Window id (16 bits) + */ +static inline u32 encode_pswid(int vasid, int winid) +{ + return ((u32)winid | (vasid << (31 - 7))); +} + static inline void decode_pswid(u32 pswid, int *vasid, int *winid) { if (vasid) |