diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/Makefile | 1 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/eeh-powernv.c | 114 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/idle.c | 48 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-async.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda-tce.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 923 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-sriov.c | 766 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 14 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 103 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/setup.c | 10 |
10 files changed, 1103 insertions, 880 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index fe3f0fb5aeca..2eb6ae150d1f 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_FA_DUMP) += opal-fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o obj-$(CONFIG_OPAL_CORE) += opal-core.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o +obj-$(CONFIG_PCI_IOV) += pci-sriov.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 79409e005fcd..9af8c3b98853 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -338,6 +338,28 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) return 0; } +static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev) +{ + struct pci_controller *hose = pdev->bus->sysdata; + struct pnv_phb *phb = hose->private_data; + struct pci_dev *parent = pdev->bus->self; + +#ifdef CONFIG_PCI_IOV + /* for VFs we use the PF's PE as the upstream PE */ + if (pdev->is_virtfn) + parent = pdev->physfn; +#endif + + /* otherwise use the PE of our parent bridge */ + if (parent) { + struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent); + + return eeh_pe_get(phb->hose, ioda_pe->pe_number, 0); + } + + return NULL; +} + /** * pnv_eeh_probe - Do probe on PCI device * @pdev: pci_dev to probe @@ -350,6 +372,7 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) struct pci_controller *hose = pdn->phb; struct pnv_phb *phb = hose->private_data; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + struct eeh_pe *upstream_pe; uint32_t pcie_flags; int ret; int config_addr = (pdn->busno << 8) | (pdn->devfn); @@ -372,19 +395,18 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) } /* Skip for PCI-ISA bridge */ - if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; eeh_edev_dbg(edev, "Probing device\n"); /* Initialize eeh device */ - edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX); edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP); edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF); edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR); - if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS, @@ -399,8 +421,10 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) edev->pe_config_addr = phb->ioda.pe_rmap[config_addr]; + upstream_pe = pnv_eeh_get_upstream_pe(pdev); + /* Create PE */ - ret = eeh_add_to_parent_pe(edev); + ret = eeh_pe_tree_insert(edev, upstream_pe); if (ret) { eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret); return NULL; @@ -535,18 +559,6 @@ static int pnv_eeh_set_option(struct eeh_pe *pe, int option) return 0; } -/** - * pnv_eeh_get_pe_addr - Retrieve PE address - * @pe: EEH PE - * - * Retrieve the PE address according to the given tranditional - * PCI BDF (Bus/Device/Function) address. - */ -static int pnv_eeh_get_pe_addr(struct eeh_pe *pe) -{ - return pe->addr; -} - static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) { struct pnv_phb *phb = pe->phb->private_data; @@ -850,32 +862,32 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) case EEH_RESET_HOT: /* Don't report linkDown event */ if (aer) { - eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, &ctrl); ctrl |= PCI_ERR_UNC_SURPDN; - eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, ctrl); } - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl |= PCI_BRIDGE_CTL_BUS_RESET; - eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; - eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl); msleep(EEH_PE_RST_SETTLE_TIME); /* Continue reporting linkDown event */ if (aer) { - eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, &ctrl); ctrl &= ~PCI_ERR_UNC_SURPDN; - eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, ctrl); } @@ -944,11 +956,12 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev) static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, int pos, u16 mask) { + struct eeh_dev *edev = pdn->edev; int i, status = 0; /* Wait for Transaction Pending bit to be cleared */ for (i = 0; i < 4; i++) { - eeh_ops->read_config(pdn, pos, 2, &status); + eeh_ops->read_config(edev, pos, 2, &status); if (!(status & mask)) return; @@ -969,7 +982,7 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option) if (WARN_ON(!edev->pcie_cap)) return -ENOTTY; - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, ®); + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCAP, 4, ®); if (!(reg & PCI_EXP_DEVCAP_FLR)) return -ENOTTY; @@ -979,18 +992,18 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option) pnv_eeh_wait_for_pending(pdn, "", edev->pcie_cap + PCI_EXP_DEVSTA, PCI_EXP_DEVSTA_TRPND); - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, ®); reg |= PCI_EXP_DEVCTL_BCR_FLR; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, reg); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, ®); reg &= ~PCI_EXP_DEVCTL_BCR_FLR; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, reg); msleep(EEH_PE_RST_SETTLE_TIME); break; @@ -1007,7 +1020,7 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option) if (WARN_ON(!edev->af_cap)) return -ENOTTY; - eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap); + eeh_ops->read_config(edev, edev->af_cap + PCI_AF_CAP, 1, &cap); if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR)) return -ENOTTY; @@ -1022,12 +1035,12 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option) pnv_eeh_wait_for_pending(pdn, "AF", edev->af_cap + PCI_AF_CTRL, PCI_AF_STATUS_TP << 8); - eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, + eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, PCI_AF_CTRL_FLR); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0); + eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, 0); msleep(EEH_PE_RST_SETTLE_TIME); break; } @@ -1261,9 +1274,11 @@ static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn) return false; } -static int pnv_eeh_read_config(struct pci_dn *pdn, +static int pnv_eeh_read_config(struct eeh_dev *edev, int where, int size, u32 *val) { + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; @@ -1275,9 +1290,11 @@ static int pnv_eeh_read_config(struct pci_dn *pdn, return pnv_pci_cfg_read(pdn, where, size, val); } -static int pnv_eeh_write_config(struct pci_dn *pdn, +static int pnv_eeh_write_config(struct eeh_dev *edev, int where, int size, u32 val) { + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; @@ -1631,34 +1648,24 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) return ret; } -static int pnv_eeh_restore_config(struct pci_dn *pdn) +static int pnv_eeh_restore_config(struct eeh_dev *edev) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); struct pnv_phb *phb; s64 ret = 0; - int config_addr = (pdn->busno << 8) | (pdn->devfn); if (!edev) return -EEXIST; - /* - * We have to restore the PCI config space after reset since the - * firmware can't see SRIOV VFs. - * - * FIXME: The MPS, error routing rules, timeout setting are worthy - * to be exported by firmware in extendible way. - */ - if (edev->physfn) { - ret = eeh_restore_vf_config(pdn); - } else { - phb = pdn->phb->private_data; - ret = opal_pci_reinit(phb->opal_id, - OPAL_REINIT_PCI_DEV, config_addr); - } + if (edev->physfn) + return 0; + + phb = edev->controller->private_data; + ret = opal_pci_reinit(phb->opal_id, + OPAL_REINIT_PCI_DEV, edev->bdfn); if (ret) { pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, config_addr, ret); + __func__, edev->bdfn, ret); return -EIO; } @@ -1670,7 +1677,6 @@ static struct eeh_ops pnv_eeh_ops = { .init = pnv_eeh_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, - .get_pe_addr = pnv_eeh_get_pe_addr, .get_state = pnv_eeh_get_state, .reset = pnv_eeh_reset, .get_log = pnv_eeh_get_log, diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 2dd467383a88..77513a80cef9 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -48,7 +48,7 @@ static bool default_stop_found; * First stop state levels when SPR and TB loss can occur. */ static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1; -static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1; +static u64 deep_spr_loss_state = MAX_STOP_STATE + 1; /* * psscr value and mask of the deepest stop idle state. @@ -73,9 +73,6 @@ static int pnv_save_sprs_for_deep_states(void) */ uint64_t lpcr_val = mfspr(SPRN_LPCR); uint64_t hid0_val = mfspr(SPRN_HID0); - uint64_t hid1_val = mfspr(SPRN_HID1); - uint64_t hid4_val = mfspr(SPRN_HID4); - uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); uint64_t msr_val = MSR_IDLE; uint64_t psscr_val = pnv_deepest_stop_psscr_val; @@ -117,6 +114,9 @@ static int pnv_save_sprs_for_deep_states(void) /* Only p8 needs to set extra HID regiters */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + uint64_t hid1_val = mfspr(SPRN_HID1); + uint64_t hid4_val = mfspr(SPRN_HID4); + uint64_t hid5_val = mfspr(SPRN_HID5); rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); if (rc != 0) @@ -611,6 +611,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) unsigned long srr1; unsigned long pls; unsigned long mmcr0 = 0; + unsigned long mmcra = 0; struct p9_sprs sprs = {}; /* avoid false used-uninitialised */ bool sprs_saved = false; @@ -657,7 +658,22 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) */ mmcr0 = mfspr(SPRN_MMCR0); } - if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) { + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + /* + * POWER10 uses MMCRA (BHRBRD) as BHRB disable bit. + * If the user hasn't asked for the BHRB to be + * written, the value of MMCRA[BHRBRD] is 1. + * On wakeup from stop, MMCRA[BHRBD] will be 0, + * since it is previleged resource and will be lost. + * Thus, if we do not save and restore the MMCRA[BHRBD], + * hardware will be needlessly writing to the BHRB + * in problem mode. + */ + mmcra = mfspr(SPRN_MMCRA); + } + + if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) { sprs.lpcr = mfspr(SPRN_LPCR); sprs.hfscr = mfspr(SPRN_HFSCR); sprs.fscr = mfspr(SPRN_FSCR); @@ -700,8 +716,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) { - unsigned long mmcra; - /* * We don't need an isync after the mtsprs here because the * upcoming mtmsrd is execution synchronizing. @@ -721,6 +735,10 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) mtspr(SPRN_MMCR0, mmcr0); } + /* Reload MMCRA to restore BHRB disable bit for POWER10 */ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + mtspr(SPRN_MMCRA, mmcra); + /* * DD2.2 and earlier need to set then clear bit 60 in MMCRA * to ensure the PMU starts running. @@ -741,7 +759,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) * just always test PSSCR for SPR/TB state loss. */ pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT; - if (likely(pls < pnv_first_spr_loss_level)) { + if (likely(pls < deep_spr_loss_state)) { if (sprs_saved) atomic_stop_thread_idle(); goto out; @@ -1088,7 +1106,7 @@ static void __init pnv_power9_idle_init(void) * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state. */ pnv_first_tb_loss_level = MAX_STOP_STATE + 1; - pnv_first_spr_loss_level = MAX_STOP_STATE + 1; + deep_spr_loss_state = MAX_STOP_STATE + 1; for (i = 0; i < nr_pnv_idle_states; i++) { int err; struct pnv_idle_states_t *state = &pnv_idle_states[i]; @@ -1099,8 +1117,8 @@ static void __init pnv_power9_idle_init(void) pnv_first_tb_loss_level = psscr_rl; if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) && - (pnv_first_spr_loss_level > psscr_rl)) - pnv_first_spr_loss_level = psscr_rl; + (deep_spr_loss_state > psscr_rl)) + deep_spr_loss_state = psscr_rl; /* * The idle code does not deal with TB loss occurring @@ -1111,8 +1129,8 @@ static void __init pnv_power9_idle_init(void) * compatibility. */ if ((state->flags & OPAL_PM_TIMEBASE_STOP) && - (pnv_first_spr_loss_level > psscr_rl)) - pnv_first_spr_loss_level = psscr_rl; + (deep_spr_loss_state > psscr_rl)) + deep_spr_loss_state = psscr_rl; err = validate_psscr_val_mask(&state->psscr_val, &state->psscr_mask, @@ -1158,7 +1176,7 @@ static void __init pnv_power9_idle_init(void) } pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n", - pnv_first_spr_loss_level); + deep_spr_loss_state); pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n", pnv_first_tb_loss_level); @@ -1205,7 +1223,7 @@ static void __init pnv_probe_idle_states(void) return; } - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (pvr_version_is(PVR_POWER9)) pnv_power9_idle_init(); for (i = 0; i < nr_pnv_idle_states; i++) diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 1656e8965d6b..c094fdf5825c 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -104,7 +104,7 @@ static int __opal_async_release_token(int token) */ case ASYNC_TOKEN_DISPATCHED: opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED; - /* Fall through */ + fallthrough; default: rc = 1; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index f923359d8afc..5218f5da2737 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -166,7 +166,7 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, if (!ptce) { ptce = pnv_tce(tbl, false, idx, alloc); if (!ptce) - return alloc ? H_HARDWARE : H_TOO_HARD; + return -ENOMEM; } if (newtce & TCE_PCI_WRITE) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 73a63efcf855..c9c25fb0783c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -115,32 +115,13 @@ static int __init pci_reset_phbs_setup(char *str) early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup); -static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) -{ - /* - * WARNING: We cannot rely on the resource flags. The Linux PCI - * allocation code sometimes decides to put a 64-bit prefetchable - * BAR in the 32-bit window, so we have to compare the addresses. - * - * For simplicity we only test resource start. - */ - return (r->start >= phb->ioda.m64_base && - r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); -} - -static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) -{ - unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); - - return (resource_flags & flags) == flags; -} - static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) { s64 rc; phb->ioda.pe_array[pe_no].phb = phb; phb->ioda.pe_array[pe_no].pe_number = pe_no; + phb->ioda.pe_array[pe_no].dma_setup_done = false; /* * Clear the PE frozen state as it might be put into frozen state @@ -164,26 +145,48 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) return; } + mutex_lock(&phb->ioda.pe_alloc_mutex); if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) pr_debug("%s: PE %x was reserved on PHB#%x\n", __func__, pe_no, phb->hose->global_number); + mutex_unlock(&phb->ioda.pe_alloc_mutex); pnv_ioda_init_pe(phb, pe_no); } -static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count) { - long pe; + struct pnv_ioda_pe *ret = NULL; + int run = 0, pe, i; + + mutex_lock(&phb->ioda.pe_alloc_mutex); + /* scan backwards for a run of @count cleared bits */ for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) { - if (!test_and_set_bit(pe, phb->ioda.pe_alloc)) - return pnv_ioda_init_pe(phb, pe); + if (test_bit(pe, phb->ioda.pe_alloc)) { + run = 0; + continue; + } + + run++; + if (run == count) + break; + } + if (run != count) + goto out; + + for (i = pe; i < pe + count; i++) { + set_bit(i, phb->ioda.pe_alloc); + pnv_ioda_init_pe(phb, i); } + ret = &phb->ioda.pe_array[pe]; - return NULL; +out: + mutex_unlock(&phb->ioda.pe_alloc_mutex); + return ret; } -static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) { struct pnv_phb *phb = pe->phb; unsigned int pe_num = pe->pe_number; @@ -192,7 +195,10 @@ static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */ kfree(pe->npucomp); memset(pe, 0, sizeof(struct pnv_ioda_pe)); + + mutex_lock(&phb->ioda.pe_alloc_mutex); clear_bit(pe_num, phb->ioda.pe_alloc); + mutex_unlock(&phb->ioda.pe_alloc_mutex); } /* The default M64 BAR is shared by all PEs */ @@ -252,8 +258,7 @@ fail: static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct resource *r; resource_size_t base, sgsz, start, end; int segno, i; @@ -311,6 +316,28 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb) } } + for (index = 0; index < phb->ioda.total_pe_num; index++) { + int64_t rc; + + /* + * P7IOC supports M64DT, which helps mapping M64 segment + * to one particular PE#. However, PHB3 has fixed mapping + * between M64 segment and PE#. In order to have same logic + * for P7IOC and PHB3, we enforce fixed mapping between M64 + * segment and PE# on P7IOC. + */ + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + index, OPAL_M64_WINDOW_TYPE, + index / PNV_IODA1_M64_SEGS, + index % PNV_IODA1_M64_SEGS); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n", + __func__, rc, phb->hose->global_number, + index); + goto fail; + } + } + /* * Exclude the segments for reserved and root bus PE, which * are first or last two PEs. @@ -351,8 +378,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *master_pe, *pe; unsigned long size, *pe_alloc; int i; @@ -403,26 +429,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) pe->master = master_pe; list_add_tail(&pe->list, &master_pe->slaves); } - - /* - * P7IOC supports M64DT, which helps mapping M64 segment - * to one particular PE#. However, PHB3 has fixed mapping - * between M64 segment and PE#. In order to have same logic - * for P7IOC and PHB3, we enforce fixed mapping between M64 - * segment and PE# on P7IOC. - */ - if (phb->type == PNV_PHB_IODA1) { - int64_t rc; - - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_M64_WINDOW_TYPE, - pe->pe_number / PNV_IODA1_M64_SEGS, - pe->pe_number % PNV_IODA1_M64_SEGS); - if (rc != OPAL_SUCCESS) - pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n", - __func__, rc, phb->hose->global_number, - pe->pe_number); - } } kfree(pe_alloc); @@ -673,8 +679,7 @@ struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn) struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); if (!pdn) @@ -816,7 +821,7 @@ static void pnv_ioda_unset_peltv(struct pnv_phb *phb, pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc); } -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; @@ -887,7 +892,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) return 0; } -static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; @@ -982,95 +987,9 @@ out: return 0; } -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) -{ - struct pci_dn *pdn = pci_get_pdn(dev); - int i; - struct resource *res, res2; - resource_size_t size; - u16 num_vfs; - - if (!dev->is_physfn) - return -EINVAL; - - /* - * "offset" is in VFs. The M64 windows are sized so that when they - * are segmented, each segment is the same size as the IOV BAR. - * Each segment is in a separate PE, and the high order bits of the - * address are the PE number. Therefore, each VF's BAR is in a - * separate PE, and changing the IOV BAR start address changes the - * range of PEs the VFs are in. - */ - num_vfs = pdn->num_vfs; - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - /* - * The actual IOV BAR range is determined by the start address - * and the actual size for num_vfs VFs BAR. This check is to - * make sure that after shifting, the range will not overlap - * with another device. - */ - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2.flags = res->flags; - res2.start = res->start + (size * offset); - res2.end = res2.start + (size * num_vfs) - 1; - - if (res2.end > res->end) { - dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", - i, &res2, res, num_vfs, offset); - return -EBUSY; - } - } - - /* - * Since M64 BAR shares segments among all possible 256 PEs, - * we have to shift the beginning of PF IOV BAR to make it start from - * the segment which belongs to the PE number assigned to the first VF. - * This creates a "hole" in the /proc/iomem which could be used for - * allocating other resources so we reserve this area below and - * release when IOV is released. - */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2 = *res; - res->start += size * offset; - - dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", - i, &res2, res, (offset > 0) ? "En" : "Dis", - num_vfs, offset); - - if (offset < 0) { - devm_release_resource(&dev->dev, &pdn->holes[i]); - memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); - } - - pci_update_resource(dev, i + PCI_IOV_RESOURCES); - - if (offset > 0) { - pdn->holes[i].start = res2.start; - pdn->holes[i].end = res2.start + size * offset - 1; - pdn->holes[i].flags = IORESOURCE_BUS; - pdn->holes[i].name = "pnv_iov_reserved"; - devm_request_resource(&dev->dev, res->parent, - &pdn->holes[i]); - } - } - return 0; -} -#endif /* CONFIG_PCI_IOV */ - static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; @@ -1082,7 +1001,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { pr_warn("%s: Not enough PE# available, disabling device\n", pci_name(dev)); @@ -1129,8 +1048,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) */ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *pe = NULL; unsigned int pe_num; @@ -1154,7 +1072,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) /* The PE number isn't pinned by M64 */ if (!pe) - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n", @@ -1196,8 +1114,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) struct pnv_ioda_pe *pe; struct pci_dev *gpu_pdev; struct pci_dn *npu_pdn; - struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus); /* * Intentionally leak a reference on the npu device (for @@ -1297,446 +1214,12 @@ static void pnv_pci_ioda_setup_nvlink(void) #endif } -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - int i, j; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < m64_bars; j++) { - if (pdn->m64_map[j][i] == IODA_INVALID_M64) - continue; - opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); - clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); - pdn->m64_map[j][i] = IODA_INVALID_M64; - } - - kfree(pdn->m64_map); - return 0; -} - -static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - unsigned int win; - struct resource *res; - int i, j; - int64_t rc; - int total_vfs; - resource_size_t size, start; - int pe_num; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - total_vfs = pci_sriov_get_totalvfs(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - pdn->m64_map = kmalloc_array(m64_bars, - sizeof(*pdn->m64_map), - GFP_KERNEL); - if (!pdn->m64_map) - return -ENOMEM; - /* Initialize the m64_map to IODA_INVALID_M64 */ - for (i = 0; i < m64_bars ; i++) - for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) - pdn->m64_map[i][j] = IODA_INVALID_M64; - - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - for (j = 0; j < m64_bars; j++) { - do { - win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, - phb->ioda.m64_bar_idx + 1, 0); - - if (win >= phb->ioda.m64_bar_idx + 1) - goto m64_failed; - } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); - - pdn->m64_map[j][i] = win; - - if (pdn->m64_single_mode) { - size = pci_iov_resource_size(pdev, - PCI_IOV_RESOURCES + i); - start = res->start + size * j; - } else { - size = resource_size(res); - start = res->start; - } - - /* Map the M64 here */ - if (pdn->m64_single_mode) { - pe_num = pdn->pe_num_map[j]; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe_num, OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], 0); - } - - rc = opal_pci_set_phb_mem_window(phb->opal_id, - OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], - start, - 0, /* unused */ - size); - - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", - win, rc); - goto m64_failed; - } - - if (pdn->m64_single_mode) - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); - else - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", - win, rc); - goto m64_failed; - } - } - } - return 0; - -m64_failed: - pnv_pci_vf_release_m64(pdev, num_vfs); - return -EBUSY; -} - -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, - int num); - -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) -{ - struct iommu_table *tbl; - int64_t rc; - - tbl = pe->table_group.tables[0]; - rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); - if (rc) - pe_warn(pe, "OPAL error %lld release DMA window\n", rc); - - pnv_pci_ioda2_set_bypass(pe, false); - if (pe->table_group.group) { - iommu_group_put(pe->table_group.group); - BUG_ON(pe->table_group.group); - } - iommu_tce_table_put(tbl); -} - -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe, *pe_n; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { - if (pe->parent_dev != pdev) - continue; - - pnv_pci_ioda2_release_dma_pe(pdev, pe); - - /* Remove from list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_del(&pe->list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - pnv_ioda_deconfigure_pe(phb, pe); - - pnv_ioda_free_pe(pe); - } -} - -void pnv_pci_sriov_disable(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - u16 num_vfs, i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - num_vfs = pdn->num_vfs; - - /* Release VF PEs */ - pnv_ioda_release_vf_PE(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->m64_single_mode) - pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map); - - /* Release M64 windows */ - pnv_pci_vf_release_m64(pdev, num_vfs); - - /* Release PE numbers */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - } -} - -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, +static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); -static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - int pe_num; - u16 vf_index; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - /* Reserve PE for each VF */ - for (vf_index = 0; vf_index < num_vfs; vf_index++) { - int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); - int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); - struct pci_dn *vf_pdn; - - if (pdn->m64_single_mode) - pe_num = pdn->pe_num_map[vf_index]; - else - pe_num = *pdn->pe_num_map + vf_index; - - pe = &phb->ioda.pe_array[pe_num]; - pe->pe_number = pe_num; - pe->phb = phb; - pe->flags = PNV_IODA_PE_VF; - pe->pbus = NULL; - pe->parent_dev = pdev; - pe->mve_number = -1; - pe->rid = (vf_bus << 8) | vf_devfn; - - pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", - hose->global_number, pdev->bus->number, - PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num); - - if (pnv_ioda_configure_pe(phb, pe)) { - /* XXX What do we do here ? */ - pnv_ioda_free_pe(pe); - pe->pdev = NULL; - continue; - } - - /* Put PE to the list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_add_tail(&pe->list, &phb->ioda.pe_list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - /* associate this pe to it's pdn */ - list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { - if (vf_pdn->busno == vf_bus && - vf_pdn->devfn == vf_devfn) { - vf_pdn->pe_number = pe_num; - break; - } - } - - pnv_pci_ioda2_setup_dma_pe(phb, pe); - } -} - -int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - int ret; - u16 i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->vfs_expanded) { - dev_info(&pdev->dev, "don't support this SRIOV device" - " with non 64bit-prefetchable IOV BAR\n"); - return -ENOSPC; - } - - /* - * When M64 BARs functions in Single PE mode, the number of VFs - * could be enabled must be less than the number of M64 BARs. - */ - if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { - dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); - return -EBUSY; - } - - /* Allocating pe_num_map */ - if (pdn->m64_single_mode) - pdn->pe_num_map = kmalloc_array(num_vfs, - sizeof(*pdn->pe_num_map), - GFP_KERNEL); - else - pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL); - - if (!pdn->pe_num_map) - return -ENOMEM; - - if (pdn->m64_single_mode) - for (i = 0; i < num_vfs; i++) - pdn->pe_num_map[i] = IODA_INVALID_PE; - - /* Calculate available PE for required VFs */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - pe = pnv_ioda_alloc_pe(phb); - if (!pe) { - ret = -EBUSY; - goto m64_failed; - } - - pdn->pe_num_map[i] = pe->pe_number; - } - } else { - mutex_lock(&phb->ioda.pe_alloc_mutex); - *pdn->pe_num_map = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe_num, - 0, num_vfs, 0); - if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { - mutex_unlock(&phb->ioda.pe_alloc_mutex); - dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); - kfree(pdn->pe_num_map); - return -EBUSY; - } - bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - mutex_unlock(&phb->ioda.pe_alloc_mutex); - } - pdn->num_vfs = num_vfs; - - /* Assign M64 window accordingly */ - ret = pnv_pci_vf_assign_m64(pdev, num_vfs); - if (ret) { - dev_info(&pdev->dev, "Not enough M64 window resources\n"); - goto m64_failed; - } - - /* - * When using one M64 BAR to map one IOV BAR, we need to shift - * the IOV BAR according to the PE# allocated to the VFs. - * Otherwise, the PE# for the VF will conflict with others. - */ - if (!pdn->m64_single_mode) { - ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map); - if (ret) - goto m64_failed; - } - } - - /* Setup VF PEs */ - pnv_ioda_setup_vf_PE(pdev, num_vfs); - - return 0; - -m64_failed: - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - - return ret; -} - -int pnv_pcibios_sriov_disable(struct pci_dev *pdev) -{ - pnv_pci_sriov_disable(pdev); - - /* Release PCI data */ - remove_sriov_vf_pdns(pdev); - return 0; -} - -int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - /* Allocate PCI data */ - add_sriov_vf_pdns(pdev); - - return pnv_pci_sriov_enable(pdev, num_vfs); -} -#endif /* CONFIG_PCI_IOV */ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; @@ -1762,6 +1245,24 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number); } + /* + * We assume that bridges *probably* don't need to do any DMA so we can + * skip allocating a TCE table, etc unless we get a non-bridge device. + */ + if (!pe->dma_setup_done && !pci_is_bridge(pdev)) { + switch (phb->type) { + case PNV_PHB_IODA1: + pnv_pci_ioda1_setup_dma_pe(phb, pe); + break; + case PNV_PHB_IODA2: + pnv_pci_ioda2_setup_dma_pe(phb, pe); + break; + default: + pr_warn("%s: No DMA for PHB#%x (type %d)\n", + __func__, phb->hose->global_number, phb->type); + } + } + if (pdn) pdn->pe_number = pe->pe_number; pe->device_count++; @@ -1847,8 +1348,7 @@ err: static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, u64 dma_mask) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; @@ -1885,19 +1385,6 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, return false; } -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); - dev->dev.archdata.dma_offset = pe->tce_bypass_base; - - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate); - } -} - static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb, bool real_mode) { @@ -2285,6 +1772,7 @@ found: pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; iommu_init_table(tbl, phb->hose->node, 0, 0); + pe->dma_setup_done = true; return; fail: /* XXX Failure: Try to fallback to 64-bit only ? */ @@ -2474,7 +1962,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) return 0; } -#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV) static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, int num) { @@ -2498,7 +1985,6 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, return ret; } -#endif #ifdef CONFIG_IOMMU_API unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, @@ -2547,6 +2033,19 @@ static long pnv_pci_ioda2_create_table_userspace( return ret; } +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); + dev->dev.archdata.dma_offset = pe->tce_bypass_base; + + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, @@ -2583,14 +2082,11 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { }; #endif -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) { int64_t rc; - if (!pnv_pci_ioda_pe_dma_weight(pe)) - return; - /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; @@ -2615,6 +2111,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); #endif + pe->dma_setup_done = true; } int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) @@ -2763,118 +2260,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) count, phb->msi_base); } -#ifdef CONFIG_PCI_IOV -static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - const resource_size_t gate = phb->ioda.m64_segsize >> 2; - struct resource *res; - int i; - resource_size_t size, total_vf_bar_sz; - struct pci_dn *pdn; - int mul, total_vfs; - - pdn = pci_get_pdn(pdev); - pdn->vfs_expanded = 0; - pdn->m64_single_mode = false; - - total_vfs = pci_sriov_get_totalvfs(pdev); - mul = phb->ioda.total_pe_num; - total_vf_bar_sz = 0; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - if (!pnv_pci_is_m64_flags(res->flags)) { - dev_warn(&pdev->dev, "Don't support SR-IOV with" - " non M64 VF BAR%d: %pR. \n", - i, res); - goto truncate_iov; - } - - total_vf_bar_sz += pci_iov_resource_size(pdev, - i + PCI_IOV_RESOURCES); - - /* - * If bigger than quarter of M64 segment size, just round up - * power of two. - * - * Generally, one M64 BAR maps one IOV BAR. To avoid conflict - * with other devices, IOV BAR size is expanded to be - * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64 - * segment size , the expanded size would equal to half of the - * whole M64 space size, which will exhaust the M64 Space and - * limit the system flexibility. This is a design decision to - * set the boundary to quarter of the M64 segment size. - */ - if (total_vf_bar_sz > gate) { - mul = roundup_pow_of_two(total_vfs); - dev_info(&pdev->dev, - "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n", - total_vf_bar_sz, gate, mul); - pdn->m64_single_mode = true; - break; - } - } - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - - size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); - /* - * On PHB3, the minimum size alignment of M64 BAR in single - * mode is 32MB. - */ - if (pdn->m64_single_mode && (size < SZ_32M)) - goto truncate_iov; - dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); - res->end = res->start + size * mul - 1; - dev_dbg(&pdev->dev, " %pR\n", res); - dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", - i, res, mul); - } - pdn->vfs_expanded = mul; - - return; - -truncate_iov: - /* To save MMIO space, IOV BAR is truncated. */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - res->flags = 0; - res->end = res->start - 1; - } -} - -static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) -{ - if (WARN_ON(pci_dev_is_added(pdev))) - return; - - if (pdev->is_virtfn) { - struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); - - /* - * VF PEs are single-device PEs so their pdev pointer needs to - * be set. The pdev doesn't exist when the PE is allocated (in - * (pcibios_sriov_enable()) so we fix it up here. - */ - pe->pdev = pdev; - WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); - } else if (pdev->is_physfn) { - /* - * For PFs adjust their allocated IOV resources to match what - * the PHB can support using it's M64 BAR table. - */ - pnv_pci_ioda_fixup_iov_resources(pdev); - } -} -#endif /* CONFIG_PCI_IOV */ - static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, struct resource *res) { @@ -3101,10 +2486,9 @@ static void pnv_pci_ioda_fixup(void) static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, unsigned long type) { - struct pci_dev *bridge; - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); int num_pci_bridges = 0; + struct pci_dev *bridge; bridge = bus->self; while (bridge) { @@ -3190,8 +2574,6 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus, static void pnv_pci_configure_bus(struct pci_bus *bus) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; struct pci_dev *bridge = bus->self; struct pnv_ioda_pe *pe; bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); @@ -3215,17 +2597,6 @@ static void pnv_pci_configure_bus(struct pci_bus *bus) return; pnv_ioda_setup_pe_seg(pe); - switch (phb->type) { - case PNV_PHB_IODA1: - pnv_pci_ioda1_setup_dma_pe(phb, pe); - break; - case PNV_PHB_IODA2: - pnv_pci_ioda2_setup_dma_pe(phb, pe); - break; - default: - pr_warn("%s: No DMA for PHB#%x (type %d)\n", - __func__, phb->hose->global_number, phb->type); - } } static resource_size_t pnv_pci_default_alignment(void) @@ -3233,49 +2604,12 @@ static resource_size_t pnv_pci_default_alignment(void) return PAGE_SIZE; } -#ifdef CONFIG_PCI_IOV -static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, - int resno) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - resource_size_t align; - - /* - * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the - * SR-IOV. While from hardware perspective, the range mapped by M64 - * BAR should be size aligned. - * - * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra - * powernv-specific hardware restriction is gone. But if just use the - * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with - * in one segment of M64 #15, which introduces the PE conflict between - * PF and VF. Based on this, the minimum alignment of an IOV BAR is - * m64_segsize. - * - * This function returns the total IOV BAR size if M64 BAR is in - * Shared PE mode or just VF BAR size if not. - * If the M64 BAR is in Single PE mode, return the VF BAR size or - * M64 segment size if IOV BAR size is less. - */ - align = pci_iov_resource_size(pdev, resno); - if (!pdn->vfs_expanded) - return align; - if (pdn->m64_single_mode) - return max(align, (resource_size_t)phb->ioda.m64_segsize); - - return pdn->vfs_expanded * align; -} -#endif /* CONFIG_PCI_IOV */ - /* Prevent enabling devices for which we couldn't properly * assign a PE */ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn; /* The function is probably called while the PEs have @@ -3346,11 +2680,10 @@ static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) { - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); struct iommu_table *tbl = pe->table_group.tables[0]; int64_t rc; - if (!weight) + if (!pe->dma_setup_done) return; rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0); @@ -3367,22 +2700,17 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) iommu_tce_table_put(tbl); } -static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = pe->table_group.tables[0]; - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); -#ifdef CONFIG_IOMMU_API int64_t rc; -#endif - if (!weight) + if (pe->dma_setup_done) return; -#ifdef CONFIG_IOMMU_API rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) pe_warn(pe, "OPAL error %lld release DMA window\n", rc); -#endif pnv_pci_ioda2_set_bypass(pe, false); if (pe->table_group.group) { @@ -3405,14 +2733,8 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, if (map[idx] != pe->pe_number) continue; - if (win == OPAL_M64_WINDOW_TYPE) - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, - idx / PNV_IODA1_M64_SEGS, - idx % PNV_IODA1_M64_SEGS); - else - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, 0, idx); + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + phb->ioda.reserved_pe_idx, win, 0, idx); if (rc != OPAL_SUCCESS) pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n", @@ -3431,8 +2753,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) phb->ioda.io_segmap); pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, phb->ioda.m32_segmap); - pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE, - phb->ioda.m64_segmap); + /* M64 is pre-configured by pnv_ioda1_init_m64() */ } else if (phb->type == PNV_PHB_IODA2) { pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, phb->ioda.m32_segmap); @@ -3488,17 +2809,27 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) static void pnv_pci_release_device(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; + /* The VF PE state is torn down when sriov_disable() is called */ if (pdev->is_virtfn) return; if (!pdn || pdn->pe_number == IODA_INVALID_PE) return; +#ifdef CONFIG_PCI_IOV + /* + * FIXME: Try move this to sriov_disable(). It's here since we allocate + * the iov state at probe time since we need to fiddle with the IOV + * resources. + */ + if (pdev->is_physfn) + kfree(pdev->dev.archdata.iov_data); +#endif + /* * PCI hotplug can happen as part of EEH error recovery. The @pdn * isn't removed and added afterwards in this scenario. We should @@ -3534,8 +2865,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus) { - struct pci_controller *hose = bus->sysdata; - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *pe; list_for_each_entry(pe, &phb->ioda.pe_list, list) { @@ -3760,7 +3090,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); } else { /* otherwise just allocate one */ - root_pe = pnv_ioda_alloc_pe(phb); + root_pe = pnv_ioda_alloc_pe(phb, 1); phb->ioda.root_pe_idx = root_pe->pe_number; } @@ -3873,8 +3203,7 @@ void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np) static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); if (!machine_is(powernv)) return; diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c new file mode 100644 index 000000000000..c4434f20f42f --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-sriov.c @@ -0,0 +1,766 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/ioport.h> +#include <linux/bitmap.h> +#include <linux/pci.h> + +#include <asm/opal.h> + +#include "pci.h" + +/* for pci_dev_is_added() */ +#include "../../../../drivers/pci/pci.h" + +/* + * The majority of the complexity in supporting SR-IOV on PowerNV comes from + * the need to put the MMIO space for each VF into a separate PE. Internally + * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table". + * The MBT historically only applied to the 64bit MMIO window of the PHB + * so it's common to see it referred to as the "M64BT". + * + * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces + * the address range that we want to map to be power-of-two sized and aligned. + * For conventional PCI devices this isn't really an issue since PCI device BARs + * have the same requirement. + * + * For a SR-IOV BAR things are a little more awkward since size and alignment + * are not coupled. The alignment is set based on the the per-VF BAR size, but + * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs + * isn't necessarily a power of two, so neither is the total size. To fix that + * we need to finesse (read: hack) the Linux BAR allocator so that it will + * allocate the SR-IOV BARs in a way that lets us map them using the MBT. + * + * The changes to size and alignment that we need to do depend on the "mode" + * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above, + * so as a baseline we can assume that we have the following BAR modes + * available: + * + * NB: $PE_COUNT is the number of PEs that the PHB supports. + * + * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized + * segments. The n'th segment is mapped to the n'th PE. + * b) An un-segmented BAR that maps the whole address range to a specific PE. + * + * + * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR + * For comparison b) requires one entry per-VF per-BAR, or: + * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment + * to equal the size of the per-VF BAR area. So: + * + * new_size = per-vf-size * number-of-PEs + * + * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size + * to "new_size", calculated above. Implementing this is a convoluted process + * which requires several hooks in the PCI core: + * + * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov(). + * + * At this point the device has been probed and the device's BARs are sized, + * but no resource allocations have been done. The SR-IOV BARs are sized + * based on the maximum number of VFs supported by the device and we need + * to increase that to new_size. + * + * 2. Later, when Linux actually assigns resources it tries to make the resource + * allocations for each PCI bus as compact as possible. As a part of that it + * sorts the BARs on a bus by their required alignment, which is calculated + * using pci_resource_alignment(). + * + * For IOV resources this goes: + * pci_resource_alignment() + * pci_sriov_resource_alignment() + * pcibios_sriov_resource_alignment() + * pnv_pci_iov_resource_alignment() + * + * Our hook overrides the default alignment, equal to the per-vf-size, with + * new_size computed above. + * + * 3. When userspace enables VFs for a device: + * + * sriov_enable() + * pcibios_sriov_enable() + * pnv_pcibios_sriov_enable() + * + * This is where we actually allocate PE numbers for each VF and setup the + * MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena" + * where each MBT segment is equal in size to the VF BAR so we can shift + * around the actual SR-IOV BAR location within this arena. We need this + * ability because the PE space is shared by all devices on the same PHB. + * When using mode a) described above segment 0 in maps to PE#0 which might + * be already being used by another device on the PHB. + * + * As a result we need allocate a contigious range of PE numbers, then shift + * the address programmed into the SR-IOV BAR of the PF so that the address + * of VF0 matches up with the segment corresponding to the first allocated + * PE number. This is handled in pnv_pci_vf_resource_shift(). + * + * Once all that is done we return to the PCI core which then enables VFs, + * scans them and creates pci_devs for each. The init process for a VF is + * largely the same as a normal device, but the VF is inserted into the IODA + * PE that we allocated for it rather than the PE associated with the bus. + * + * 4. When userspace disables VFs we unwind the above in + * pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since + * we don't need to validate anything, just tear down the mappings and + * move SR-IOV resource back to its "proper" location. + * + * That's how mode a) works. In theory mode b) (single PE mapping) is less work + * since we can map each individual VF with a separate BAR. However, there's a + * few limitations: + * + * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes + * it only usable for devices with very large per-VF BARs. Such devices are + * similar to Big Foot. They definitely exist, but I've never seen one. + * + * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only + * 16 total and some are needed for. Most SR-IOV capable network cards can support + * more than 16 VFs on each port. + * + * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO + * window of the PHB. + * + * + * + * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It + * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows + * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since + * the Linux BAR allocation will place any BAR marked as non-prefetchable into + * the non-prefetchable bridge window, which is 32bit only. It also added two + * new modes: + * + * c) A segmented BAR similar to a), but each segment can be individually + * mapped to any PE. This is matches how the 32bit MMIO window worked on + * IODA1&2. + * + * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a), + * but with fewer segments and configurable base PE. + * + * i.e. The n'th segment maps to the (n + base)'th PE. + * + * The base PE is also required to be a multiple of the window size. + * + * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us + * to exploit any of the IODA3 features. + */ + +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) +{ + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct resource *res; + int i; + resource_size_t vf_bar_sz; + struct pnv_iov_data *iov; + int mul; + + iov = kzalloc(sizeof(*iov), GFP_KERNEL); + if (!iov) + goto disable_iov; + pdev->dev.archdata.iov_data = iov; + mul = phb->ioda.total_pe_num; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || res->parent) + continue; + if (!pnv_pci_is_m64_flags(res->flags)) { + dev_warn(&pdev->dev, "Don't support SR-IOV with non M64 VF BAR%d: %pR. \n", + i, res); + goto disable_iov; + } + + vf_bar_sz = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + + /* + * Generally, one segmented M64 BAR maps one IOV BAR. However, + * if a VF BAR is too large we end up wasting a lot of space. + * If each VF needs more than 1/4 of the default m64 segment + * then each VF BAR should be mapped in single-PE mode to reduce + * the amount of space required. This does however limit the + * number of VFs we can support. + * + * The 1/4 limit is arbitrary and can be tweaked. + */ + if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) { + /* + * On PHB3, the minimum size alignment of M64 BAR in + * single mode is 32MB. If this VF BAR is smaller than + * 32MB, but still too large for a segmented window + * then we can't map it and need to disable SR-IOV for + * this device. + */ + if (vf_bar_sz < SZ_32M) { + pci_err(pdev, "VF BAR%d: %pR can't be mapped in single PE mode\n", + i, res); + goto disable_iov; + } + + iov->m64_single_mode[i] = true; + continue; + } + + /* + * This BAR can be mapped with one segmented window, so adjust + * te resource size to accommodate. + */ + pci_dbg(pdev, " Fixing VF BAR%d: %pR to\n", i, res); + res->end = res->start + vf_bar_sz * mul - 1; + pci_dbg(pdev, " %pR\n", res); + + pci_info(pdev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", + i, res, mul); + + iov->need_shift = true; + } + + return; + +disable_iov: + /* Save ourselves some MMIO space by disabling the unusable BARs */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + res->flags = 0; + res->end = res->start - 1; + } + + pdev->dev.archdata.iov_data = NULL; + kfree(iov); +} + +void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) +{ + if (WARN_ON(pci_dev_is_added(pdev))) + return; + + if (pdev->is_virtfn) { + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); + + /* + * VF PEs are single-device PEs so their pdev pointer needs to + * be set. The pdev doesn't exist when the PE is allocated (in + * (pcibios_sriov_enable()) so we fix it up here. + */ + pe->pdev = pdev; + WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); + } else if (pdev->is_physfn) { + /* + * For PFs adjust their allocated IOV resources to match what + * the PHB can support using it's M64 BAR table. + */ + pnv_pci_ioda_fixup_iov_resources(pdev); + } +} + +resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, + int resno) +{ + resource_size_t align = pci_iov_resource_size(pdev, resno); + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct pnv_iov_data *iov = pnv_iov_get(pdev); + + /* + * iov can be null if we have an SR-IOV device with IOV BAR that can't + * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch). + * In that case we don't allow VFs to be enabled since one of their + * BARs would not be placed in the correct PE. + */ + if (!iov) + return align; + + /* + * If we're using single mode then we can just use the native VF BAR + * alignment. We validated that it's possible to use a single PE + * window above when we did the fixup. + */ + if (iov->m64_single_mode[resno - PCI_IOV_RESOURCES]) + return align; + + /* + * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the + * SR-IOV. While from hardware perspective, the range mapped by M64 + * BAR should be size aligned. + * + * This function returns the total IOV BAR size if M64 BAR is in + * Shared PE mode or just VF BAR size if not. + * If the M64 BAR is in Single PE mode, return the VF BAR size or + * M64 segment size if IOV BAR size is less. + */ + return phb->ioda.total_pe_num * align; +} + +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_iov_data *iov; + struct pnv_phb *phb; + int window_id; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + for_each_set_bit(window_id, iov->used_m64_bar_mask, MAX_M64_BARS) { + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + 0); + + clear_bit(window_id, &phb->ioda.m64_bar_alloc); + } + + return 0; +} + + +/* + * PHB3 and beyond support segmented windows. The window's address range + * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1 + * mapping between PEs and segments. + */ +static int64_t pnv_ioda_map_m64_segmented(struct pnv_phb *phb, + int window_id, + resource_size_t start, + resource_size_t size) +{ + int64_t rc; + + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + start, + 0, /* unused */ + size); + if (rc) + goto out; + + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + OPAL_ENABLE_M64_SPLIT); +out: + if (rc) + pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc); + + return rc; +} + +static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb, + int pe_num, + int window_id, + resource_size_t start, + resource_size_t size) +{ + int64_t rc; + + /* + * The API for setting up m64 mmio windows seems to have been designed + * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed + * split of 8 equally sized segments each of which could individually + * assigned to a PE. + * + * The problem with this is that the API doesn't have any way to + * communicate the number of segments we want on a BAR. This wasn't + * a problem for p7-ioc since you didn't have a choice, but the + * single PE windows added in PHB3 don't map cleanly to this API. + * + * As a result we've got this slightly awkward process where we + * call opal_pci_map_pe_mmio_window() to put the single in single + * PE mode, and set the PE for the window before setting the address + * bounds. We need to do it this way because the single PE windows + * for PHB3 have different alignment requirements on PHB3. + */ + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe_num, + OPAL_M64_WINDOW_TYPE, + window_id, + 0); + if (rc) + goto out; + + /* + * NB: In single PE mode the window needs to be aligned to 32MB + */ + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + start, + 0, /* ignored by FW, m64 is 1-1 */ + size); + if (rc) + goto out; + + /* + * Now actually enable it. We specified the BAR should be in "non-split" + * mode so FW will validate that the BAR is in single PE mode. + */ + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + OPAL_ENABLE_M64_NON_SPLIT); +out: + if (rc) + pr_err("Error mapping single PE BAR\n"); + + return rc; +} + +static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov) +{ + int win; + + do { + win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, + phb->ioda.m64_bar_idx + 1, 0); + + if (win >= phb->ioda.m64_bar_idx + 1) + return -1; + } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); + + set_bit(win, iov->used_m64_bar_mask); + + return win; +} + +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_iov_data *iov; + struct pnv_phb *phb; + unsigned int win; + struct resource *res; + int i, j; + int64_t rc; + resource_size_t size, start; + int base_pe_num; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + /* don't need single mode? map everything in one go! */ + if (!iov->m64_single_mode[i]) { + win = pnv_pci_alloc_m64_bar(phb, iov); + if (win < 0) + goto m64_failed; + + size = resource_size(res); + start = res->start; + + rc = pnv_ioda_map_m64_segmented(phb, win, start, size); + if (rc) + goto m64_failed; + + continue; + } + + /* otherwise map each VF with single PE BARs */ + size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i); + base_pe_num = iov->vf_pe_arr[0].pe_number; + + for (j = 0; j < num_vfs; j++) { + win = pnv_pci_alloc_m64_bar(phb, iov); + if (win < 0) + goto m64_failed; + + start = res->start + size * j; + rc = pnv_ioda_map_m64_single(phb, win, + base_pe_num + j, + start, + size); + if (rc) + goto m64_failed; + } + } + return 0; + +m64_failed: + pnv_pci_vf_release_m64(pdev, num_vfs); + return -EBUSY; +} + +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) +{ + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *pe_n; + + phb = pci_bus_to_pnvhb(pdev->bus); + + if (!pdev->is_physfn) + return; + + /* FIXME: Use pnv_ioda_release_pe()? */ + list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { + if (pe->parent_dev != pdev) + continue; + + pnv_pci_ioda2_release_pe_dma(pe); + + /* Remove from list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_ioda_deconfigure_pe(phb, pe); + + pnv_ioda_free_pe(pe); + } +} + +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) +{ + struct resource *res, res2; + struct pnv_iov_data *iov; + resource_size_t size; + u16 num_vfs; + int i; + + if (!dev->is_physfn) + return -EINVAL; + iov = pnv_iov_get(dev); + + /* + * "offset" is in VFs. The M64 windows are sized so that when they + * are segmented, each segment is the same size as the IOV BAR. + * Each segment is in a separate PE, and the high order bits of the + * address are the PE number. Therefore, each VF's BAR is in a + * separate PE, and changing the IOV BAR start address changes the + * range of PEs the VFs are in. + */ + num_vfs = iov->num_vfs; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + if (iov->m64_single_mode[i]) + continue; + + /* + * The actual IOV BAR range is determined by the start address + * and the actual size for num_vfs VFs BAR. This check is to + * make sure that after shifting, the range will not overlap + * with another device. + */ + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2.flags = res->flags; + res2.start = res->start + (size * offset); + res2.end = res2.start + (size * num_vfs) - 1; + + if (res2.end > res->end) { + dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + return -EBUSY; + } + } + + /* + * Since M64 BAR shares segments among all possible 256 PEs, + * we have to shift the beginning of PF IOV BAR to make it start from + * the segment which belongs to the PE number assigned to the first VF. + * This creates a "hole" in the /proc/iomem which could be used for + * allocating other resources so we reserve this area below and + * release when IOV is released. + */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + if (iov->m64_single_mode[i]) + continue; + + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2 = *res; + res->start += size * offset; + + dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", + i, &res2, res, (offset > 0) ? "En" : "Dis", + num_vfs, offset); + + if (offset < 0) { + devm_release_resource(&dev->dev, &iov->holes[i]); + memset(&iov->holes[i], 0, sizeof(iov->holes[i])); + } + + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + + if (offset > 0) { + iov->holes[i].start = res2.start; + iov->holes[i].end = res2.start + size * offset - 1; + iov->holes[i].flags = IORESOURCE_BUS; + iov->holes[i].name = "pnv_iov_reserved"; + devm_request_resource(&dev->dev, res->parent, + &iov->holes[i]); + } + } + return 0; +} + +static void pnv_pci_sriov_disable(struct pci_dev *pdev) +{ + u16 num_vfs, base_pe; + struct pnv_iov_data *iov; + + iov = pnv_iov_get(pdev); + num_vfs = iov->num_vfs; + base_pe = iov->vf_pe_arr[0].pe_number; + + if (WARN_ON(!iov)) + return; + + /* Release VF PEs */ + pnv_ioda_release_vf_PE(pdev); + + /* Un-shift the IOV BARs if we need to */ + if (iov->need_shift) + pnv_pci_vf_resource_shift(pdev, -base_pe); + + /* Release M64 windows */ + pnv_pci_vf_release_m64(pdev, num_vfs); +} + +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + int pe_num; + u16 vf_index; + struct pnv_iov_data *iov; + struct pci_dn *pdn; + + if (!pdev->is_physfn) + return; + + phb = pci_bus_to_pnvhb(pdev->bus); + pdn = pci_get_pdn(pdev); + iov = pnv_iov_get(pdev); + + /* Reserve PE for each VF */ + for (vf_index = 0; vf_index < num_vfs; vf_index++) { + int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); + int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); + struct pci_dn *vf_pdn; + + pe = &iov->vf_pe_arr[vf_index]; + pe->phb = phb; + pe->flags = PNV_IODA_PE_VF; + pe->pbus = NULL; + pe->parent_dev = pdev; + pe->mve_number = -1; + pe->rid = (vf_bus << 8) | vf_devfn; + + pe_num = pe->pe_number; + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", + pci_domain_nr(pdev->bus), pdev->bus->number, + PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + pnv_ioda_free_pe(pe); + pe->pdev = NULL; + continue; + } + + /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_add_tail(&pe->list, &phb->ioda.pe_list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + /* associate this pe to it's pdn */ + list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { + if (vf_pdn->busno == vf_bus && + vf_pdn->devfn == vf_devfn) { + vf_pdn->pe_number = pe_num; + break; + } + } + + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } +} + +static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_ioda_pe *base_pe; + struct pnv_iov_data *iov; + struct pnv_phb *phb; + int ret; + u16 i; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + /* + * There's a calls to IODA2 PE setup code littered throughout. We could + * probably fix that, but we'd still have problems due to the + * restriction inherent on IODA1 PHBs. + * + * NB: We class IODA3 as IODA2 since they're very similar. + */ + if (phb->type != PNV_PHB_IODA2) { + pci_err(pdev, "SR-IOV is not supported on this PHB\n"); + return -ENXIO; + } + + if (!iov) { + dev_info(&pdev->dev, "don't support this SRIOV device with non 64bit-prefetchable IOV BAR\n"); + return -ENOSPC; + } + + /* allocate a contigious block of PEs for our VFs */ + base_pe = pnv_ioda_alloc_pe(phb, num_vfs); + if (!base_pe) { + pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs); + return -EBUSY; + } + + iov->vf_pe_arr = base_pe; + iov->num_vfs = num_vfs; + + /* Assign M64 window accordingly */ + ret = pnv_pci_vf_assign_m64(pdev, num_vfs); + if (ret) { + dev_info(&pdev->dev, "Not enough M64 window resources\n"); + goto m64_failed; + } + + /* + * When using one M64 BAR to map one IOV BAR, we need to shift + * the IOV BAR according to the PE# allocated to the VFs. + * Otherwise, the PE# for the VF will conflict with others. + */ + if (iov->need_shift) { + ret = pnv_pci_vf_resource_shift(pdev, base_pe->pe_number); + if (ret) + goto shift_failed; + } + + /* Setup VF PEs */ + pnv_ioda_setup_vf_PE(pdev, num_vfs); + + return 0; + +shift_failed: + pnv_pci_vf_release_m64(pdev, num_vfs); + +m64_failed: + for (i = 0; i < num_vfs; i++) + pnv_ioda_free_pe(&iov->vf_pe_arr[i]); + + return ret; +} + +int pnv_pcibios_sriov_disable(struct pci_dev *pdev) +{ + pnv_pci_sriov_disable(pdev); + + /* Release PCI data */ + remove_sriov_vf_pdns(pdev); + return 0; +} + +int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + /* Allocate PCI data */ + add_sriov_vf_pdns(pdev); + + return pnv_pci_sriov_enable(pdev, num_vfs); +} diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 091fe1cf386b..9b9bca169275 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -162,8 +162,7 @@ EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct msi_desc *entry; struct msi_msg msg; int hwirq; @@ -211,8 +210,7 @@ int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) void pnv_teardown_msi_irqs(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct msi_desc *entry; irq_hw_number_t hwirq; @@ -824,10 +822,9 @@ EXPORT_SYMBOL(pnv_pci_get_phb_node); int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable) { - __be64 val; - struct pci_controller *hose; - struct pnv_phb *phb; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); u64 tunnel_bar; + __be64 val; int rc; if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR)) @@ -835,9 +832,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable) if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR)) return -ENXIO; - hose = pci_bus_to_host(dev->bus); - phb = hose->private_data; - mutex_lock(&tunnel_mutex); rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val); if (rc != OPAL_SUCCESS) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 51c254f2f3cb..739a0b3b72e1 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -87,7 +87,14 @@ struct pnv_ioda_pe { bool tce_bypass_enabled; uint64_t tce_bypass_base; - /* MSIs. MVE index is identical for for 32 and 64 bit MSI + /* + * Used to track whether we've done DMA setup for this PE or not. We + * want to defer allocating TCE tables, etc until we've added a + * non-bridge device to the PE. + */ + bool dma_setup_done; + + /* MSIs. MVE index is identical for 32 and 64 bit MSI * and -1 if not supported. (It's actually identical to the * PE number) */ @@ -147,6 +154,7 @@ struct pnv_phb { unsigned long m64_size; unsigned long m64_segsize; unsigned long m64_base; +#define MAX_M64_BARS 64 unsigned long m64_bar_alloc; /* IO ports */ @@ -187,6 +195,89 @@ struct pnv_phb { u8 *diag_data; }; + +/* IODA PE management */ + +static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) +{ + /* + * WARNING: We cannot rely on the resource flags. The Linux PCI + * allocation code sometimes decides to put a 64-bit prefetchable + * BAR in the 32-bit window, so we have to compare the addresses. + * + * For simplicity we only test resource start. + */ + return (r->start >= phb->ioda.m64_base && + r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); +} + +static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) +{ + unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); + + return (resource_flags & flags) == flags; +} + +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); + +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe); + +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count); +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe); + +#ifdef CONFIG_PCI_IOV +/* + * For SR-IOV we want to put each VF's MMIO resource in to a separate PE. + * This requires a bit of acrobatics with the MMIO -> PE configuration + * and this structure is used to keep track of it all. + */ +struct pnv_iov_data { + /* number of VFs enabled */ + u16 num_vfs; + + /* pointer to the array of VF PEs. num_vfs long*/ + struct pnv_ioda_pe *vf_pe_arr; + + /* Did we map the VF BAR with single-PE IODA BARs? */ + bool m64_single_mode[PCI_SRIOV_NUM_BARS]; + + /* + * True if we're using any segmented windows. In that case we need + * shift the start of the IOV resource the segment corresponding to + * the allocated PE. + */ + bool need_shift; + + /* + * Bit mask used to track which m64 windows are used to map the + * SR-IOV BARs for this device. + */ + DECLARE_BITMAP(used_m64_bar_mask, MAX_M64_BARS); + + /* + * If we map the SR-IOV BARs with a segmented window then + * parts of that window will be "claimed" by other PEs. + * + * "holes" here is used to reserve the leading portion + * of the window that is used by other (non VF) PEs. + */ + struct resource holes[PCI_SRIOV_NUM_BARS]; +}; + +static inline struct pnv_iov_data *pnv_iov_get(struct pci_dev *pdev) +{ + return pdev->dev.archdata.iov_data; +} + +void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev); +resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno); + +int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs); +int pnv_pcibios_sriov_disable(struct pci_dev *pdev); +#endif /* CONFIG_PCI_IOV */ + extern struct pci_ops pnv_pci_ops; void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, @@ -260,4 +351,14 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); +static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus) +{ + struct pci_controller *hose = bus->sysdata; + + if (hose) + return hose->private_data; + + return NULL; +} + #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 3bc188da82ba..7fcb88623081 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -399,7 +399,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static unsigned long pnv_memory_block_size(void) { - return 256UL * 1024 * 1024; + /* + * We map the kernel linear region with 1GB large pages on radix. For + * memory hot unplug to work our memory block size must be at least + * this size. + */ + if (radix_enabled()) + return radix_mem_block_size; + else + return 256UL * 1024 * 1024; } #endif |