From c545f5e412250555bd4e717d062b117f20bab418 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 10 Jul 2023 09:32:32 +0800 Subject: EDAC/i10nm: Skip the absent memory controllers Some Sapphire Rapids workstations' absent memory controllers still appear as PCIe devices that fool the i10nm_edac driver and result in "shift exponent -66 is negative" call traces from skx_get_dimm_info(). Skip the absent memory controllers to avoid the call traces. Reported-by: Kai-Heng Feng Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2 Tested-by: Kai-Heng Feng Reported-by: Koba Ko Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t Tested-by: Koba Ko Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors") Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20230710013232.59712-1-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 54 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index a897b6aff368..349ff6cfb379 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi return mdev; } +/** + * i10nm_imc_absent() - Check whether the memory controller @imc is absent + * + * @imc : The pointer to the structure of memory controller EDAC device. + * + * RETURNS : true if the memory controller EDAC device is absent, false otherwise. + */ +static bool i10nm_imc_absent(struct skx_imc *imc) +{ + u32 mcmtr; + int i; + + switch (res_cfg->type) { + case SPR: + for (i = 0; i < res_cfg->ddr_chan_num; i++) { + mcmtr = I10NM_GET_MCMTR(imc, i); + edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr); + if (mcmtr != ~0) + return false; + } + + /* + * Some workstations' absent memory controllers still + * appear as PCIe devices, misleading the EDAC driver. + * By observing that the MMIO registers of these absent + * memory controllers consistently hold the value of ~0. + * + * We identify a memory controller as absent by checking + * if its MMIO register "mcmtr" == ~0 in all its channels. + */ + return true; + default: + return false; + } +} + static int i10nm_get_ddr_munits(void) { struct pci_dev *mdev; void __iomem *mbase; unsigned long size; struct skx_dev *d; - int i, j = 0; + int i, lmc, j = 0; u32 reg, off; u64 base; @@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void) edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", j++, base, reg); - for (i = 0; i < res_cfg->ddr_imc_num; i++) { + for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) { mdev = get_ddr_munit(d, i, &off, &size); if (i == 0 && !mdev) { @@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void) if (!mdev) continue; - d->imc[i].mdev = mdev; - edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n", i, base + off, size, reg); @@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void) return -ENODEV; } - d->imc[i].mbase = mbase; + d->imc[lmc].mbase = mbase; + if (i10nm_imc_absent(&d->imc[lmc])) { + pci_dev_put(mdev); + iounmap(mbase); + d->imc[lmc].mbase = NULL; + edac_dbg(2, "Skip absent mc%d\n", i); + continue; + } else { + d->imc[lmc].mdev = mdev; + lmc++; + } } } -- cgit v1.2.3 From ce53ad81ed36c24aff075f94474adecfabfcf239 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 25 Jul 2023 16:04:27 +0800 Subject: EDAC/igen6: Fix the issue of no error events Current igen6_edac checks for pending errors before the registration of the error handler. However, there is a possibility that the error occurs during the registration process, leading to unhandled pending errors and no future error events. This issue can be reproduced by repeatedly injecting errors during the loading of the igen6_edac. Fix this issue by moving the pending error handler after the registration of the error handler, ensuring that no pending errors are left unhandled. Fixes: 10590a9d4f23 ("EDAC/igen6: Add EDAC driver for Intel client SoCs using IBECC") Reported-by: Ee Wey Lim Tested-by: Ee Wey Lim Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20230725080427.23883-1-qiuxu.zhuo@intel.com --- drivers/edac/igen6_edac.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 544dd19072ea..1a18693294db 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -27,7 +27,7 @@ #include "edac_mc.h" #include "edac_module.h" -#define IGEN6_REVISION "v2.5" +#define IGEN6_REVISION "v2.5.1" #define EDAC_MOD_STR "igen6_edac" #define IGEN6_NMI_NAME "igen6_ibecc" @@ -1216,9 +1216,6 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&ecclog_work, ecclog_work_cb); init_irq_work(&ecclog_irq_work, ecclog_irq_work_cb); - /* Check if any pending errors before registering the NMI handler */ - ecclog_handler(); - rc = register_err_handler(); if (rc) goto fail3; @@ -1230,6 +1227,9 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto fail4; } + /* Check if any pending errors before/during the registration of the error handler */ + ecclog_handler(); + igen6_debug_setup(); return 0; fail4: -- cgit v1.2.3