From 3e0fdec858d82c829774f271e88b5ceb17051551 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Apr 2020 09:55:10 +0200 Subject: x86/mce/amd, edac: Remove report_gart_errors ... because no one should be interested in spurious MCEs anyway. Make the filtering unconditional and move it to amd_filter_mce(). Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200407163414.18058-2-bp@alien8.de --- drivers/edac/mce_amd.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'drivers/edac/mce_amd.c') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 8874b7722b2f..e58644d9c92b 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -10,15 +10,8 @@ static struct amd_decoder_ops fam_ops; static u8 xec_mask = 0xf; -static bool report_gart_errors; static void (*decode_dram_ecc)(int node_id, struct mce *m); -void amd_report_gart_errors(bool v) -{ - report_gart_errors = v; -} -EXPORT_SYMBOL_GPL(amd_report_gart_errors); - void amd_register_ecc_decoder(void (*f)(int, struct mce *)) { decode_dram_ecc = f; @@ -1030,20 +1023,6 @@ static inline void amd_decode_err_code(u16 ec) pr_cont("\n"); } -/* - * Filter out unwanted MCE signatures here. - */ -static bool ignore_mce(struct mce *m) -{ - /* - * NB GART TLB error reporting is disabled by default. - */ - if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors) - return true; - - return false; -} - static const char *decode_error_status(struct mce *m) { if (m->status & MCI_STATUS_UC) { @@ -1067,9 +1046,6 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) unsigned int fam = x86_family(m->cpuid); int ecc; - if (ignore_mce(m)) - return NOTIFY_STOP; - pr_emerg(HW_ERR "%s\n", decode_error_status(m)); pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", -- cgit v1.2.3 From 23ba710a0864108910c7531dc4c73ef65eca5568 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:17 -0800 Subject: x86/mce: Fix all mce notifiers to update the mce->kflags bitmask If the handler took any action to log or deal with the error, set a bit in mce->kflags so that the default handler on the end of the machine check chain can see what has been done. Get rid of NOTIFY_STOP returns. Make the EDAC and dev-mcelog handlers skip over errors already processed by CEC. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-5-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 4 +++- arch/x86/kernel/cpu/mce/dev-mcelog.c | 5 +++++ drivers/acpi/acpi_extlog.c | 5 +++-- drivers/acpi/nfit/mce.c | 1 + drivers/edac/i7core_edac.c | 5 +++-- drivers/edac/mce_amd.c | 6 +++++- drivers/edac/pnd2_edac.c | 5 +++-- drivers/edac/sb_edac.c | 5 ++++- drivers/edac/skx_common.c | 4 ++++ drivers/ras/cec.c | 9 ++++++--- 10 files changed, 37 insertions(+), 12 deletions(-) (limited to 'drivers/edac/mce_amd.c') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b033b3589630..5666a48a4bc9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -581,8 +581,10 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; pfn = mce->addr >> PAGE_SHIFT; - if (!memory_failure(pfn, 0)) + if (!memory_failure(pfn, 0)) { set_mce_nospec(pfn); + mce->kflags |= MCE_HANDLED_UC; + } return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index d089567a9ce8..c033e7ea9e3c 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -39,6 +39,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, struct mce *mce = (struct mce *)data; unsigned int entry; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + mutex_lock(&mce_chrdev_read_mutex); entry = mcelog->next; @@ -56,6 +59,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); mcelog->entry[entry].finished = 1; + mcelog->entry[entry].kflags = 0; /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); @@ -63,6 +67,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); + mce->kflags |= MCE_HANDLED_MCELOG; return NOTIFY_OK; } diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 8596a106a933..9cc3c1f92db5 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -146,7 +146,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, static u32 err_seq; estatus = extlog_elog_entry_check(cpu, bank); - if (estatus == NULL) + if (estatus == NULL || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; memcpy(elog_buf, (void *)estatus, ELOG_ENTRY_LEN); @@ -176,7 +176,8 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, } out: - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EXTLOG; + return NOTIFY_OK; } static bool __init extlog_get_l1addr(void) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index f0ae48515b48..ee8d9973f60b 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -76,6 +76,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, */ acpi_nfit_ars_rescan(acpi_desc, 0); } + mce->kflags |= MCE_HANDLED_NFIT; break; } diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index b3135b208f9a..5860ca41185c 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -1815,7 +1815,7 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val, struct mem_ctl_info *mci; i7_dev = get_i7core_dev(mce->socketid); - if (!i7_dev) + if (!i7_dev || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; mci = i7_dev->mci; @@ -1834,7 +1834,8 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val, i7core_check_error(mci, mce); /* Advise mcelog that the errors were handled */ - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block i7_mce_dec = { diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e58644d9c92b..2b5401db56ad 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1046,6 +1046,9 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) unsigned int fam = x86_family(m->cpuid); int ecc; + if (m->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + pr_emerg(HW_ERR "%s\n", decode_error_status(m)); pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", @@ -1146,7 +1149,8 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) err_code: amd_decode_err_code(m->status & 0xffff); - return NOTIFY_STOP; + m->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block amd_mce_dec_nb = { diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index bc47328eb485..1929a5dc8f94 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1400,7 +1400,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo return NOTIFY_DONE; mci = pnd2_mci; - if (!mci) + if (!mci || (mce->kflags & MCE_HANDLED_CEC)) return NOTIFY_DONE; /* @@ -1429,7 +1429,8 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo pnd2_mce_output_error(mci, mce, &daddr); /* Advice mcelog that the error were handled */ - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block pnd2_mce_dec = { diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 7d51c82be62b..f790f7d08688 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3136,6 +3136,8 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, if (edac_get_report_status() == EDAC_REPORTING_DISABLED) return NOTIFY_DONE; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; /* * Just let mcelog handle it if the error is @@ -3183,7 +3185,8 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, sbridge_mce_output_error(mci, mce); /* Advice mcelog that the error were handled */ - return NOTIFY_STOP; + mce->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; } static struct notifier_block sbridge_mce_dec = { diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 99bbaf629b8d..6f08a12f6b11 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -577,6 +577,9 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if (edac_get_report_status() == EDAC_REPORTING_DISABLED) return NOTIFY_DONE; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + /* ignore unless this is memory related with an address */ if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) return NOTIFY_DONE; @@ -616,6 +619,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, skx_mce_output_error(mci, mce, &res); + mce->kflags |= MCE_HANDLED_EDAC; return NOTIFY_DONE; } diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index 6b42040bf956..569d9ad2c594 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -538,9 +538,12 @@ static int cec_notifier(struct notifier_block *nb, unsigned long val, /* We eat only correctable DRAM errors with usable addresses. */ if (mce_is_memory_error(m) && mce_is_correctable(m) && - mce_usable_address(m)) - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) - return NOTIFY_STOP; + mce_usable_address(m)) { + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) { + m->kflags |= MCE_HANDLED_CEC; + return NOTIFY_OK; + } + } return NOTIFY_DONE; } -- cgit v1.2.3