From 8e40612f6146da1333e9bb5cfd9af7511c063d93 Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:54 +0000 Subject: EDAC/ghes: Add a notifier for reporting memory errors In order to make it a proper module and disentangle it from facilities, add a notifier for reporting memory errors. Use an atomic notifier because calls sites like ghes_proc_in_irq() run in interrupt context. [ bp: Massage commit message. ] Suggested-by: Borislav Petkov Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-3-justin.he@arm.com --- include/acpi/ghes.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/acpi') diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 34fb3431a8f3..5cbd38b6e4e1 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -76,18 +76,11 @@ int ghes_estatus_pool_init(int num_ghes); /* From drivers/edac/ghes_edac.c */ #ifdef CONFIG_EDAC_GHES -void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err); - int ghes_edac_register(struct ghes *ghes, struct device *dev); void ghes_edac_unregister(struct ghes *ghes); #else -static inline void ghes_edac_report_mem_error(int sev, - struct cper_sec_mem_err *mem_err) -{ -} - static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) { return -ENODEV; @@ -145,4 +138,7 @@ int ghes_notify_sea(void); static inline int ghes_notify_sea(void) { return -ENOENT; } #endif +struct notifier_block; +extern void ghes_register_report_chain(struct notifier_block *nb); +extern void ghes_unregister_report_chain(struct notifier_block *nb); #endif /* GHES_H */ -- cgit v1.2.3 From 9057a3f7ac360e068ceb261938e9ae2b1a7e654c Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:55 +0000 Subject: EDAC/ghes: Prepare to make ghes_edac a proper module To make ghes_edac a proper module, prepare to decouple its dependencies from GHES. Move the ghes_edac.force_load parameter to ghes.c in order to properly control whether ghes_edac should be force-loaded: In ghes_edac_register() it is too late to set the module flag. Introduce a helper ghes_get_devices(), which returns the list of GHES devices which got probed when the platform-check passes on the system. The previous force_load check is not needed in ghes_edac_unregister() since it will be checked in the module's init function of ghes_edac later. [ bp: Massage. ] Suggested-by: Toshi Kani Suggested-by: Borislav Petkov Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-4-justin.he@arm.com --- drivers/acpi/apei/ghes.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/edac/ghes_edac.c | 35 ++------------------------------- include/acpi/ghes.h | 6 ++++++ 3 files changed, 58 insertions(+), 33 deletions(-) (limited to 'include/acpi') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 55013e024ba3..acab512741f6 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -109,6 +109,13 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) bool ghes_disable; module_param_named(disable, ghes_disable, bool, 0); +/* + * "ghes.edac_force_enable" forcibly enables ghes_edac and skips the platform + * check. + */ +static bool ghes_edac_force_enable; +module_param_named(edac_force_enable, ghes_edac_force_enable, bool, 0); + /* * All error sources notified with HED (Hardware Error Device) share a * single notifier callback, so they need to be linked and checked one @@ -120,6 +127,13 @@ module_param_named(disable, ghes_disable, bool, 0); static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +/* + * A list of GHES devices which are given to the corresponding EDAC driver + * ghes_edac for further use. + */ +static LIST_HEAD(ghes_devs); +static DEFINE_MUTEX(ghes_devs_mutex); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -1380,6 +1394,12 @@ static int ghes_probe(struct platform_device *ghes_dev) ghes_edac_register(ghes, &ghes_dev->dev); + ghes->dev = &ghes_dev->dev; + + mutex_lock(&ghes_devs_mutex); + list_add_tail(&ghes->elist, &ghes_devs); + mutex_unlock(&ghes_devs_mutex); + /* Handle any pending errors right away */ spin_lock_irqsave(&ghes_notify_lock_irq, flags); ghes_proc(ghes); @@ -1444,6 +1464,10 @@ static int ghes_remove(struct platform_device *ghes_dev) ghes_edac_unregister(ghes); + mutex_lock(&ghes_devs_mutex); + list_del(&ghes->elist); + mutex_unlock(&ghes_devs_mutex); + kfree(ghes); platform_set_drvdata(ghes_dev, NULL); @@ -1500,6 +1524,32 @@ void __init acpi_ghes_init(void) pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); } +/* + * Known x86 systems that prefer GHES error reporting: + */ +static struct acpi_platform_list plat_list[] = { + {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, + { } /* End */ +}; + +struct list_head *ghes_get_devices(void) +{ + int idx = -1; + + if (IS_ENABLED(CONFIG_X86)) { + idx = acpi_match_platform_list(plat_list); + if (idx < 0) { + if (!ghes_edac_force_enable) + return NULL; + + pr_warn_once("Force-loading ghes_edac on an unsupported platform. You're on your own!\n"); + } + } + + return &ghes_devs; +} +EXPORT_SYMBOL_GPL(ghes_get_devices); + void ghes_register_report_chain(struct notifier_block *nb) { atomic_notifier_chain_register(&ghes_report_chain, nb); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 7b8d56a769f6..b85a545d1cb0 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -54,10 +54,6 @@ static DEFINE_MUTEX(ghes_reg_mutex); */ static DEFINE_SPINLOCK(ghes_lock); -/* "ghes_edac.force_load=1" skips the platform check */ -static bool __read_mostly force_load; -module_param(force_load, bool, 0); - static bool system_scanned; /* Memory Device - Type 17 of SMBIOS spec */ @@ -387,14 +383,6 @@ static struct notifier_block ghes_edac_mem_err_nb = { .priority = 0, }; -/* - * Known systems that are safe to enable this module. - */ -static struct acpi_platform_list plat_list[] = { - {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, - { } /* End */ -}; - int ghes_edac_register(struct ghes *ghes, struct device *dev) { bool fake = false; @@ -402,19 +390,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) struct ghes_pvt *pvt; struct edac_mc_layer layers[1]; unsigned long flags; - int idx = -1; int rc = 0; - if (IS_ENABLED(CONFIG_X86)) { - /* Check if safe to enable on this system */ - idx = acpi_match_platform_list(plat_list); - if (!force_load && idx < 0) - return -ENODEV; - } else { - force_load = true; - idx = 0; - } - /* finish another registration/unregistration instance first */ mutex_lock(&ghes_reg_mutex); @@ -458,15 +435,10 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); pr_info("work on such system. Use this driver with caution\n"); - } else if (idx < 0) { - pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); - pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); - pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); - pr_info("If you find incorrect reports, please contact your hardware vendor\n"); - pr_info("to correct its BIOS.\n"); - pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms); } + pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms); + if (!fake) { struct dimm_info *src, *dst; int i = 0; @@ -535,9 +507,6 @@ void ghes_edac_unregister(struct ghes *ghes) struct mem_ctl_info *mci; unsigned long flags; - if (!force_load) - return; - mutex_lock(&ghes_reg_mutex); system_scanned = false; diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 5cbd38b6e4e1..ce693e9f07a0 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -27,6 +27,8 @@ struct ghes { struct timer_list timer; unsigned int irq; }; + struct device *dev; + struct list_head elist; }; struct ghes_estatus_node { @@ -80,6 +82,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev); void ghes_edac_unregister(struct ghes *ghes); +struct list_head *ghes_get_devices(void); + #else static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) { @@ -89,6 +93,8 @@ static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) static inline void ghes_edac_unregister(struct ghes *ghes) { } + +static inline struct list_head *ghes_get_devices(void) { return NULL; } #endif static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata) -- cgit v1.2.3 From 802e7f1dfed7cc7fb309995e0c4138f08977fdfc Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:56 +0000 Subject: EDAC/ghes: Make ghes_edac a proper module Commit dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in apci_init()") introduced a bug leading to ghes_edac_register() to be invoked before edac_init(). Because at that time the bus "edac" hadn't been even registered, this created sysfs nodes as /devices/mc0 instead of /sys/devices/system/edac/mc/mc0 on an Ampere eMag server. Fix this by turning ghes_edac into a proper module. The list of GHES devices returned is not protected from being modified concurrently but it is pretty static as it gets created only during GHES init and latter is not a module so... [ bp: Massage. ] Fixes: dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in apci_init()") Co-developed-by: Borislav Petkov Signed-off-by: Borislav Petkov Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-5-justin.he@arm.com --- drivers/acpi/apei/ghes.c | 4 ---- drivers/edac/Kconfig | 4 ++-- drivers/edac/ghes_edac.c | 40 ++++++++++++++++++++++++++++++++++++++-- include/acpi/ghes.h | 22 ++-------------------- 4 files changed, 42 insertions(+), 28 deletions(-) (limited to 'include/acpi') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index acab512741f6..249cd01cb920 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1392,8 +1392,6 @@ static int ghes_probe(struct platform_device *ghes_dev) platform_set_drvdata(ghes_dev, ghes); - ghes_edac_register(ghes, &ghes_dev->dev); - ghes->dev = &ghes_dev->dev; mutex_lock(&ghes_devs_mutex); @@ -1462,8 +1460,6 @@ static int ghes_remove(struct platform_device *ghes_dev) ghes_fini(ghes); - ghes_edac_unregister(ghes); - mutex_lock(&ghes_devs_mutex); list_del(&ghes->elist); mutex_unlock(&ghes_devs_mutex); diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 456602d373b7..cde0849cf861 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -53,8 +53,8 @@ config EDAC_DECODE_MCE has been initialized. config EDAC_GHES - bool "Output ACPI APEI/GHES BIOS detected errors via EDAC" - depends on ACPI_APEI_GHES && (EDAC=y) + tristate "Output ACPI APEI/GHES BIOS detected errors via EDAC" + depends on ACPI_APEI_GHES select UEFI_CPER help Not all machines support hardware-driven error report. Some of those diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index b85a545d1cb0..cf2b618c1ada 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -56,6 +56,8 @@ static DEFINE_SPINLOCK(ghes_lock); static bool system_scanned; +static struct list_head *ghes_devs; + /* Memory Device - Type 17 of SMBIOS spec */ struct memdev_dmi_entry { u8 type; @@ -383,7 +385,7 @@ static struct notifier_block ghes_edac_mem_err_nb = { .priority = 0, }; -int ghes_edac_register(struct ghes *ghes, struct device *dev) +static int ghes_edac_register(struct device *dev) { bool fake = false; struct mem_ctl_info *mci; @@ -502,7 +504,7 @@ unlock: return rc; } -void ghes_edac_unregister(struct ghes *ghes) +static void ghes_edac_unregister(struct ghes *ghes) { struct mem_ctl_info *mci; unsigned long flags; @@ -535,3 +537,37 @@ void ghes_edac_unregister(struct ghes *ghes) unlock: mutex_unlock(&ghes_reg_mutex); } + +static int __init ghes_edac_init(void) +{ + struct ghes *g, *g_tmp; + + ghes_devs = ghes_get_devices(); + if (!ghes_devs) + return -ENODEV; + + if (list_empty(ghes_devs)) { + pr_info("GHES probing device list is empty"); + return -ENODEV; + } + + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) { + ghes_edac_register(g->dev); + } + + return 0; +} +module_init(ghes_edac_init); + +static void __exit ghes_edac_exit(void) +{ + struct ghes *g, *g_tmp; + + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) { + ghes_edac_unregister(g); + } +} +module_exit(ghes_edac_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors via EDAC"); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index ce693e9f07a0..2e785d3554d8 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -71,32 +71,14 @@ int ghes_register_vendor_record_notifier(struct notifier_block *nb); * @nb: pointer to the notifier_block structure of the vendor record handler. */ void ghes_unregister_vendor_record_notifier(struct notifier_block *nb); -#endif - -int ghes_estatus_pool_init(int num_ghes); - -/* From drivers/edac/ghes_edac.c */ - -#ifdef CONFIG_EDAC_GHES -int ghes_edac_register(struct ghes *ghes, struct device *dev); - -void ghes_edac_unregister(struct ghes *ghes); struct list_head *ghes_get_devices(void); - #else -static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) -{ - return -ENODEV; -} - -static inline void ghes_edac_unregister(struct ghes *ghes) -{ -} - static inline struct list_head *ghes_get_devices(void) { return NULL; } #endif +int ghes_estatus_pool_init(int num_ghes); + static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata) { return gdata->revision >> 8; -- cgit v1.2.3