From ab16ab3d0de4dc9d130ae3db366c38888f1ada5a Mon Sep 17 00:00:00 2001 From: "Jason M. Bills" Date: Mon, 19 Apr 2021 12:13:22 -0700 Subject: Update to internal 0.45 Signed-off-by: Jason M. Bills --- ...onfigure-host-error-monitors-for-meta-wht.patch | 194 +++++++++++++++++++++ ...ory-thermtrip-events-based-on-DIMM-status.patch | 169 ------------------ ...-Add-a-workaround-for-spurious-CPU-errors.patch | 133 -------------- ...ory-thermtrip-events-based-on-DIMM-status.patch | 143 +++++++++++++++ .../host-error-monitor_%.bbappend | 6 +- .../recipes-core/libpeci/libpeci/99-peci.rules | 2 + .../recipes-core/libpeci/libpeci_%.bbappend | 8 + 7 files changed, 350 insertions(+), 305 deletions(-) create mode 100644 meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch delete mode 100644 meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch delete mode 100644 meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch create mode 100644 meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch create mode 100644 meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules create mode 100644 meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend (limited to 'meta-openbmc-mods/meta-wht') diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch new file mode 100644 index 000000000..17f16cce5 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch @@ -0,0 +1,194 @@ +From 1b2df626b20aa14c0de7f46915758d10394d01b4 Mon Sep 17 00:00:00 2001 +From: "Jason M. Bills" +Date: Tue, 15 Dec 2020 10:05:31 -0800 +Subject: [PATCH] Configure host error monitors for meta-wht + +The new host error monitor architecture allows the list of error +monitors to be customized through a platform-specific patch file. + +This patch configures the host error monitors for meta-wht. + +Change-Id: I7070a3409b1471d7f9c93eca3e36b477f484e5d7 +Signed-off-by: Jason M. Bills +--- + include/error_monitors.hpp | 137 ++++++++++++++++++++++++++++++++++--- + 1 file changed, 129 insertions(+), 8 deletions(-) + +diff --git a/include/error_monitors.hpp b/include/error_monitors.hpp +index 55b8790d..8d1651d3 100644 +--- a/include/error_monitors.hpp ++++ b/include/error_monitors.hpp +@@ -14,23 +14,88 @@ + // limitations under the License. + */ + #pragma once ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + #include +-// #include + + #include + + namespace host_error_monitor::error_monitors + { + // Error signals to monitor +-// static std::unique_ptr +-// smiMonitor; ++static std::unique_ptr smiMonitor; ++static std::unique_ptr< ++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor> ++ cpu1MismatchMonitor; ++static std::unique_ptr< ++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor> ++ cpu2MismatchMonitor; ++static std::unique_ptr ++ err0Monitor; ++static std::unique_ptr ++ err1Monitor; ++static std::unique_ptr ++ err2Monitor; ++static std::unique_ptr ++ ierrMonitor; ++static std::unique_ptr< ++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor> ++ cpu1ThermtripMonitor; ++static std::unique_ptr< ++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor> ++ cpu2ThermtripMonitor; ++static std::unique_ptr< ++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor> ++ mem1ThermtripMonitor; ++static std::unique_ptr< ++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor> ++ mem2ThermtripMonitor; ++static std::unique_ptr ++ cpu1VRHotMonitor; ++static std::unique_ptr ++ cpu1MemABCDVRHotMonitor; ++static std::unique_ptr ++ cpu1MemEFGHVRHotMonitor; ++static std::unique_ptr ++ cpu2VRHotMonitor; ++static std::unique_ptr ++ cpu2MemABCDVRHotMonitor; ++static std::unique_ptr ++ cpu2MemEFGHVRHotMonitor; ++static std::unique_ptr< ++ host_error_monitor::pch_thermtrip_monitor::PCHThermtripMonitor> ++ pchThermtripMonitor; + + // Check if all the signal monitors started successfully + bool checkMonitors() + { + bool ret = true; + +- // ret &= smiMonitor->isValid(); ++ ret &= smiMonitor->isValid(); ++ ret &= cpu1MismatchMonitor->isValid(); ++ ret &= cpu2MismatchMonitor->isValid(); ++ ret &= err0Monitor->isValid(); ++ ret &= err1Monitor->isValid(); ++ ret &= err2Monitor->isValid(); ++ ret &= ierrMonitor->isValid(); ++ ret &= cpu1ThermtripMonitor->isValid(); ++ ret &= cpu2ThermtripMonitor->isValid(); ++ ret &= mem1ThermtripMonitor->isValid(); ++ ret &= mem2ThermtripMonitor->isValid(); ++ ret &= cpu1VRHotMonitor->isValid(); ++ ret &= cpu1MemABCDVRHotMonitor->isValid(); ++ ret &= cpu1MemEFGHVRHotMonitor->isValid(); ++ ret &= cpu2VRHotMonitor->isValid(); ++ ret &= cpu2MemABCDVRHotMonitor->isValid(); ++ ret &= cpu2MemEFGHVRHotMonitor->isValid(); ++ ret &= pchThermtripMonitor->isValid(); + + return ret; + } +@@ -39,9 +104,59 @@ bool checkMonitors() + bool startMonitors(boost::asio::io_service& io, + std::shared_ptr conn) + { +- // smiMonitor = +- // std::make_unique( +- // io, conn, "SMI"); ++ smiMonitor = std::make_unique( ++ io, conn, "SMI"); ++ cpu1MismatchMonitor = std::make_unique< ++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor>( ++ io, conn, "CPU1_MISMATCH", 1); ++ cpu2MismatchMonitor = std::make_unique< ++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor>( ++ io, conn, "CPU2_MISMATCH", 2); ++ err0Monitor = ++ std::make_unique( ++ io, conn, "CPU_ERR0", 0); ++ err1Monitor = ++ std::make_unique( ++ io, conn, "CPU_ERR1", 1); ++ err2Monitor = ++ std::make_unique( ++ io, conn, "CPU_ERR2"); ++ ierrMonitor = ++ std::make_unique( ++ io, conn, "CPU_CATERR"); ++ cpu1ThermtripMonitor = std::make_unique< ++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor>( ++ io, conn, "CPU1_THERMTRIP", 1, "CPU1_FIVR_FAULT"); ++ cpu2ThermtripMonitor = std::make_unique< ++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor>( ++ io, conn, "CPU2_THERMTRIP", 2, "CPU2_FIVR_FAULT"); ++ mem1ThermtripMonitor = std::make_unique< ++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor>( ++ io, conn, "CPU1_MEM_THERM_EVENT", 1); ++ mem2ThermtripMonitor = std::make_unique< ++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor>( ++ io, conn, "CPU2_MEM_THERM_EVENT", 2); ++ cpu1VRHotMonitor = ++ std::make_unique( ++ io, conn, "CPU1_VRHOT", "CPU 1"); ++ cpu1MemABCDVRHotMonitor = ++ std::make_unique( ++ io, conn, "CPU1_MEM_ABCD_VRHOT", "CPU 1 Memory ABCD"); ++ cpu1MemEFGHVRHotMonitor = ++ std::make_unique( ++ io, conn, "CPU1_MEM_EFGH_VRHOT", "CPU 1 Memory EFGH"); ++ cpu2VRHotMonitor = ++ std::make_unique( ++ io, conn, "CPU2_VRHOT", "CPU 2"); ++ cpu2MemABCDVRHotMonitor = ++ std::make_unique( ++ io, conn, "CPU2_MEM_ABCD_VRHOT", "CPU 2 Memory ABCD"); ++ cpu2MemEFGHVRHotMonitor = ++ std::make_unique( ++ io, conn, "CPU2_MEM_EFGH_VRHOT", "CPU 2 Memory EFGH"); ++ pchThermtripMonitor = std::make_unique< ++ host_error_monitor::pch_thermtrip_monitor::PCHThermtripMonitor>( ++ io, conn, "PCH_BMC_THERMTRIP"); + + return checkMonitors(); + } +@@ -49,7 +164,13 @@ bool startMonitors(boost::asio::io_service& io, + // Notify the signal monitors of host on event + void sendHostOn() + { +- // smiMonitor->hostOn(); ++ smiMonitor->hostOn(); ++ cpu1MismatchMonitor->hostOn(); ++ cpu2MismatchMonitor->hostOn(); ++ err0Monitor->hostOn(); ++ err1Monitor->hostOn(); ++ err2Monitor->hostOn(); ++ ierrMonitor->hostOn(); + } + + } // namespace host_error_monitor::error_monitors +-- +2.17.1 + diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch deleted file mode 100644 index 140724ca9..000000000 --- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch +++ /dev/null @@ -1,169 +0,0 @@ -From c09e608da2f63eed5b73891d5c032b646d8e81eb Mon Sep 17 00:00:00 2001 -From: "Jason M. Bills" -Date: Wed, 22 Jul 2020 14:30:04 -0700 -Subject: [PATCH 1/2] Filter memory thermtrip events based on DIMM status - -There is a race-condition on shutdown that makes it difficult to -differentiate between a normal shutdown and a memory thermtrip -shutdown. This race-condition will be resolved in the CPLD for -future platforms but for now it requires a workaround. - -This workaround assumes that a memory thermtrip can only occur -if a DIMM temperature sensor has already reached a critical -threshold. When memory thermtrip asserts on shutdown, it only -logs an error if a DIMM is critical; otherwise it is treated -as a normal shutdown. - -Tested: -Memory thermtrip errors no longer log on each power-off. -Manually set a DIMM temperature above critical and verified -that the memory thermtrip event is logged. - -Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3 -Signed-off-by: Jason M. Bills ---- - src/host_error_monitor.cpp | 81 ++++++++++++++++++++++++++++++++++++-- - 1 file changed, 77 insertions(+), 4 deletions(-) - -diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp -index d52a5dc6a..77d065fa3 100644 ---- a/src/host_error_monitor.cpp -+++ b/src/host_error_monitor.cpp -@@ -19,6 +19,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -38,6 +39,9 @@ static std::shared_ptr associationCATAssert; - - static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager"; - -+static boost::container::flat_set cpu1CriticalDIMMs; -+static boost::container::flat_set cpu2CriticalDIMMs; -+ - static bool hostOff = true; - - static size_t caterrTimeoutMs = 2000; -@@ -274,6 +278,67 @@ static void initializeHostState() - "xyz.openbmc_project.State.Host", "CurrentHostState"); - } - -+static std::shared_ptr -+ startDIMMThresholdEventMonitor() -+{ -+ return std::make_shared( -+ *conn, -+ "type='signal',interface='org.freedesktop.DBus.Properties',member='" -+ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor." -+ "Threshold.Critical'", -+ [](sdbusplus::message::message& msg) { -+ std::string interfaceName; -+ boost::container::flat_map> -+ propertiesChanged; -+ try -+ { -+ msg.read(interfaceName, propertiesChanged); -+ } -+ catch (std::exception& e) -+ { -+ std::cerr << "Unable to read threshold event\n"; -+ return; -+ } -+ // We only want to check for CriticalAlarmHigh -+ if (propertiesChanged.begin()->first != "CriticalAlarmHigh") -+ { -+ return; -+ } -+ const bool* alarm = -+ std::get_if(&(propertiesChanged.begin()->second)); -+ if (alarm == nullptr) -+ { -+ std::cerr << propertiesChanged.begin()->first -+ << " property invalid\n"; -+ return; -+ } -+ -+ // Get the sensor path and check if it's a DIMM sensor -+ std::string sensor = msg.get_path(); -+ if (sensor.find("DIMM") == std::string::npos) -+ { -+ // Not a DIMM sensor -+ return; -+ } -+ -+ // Determine which CPU the DIMM belongs to -+ boost::container::flat_set& criticalDIMMs = -+ (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs -+ : cpu2CriticalDIMMs; -+ -+ if (*alarm) -+ { -+ // DIMM crossed a critical threshold, so store it -+ criticalDIMMs.insert(sensor); -+ } -+ else -+ { -+ // DIMM is no longer critical, so remove it -+ criticalDIMMs.erase(sensor); -+ } -+ }); -+} -+ - static std::shared_ptr startHostStateMonitor() - { - return std::make_shared( -@@ -851,7 +916,9 @@ static void cpu1MemtripHandler() - - bool cpu1Memtrip = - gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; -- if (cpu1Memtrip) -+ -+ // Only log a memory thermtrip if a DIMM is critical -+ if (cpu1Memtrip && !cpu1CriticalDIMMs.empty()) - { - memThermTripLog(1); - } -@@ -911,7 +978,9 @@ static void cpu2MemtripHandler() - - bool cpu2Memtrip = - gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; -- if (cpu2Memtrip) -+ -+ // Only log a memory thermtrip if a DIMM is critical -+ if (cpu2Memtrip && !cpu2CriticalDIMMs.empty()) - { - memThermTripLog(2); - } -@@ -1521,13 +1590,13 @@ static void initializeErrorState() - } - - // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now -- if (cpu1MemtripLine.get_value() == 0) -+ if ((cpu1MemtripLine.get_value() == 0) && !cpu1CriticalDIMMs.empty()) - { - memThermTripLog(1); - } - - // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now -- if (cpu2MemtripLine.get_value() == 0) -+ if ((cpu2MemtripLine.get_value() == 0) && !cpu2CriticalDIMMs.empty()) - { - memThermTripLog(2); - } -@@ -1639,6 +1708,10 @@ int main(int argc, char* argv[]) - std::shared_ptr hostStateMonitor = - host_error_monitor::startHostStateMonitor(); - -+ // Start tracking critical DIMM status -+ std::shared_ptr dimmThresholdEventMonitor = -+ host_error_monitor::startDIMMThresholdEventMonitor(); -+ - // Request CPU1_MISMATCH GPIO events - if (!host_error_monitor::requestGPIOInput( - "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine)) --- -2.17.1 - diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch deleted file mode 100644 index 1f1efea69..000000000 --- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch +++ /dev/null @@ -1,133 +0,0 @@ -From d7909c8924cf3619bffd52e5f352f175c1cf5033 Mon Sep 17 00:00:00 2001 -From: "Jason M. Bills" -Date: Mon, 17 Aug 2020 15:52:22 -0700 -Subject: [PATCH 2/2] Add a workaround for spurious CPU errors - -There is a possible issue where GPIO event interrupts are getting -missed causing false errors to be logged. - -This adds a check that the host is still on and the error is still -asserted before logging an error. - -Tested: -Confirmed that a spurious SMI event was ignored correctly after -this change. - -Change-Id: Id83d9d67b15dcf9035e6448086b140e5c7dab4fe -Signed-off-by: Jason M. Bills ---- - src/host_error_monitor.cpp | 77 ++++++++++++++++++++++++++++++++++++++ - 1 file changed, 77 insertions(+) - -diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp -index 77d065fa3..d026ab90d 100644 ---- a/src/host_error_monitor.cpp -+++ b/src/host_error_monitor.cpp -@@ -806,6 +806,18 @@ static void caterrAssertHandler() - } - return; - } -+ // Confirm that this is a real failure by checking that the host is on -+ if (hostOff) -+ { -+ return; -+ } -+ // And that the signal is still asserted -+ if (caterrLine.get_value() != 0) -+ { -+ std::cerr -+ << "CPU_CATERR not asserted after timeout. Error ignored.\n"; -+ return; -+ } - std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs) - << " ms\n"; - beep(beepCPUIERR); -@@ -1288,6 +1300,48 @@ static void errXAssertHandler(const int errPin, - } - return; - } -+ // Confirm that this is a real failure by checking that the host is on -+ if (hostOff) -+ { -+ return; -+ } -+ // And that the signal is still asserted -+ switch (errPin) -+ { -+ case 0: -+ { -+ if (err0Line.get_value() != 0) -+ { -+ std::cerr << "CPU_ERR0 not asserted after timeout. Error " -+ "ignored.\n"; -+ return; -+ } -+ break; -+ } -+ case 1: -+ { -+ if (err1Line.get_value() != 0) -+ { -+ std::cerr << "CPU_ERR1 not asserted after timeout. Error " -+ "ignored.\n"; -+ return; -+ } -+ break; -+ } -+ case 2: -+ { -+ if (err2Line.get_value() != 0) -+ { -+ std::cerr << "CPU_ERR2 not asserted after timeout. Error " -+ "ignored.\n"; -+ return; -+ } -+ break; -+ } -+ default: -+ std::cerr << "Invalid ERR pin asserted\n"; -+ return; -+ } - std::cerr << "ERR" << std::to_string(errPin) << " asserted for " - << std::to_string(errTimeoutMs) << " ms\n"; - if (errPinCPUs.count()) -@@ -1397,6 +1451,18 @@ static void err2AssertHandler() - } - return; - } -+ // Confirm that this is a real failure by checking that the host is on -+ if (hostOff) -+ { -+ return; -+ } -+ // And that the signal is still asserted -+ if (err2Line.get_value() != 0) -+ { -+ std::cerr -+ << "CPU_ERR2 not asserted after timeout. Error ignored.\n"; -+ return; -+ } - conn->async_method_call( - [](boost::system::error_code ec, - const std::variant& property) { -@@ -1465,6 +1531,17 @@ static void smiAssertHandler() - } - return; - } -+ // Confirm that this is a real failure by checking that the host is on -+ if (hostOff) -+ { -+ return; -+ } -+ // And that the signal is still asserted -+ if (smiLine.get_value() != 0) -+ { -+ std::cerr << "SMI not asserted after timeout. Error ignored.\n"; -+ return; -+ } - std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs) - << " ms\n"; - smiTimeoutLog(); --- -2.17.1 - diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch new file mode 100644 index 000000000..cf74a4925 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch @@ -0,0 +1,143 @@ +From b8b701fde79e7a8ca7bf5aa6ca6832524c011fa5 Mon Sep 17 00:00:00 2001 +From: "Jason M. Bills" +Date: Tue, 15 Dec 2020 16:09:00 -0800 +Subject: [PATCH] Filter memory thermtrip events based on DIMM status + +There is a race-condition on shutdown that makes it difficult to +differentiate between a normal shutdown and a memory thermtrip +shutdown. This race-condition will be resolved in the CPLD for +future platforms but for now it requires a workaround. + +This workaround assumes that a memory thermtrip can only occur +if a DIMM temperature sensor has already reached a critical +threshold. When memory thermtrip asserts on shutdown, it only +logs an error if a DIMM is critical; otherwise it is treated +as a normal shutdown. + +Tested: +Memory thermtrip errors no longer log on each power-off. +Manually set a DIMM temperature above critical and verified +that the memory thermtrip event is logged. + +Change-Id: I9d8cf9b1de688e27babb8004b41f662242c78b3c +Signed-off-by: Jason M. Bills +--- + .../error_monitors/mem_thermtrip_monitor.hpp | 81 +++++++++++++++++++ + 1 file changed, 81 insertions(+) + +diff --git a/include/error_monitors/mem_thermtrip_monitor.hpp b/include/error_monitors/mem_thermtrip_monitor.hpp +index d3dff1d3b..0a3f2fc22 100644 +--- a/include/error_monitors/mem_thermtrip_monitor.hpp ++++ b/include/error_monitors/mem_thermtrip_monitor.hpp +@@ -14,6 +14,7 @@ + // limitations under the License. + */ + #pragma once ++#include + #include + #include + #include +@@ -28,6 +29,72 @@ class MemThermtripMonitor : + host_error_monitor::base_gpio_monitor::AssertValue::lowAssert; + size_t cpuNum; + ++ std::shared_ptr dimmThresholdEventMonitor; ++ boost::container::flat_set criticalDIMMs; ++ ++ std::shared_ptr ++ startDIMMThresholdEventMonitor() ++ { ++ return std::make_shared( ++ *conn, ++ "type='signal',interface='org.freedesktop.DBus.Properties',member='" ++ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor." ++ "Threshold.Critical'", ++ [this](sdbusplus::message::message& msg) { ++ std::string interfaceName; ++ boost::container::flat_map> ++ propertiesChanged; ++ try ++ { ++ msg.read(interfaceName, propertiesChanged); ++ } ++ catch (std::exception& e) ++ { ++ std::cerr << "Unable to read threshold event\n"; ++ return; ++ } ++ // We only want to check for CriticalAlarmHigh ++ if (propertiesChanged.begin()->first != "CriticalAlarmHigh") ++ { ++ return; ++ } ++ const bool* alarm = ++ std::get_if(&(propertiesChanged.begin()->second)); ++ if (alarm == nullptr) ++ { ++ std::cerr << propertiesChanged.begin()->first ++ << " property invalid\n"; ++ return; ++ } ++ ++ // Get the sensor path and check if it's a DIMM sensor ++ std::string sensor = msg.get_path(); ++ if (sensor.find("DIMM") == std::string::npos) ++ { ++ // Not a DIMM sensor ++ return; ++ } ++ ++ // Check if the DIMM belongs to this CPU ++ if (sensor.find("CPU" + std::to_string(cpuNum)) == ++ std::string::npos) ++ { ++ return; ++ } ++ ++ if (*alarm) ++ { ++ // DIMM crossed a critical threshold, so store it ++ criticalDIMMs.insert(sensor); ++ } ++ else ++ { ++ // DIMM is no longer critical, so remove it ++ criticalDIMMs.erase(sensor); ++ } ++ }); ++ } ++ + void logEvent() override + { + std::string cpuNumber = "CPU " + std::to_string(cpuNum); +@@ -39,6 +106,17 @@ class MemThermtripMonitor : + "REDFISH_MESSAGE_ARGS=%s", cpuNumber.c_str(), NULL); + } + ++ void assertHandler() override ++ { ++ // Only log a memory thermtrip if a DIMM is critical ++ if (criticalDIMMs.empty()) ++ { ++ return; ++ } ++ ++ host_error_monitor::base_gpio_monitor::BaseGPIOMonitor::assertHandler(); ++ } ++ + public: + MemThermtripMonitor(boost::asio::io_service& io, + std::shared_ptr conn, +@@ -46,6 +124,9 @@ class MemThermtripMonitor : + BaseGPIOMonitor(io, conn, signalName, assertValue), + cpuNum(cpuNum) + { ++ // Start tracking critical DIMM status ++ dimmThresholdEventMonitor = startDIMMThresholdEventMonitor(); ++ + if (valid) + { + startMonitoring(); +-- +2.17.1 + diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend index 0d1fd91d2..638d833a8 100644 --- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend @@ -1,6 +1,6 @@ FILESEXTRAPATHS_append := "${THISDIR}/${PN}:" SRC_URI += " \ - file://0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch \ - file://0002-Add-a-workaround-for-spurious-CPU-errors.patch \ - " + file://0001-Configure-host-error-monitors-for-meta-wht.patch \ + file://0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch \ + " diff --git a/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules new file mode 100644 index 000000000..b587a3f57 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules @@ -0,0 +1,2 @@ +ACTION=="add", SUBSYSTEM=="peci_dev", ATTRS{name}=="*.peci-bus", SYMLINK+="peci-wire" TAG+="peci-wire" +ACTION=="add", SUBSYSTEM=="peci_dev", TAG=="peci-wire", SYMLINK+="peci-default" diff --git a/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend new file mode 100644 index 000000000..575cfea24 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend @@ -0,0 +1,8 @@ +FILESEXTRAPATHS_append := ":${THISDIR}/${PN}" + +SRC_URI += "file://99-peci.rules" + +do_install_append() { + install -d ${D}/lib/udev/rules.d + install -m 0644 ${WORKDIR}/99-peci.rules ${D}/lib/udev/rules.d +} -- cgit v1.2.3