From f99301c1a626951ee7feee081a1494e795d0e243 Mon Sep 17 00:00:00 2001 From: "Jason M. Bills" Date: Mon, 31 Aug 2020 13:56:28 -0700 Subject: Update to internal 0.74 Signed-off-by: Jason M. Bills --- ...ory-thermtrip-events-based-on-DIMM-status.patch | 153 +++++++++++++++++++++ .../host-error-monitor_%.bbappend | 5 + 2 files changed, 158 insertions(+) create mode 100644 meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch create mode 100644 meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend (limited to 'meta-openbmc-mods/meta-wht') diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch new file mode 100644 index 000000000..30859d1a4 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch @@ -0,0 +1,153 @@ +From 0253fd1d68d6a42c95c425b1a61fa2d53b2b2469 Mon Sep 17 00:00:00 2001 +From: "Jason M. Bills" +Date: Wed, 22 Jul 2020 14:30:04 -0700 +Subject: [PATCH] Filter memory thermtrip events based on DIMM status + +There is a race-condition on shutdown that makes it difficult to +differentiate between a normal shutdown and a memory thermtrip +shutdown. This race-condition will be resolved in the CPLD for +future platforms but for now it requires a workaround. + +This workaround assumes that a memory thermtrip can only occur +if a DIMM temperature sensor has already reached a critical +threshold. When memory thermtrip asserts on shutdown, it only +logs an error if a DIMM is critical; otherwise it is treated +as a normal shutdown. + +Tested: +Memory thermtrip errors no longer log on each power-off. +Manually set a DIMM temperature above critical and verified +that the memory thermtrip event is logged. + +Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3 +Signed-off-by: Jason M. Bills +--- + src/host_error_monitor.cpp | 77 +++++++++++++++++++++++++++++++++++++- + 1 file changed, 75 insertions(+), 2 deletions(-) + +diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp +index 1c6a2e70d..aa4a9b672 100644 +--- a/src/host_error_monitor.cpp ++++ b/src/host_error_monitor.cpp +@@ -17,6 +17,7 @@ + #include + + #include ++#include + #include + #include + +@@ -36,6 +37,9 @@ static std::shared_ptr associationCATAssert; + + static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager"; + ++static boost::container::flat_set cpu1CriticalDIMMs; ++static boost::container::flat_set cpu2CriticalDIMMs; ++ + static bool hostOff = true; + + static size_t caterrTimeoutMs = 2000; +@@ -258,6 +262,67 @@ static void initializeHostState() + "xyz.openbmc_project.State.Host", "CurrentHostState"); + } + ++static std::shared_ptr ++ startDIMMThresholdEventMonitor() ++{ ++ return std::make_shared( ++ *conn, ++ "type='signal',interface='org.freedesktop.DBus.Properties',member='" ++ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor." ++ "Threshold.Critical'", ++ [](sdbusplus::message::message& msg) { ++ std::string interfaceName; ++ boost::container::flat_map> ++ propertiesChanged; ++ try ++ { ++ msg.read(interfaceName, propertiesChanged); ++ } ++ catch (std::exception& e) ++ { ++ std::cerr << "Unable to read threshold event\n"; ++ return; ++ } ++ // We only want to check for CriticalAlarmHigh ++ if (propertiesChanged.begin()->first != "CriticalAlarmHigh") ++ { ++ return; ++ } ++ const bool* alarm = ++ std::get_if(&(propertiesChanged.begin()->second)); ++ if (alarm == nullptr) ++ { ++ std::cerr << propertiesChanged.begin()->first ++ << " property invalid\n"; ++ return; ++ } ++ ++ // Get the sensor path and check if it's a DIMM sensor ++ std::string sensor = msg.get_path(); ++ if (sensor.find("DIMM") == std::string::npos) ++ { ++ // Not a DIMM sensor ++ return; ++ } ++ ++ // Determine which CPU the DIMM belongs to ++ boost::container::flat_set& criticalDIMMs = ++ (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs ++ : cpu2CriticalDIMMs; ++ ++ if (*alarm) ++ { ++ // DIMM crossed a critical threshold, so store it ++ criticalDIMMs.insert(sensor); ++ } ++ else ++ { ++ // DIMM is no longer critical, so remove it ++ criticalDIMMs.erase(sensor); ++ } ++ }); ++} ++ + static std::shared_ptr startHostStateMonitor() + { + return std::make_shared( +@@ -826,7 +891,9 @@ static void cpu1MemtripHandler() + + bool cpu1Memtrip = + gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; +- if (cpu1Memtrip) ++ ++ // Only log a memory thermtrip if a DIMM is critical ++ if (cpu1Memtrip && !cpu1CriticalDIMMs.empty()) + { + memThermTripLog(1); + } +@@ -886,7 +953,9 @@ static void cpu2MemtripHandler() + + bool cpu2Memtrip = + gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; +- if (cpu2Memtrip) ++ ++ // Only log a memory thermtrip if a DIMM is critical ++ if (cpu2Memtrip && !cpu2CriticalDIMMs.empty()) + { + memThermTripLog(2); + } +@@ -1605,6 +1674,10 @@ int main(int argc, char* argv[]) + std::shared_ptr hostStateMonitor = + host_error_monitor::startHostStateMonitor(); + ++ // Start tracking critical DIMM status ++ std::shared_ptr dimmThresholdEventMonitor = ++ host_error_monitor::startDIMMThresholdEventMonitor(); ++ + // Request CPU1_MISMATCH GPIO events + if (!host_error_monitor::requestGPIOInput( + "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine)) +-- +2.17.1 + diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend new file mode 100644 index 000000000..4b79757c0 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend @@ -0,0 +1,5 @@ +FILESEXTRAPATHS_append := "${THISDIR}/${PN}:" + +SRC_URI += " \ + file://0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch \ + " -- cgit v1.2.3