diff options
Diffstat (limited to 'meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch')
-rw-r--r-- | meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch | 169 |
1 files changed, 0 insertions, 169 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch deleted file mode 100644 index 140724ca9..000000000 --- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch +++ /dev/null @@ -1,169 +0,0 @@ -From c09e608da2f63eed5b73891d5c032b646d8e81eb Mon Sep 17 00:00:00 2001 -From: "Jason M. Bills" <jason.m.bills@intel.com> -Date: Wed, 22 Jul 2020 14:30:04 -0700 -Subject: [PATCH 1/2] Filter memory thermtrip events based on DIMM status - -There is a race-condition on shutdown that makes it difficult to -differentiate between a normal shutdown and a memory thermtrip -shutdown. This race-condition will be resolved in the CPLD for -future platforms but for now it requires a workaround. - -This workaround assumes that a memory thermtrip can only occur -if a DIMM temperature sensor has already reached a critical -threshold. When memory thermtrip asserts on shutdown, it only -logs an error if a DIMM is critical; otherwise it is treated -as a normal shutdown. - -Tested: -Memory thermtrip errors no longer log on each power-off. -Manually set a DIMM temperature above critical and verified -that the memory thermtrip event is logged. - -Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3 -Signed-off-by: Jason M. Bills <jason.m.bills@intel.com> ---- - src/host_error_monitor.cpp | 81 ++++++++++++++++++++++++++++++++++++-- - 1 file changed, 77 insertions(+), 4 deletions(-) - -diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp -index d52a5dc6a..77d065fa3 100644 ---- a/src/host_error_monitor.cpp -+++ b/src/host_error_monitor.cpp -@@ -19,6 +19,7 @@ - #include <boost/asio/io_service.hpp> - #include <boost/asio/posix/stream_descriptor.hpp> - #include <boost/asio/steady_timer.hpp> -+#include <boost/container/flat_set.hpp> - #include <gpiod.hpp> - #include <sdbusplus/asio/object_server.hpp> - -@@ -38,6 +39,9 @@ static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert; - - static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager"; - -+static boost::container::flat_set<std::string> cpu1CriticalDIMMs; -+static boost::container::flat_set<std::string> cpu2CriticalDIMMs; -+ - static bool hostOff = true; - - static size_t caterrTimeoutMs = 2000; -@@ -274,6 +278,67 @@ static void initializeHostState() - "xyz.openbmc_project.State.Host", "CurrentHostState"); - } - -+static std::shared_ptr<sdbusplus::bus::match::match> -+ startDIMMThresholdEventMonitor() -+{ -+ return std::make_shared<sdbusplus::bus::match::match>( -+ *conn, -+ "type='signal',interface='org.freedesktop.DBus.Properties',member='" -+ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor." -+ "Threshold.Critical'", -+ [](sdbusplus::message::message& msg) { -+ std::string interfaceName; -+ boost::container::flat_map<std::string, std::variant<bool>> -+ propertiesChanged; -+ try -+ { -+ msg.read(interfaceName, propertiesChanged); -+ } -+ catch (std::exception& e) -+ { -+ std::cerr << "Unable to read threshold event\n"; -+ return; -+ } -+ // We only want to check for CriticalAlarmHigh -+ if (propertiesChanged.begin()->first != "CriticalAlarmHigh") -+ { -+ return; -+ } -+ const bool* alarm = -+ std::get_if<bool>(&(propertiesChanged.begin()->second)); -+ if (alarm == nullptr) -+ { -+ std::cerr << propertiesChanged.begin()->first -+ << " property invalid\n"; -+ return; -+ } -+ -+ // Get the sensor path and check if it's a DIMM sensor -+ std::string sensor = msg.get_path(); -+ if (sensor.find("DIMM") == std::string::npos) -+ { -+ // Not a DIMM sensor -+ return; -+ } -+ -+ // Determine which CPU the DIMM belongs to -+ boost::container::flat_set<std::string>& criticalDIMMs = -+ (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs -+ : cpu2CriticalDIMMs; -+ -+ if (*alarm) -+ { -+ // DIMM crossed a critical threshold, so store it -+ criticalDIMMs.insert(sensor); -+ } -+ else -+ { -+ // DIMM is no longer critical, so remove it -+ criticalDIMMs.erase(sensor); -+ } -+ }); -+} -+ - static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor() - { - return std::make_shared<sdbusplus::bus::match::match>( -@@ -851,7 +916,9 @@ static void cpu1MemtripHandler() - - bool cpu1Memtrip = - gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; -- if (cpu1Memtrip) -+ -+ // Only log a memory thermtrip if a DIMM is critical -+ if (cpu1Memtrip && !cpu1CriticalDIMMs.empty()) - { - memThermTripLog(1); - } -@@ -911,7 +978,9 @@ static void cpu2MemtripHandler() - - bool cpu2Memtrip = - gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; -- if (cpu2Memtrip) -+ -+ // Only log a memory thermtrip if a DIMM is critical -+ if (cpu2Memtrip && !cpu2CriticalDIMMs.empty()) - { - memThermTripLog(2); - } -@@ -1521,13 +1590,13 @@ static void initializeErrorState() - } - - // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now -- if (cpu1MemtripLine.get_value() == 0) -+ if ((cpu1MemtripLine.get_value() == 0) && !cpu1CriticalDIMMs.empty()) - { - memThermTripLog(1); - } - - // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now -- if (cpu2MemtripLine.get_value() == 0) -+ if ((cpu2MemtripLine.get_value() == 0) && !cpu2CriticalDIMMs.empty()) - { - memThermTripLog(2); - } -@@ -1639,6 +1708,10 @@ int main(int argc, char* argv[]) - std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor = - host_error_monitor::startHostStateMonitor(); - -+ // Start tracking critical DIMM status -+ std::shared_ptr<sdbusplus::bus::match::match> dimmThresholdEventMonitor = -+ host_error_monitor::startDIMMThresholdEventMonitor(); -+ - // Request CPU1_MISMATCH GPIO events - if (!host_error_monitor::requestGPIOInput( - "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine)) --- -2.17.1 - |