diff options
author | Jason M. Bills <jason.m.bills@linux.intel.com> | 2020-08-31 23:56:28 +0300 |
---|---|---|
committer | Jason M. Bills <jason.m.bills@linux.intel.com> | 2020-09-02 00:21:46 +0300 |
commit | f99301c1a626951ee7feee081a1494e795d0e243 (patch) | |
tree | ca75379d317be9cc1757a00e0352a048b5d3200b /meta-openbmc-mods/meta-wht/recipes-core | |
parent | 40108db4434d8c2e0a1ad2d1dd3f5ae34b17352c (diff) | |
download | openbmc-f99301c1a626951ee7feee081a1494e795d0e243.tar.xz |
Update to internal 0.74
Signed-off-by: Jason M. Bills <jason.m.bills@linux.intel.com>
Diffstat (limited to 'meta-openbmc-mods/meta-wht/recipes-core')
2 files changed, 158 insertions, 0 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch new file mode 100644 index 000000000..30859d1a4 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch @@ -0,0 +1,153 @@ +From 0253fd1d68d6a42c95c425b1a61fa2d53b2b2469 Mon Sep 17 00:00:00 2001 +From: "Jason M. Bills" <jason.m.bills@intel.com> +Date: Wed, 22 Jul 2020 14:30:04 -0700 +Subject: [PATCH] Filter memory thermtrip events based on DIMM status + +There is a race-condition on shutdown that makes it difficult to +differentiate between a normal shutdown and a memory thermtrip +shutdown. This race-condition will be resolved in the CPLD for +future platforms but for now it requires a workaround. + +This workaround assumes that a memory thermtrip can only occur +if a DIMM temperature sensor has already reached a critical +threshold. When memory thermtrip asserts on shutdown, it only +logs an error if a DIMM is critical; otherwise it is treated +as a normal shutdown. + +Tested: +Memory thermtrip errors no longer log on each power-off. +Manually set a DIMM temperature above critical and verified +that the memory thermtrip event is logged. + +Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3 +Signed-off-by: Jason M. Bills <jason.m.bills@intel.com> +--- + src/host_error_monitor.cpp | 77 +++++++++++++++++++++++++++++++++++++- + 1 file changed, 75 insertions(+), 2 deletions(-) + +diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp +index 1c6a2e70d..aa4a9b672 100644 +--- a/src/host_error_monitor.cpp ++++ b/src/host_error_monitor.cpp +@@ -17,6 +17,7 @@ + #include <systemd/sd-journal.h> + + #include <boost/asio/posix/stream_descriptor.hpp> ++#include <boost/container/flat_set.hpp> + #include <gpiod.hpp> + #include <sdbusplus/asio/object_server.hpp> + +@@ -36,6 +37,9 @@ static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert; + + static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager"; + ++static boost::container::flat_set<std::string> cpu1CriticalDIMMs; ++static boost::container::flat_set<std::string> cpu2CriticalDIMMs; ++ + static bool hostOff = true; + + static size_t caterrTimeoutMs = 2000; +@@ -258,6 +262,67 @@ static void initializeHostState() + "xyz.openbmc_project.State.Host", "CurrentHostState"); + } + ++static std::shared_ptr<sdbusplus::bus::match::match> ++ startDIMMThresholdEventMonitor() ++{ ++ return std::make_shared<sdbusplus::bus::match::match>( ++ *conn, ++ "type='signal',interface='org.freedesktop.DBus.Properties',member='" ++ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor." ++ "Threshold.Critical'", ++ [](sdbusplus::message::message& msg) { ++ std::string interfaceName; ++ boost::container::flat_map<std::string, std::variant<bool>> ++ propertiesChanged; ++ try ++ { ++ msg.read(interfaceName, propertiesChanged); ++ } ++ catch (std::exception& e) ++ { ++ std::cerr << "Unable to read threshold event\n"; ++ return; ++ } ++ // We only want to check for CriticalAlarmHigh ++ if (propertiesChanged.begin()->first != "CriticalAlarmHigh") ++ { ++ return; ++ } ++ const bool* alarm = ++ std::get_if<bool>(&(propertiesChanged.begin()->second)); ++ if (alarm == nullptr) ++ { ++ std::cerr << propertiesChanged.begin()->first ++ << " property invalid\n"; ++ return; ++ } ++ ++ // Get the sensor path and check if it's a DIMM sensor ++ std::string sensor = msg.get_path(); ++ if (sensor.find("DIMM") == std::string::npos) ++ { ++ // Not a DIMM sensor ++ return; ++ } ++ ++ // Determine which CPU the DIMM belongs to ++ boost::container::flat_set<std::string>& criticalDIMMs = ++ (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs ++ : cpu2CriticalDIMMs; ++ ++ if (*alarm) ++ { ++ // DIMM crossed a critical threshold, so store it ++ criticalDIMMs.insert(sensor); ++ } ++ else ++ { ++ // DIMM is no longer critical, so remove it ++ criticalDIMMs.erase(sensor); ++ } ++ }); ++} ++ + static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor() + { + return std::make_shared<sdbusplus::bus::match::match>( +@@ -826,7 +891,9 @@ static void cpu1MemtripHandler() + + bool cpu1Memtrip = + gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; +- if (cpu1Memtrip) ++ ++ // Only log a memory thermtrip if a DIMM is critical ++ if (cpu1Memtrip && !cpu1CriticalDIMMs.empty()) + { + memThermTripLog(1); + } +@@ -886,7 +953,9 @@ static void cpu2MemtripHandler() + + bool cpu2Memtrip = + gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; +- if (cpu2Memtrip) ++ ++ // Only log a memory thermtrip if a DIMM is critical ++ if (cpu2Memtrip && !cpu2CriticalDIMMs.empty()) + { + memThermTripLog(2); + } +@@ -1605,6 +1674,10 @@ int main(int argc, char* argv[]) + std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor = + host_error_monitor::startHostStateMonitor(); + ++ // Start tracking critical DIMM status ++ std::shared_ptr<sdbusplus::bus::match::match> dimmThresholdEventMonitor = ++ host_error_monitor::startDIMMThresholdEventMonitor(); ++ + // Request CPU1_MISMATCH GPIO events + if (!host_error_monitor::requestGPIOInput( + "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine)) +-- +2.17.1 + diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend new file mode 100644 index 000000000..4b79757c0 --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend @@ -0,0 +1,5 @@ +FILESEXTRAPATHS_append := "${THISDIR}/${PN}:" + +SRC_URI += " \ + file://0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch \ + " |