summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch')
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch153
1 files changed, 153 insertions, 0 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
new file mode 100644
index 000000000..30859d1a4
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
@@ -0,0 +1,153 @@
+From 0253fd1d68d6a42c95c425b1a61fa2d53b2b2469 Mon Sep 17 00:00:00 2001
+From: "Jason M. Bills" <jason.m.bills@intel.com>
+Date: Wed, 22 Jul 2020 14:30:04 -0700
+Subject: [PATCH] Filter memory thermtrip events based on DIMM status
+
+There is a race-condition on shutdown that makes it difficult to
+differentiate between a normal shutdown and a memory thermtrip
+shutdown. This race-condition will be resolved in the CPLD for
+future platforms but for now it requires a workaround.
+
+This workaround assumes that a memory thermtrip can only occur
+if a DIMM temperature sensor has already reached a critical
+threshold. When memory thermtrip asserts on shutdown, it only
+logs an error if a DIMM is critical; otherwise it is treated
+as a normal shutdown.
+
+Tested:
+Memory thermtrip errors no longer log on each power-off.
+Manually set a DIMM temperature above critical and verified
+that the memory thermtrip event is logged.
+
+Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3
+Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
+---
+ src/host_error_monitor.cpp | 77 +++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 75 insertions(+), 2 deletions(-)
+
+diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
+index 1c6a2e70d..aa4a9b672 100644
+--- a/src/host_error_monitor.cpp
++++ b/src/host_error_monitor.cpp
+@@ -17,6 +17,7 @@
+ #include <systemd/sd-journal.h>
+
+ #include <boost/asio/posix/stream_descriptor.hpp>
++#include <boost/container/flat_set.hpp>
+ #include <gpiod.hpp>
+ #include <sdbusplus/asio/object_server.hpp>
+
+@@ -36,6 +37,9 @@ static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
+
+ static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
+
++static boost::container::flat_set<std::string> cpu1CriticalDIMMs;
++static boost::container::flat_set<std::string> cpu2CriticalDIMMs;
++
+ static bool hostOff = true;
+
+ static size_t caterrTimeoutMs = 2000;
+@@ -258,6 +262,67 @@ static void initializeHostState()
+ "xyz.openbmc_project.State.Host", "CurrentHostState");
+ }
+
++static std::shared_ptr<sdbusplus::bus::match::match>
++ startDIMMThresholdEventMonitor()
++{
++ return std::make_shared<sdbusplus::bus::match::match>(
++ *conn,
++ "type='signal',interface='org.freedesktop.DBus.Properties',member='"
++ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor."
++ "Threshold.Critical'",
++ [](sdbusplus::message::message& msg) {
++ std::string interfaceName;
++ boost::container::flat_map<std::string, std::variant<bool>>
++ propertiesChanged;
++ try
++ {
++ msg.read(interfaceName, propertiesChanged);
++ }
++ catch (std::exception& e)
++ {
++ std::cerr << "Unable to read threshold event\n";
++ return;
++ }
++ // We only want to check for CriticalAlarmHigh
++ if (propertiesChanged.begin()->first != "CriticalAlarmHigh")
++ {
++ return;
++ }
++ const bool* alarm =
++ std::get_if<bool>(&(propertiesChanged.begin()->second));
++ if (alarm == nullptr)
++ {
++ std::cerr << propertiesChanged.begin()->first
++ << " property invalid\n";
++ return;
++ }
++
++ // Get the sensor path and check if it's a DIMM sensor
++ std::string sensor = msg.get_path();
++ if (sensor.find("DIMM") == std::string::npos)
++ {
++ // Not a DIMM sensor
++ return;
++ }
++
++ // Determine which CPU the DIMM belongs to
++ boost::container::flat_set<std::string>& criticalDIMMs =
++ (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs
++ : cpu2CriticalDIMMs;
++
++ if (*alarm)
++ {
++ // DIMM crossed a critical threshold, so store it
++ criticalDIMMs.insert(sensor);
++ }
++ else
++ {
++ // DIMM is no longer critical, so remove it
++ criticalDIMMs.erase(sensor);
++ }
++ });
++}
++
+ static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
+ {
+ return std::make_shared<sdbusplus::bus::match::match>(
+@@ -826,7 +891,9 @@ static void cpu1MemtripHandler()
+
+ bool cpu1Memtrip =
+ gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
+- if (cpu1Memtrip)
++
++ // Only log a memory thermtrip if a DIMM is critical
++ if (cpu1Memtrip && !cpu1CriticalDIMMs.empty())
+ {
+ memThermTripLog(1);
+ }
+@@ -886,7 +953,9 @@ static void cpu2MemtripHandler()
+
+ bool cpu2Memtrip =
+ gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
+- if (cpu2Memtrip)
++
++ // Only log a memory thermtrip if a DIMM is critical
++ if (cpu2Memtrip && !cpu2CriticalDIMMs.empty())
+ {
+ memThermTripLog(2);
+ }
+@@ -1605,6 +1674,10 @@ int main(int argc, char* argv[])
+ std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
+ host_error_monitor::startHostStateMonitor();
+
++ // Start tracking critical DIMM status
++ std::shared_ptr<sdbusplus::bus::match::match> dimmThresholdEventMonitor =
++ host_error_monitor::startDIMMThresholdEventMonitor();
++
+ // Request CPU1_MISMATCH GPIO events
+ if (!host_error_monitor::requestGPIOInput(
+ "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
+--
+2.17.1
+