summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch')
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch169
1 files changed, 0 insertions, 169 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
deleted file mode 100644
index 140724ca9..000000000
--- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
+++ /dev/null
@@ -1,169 +0,0 @@
-From c09e608da2f63eed5b73891d5c032b646d8e81eb Mon Sep 17 00:00:00 2001
-From: "Jason M. Bills" <jason.m.bills@intel.com>
-Date: Wed, 22 Jul 2020 14:30:04 -0700
-Subject: [PATCH 1/2] Filter memory thermtrip events based on DIMM status
-
-There is a race-condition on shutdown that makes it difficult to
-differentiate between a normal shutdown and a memory thermtrip
-shutdown. This race-condition will be resolved in the CPLD for
-future platforms but for now it requires a workaround.
-
-This workaround assumes that a memory thermtrip can only occur
-if a DIMM temperature sensor has already reached a critical
-threshold. When memory thermtrip asserts on shutdown, it only
-logs an error if a DIMM is critical; otherwise it is treated
-as a normal shutdown.
-
-Tested:
-Memory thermtrip errors no longer log on each power-off.
-Manually set a DIMM temperature above critical and verified
-that the memory thermtrip event is logged.
-
-Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3
-Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
----
- src/host_error_monitor.cpp | 81 ++++++++++++++++++++++++++++++++++++--
- 1 file changed, 77 insertions(+), 4 deletions(-)
-
-diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
-index d52a5dc6a..77d065fa3 100644
---- a/src/host_error_monitor.cpp
-+++ b/src/host_error_monitor.cpp
-@@ -19,6 +19,7 @@
- #include <boost/asio/io_service.hpp>
- #include <boost/asio/posix/stream_descriptor.hpp>
- #include <boost/asio/steady_timer.hpp>
-+#include <boost/container/flat_set.hpp>
- #include <gpiod.hpp>
- #include <sdbusplus/asio/object_server.hpp>
-
-@@ -38,6 +39,9 @@ static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
-
- static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
-
-+static boost::container::flat_set<std::string> cpu1CriticalDIMMs;
-+static boost::container::flat_set<std::string> cpu2CriticalDIMMs;
-+
- static bool hostOff = true;
-
- static size_t caterrTimeoutMs = 2000;
-@@ -274,6 +278,67 @@ static void initializeHostState()
- "xyz.openbmc_project.State.Host", "CurrentHostState");
- }
-
-+static std::shared_ptr<sdbusplus::bus::match::match>
-+ startDIMMThresholdEventMonitor()
-+{
-+ return std::make_shared<sdbusplus::bus::match::match>(
-+ *conn,
-+ "type='signal',interface='org.freedesktop.DBus.Properties',member='"
-+ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor."
-+ "Threshold.Critical'",
-+ [](sdbusplus::message::message& msg) {
-+ std::string interfaceName;
-+ boost::container::flat_map<std::string, std::variant<bool>>
-+ propertiesChanged;
-+ try
-+ {
-+ msg.read(interfaceName, propertiesChanged);
-+ }
-+ catch (std::exception& e)
-+ {
-+ std::cerr << "Unable to read threshold event\n";
-+ return;
-+ }
-+ // We only want to check for CriticalAlarmHigh
-+ if (propertiesChanged.begin()->first != "CriticalAlarmHigh")
-+ {
-+ return;
-+ }
-+ const bool* alarm =
-+ std::get_if<bool>(&(propertiesChanged.begin()->second));
-+ if (alarm == nullptr)
-+ {
-+ std::cerr << propertiesChanged.begin()->first
-+ << " property invalid\n";
-+ return;
-+ }
-+
-+ // Get the sensor path and check if it's a DIMM sensor
-+ std::string sensor = msg.get_path();
-+ if (sensor.find("DIMM") == std::string::npos)
-+ {
-+ // Not a DIMM sensor
-+ return;
-+ }
-+
-+ // Determine which CPU the DIMM belongs to
-+ boost::container::flat_set<std::string>& criticalDIMMs =
-+ (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs
-+ : cpu2CriticalDIMMs;
-+
-+ if (*alarm)
-+ {
-+ // DIMM crossed a critical threshold, so store it
-+ criticalDIMMs.insert(sensor);
-+ }
-+ else
-+ {
-+ // DIMM is no longer critical, so remove it
-+ criticalDIMMs.erase(sensor);
-+ }
-+ });
-+}
-+
- static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
- {
- return std::make_shared<sdbusplus::bus::match::match>(
-@@ -851,7 +916,9 @@ static void cpu1MemtripHandler()
-
- bool cpu1Memtrip =
- gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
-- if (cpu1Memtrip)
-+
-+ // Only log a memory thermtrip if a DIMM is critical
-+ if (cpu1Memtrip && !cpu1CriticalDIMMs.empty())
- {
- memThermTripLog(1);
- }
-@@ -911,7 +978,9 @@ static void cpu2MemtripHandler()
-
- bool cpu2Memtrip =
- gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
-- if (cpu2Memtrip)
-+
-+ // Only log a memory thermtrip if a DIMM is critical
-+ if (cpu2Memtrip && !cpu2CriticalDIMMs.empty())
- {
- memThermTripLog(2);
- }
-@@ -1521,13 +1590,13 @@ static void initializeErrorState()
- }
-
- // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
-- if (cpu1MemtripLine.get_value() == 0)
-+ if ((cpu1MemtripLine.get_value() == 0) && !cpu1CriticalDIMMs.empty())
- {
- memThermTripLog(1);
- }
-
- // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
-- if (cpu2MemtripLine.get_value() == 0)
-+ if ((cpu2MemtripLine.get_value() == 0) && !cpu2CriticalDIMMs.empty())
- {
- memThermTripLog(2);
- }
-@@ -1639,6 +1708,10 @@ int main(int argc, char* argv[])
- std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
- host_error_monitor::startHostStateMonitor();
-
-+ // Start tracking critical DIMM status
-+ std::shared_ptr<sdbusplus::bus::match::match> dimmThresholdEventMonitor =
-+ host_error_monitor::startDIMMThresholdEventMonitor();
-+
- // Request CPU1_MISMATCH GPIO events
- if (!host_error_monitor::requestGPIOInput(
- "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
---
-2.17.1
-