From c09e608da2f63eed5b73891d5c032b646d8e81eb Mon Sep 17 00:00:00 2001 From: "Jason M. Bills" Date: Wed, 22 Jul 2020 14:30:04 -0700 Subject: [PATCH 1/2] Filter memory thermtrip events based on DIMM status There is a race-condition on shutdown that makes it difficult to differentiate between a normal shutdown and a memory thermtrip shutdown. This race-condition will be resolved in the CPLD for future platforms but for now it requires a workaround. This workaround assumes that a memory thermtrip can only occur if a DIMM temperature sensor has already reached a critical threshold. When memory thermtrip asserts on shutdown, it only logs an error if a DIMM is critical; otherwise it is treated as a normal shutdown. Tested: Memory thermtrip errors no longer log on each power-off. Manually set a DIMM temperature above critical and verified that the memory thermtrip event is logged. Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3 Signed-off-by: Jason M. Bills --- src/host_error_monitor.cpp | 81 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 4 deletions(-) diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp index d52a5dc6a..77d065fa3 100644 --- a/src/host_error_monitor.cpp +++ b/src/host_error_monitor.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,9 @@ static std::shared_ptr associationCATAssert; static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager"; +static boost::container::flat_set cpu1CriticalDIMMs; +static boost::container::flat_set cpu2CriticalDIMMs; + static bool hostOff = true; static size_t caterrTimeoutMs = 2000; @@ -274,6 +278,67 @@ static void initializeHostState() "xyz.openbmc_project.State.Host", "CurrentHostState"); } +static std::shared_ptr + startDIMMThresholdEventMonitor() +{ + return std::make_shared( + *conn, + "type='signal',interface='org.freedesktop.DBus.Properties',member='" + "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor." + "Threshold.Critical'", + [](sdbusplus::message::message& msg) { + std::string interfaceName; + boost::container::flat_map> + propertiesChanged; + try + { + msg.read(interfaceName, propertiesChanged); + } + catch (std::exception& e) + { + std::cerr << "Unable to read threshold event\n"; + return; + } + // We only want to check for CriticalAlarmHigh + if (propertiesChanged.begin()->first != "CriticalAlarmHigh") + { + return; + } + const bool* alarm = + std::get_if(&(propertiesChanged.begin()->second)); + if (alarm == nullptr) + { + std::cerr << propertiesChanged.begin()->first + << " property invalid\n"; + return; + } + + // Get the sensor path and check if it's a DIMM sensor + std::string sensor = msg.get_path(); + if (sensor.find("DIMM") == std::string::npos) + { + // Not a DIMM sensor + return; + } + + // Determine which CPU the DIMM belongs to + boost::container::flat_set& criticalDIMMs = + (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs + : cpu2CriticalDIMMs; + + if (*alarm) + { + // DIMM crossed a critical threshold, so store it + criticalDIMMs.insert(sensor); + } + else + { + // DIMM is no longer critical, so remove it + criticalDIMMs.erase(sensor); + } + }); +} + static std::shared_ptr startHostStateMonitor() { return std::make_shared( @@ -851,7 +916,9 @@ static void cpu1MemtripHandler() bool cpu1Memtrip = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; - if (cpu1Memtrip) + + // Only log a memory thermtrip if a DIMM is critical + if (cpu1Memtrip && !cpu1CriticalDIMMs.empty()) { memThermTripLog(1); } @@ -911,7 +978,9 @@ static void cpu2MemtripHandler() bool cpu2Memtrip = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; - if (cpu2Memtrip) + + // Only log a memory thermtrip if a DIMM is critical + if (cpu2Memtrip && !cpu2CriticalDIMMs.empty()) { memThermTripLog(2); } @@ -1521,13 +1590,13 @@ static void initializeErrorState() } // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now - if (cpu1MemtripLine.get_value() == 0) + if ((cpu1MemtripLine.get_value() == 0) && !cpu1CriticalDIMMs.empty()) { memThermTripLog(1); } // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now - if (cpu2MemtripLine.get_value() == 0) + if ((cpu2MemtripLine.get_value() == 0) && !cpu2CriticalDIMMs.empty()) { memThermTripLog(2); } @@ -1639,6 +1708,10 @@ int main(int argc, char* argv[]) std::shared_ptr hostStateMonitor = host_error_monitor::startHostStateMonitor(); + // Start tracking critical DIMM status + std::shared_ptr dimmThresholdEventMonitor = + host_error_monitor::startDIMMThresholdEventMonitor(); + // Request CPU1_MISMATCH GPIO events if (!host_error_monitor::requestGPIOInput( "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine)) -- 2.17.1