summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
blob: 8744776015d2bb38583d7696e42c1a326d7555fe (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
From 4e8b5e2a7cef7e8a4f7a9af077da76c3b7eb6a24 Mon Sep 17 00:00:00 2001
From: "Jason M. Bills" <jason.m.bills@intel.com>
Date: Wed, 22 Jul 2020 14:30:04 -0700
Subject: [PATCH] Filter memory thermtrip events based on DIMM status

There is a race-condition on shutdown that makes it difficult to
differentiate between a normal shutdown and a memory thermtrip
shutdown.  This race-condition will be resolved in the CPLD for
future platforms but for now it requires a workaround.

This workaround assumes that a memory thermtrip can only occur
if a DIMM temperature sensor has already reached a critical
threshold.  When memory thermtrip asserts on shutdown, it only
logs an error if a DIMM is critical; otherwise it is treated
as a normal shutdown.

Tested:
Memory thermtrip errors no longer log on each power-off.
Manually set a DIMM temperature above critical and verified
that the memory thermtrip event is logged.

Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3
Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
---
 src/host_error_monitor.cpp | 77 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
index 313ef29e0..ca089f70d 100644
--- a/src/host_error_monitor.cpp
+++ b/src/host_error_monitor.cpp
@@ -17,6 +17,7 @@
 #include <systemd/sd-journal.h>
 
 #include <boost/asio/posix/stream_descriptor.hpp>
+#include <boost/container/flat_set.hpp>
 #include <gpiod.hpp>
 #include <sdbusplus/asio/object_server.hpp>
 
@@ -36,6 +37,9 @@ static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
 
 static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
 
+static boost::container::flat_set<std::string> cpu1CriticalDIMMs;
+static boost::container::flat_set<std::string> cpu2CriticalDIMMs;
+
 static bool hostOff = true;
 
 static size_t caterrTimeoutMs = 2000;
@@ -258,6 +262,67 @@ static void initializeHostState()
         "xyz.openbmc_project.State.Host", "CurrentHostState");
 }
 
+static std::shared_ptr<sdbusplus::bus::match::match>
+    startDIMMThresholdEventMonitor()
+{
+    return std::make_shared<sdbusplus::bus::match::match>(
+        *conn,
+        "type='signal',interface='org.freedesktop.DBus.Properties',member='"
+        "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor."
+        "Threshold.Critical'",
+        [](sdbusplus::message::message& msg) {
+            std::string interfaceName;
+            boost::container::flat_map<std::string, std::variant<bool>>
+                propertiesChanged;
+            try
+            {
+                msg.read(interfaceName, propertiesChanged);
+            }
+            catch (std::exception& e)
+            {
+                std::cerr << "Unable to read threshold event\n";
+                return;
+            }
+            // We only want to check for CriticalAlarmHigh
+            if (propertiesChanged.begin()->first != "CriticalAlarmHigh")
+            {
+                return;
+            }
+            const bool* alarm =
+                std::get_if<bool>(&(propertiesChanged.begin()->second));
+            if (alarm == nullptr)
+            {
+                std::cerr << propertiesChanged.begin()->first
+                          << " property invalid\n";
+                return;
+            }
+
+            // Get the sensor path and check if it's a DIMM sensor
+            std::string sensor = msg.get_path();
+            if (sensor.find("DIMM") == std::string::npos)
+            {
+                // Not a DIMM sensor
+                return;
+            }
+
+            // Determine which CPU the DIMM belongs to
+            boost::container::flat_set<std::string>& criticalDIMMs =
+                (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs
+                                                           : cpu2CriticalDIMMs;
+
+            if (*alarm)
+            {
+                // DIMM crossed a critical threshold, so store it
+                criticalDIMMs.insert(sensor);
+            }
+            else
+            {
+                // DIMM is no longer critical, so remove it
+                criticalDIMMs.erase(sensor);
+            }
+        });
+}
+
 static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
 {
     return std::make_shared<sdbusplus::bus::match::match>(
@@ -842,7 +907,9 @@ static void cpu1MemtripHandler()
 
     bool cpu1Memtrip =
         gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
-    if (cpu1Memtrip)
+
+    // Only log a memory thermtrip if a DIMM is critical
+    if (cpu1Memtrip && !cpu1CriticalDIMMs.empty())
     {
         memThermTripLog(1);
     }
@@ -902,7 +969,9 @@ static void cpu2MemtripHandler()
 
     bool cpu2Memtrip =
         gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
-    if (cpu2Memtrip)
+
+    // Only log a memory thermtrip if a DIMM is critical
+    if (cpu2Memtrip && !cpu2CriticalDIMMs.empty())
     {
         memThermTripLog(2);
     }
@@ -1621,6 +1690,10 @@ int main(int argc, char* argv[])
     std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
         host_error_monitor::startHostStateMonitor();
 
+    // Start tracking critical DIMM status
+    std::shared_ptr<sdbusplus::bus::match::match> dimmThresholdEventMonitor =
+        host_error_monitor::startDIMMThresholdEventMonitor();
+
     // Request CPU1_MISMATCH GPIO events
     if (!host_error_monitor::requestGPIOInput(
             "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
-- 
2.17.1