summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-common/recipes-phosphor/fans/phosphor-pid-control/0001-Eliminate-swampd-core-dump-after-D-Bus-updates-senso.patch
blob: d2a8d7c4036ed6c91210eab41591d0f02e263f37 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
From 26db33e341f7e96931905aee4358353b0c6aee39 Mon Sep 17 00:00:00 2001
From: Johnathan Mantey <johnathanx.mantey@intel.com>
Date: Mon, 28 Sep 2020 11:06:58 -0700
Subject: [PATCH] Eliminate swampd core dump after D-Bus updates sensors

The swamp daemon intializes a list of sensors and uses those to
periodically scan the state associated devices. Reading the sensors is
done with an async timer, that runs code to re-arm an async timer.

There is also a D-Bus update cycle that is independent of the async
timer reading the sensors. When the D-Bus updates the number of
sensors in the system a new list must be created. In order to create
the new list the timers using the old list must be stopped. Only after
those timers have stopped may a new list be generated, and a new set of
timers started.

The two processes are unware of each other. To safely perform the
change the pointers to the list of zones and timers must be kept alive
until all timer actions complete. Only after all references to the
pointers have been release may the new state be built, and new timers
started.

Prior to this change swampd would throw a SYSSEGV fault due to an
attempt to use a pointer that was no longer active.

Tested:
Issued a "reset -w" (Warm Reset command) from the EFI shell.
Waited for the system to reboot, and enter EFI
Checked for a core file in /var/lib/systemd/coredump
Repeated step 1 if coredump file was not present.
Completed 2900+ passes successfully when ealier code failed at less
than 800 passes.

Change-Id: I10ab824d8050be9eca63c18d7e5a62bdb41e9c64
Signed-off-by: Johnathan Mantey <johnathanx.mantey@intel.com>
---
 main.cpp        | 14 ++++++++++----
 pid/builder.cpp |  6 +++---
 pid/builder.hpp |  2 +-
 pid/pidloop.cpp | 13 +++++++------
 pid/pidloop.hpp |  3 ++-
 5 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/main.cpp b/main.cpp
index 2ab3fc4..46cb38d 100644
--- a/main.cpp
+++ b/main.cpp
@@ -72,10 +72,15 @@ static sdbusplus::asio::connection
 void restartControlLoops()
 {
     static SensorManager mgmr;
-    static std::unordered_map<int64_t, std::unique_ptr<PIDZone>> zones;
-    static std::list<boost::asio::steady_timer> timers;
+    static std::unordered_map<int64_t, std::shared_ptr<PIDZone>> zones;
+    static std::vector<std::shared_ptr<boost::asio::steady_timer>> timers;
 
+    for (const auto timer : timers)
+    {
+        timer->cancel();
+    }
     timers.clear();
+    zones.clear();
 
 #if CONFIGURE_DBUS
 
@@ -117,9 +122,10 @@ void restartControlLoops()
 
     for (const auto& i : zones)
     {
-        auto& timer = timers.emplace_back(io);
+        std::shared_ptr<boost::asio::steady_timer> timer = timers.emplace_back(
+            std::make_shared<boost::asio::steady_timer>(io));
         std::cerr << "pushing zone " << i.first << "\n";
-        pidControlLoop(i.second.get(), timer);
+        pidControlLoop(i.second, timer);
     }
 }
 
diff --git a/pid/builder.cpp b/pid/builder.cpp
index 1fbfbd4..15fc4cd 100644
--- a/pid/builder.cpp
+++ b/pid/builder.cpp
@@ -35,12 +35,12 @@ static std::string getControlPath(int64_t zone)
     return std::string(objectPath) + std::to_string(zone);
 }
 
-std::unordered_map<int64_t, std::unique_ptr<PIDZone>>
+std::unordered_map<int64_t, std::shared_ptr<PIDZone>>
     buildZones(std::map<int64_t, conf::PIDConf>& zonePids,
                std::map<int64_t, struct conf::ZoneConfig>& zoneConfigs,
                SensorManager& mgr, sdbusplus::bus::bus& modeControlBus)
 {
-    std::unordered_map<int64_t, std::unique_ptr<PIDZone>> zones;
+    std::unordered_map<int64_t, std::shared_ptr<PIDZone>> zones;
 
     for (const auto& zi : zonePids)
     {
@@ -62,7 +62,7 @@ std::unordered_map<int64_t, std::unique_ptr<PIDZone>>
 
         const conf::PIDConf& pidConfig = zi.second;
 
-        auto zone = std::make_unique<PIDZone>(
+        auto zone = std::make_shared<PIDZone>(
             zoneId, zoneConf->second.minThermalOutput,
             zoneConf->second.failsafePercent, mgr, modeControlBus,
             getControlPath(zi.first).c_str(), deferSignals);
diff --git a/pid/builder.hpp b/pid/builder.hpp
index e500503..e3ba88c 100644
--- a/pid/builder.hpp
+++ b/pid/builder.hpp
@@ -7,7 +7,7 @@
 #include <sdbusplus/bus.hpp>
 #include <unordered_map>
 
-std::unordered_map<int64_t, std::unique_ptr<PIDZone>>
+std::unordered_map<int64_t, std::shared_ptr<PIDZone>>
     buildZones(std::map<int64_t, conf::PIDConf>& zonePids,
                std::map<int64_t, struct conf::ZoneConfig>& zoneConfigs,
                SensorManager& mgr, sdbusplus::bus::bus& modeControlBus);
diff --git a/pid/pidloop.cpp b/pid/pidloop.cpp
index 56bf8bd..14225ec 100644
--- a/pid/pidloop.cpp
+++ b/pid/pidloop.cpp
@@ -27,7 +27,7 @@
 #include <thread>
 #include <vector>
 
-static void processThermals(PIDZone* zone)
+static void processThermals(std::shared_ptr<PIDZone> zone)
 {
     // Get the latest margins.
     zone->updateSensors();
@@ -40,8 +40,9 @@ static void processThermals(PIDZone* zone)
     zone->determineMaxSetPointRequest();
 }
 
-void pidControlLoop(PIDZone* zone, boost::asio::steady_timer& timer, bool first,
-                    int ms100cnt)
+void pidControlLoop(std::shared_ptr<PIDZone> zone,
+                    std::shared_ptr<boost::asio::steady_timer> timer,
+                    bool first, int ms100cnt)
 {
     if (first)
     {
@@ -54,9 +55,9 @@ void pidControlLoop(PIDZone* zone, boost::asio::steady_timer& timer, bool first,
         processThermals(zone);
     }
 
-    timer.expires_after(std::chrono::milliseconds(100));
-    timer.async_wait(
-        [zone, &timer, ms100cnt](const boost::system::error_code& ec) mutable {
+    timer->expires_after(std::chrono::milliseconds(100));
+    timer->async_wait(
+        [zone, timer, ms100cnt](const boost::system::error_code& ec) mutable {
             if (ec == boost::asio::error::operation_aborted)
             {
                 return; // timer being canceled, stop loop
diff --git a/pid/pidloop.hpp b/pid/pidloop.hpp
index 3a67954..7aef73a 100644
--- a/pid/pidloop.hpp
+++ b/pid/pidloop.hpp
@@ -14,5 +14,6 @@
  * @param[in] first - boolean to denote if initialization needs to be run.
  * @param[in] ms100cnt - loop timer counter.
  */
-void pidControlLoop(PIDZone* zone, boost::asio::steady_timer& timer,
+void pidControlLoop(std::shared_ptr<PIDZone> zone,
+                    std::shared_ptr<boost::asio::steady_timer> timer,
                     bool first = true, int ms100cnt = 0);
-- 
2.26.2