summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-common/recipes-phosphor/fans/phosphor-pid-control/0002-Prevent-run-away-memory-consumption-from-swamped.patch
blob: 03ed37e6af29427222b3bcc533b8ba94f60116b3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
From 76b39bb8e638114fdc448334926fe82003fc9598 Mon Sep 17 00:00:00 2001
From: Zhikui Ren <zhikui.ren@intel.com>
Date: Wed, 21 Oct 2020 11:02:50 -0700
Subject: [PATCH] Prevent run away memory consumption from swamped

These set of changes mitigates swampd memory leak.
Root cause of memory leak has not been identified.
Changes include:
1. Do not rebuild pid control loop for voltage sensor changes
2. Use default system dbus for helper function temp dbus connection
3. Only rebuild pid control loop when reload timer is not active
4. Explicit clear sensor map in static sensor manager struct before
rebuildng pid control loop.
5. Add abort flag in pid loop.
6. Increase event handler timeout time from 2 seconds to 8 seconds.

Tested:
Build and run 1000 dc cycle, reset and stop/start psusensor service.
Swampd memory comsumption was stable after some increase.
Statm for swampd:
iteration 1:    1922 1326 1098 101 0 238 0
iteration 1000: 2056 1506 1139 101 0 372 0
Maximum statm for swampd: 2126 1546 1139 101 0 442 0

Signed-off-by: Zhikui Ren <zhikui.ren@intel.com>
---
 dbus/dbusconfiguration.cpp | 36 ++++++++++++++++++++++++++++--------
 dbus/dbuspassive.cpp       |  2 +-
 dbus/dbuswrite.cpp         |  8 ++++----
 main.cpp                   |  7 ++++++-
 pid/pidloop.cpp            | 15 +++++++++++++++
 pid/zone.hpp               |  2 ++
 sensors/manager.hpp        |  6 ++++++
 7 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/dbus/dbusconfiguration.cpp b/dbus/dbusconfiguration.cpp
index bfa2a7c..82e1da0 100644
--- a/dbus/dbusconfiguration.cpp
+++ b/dbus/dbusconfiguration.cpp
@@ -18,6 +18,7 @@
 #include "util.hpp"
 
 #include <algorithm>
+#include <boost/algorithm/string/predicate.hpp>
 #include <boost/asio/steady_timer.hpp>
 #include <chrono>
 #include <functional>
@@ -270,6 +271,11 @@ int eventHandler(sd_bus_message* m, void* context, sd_bus_error*)
                 }
             }
         }
+
+        if (path.str.find("/xyz/openbmc_project/sensors/voltage") == 0)
+        {
+            return 1;
+        }
     }
 
     boost::asio::steady_timer* timer =
@@ -277,16 +283,27 @@ int eventHandler(sd_bus_message* m, void* context, sd_bus_error*)
 
     // do a brief sleep as we tend to get a bunch of these events at
     // once
-    timer->expires_after(std::chrono::seconds(2));
+    static int timercnt = 0;
+    ++timercnt;
+    timer->expires_after(std::chrono::seconds(8));
     timer->async_wait([](const boost::system::error_code ec) {
-        if (ec == boost::asio::error::operation_aborted)
+        --timercnt;
+        if (ec)
         {
-            /* another timer started*/
+            if (ec != boost::asio::error::operation_aborted)
+            {
+                // ec == boost::asio::error::operation_aborted means
+                // another timer started, which is normal
+                std::cerr << "Reload timer async handler error - timerCnt "
+                          << timercnt << "\n";
+            }
             return;
         }
-
-        std::cout << "New configuration detected, reloading\n.";
-        tryRestartControlLoops();
+        if (timercnt <= 0)
+        {
+            std::cout << "New configuration detected, reloading\n.";
+            tryRestartControlLoops();
+        }
     });
 
     return 1;
@@ -433,7 +450,6 @@ void populatePidInfo(
 
 bool init(sdbusplus::bus::bus& bus, boost::asio::steady_timer& timer)
 {
-
     sensorConfig.clear();
     zoneConfig.clear();
     zoneDetailsConfig.clear();
@@ -486,6 +502,11 @@ bool init(sdbusplus::bus::bus& bus, boost::asio::steady_timer& timer)
     {
         for (const auto& ownerPair : objectPair.second)
         {
+            if (boost::ends_with(ownerPair.first, "ADCSensor"))
+            {
+                continue;
+            }
+
             auto& owner = owners[ownerPair.first];
             for (const std::string& interface : ownerPair.second)
             {
@@ -629,7 +650,6 @@ bool init(sdbusplus::bus::bus& bus, boost::asio::steady_timer& timer)
         auto findBase = configuration.second.find(pidConfigurationInterface);
         if (findBase != configuration.second.end())
         {
-
             const auto& base =
                 configuration.second.at(pidConfigurationInterface);
             const std::vector<std::string>& zones =
diff --git a/dbus/dbuspassive.cpp b/dbus/dbuspassive.cpp
index 02d1beb..4c6e405 100644
--- a/dbus/dbuspassive.cpp
+++ b/dbus/dbuspassive.cpp
@@ -41,7 +41,7 @@ std::unique_ptr<ReadInterface> DbusPassive::createDbusPassive(
     }
 
     /* Need to get the scale and initial value */
-    auto tempBus = sdbusplus::bus::new_system();
+    auto tempBus = sdbusplus::bus::new_default_system();
 
     /* service == busname */
     std::string path = getSensorPath(type, id);
diff --git a/dbus/dbuswrite.cpp b/dbus/dbuswrite.cpp
index 76f4ada..583a065 100644
--- a/dbus/dbuswrite.cpp
+++ b/dbus/dbuswrite.cpp
@@ -31,7 +31,7 @@ std::unique_ptr<WriteInterface>
     DbusWritePercent::createDbusWrite(const std::string& path, int64_t min,
                                       int64_t max, DbusHelperInterface& helper)
 {
-    auto tempBus = sdbusplus::bus::new_system();
+    auto tempBus = sdbusplus::bus::new_default_system();
     std::string connectionName;
 
     try
@@ -59,7 +59,7 @@ void DbusWritePercent::write(double value)
     {
         return;
     }
-    auto writeBus = sdbusplus::bus::new_default();
+    auto writeBus = sdbusplus::bus::new_default_system();
     auto mesg =
         writeBus.new_method_call(connectionName.c_str(), path.c_str(),
                                  "org.freedesktop.DBus.Properties", "Set");
@@ -85,7 +85,7 @@ std::unique_ptr<WriteInterface>
     DbusWrite::createDbusWrite(const std::string& path, int64_t min,
                                int64_t max, DbusHelperInterface& helper)
 {
-    auto tempBus = sdbusplus::bus::new_system();
+    auto tempBus = sdbusplus::bus::new_default_system();
     std::string connectionName;
 
     try
@@ -106,7 +106,7 @@ void DbusWrite::write(double value)
     {
         return;
     }
-    auto writeBus = sdbusplus::bus::new_default();
+    auto writeBus = sdbusplus::bus::new_default_system();
     auto mesg =
         writeBus.new_method_call(connectionName.c_str(), path.c_str(),
                                  "org.freedesktop.DBus.Properties", "Set");
diff --git a/main.cpp b/main.cpp
index 46cb38d..a07ca80 100644
--- a/main.cpp
+++ b/main.cpp
@@ -79,11 +79,15 @@ void restartControlLoops()
     {
         timer->cancel();
     }
+    for (const auto zone : zones)
+    {
+        zone.second->abort = true;
+    }
     timers.clear();
     zones.clear();
+    mgmr.clearSensor();
 
 #if CONFIGURE_DBUS
-
     static boost::asio::steady_timer reloadTimer(io);
     if (!dbus_configuration::init(modeControlBus, reloadTimer))
     {
@@ -190,5 +194,6 @@ int main(int argc, char* argv[])
     tryRestartControlLoops();
 
     io.run();
+
     return 0;
 }
diff --git a/pid/pidloop.cpp b/pid/pidloop.cpp
index 14225ec..62bf323 100644
--- a/pid/pidloop.cpp
+++ b/pid/pidloop.cpp
@@ -22,6 +22,7 @@
 
 #include <boost/asio/steady_timer.hpp>
 #include <chrono>
+#include <iostream>
 #include <map>
 #include <memory>
 #include <thread>
@@ -60,8 +61,22 @@ void pidControlLoop(std::shared_ptr<PIDZone> zone,
         [zone, timer, ms100cnt](const boost::system::error_code& ec) mutable {
             if (ec == boost::asio::error::operation_aborted)
             {
+                std::cerr << "pid loop timer cancelled " << zone << "\n";
                 return; // timer being canceled, stop loop
             }
+            if (ec)
+            {
+                std::cerr << "pid loop timer err " << zone << "\n";
+                return;
+            }
+
+            if (zone->abort)
+            {
+                // on our way to destruct and construct new zone and timer
+                // stop the loop
+                std::cerr << "pid loop abort zone " << zone << "\n";
+                return;
+            }
 
             /*
              * This should sleep on the conditional wait for the listen thread
diff --git a/pid/zone.hpp b/pid/zone.hpp
index 3cf4e59..899636c 100644
--- a/pid/zone.hpp
+++ b/pid/zone.hpp
@@ -97,6 +97,8 @@ class PIDZone : public ZoneInterface, public ModeObject
     /* Method for reading whether in fail-safe mode over dbus */
     bool failSafe() const override;
 
+    bool abort = false;
+
   private:
     std::ofstream _log;
 
diff --git a/sensors/manager.hpp b/sensors/manager.hpp
index 411b302..5c7e69a 100644
--- a/sensors/manager.hpp
+++ b/sensors/manager.hpp
@@ -51,6 +51,12 @@ class SensorManager
         return *_hostSensorBus;
     }
 
+    void clearSensor(void)
+    {
+        _sensorMap.clear();
+        _sensorTypeList.clear();
+    }
+
   private:
     std::map<std::string, std::unique_ptr<Sensor>> _sensorMap;
     std::map<std::string, std::vector<std::string>> _sensorTypeList;
-- 
2.17.1