summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-wht
diff options
context:
space:
mode:
authorJason M. Bills <jason.m.bills@linux.intel.com>2020-09-21 22:59:23 +0300
committerJason M. Bills <jason.m.bills@linux.intel.com>2020-09-22 00:17:37 +0300
commit5d3fc64c142786581d792d145231c835afbcdded (patch)
tree18d6cc15665dd2a3cf3ffbdf72cc3aabedc1ba1b /meta-openbmc-mods/meta-wht
parentf99301c1a626951ee7feee081a1494e795d0e243 (diff)
downloadopenbmc-5d3fc64c142786581d792d145231c835afbcdded.tar.xz
Update to internal 0.74-57
Signed-off-by: Jason M. Bills <jason.m.bills@linux.intel.com>
Diffstat (limited to 'meta-openbmc-mods/meta-wht')
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch133
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0003-Override-crashdump-timeout-to-30-minutes.patch41
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend2
3 files changed, 176 insertions, 0 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch
new file mode 100644
index 000000000..2a573311f
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch
@@ -0,0 +1,133 @@
+From d0e4130b2d1e0e44efc8fd6e180487853625edd6 Mon Sep 17 00:00:00 2001
+From: "Jason M. Bills" <jason.m.bills@intel.com>
+Date: Mon, 17 Aug 2020 15:52:22 -0700
+Subject: [PATCH] Add a workaround for spurious CPU errors
+
+There is a possible issue where GPIO event interrupts are getting
+missed causing false errors to be logged.
+
+This adds a check that the host is still on and the error is still
+asserted before logging an error.
+
+Tested:
+Confirmed that a spurious SMI event was ignored correctly after
+this change.
+
+Change-Id: Id83d9d67b15dcf9035e6448086b140e5c7dab4fe
+Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
+---
+ src/host_error_monitor.cpp | 77 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 77 insertions(+)
+
+diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
+index ca089f70d..fd453ccdc 100644
+--- a/src/host_error_monitor.cpp
++++ b/src/host_error_monitor.cpp
+@@ -797,6 +797,18 @@ static void caterrAssertHandler()
+ }
+ return;
+ }
++ // Confirm that this is a real failure by checking that the host is on
++ if (hostOff)
++ {
++ return;
++ }
++ // And that the signal is still asserted
++ if (caterrLine.get_value() != 0)
++ {
++ std::cerr
++ << "CPU_CATERR not asserted after timeout. Error ignored.\n";
++ return;
++ }
+ std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
+ << " ms\n";
+ beep(beepCPUIERR);
+@@ -1270,6 +1282,48 @@ static void errXAssertHandler(const int errPin,
+ }
+ return;
+ }
++ // Confirm that this is a real failure by checking that the host is on
++ if (hostOff)
++ {
++ return;
++ }
++ // And that the signal is still asserted
++ switch (errPin)
++ {
++ case 0:
++ {
++ if (err0Line.get_value() != 0)
++ {
++ std::cerr << "CPU_ERR0 not asserted after timeout. Error "
++ "ignored.\n";
++ return;
++ }
++ break;
++ }
++ case 1:
++ {
++ if (err1Line.get_value() != 0)
++ {
++ std::cerr << "CPU_ERR1 not asserted after timeout. Error "
++ "ignored.\n";
++ return;
++ }
++ break;
++ }
++ case 2:
++ {
++ if (err2Line.get_value() != 0)
++ {
++ std::cerr << "CPU_ERR2 not asserted after timeout. Error "
++ "ignored.\n";
++ return;
++ }
++ break;
++ }
++ default:
++ std::cerr << "Invalid ERR pin asserted\n";
++ return;
++ }
+ std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
+ << std::to_string(errTimeoutMs) << " ms\n";
+ if (errPinCPUs.count())
+@@ -1379,6 +1433,18 @@ static void err2AssertHandler()
+ }
+ return;
+ }
++ // Confirm that this is a real failure by checking that the host is on
++ if (hostOff)
++ {
++ return;
++ }
++ // And that the signal is still asserted
++ if (err2Line.get_value() != 0)
++ {
++ std::cerr
++ << "CPU_ERR2 not asserted after timeout. Error ignored.\n";
++ return;
++ }
+ conn->async_method_call(
+ [](boost::system::error_code ec,
+ const std::variant<bool>& property) {
+@@ -1447,6 +1513,17 @@ static void smiAssertHandler()
+ }
+ return;
+ }
++ // Confirm that this is a real failure by checking that the host is on
++ if (hostOff)
++ {
++ return;
++ }
++ // And that the signal is still asserted
++ if (smiLine.get_value() != 0)
++ {
++ std::cerr << "SMI not asserted after timeout. Error ignored.\n";
++ return;
++ }
+ std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
+ << " ms\n";
+ smiTimeoutLog();
+--
+2.17.1
+
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0003-Override-crashdump-timeout-to-30-minutes.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0003-Override-crashdump-timeout-to-30-minutes.patch
new file mode 100644
index 000000000..7511cc218
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0003-Override-crashdump-timeout-to-30-minutes.patch
@@ -0,0 +1,41 @@
+From 6d3f28619226c0dbfca6feb320a2fa292aa11f40 Mon Sep 17 00:00:00 2001
+From: "Jason M. Bills" <jason.m.bills@intel.com>
+Date: Wed, 2 Sep 2020 18:23:00 -0700
+Subject: [PATCH] Override crashdump timeout to 30 minutes
+
+The current crashdump timeout is set to 5 minutes, so if it takes
+longer than 5 minutes to complete the crashdump, then
+host-error-monitor will not reset the system to recover from the
+error.
+
+My current crashdump on a single socket is taking about 11 minutes
+to complete. This is a workaround to change the timeout to 30
+minutes to give enough time for crashdump to complete before timing
+out.
+
+Tested:
+Confirmed that when I inject an IERR on my system, it will correctly
+reset after 11 minutes when crashdump completes.
+
+Change-Id: I36ac3f0c83ae1634e486d9f8413b38fae5efb561
+Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
+---
+ src/host_error_monitor.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
+index fd453ccdc..9dabb52bb 100644
+--- a/src/host_error_monitor.cpp
++++ b/src/host_error_monitor.cpp
+@@ -46,7 +46,7 @@ static size_t caterrTimeoutMs = 2000;
+ const static constexpr size_t caterrTimeoutMsMax = 600000; // 10 minutes maximum
+ const static constexpr size_t errTimeoutMs = 90000;
+ const static constexpr size_t smiTimeoutMs = 90000;
+-const static constexpr size_t crashdumpTimeoutS = 300;
++const static constexpr size_t crashdumpTimeoutS = 1800;
+
+ // Timers
+ // Timer for CATERR asserted
+--
+2.17.1
+
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend
index 4b79757c0..9853a4abc 100644
--- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend
@@ -2,4 +2,6 @@ FILESEXTRAPATHS_append := "${THISDIR}/${PN}:"
SRC_URI += " \
file://0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch \
+ file://0002-Add-a-workaround-for-spurious-CPU-errors.patch \
+ file://0003-Override-crashdump-timeout-to-30-minutes.patch \
"