summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-wht
diff options
context:
space:
mode:
authorJason M. Bills <jason.m.bills@linux.intel.com>2021-04-19 22:13:22 +0300
committerJason M. Bills <jason.m.bills@linux.intel.com>2021-04-19 23:02:05 +0300
commitab16ab3d0de4dc9d130ae3db366c38888f1ada5a (patch)
treed7b76b8111aedb06ee17ced2c9cbdebaeaaf6311 /meta-openbmc-mods/meta-wht
parent36caa12533da01d4319c5ffe7613711a0ec7dea7 (diff)
downloadopenbmc-ab16ab3d0de4dc9d130ae3db366c38888f1ada5a.tar.xz
Update to internal 0.45
Signed-off-by: Jason M. Bills <jason.m.bills@linux.intel.com>
Diffstat (limited to 'meta-openbmc-mods/meta-wht')
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch194
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch169
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch133
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch143
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend6
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules2
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend8
7 files changed, 350 insertions, 305 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch
new file mode 100644
index 000000000..17f16cce5
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Configure-host-error-monitors-for-meta-wht.patch
@@ -0,0 +1,194 @@
+From 1b2df626b20aa14c0de7f46915758d10394d01b4 Mon Sep 17 00:00:00 2001
+From: "Jason M. Bills" <jason.m.bills@intel.com>
+Date: Tue, 15 Dec 2020 10:05:31 -0800
+Subject: [PATCH] Configure host error monitors for meta-wht
+
+The new host error monitor architecture allows the list of error
+monitors to be customized through a platform-specific patch file.
+
+This patch configures the host error monitors for meta-wht.
+
+Change-Id: I7070a3409b1471d7f9c93eca3e36b477f484e5d7
+Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
+---
+ include/error_monitors.hpp | 137 ++++++++++++++++++++++++++++++++++---
+ 1 file changed, 129 insertions(+), 8 deletions(-)
+
+diff --git a/include/error_monitors.hpp b/include/error_monitors.hpp
+index 55b8790d..8d1651d3 100644
+--- a/include/error_monitors.hpp
++++ b/include/error_monitors.hpp
+@@ -14,23 +14,88 @@
+ // limitations under the License.
+ */
+ #pragma once
++#include <error_monitors/cpu_mismatch_monitor.hpp>
++#include <error_monitors/cpu_thermtrip_monitor.hpp>
++#include <error_monitors/err2_monitor.hpp>
++#include <error_monitors/err_pin_monitor.hpp>
++#include <error_monitors/ierr_monitor.hpp>
++#include <error_monitors/mem_thermtrip_monitor.hpp>
++#include <error_monitors/pch_thermtrip_monitor.hpp>
++#include <error_monitors/smi_monitor.hpp>
++#include <error_monitors/vr_hot_monitor.hpp>
+ #include <sdbusplus/asio/object_server.hpp>
+-// #include <error_monitors/smi_monitor.hpp>
+
+ #include <memory>
+
+ namespace host_error_monitor::error_monitors
+ {
+ // Error signals to monitor
+-// static std::unique_ptr<host_error_monitor::smi_monitor::SMIMonitor>
+-// smiMonitor;
++static std::unique_ptr<host_error_monitor::smi_monitor::SMIMonitor> smiMonitor;
++static std::unique_ptr<
++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor>
++ cpu1MismatchMonitor;
++static std::unique_ptr<
++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor>
++ cpu2MismatchMonitor;
++static std::unique_ptr<host_error_monitor::err_pin_monitor::ErrPinMonitor>
++ err0Monitor;
++static std::unique_ptr<host_error_monitor::err_pin_monitor::ErrPinMonitor>
++ err1Monitor;
++static std::unique_ptr<host_error_monitor::err2_monitor::Err2Monitor>
++ err2Monitor;
++static std::unique_ptr<host_error_monitor::ierr_monitor::IERRMonitor>
++ ierrMonitor;
++static std::unique_ptr<
++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor>
++ cpu1ThermtripMonitor;
++static std::unique_ptr<
++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor>
++ cpu2ThermtripMonitor;
++static std::unique_ptr<
++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor>
++ mem1ThermtripMonitor;
++static std::unique_ptr<
++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor>
++ mem2ThermtripMonitor;
++static std::unique_ptr<host_error_monitor::vr_hot_monitor::VRHotMonitor>
++ cpu1VRHotMonitor;
++static std::unique_ptr<host_error_monitor::vr_hot_monitor::VRHotMonitor>
++ cpu1MemABCDVRHotMonitor;
++static std::unique_ptr<host_error_monitor::vr_hot_monitor::VRHotMonitor>
++ cpu1MemEFGHVRHotMonitor;
++static std::unique_ptr<host_error_monitor::vr_hot_monitor::VRHotMonitor>
++ cpu2VRHotMonitor;
++static std::unique_ptr<host_error_monitor::vr_hot_monitor::VRHotMonitor>
++ cpu2MemABCDVRHotMonitor;
++static std::unique_ptr<host_error_monitor::vr_hot_monitor::VRHotMonitor>
++ cpu2MemEFGHVRHotMonitor;
++static std::unique_ptr<
++ host_error_monitor::pch_thermtrip_monitor::PCHThermtripMonitor>
++ pchThermtripMonitor;
+
+ // Check if all the signal monitors started successfully
+ bool checkMonitors()
+ {
+ bool ret = true;
+
+- // ret &= smiMonitor->isValid();
++ ret &= smiMonitor->isValid();
++ ret &= cpu1MismatchMonitor->isValid();
++ ret &= cpu2MismatchMonitor->isValid();
++ ret &= err0Monitor->isValid();
++ ret &= err1Monitor->isValid();
++ ret &= err2Monitor->isValid();
++ ret &= ierrMonitor->isValid();
++ ret &= cpu1ThermtripMonitor->isValid();
++ ret &= cpu2ThermtripMonitor->isValid();
++ ret &= mem1ThermtripMonitor->isValid();
++ ret &= mem2ThermtripMonitor->isValid();
++ ret &= cpu1VRHotMonitor->isValid();
++ ret &= cpu1MemABCDVRHotMonitor->isValid();
++ ret &= cpu1MemEFGHVRHotMonitor->isValid();
++ ret &= cpu2VRHotMonitor->isValid();
++ ret &= cpu2MemABCDVRHotMonitor->isValid();
++ ret &= cpu2MemEFGHVRHotMonitor->isValid();
++ ret &= pchThermtripMonitor->isValid();
+
+ return ret;
+ }
+@@ -39,9 +104,59 @@ bool checkMonitors()
+ bool startMonitors(boost::asio::io_service& io,
+ std::shared_ptr<sdbusplus::asio::connection> conn)
+ {
+- // smiMonitor =
+- // std::make_unique<host_error_monitor::smi_monitor::SMIMonitor>(
+- // io, conn, "SMI");
++ smiMonitor = std::make_unique<host_error_monitor::smi_monitor::SMIMonitor>(
++ io, conn, "SMI");
++ cpu1MismatchMonitor = std::make_unique<
++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor>(
++ io, conn, "CPU1_MISMATCH", 1);
++ cpu2MismatchMonitor = std::make_unique<
++ host_error_monitor::cpu_mismatch_monitor::CPUMismatchMonitor>(
++ io, conn, "CPU2_MISMATCH", 2);
++ err0Monitor =
++ std::make_unique<host_error_monitor::err_pin_monitor::ErrPinMonitor>(
++ io, conn, "CPU_ERR0", 0);
++ err1Monitor =
++ std::make_unique<host_error_monitor::err_pin_monitor::ErrPinMonitor>(
++ io, conn, "CPU_ERR1", 1);
++ err2Monitor =
++ std::make_unique<host_error_monitor::err2_monitor::Err2Monitor>(
++ io, conn, "CPU_ERR2");
++ ierrMonitor =
++ std::make_unique<host_error_monitor::ierr_monitor::IERRMonitor>(
++ io, conn, "CPU_CATERR");
++ cpu1ThermtripMonitor = std::make_unique<
++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor>(
++ io, conn, "CPU1_THERMTRIP", 1, "CPU1_FIVR_FAULT");
++ cpu2ThermtripMonitor = std::make_unique<
++ host_error_monitor::cpu_thermtrip_monitor::CPUThermtripMonitor>(
++ io, conn, "CPU2_THERMTRIP", 2, "CPU2_FIVR_FAULT");
++ mem1ThermtripMonitor = std::make_unique<
++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor>(
++ io, conn, "CPU1_MEM_THERM_EVENT", 1);
++ mem2ThermtripMonitor = std::make_unique<
++ host_error_monitor::mem_thermtrip_monitor::MemThermtripMonitor>(
++ io, conn, "CPU2_MEM_THERM_EVENT", 2);
++ cpu1VRHotMonitor =
++ std::make_unique<host_error_monitor::vr_hot_monitor::VRHotMonitor>(
++ io, conn, "CPU1_VRHOT", "CPU 1");
++ cpu1MemABCDVRHotMonitor =
++ std::make_unique<host_error_monitor::vr_hot_monitor::VRHotMonitor>(
++ io, conn, "CPU1_MEM_ABCD_VRHOT", "CPU 1 Memory ABCD");
++ cpu1MemEFGHVRHotMonitor =
++ std::make_unique<host_error_monitor::vr_hot_monitor::VRHotMonitor>(
++ io, conn, "CPU1_MEM_EFGH_VRHOT", "CPU 1 Memory EFGH");
++ cpu2VRHotMonitor =
++ std::make_unique<host_error_monitor::vr_hot_monitor::VRHotMonitor>(
++ io, conn, "CPU2_VRHOT", "CPU 2");
++ cpu2MemABCDVRHotMonitor =
++ std::make_unique<host_error_monitor::vr_hot_monitor::VRHotMonitor>(
++ io, conn, "CPU2_MEM_ABCD_VRHOT", "CPU 2 Memory ABCD");
++ cpu2MemEFGHVRHotMonitor =
++ std::make_unique<host_error_monitor::vr_hot_monitor::VRHotMonitor>(
++ io, conn, "CPU2_MEM_EFGH_VRHOT", "CPU 2 Memory EFGH");
++ pchThermtripMonitor = std::make_unique<
++ host_error_monitor::pch_thermtrip_monitor::PCHThermtripMonitor>(
++ io, conn, "PCH_BMC_THERMTRIP");
+
+ return checkMonitors();
+ }
+@@ -49,7 +164,13 @@ bool startMonitors(boost::asio::io_service& io,
+ // Notify the signal monitors of host on event
+ void sendHostOn()
+ {
+- // smiMonitor->hostOn();
++ smiMonitor->hostOn();
++ cpu1MismatchMonitor->hostOn();
++ cpu2MismatchMonitor->hostOn();
++ err0Monitor->hostOn();
++ err1Monitor->hostOn();
++ err2Monitor->hostOn();
++ ierrMonitor->hostOn();
+ }
+
+ } // namespace host_error_monitor::error_monitors
+--
+2.17.1
+
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
deleted file mode 100644
index 140724ca9..000000000
--- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
+++ /dev/null
@@ -1,169 +0,0 @@
-From c09e608da2f63eed5b73891d5c032b646d8e81eb Mon Sep 17 00:00:00 2001
-From: "Jason M. Bills" <jason.m.bills@intel.com>
-Date: Wed, 22 Jul 2020 14:30:04 -0700
-Subject: [PATCH 1/2] Filter memory thermtrip events based on DIMM status
-
-There is a race-condition on shutdown that makes it difficult to
-differentiate between a normal shutdown and a memory thermtrip
-shutdown. This race-condition will be resolved in the CPLD for
-future platforms but for now it requires a workaround.
-
-This workaround assumes that a memory thermtrip can only occur
-if a DIMM temperature sensor has already reached a critical
-threshold. When memory thermtrip asserts on shutdown, it only
-logs an error if a DIMM is critical; otherwise it is treated
-as a normal shutdown.
-
-Tested:
-Memory thermtrip errors no longer log on each power-off.
-Manually set a DIMM temperature above critical and verified
-that the memory thermtrip event is logged.
-
-Change-Id: I9c38b41db30046499297ee24cc3a2790920b19d3
-Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
----
- src/host_error_monitor.cpp | 81 ++++++++++++++++++++++++++++++++++++--
- 1 file changed, 77 insertions(+), 4 deletions(-)
-
-diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
-index d52a5dc6a..77d065fa3 100644
---- a/src/host_error_monitor.cpp
-+++ b/src/host_error_monitor.cpp
-@@ -19,6 +19,7 @@
- #include <boost/asio/io_service.hpp>
- #include <boost/asio/posix/stream_descriptor.hpp>
- #include <boost/asio/steady_timer.hpp>
-+#include <boost/container/flat_set.hpp>
- #include <gpiod.hpp>
- #include <sdbusplus/asio/object_server.hpp>
-
-@@ -38,6 +39,9 @@ static std::shared_ptr<sdbusplus::asio::dbus_interface> associationCATAssert;
-
- static const constexpr char* rootPath = "/xyz/openbmc_project/CallbackManager";
-
-+static boost::container::flat_set<std::string> cpu1CriticalDIMMs;
-+static boost::container::flat_set<std::string> cpu2CriticalDIMMs;
-+
- static bool hostOff = true;
-
- static size_t caterrTimeoutMs = 2000;
-@@ -274,6 +278,67 @@ static void initializeHostState()
- "xyz.openbmc_project.State.Host", "CurrentHostState");
- }
-
-+static std::shared_ptr<sdbusplus::bus::match::match>
-+ startDIMMThresholdEventMonitor()
-+{
-+ return std::make_shared<sdbusplus::bus::match::match>(
-+ *conn,
-+ "type='signal',interface='org.freedesktop.DBus.Properties',member='"
-+ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor."
-+ "Threshold.Critical'",
-+ [](sdbusplus::message::message& msg) {
-+ std::string interfaceName;
-+ boost::container::flat_map<std::string, std::variant<bool>>
-+ propertiesChanged;
-+ try
-+ {
-+ msg.read(interfaceName, propertiesChanged);
-+ }
-+ catch (std::exception& e)
-+ {
-+ std::cerr << "Unable to read threshold event\n";
-+ return;
-+ }
-+ // We only want to check for CriticalAlarmHigh
-+ if (propertiesChanged.begin()->first != "CriticalAlarmHigh")
-+ {
-+ return;
-+ }
-+ const bool* alarm =
-+ std::get_if<bool>(&(propertiesChanged.begin()->second));
-+ if (alarm == nullptr)
-+ {
-+ std::cerr << propertiesChanged.begin()->first
-+ << " property invalid\n";
-+ return;
-+ }
-+
-+ // Get the sensor path and check if it's a DIMM sensor
-+ std::string sensor = msg.get_path();
-+ if (sensor.find("DIMM") == std::string::npos)
-+ {
-+ // Not a DIMM sensor
-+ return;
-+ }
-+
-+ // Determine which CPU the DIMM belongs to
-+ boost::container::flat_set<std::string>& criticalDIMMs =
-+ (sensor.find("CPU1") != std::string::npos) ? cpu1CriticalDIMMs
-+ : cpu2CriticalDIMMs;
-+
-+ if (*alarm)
-+ {
-+ // DIMM crossed a critical threshold, so store it
-+ criticalDIMMs.insert(sensor);
-+ }
-+ else
-+ {
-+ // DIMM is no longer critical, so remove it
-+ criticalDIMMs.erase(sensor);
-+ }
-+ });
-+}
-+
- static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
- {
- return std::make_shared<sdbusplus::bus::match::match>(
-@@ -851,7 +916,9 @@ static void cpu1MemtripHandler()
-
- bool cpu1Memtrip =
- gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
-- if (cpu1Memtrip)
-+
-+ // Only log a memory thermtrip if a DIMM is critical
-+ if (cpu1Memtrip && !cpu1CriticalDIMMs.empty())
- {
- memThermTripLog(1);
- }
-@@ -911,7 +978,9 @@ static void cpu2MemtripHandler()
-
- bool cpu2Memtrip =
- gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
-- if (cpu2Memtrip)
-+
-+ // Only log a memory thermtrip if a DIMM is critical
-+ if (cpu2Memtrip && !cpu2CriticalDIMMs.empty())
- {
- memThermTripLog(2);
- }
-@@ -1521,13 +1590,13 @@ static void initializeErrorState()
- }
-
- // Handle CPU1_MEM_THERM_EVENT (CPU1 DIMM Thermal trip) if it's asserted now
-- if (cpu1MemtripLine.get_value() == 0)
-+ if ((cpu1MemtripLine.get_value() == 0) && !cpu1CriticalDIMMs.empty())
- {
- memThermTripLog(1);
- }
-
- // Handle CPU2_MEM_THERM_EVENT (CPU2 DIMM Thermal trip) if it's asserted now
-- if (cpu2MemtripLine.get_value() == 0)
-+ if ((cpu2MemtripLine.get_value() == 0) && !cpu2CriticalDIMMs.empty())
- {
- memThermTripLog(2);
- }
-@@ -1639,6 +1708,10 @@ int main(int argc, char* argv[])
- std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor =
- host_error_monitor::startHostStateMonitor();
-
-+ // Start tracking critical DIMM status
-+ std::shared_ptr<sdbusplus::bus::match::match> dimmThresholdEventMonitor =
-+ host_error_monitor::startDIMMThresholdEventMonitor();
-+
- // Request CPU1_MISMATCH GPIO events
- if (!host_error_monitor::requestGPIOInput(
- "CPU1_MISMATCH", host_error_monitor::cpu1MismatchLine))
---
-2.17.1
-
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch
deleted file mode 100644
index 1f1efea69..000000000
--- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch
+++ /dev/null
@@ -1,133 +0,0 @@
-From d7909c8924cf3619bffd52e5f352f175c1cf5033 Mon Sep 17 00:00:00 2001
-From: "Jason M. Bills" <jason.m.bills@intel.com>
-Date: Mon, 17 Aug 2020 15:52:22 -0700
-Subject: [PATCH 2/2] Add a workaround for spurious CPU errors
-
-There is a possible issue where GPIO event interrupts are getting
-missed causing false errors to be logged.
-
-This adds a check that the host is still on and the error is still
-asserted before logging an error.
-
-Tested:
-Confirmed that a spurious SMI event was ignored correctly after
-this change.
-
-Change-Id: Id83d9d67b15dcf9035e6448086b140e5c7dab4fe
-Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
----
- src/host_error_monitor.cpp | 77 ++++++++++++++++++++++++++++++++++++++
- 1 file changed, 77 insertions(+)
-
-diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
-index 77d065fa3..d026ab90d 100644
---- a/src/host_error_monitor.cpp
-+++ b/src/host_error_monitor.cpp
-@@ -806,6 +806,18 @@ static void caterrAssertHandler()
- }
- return;
- }
-+ // Confirm that this is a real failure by checking that the host is on
-+ if (hostOff)
-+ {
-+ return;
-+ }
-+ // And that the signal is still asserted
-+ if (caterrLine.get_value() != 0)
-+ {
-+ std::cerr
-+ << "CPU_CATERR not asserted after timeout. Error ignored.\n";
-+ return;
-+ }
- std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
- << " ms\n";
- beep(beepCPUIERR);
-@@ -1288,6 +1300,48 @@ static void errXAssertHandler(const int errPin,
- }
- return;
- }
-+ // Confirm that this is a real failure by checking that the host is on
-+ if (hostOff)
-+ {
-+ return;
-+ }
-+ // And that the signal is still asserted
-+ switch (errPin)
-+ {
-+ case 0:
-+ {
-+ if (err0Line.get_value() != 0)
-+ {
-+ std::cerr << "CPU_ERR0 not asserted after timeout. Error "
-+ "ignored.\n";
-+ return;
-+ }
-+ break;
-+ }
-+ case 1:
-+ {
-+ if (err1Line.get_value() != 0)
-+ {
-+ std::cerr << "CPU_ERR1 not asserted after timeout. Error "
-+ "ignored.\n";
-+ return;
-+ }
-+ break;
-+ }
-+ case 2:
-+ {
-+ if (err2Line.get_value() != 0)
-+ {
-+ std::cerr << "CPU_ERR2 not asserted after timeout. Error "
-+ "ignored.\n";
-+ return;
-+ }
-+ break;
-+ }
-+ default:
-+ std::cerr << "Invalid ERR pin asserted\n";
-+ return;
-+ }
- std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
- << std::to_string(errTimeoutMs) << " ms\n";
- if (errPinCPUs.count())
-@@ -1397,6 +1451,18 @@ static void err2AssertHandler()
- }
- return;
- }
-+ // Confirm that this is a real failure by checking that the host is on
-+ if (hostOff)
-+ {
-+ return;
-+ }
-+ // And that the signal is still asserted
-+ if (err2Line.get_value() != 0)
-+ {
-+ std::cerr
-+ << "CPU_ERR2 not asserted after timeout. Error ignored.\n";
-+ return;
-+ }
- conn->async_method_call(
- [](boost::system::error_code ec,
- const std::variant<bool>& property) {
-@@ -1465,6 +1531,17 @@ static void smiAssertHandler()
- }
- return;
- }
-+ // Confirm that this is a real failure by checking that the host is on
-+ if (hostOff)
-+ {
-+ return;
-+ }
-+ // And that the signal is still asserted
-+ if (smiLine.get_value() != 0)
-+ {
-+ std::cerr << "SMI not asserted after timeout. Error ignored.\n";
-+ return;
-+ }
- std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
- << " ms\n";
- smiTimeoutLog();
---
-2.17.1
-
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
new file mode 100644
index 000000000..cf74a4925
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch
@@ -0,0 +1,143 @@
+From b8b701fde79e7a8ca7bf5aa6ca6832524c011fa5 Mon Sep 17 00:00:00 2001
+From: "Jason M. Bills" <jason.m.bills@intel.com>
+Date: Tue, 15 Dec 2020 16:09:00 -0800
+Subject: [PATCH] Filter memory thermtrip events based on DIMM status
+
+There is a race-condition on shutdown that makes it difficult to
+differentiate between a normal shutdown and a memory thermtrip
+shutdown. This race-condition will be resolved in the CPLD for
+future platforms but for now it requires a workaround.
+
+This workaround assumes that a memory thermtrip can only occur
+if a DIMM temperature sensor has already reached a critical
+threshold. When memory thermtrip asserts on shutdown, it only
+logs an error if a DIMM is critical; otherwise it is treated
+as a normal shutdown.
+
+Tested:
+Memory thermtrip errors no longer log on each power-off.
+Manually set a DIMM temperature above critical and verified
+that the memory thermtrip event is logged.
+
+Change-Id: I9d8cf9b1de688e27babb8004b41f662242c78b3c
+Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
+---
+ .../error_monitors/mem_thermtrip_monitor.hpp | 81 +++++++++++++++++++
+ 1 file changed, 81 insertions(+)
+
+diff --git a/include/error_monitors/mem_thermtrip_monitor.hpp b/include/error_monitors/mem_thermtrip_monitor.hpp
+index d3dff1d3b..0a3f2fc22 100644
+--- a/include/error_monitors/mem_thermtrip_monitor.hpp
++++ b/include/error_monitors/mem_thermtrip_monitor.hpp
+@@ -14,6 +14,7 @@
+ // limitations under the License.
+ */
+ #pragma once
++#include <boost/container/flat_set.hpp>
+ #include <error_monitors/base_gpio_monitor.hpp>
+ #include <host_error_monitor.hpp>
+ #include <sdbusplus/asio/object_server.hpp>
+@@ -28,6 +29,72 @@ class MemThermtripMonitor :
+ host_error_monitor::base_gpio_monitor::AssertValue::lowAssert;
+ size_t cpuNum;
+
++ std::shared_ptr<sdbusplus::bus::match::match> dimmThresholdEventMonitor;
++ boost::container::flat_set<std::string> criticalDIMMs;
++
++ std::shared_ptr<sdbusplus::bus::match::match>
++ startDIMMThresholdEventMonitor()
++ {
++ return std::make_shared<sdbusplus::bus::match::match>(
++ *conn,
++ "type='signal',interface='org.freedesktop.DBus.Properties',member='"
++ "PropertiesChanged',arg0namespace='xyz.openbmc_project.Sensor."
++ "Threshold.Critical'",
++ [this](sdbusplus::message::message& msg) {
++ std::string interfaceName;
++ boost::container::flat_map<std::string, std::variant<bool>>
++ propertiesChanged;
++ try
++ {
++ msg.read(interfaceName, propertiesChanged);
++ }
++ catch (std::exception& e)
++ {
++ std::cerr << "Unable to read threshold event\n";
++ return;
++ }
++ // We only want to check for CriticalAlarmHigh
++ if (propertiesChanged.begin()->first != "CriticalAlarmHigh")
++ {
++ return;
++ }
++ const bool* alarm =
++ std::get_if<bool>(&(propertiesChanged.begin()->second));
++ if (alarm == nullptr)
++ {
++ std::cerr << propertiesChanged.begin()->first
++ << " property invalid\n";
++ return;
++ }
++
++ // Get the sensor path and check if it's a DIMM sensor
++ std::string sensor = msg.get_path();
++ if (sensor.find("DIMM") == std::string::npos)
++ {
++ // Not a DIMM sensor
++ return;
++ }
++
++ // Check if the DIMM belongs to this CPU
++ if (sensor.find("CPU" + std::to_string(cpuNum)) ==
++ std::string::npos)
++ {
++ return;
++ }
++
++ if (*alarm)
++ {
++ // DIMM crossed a critical threshold, so store it
++ criticalDIMMs.insert(sensor);
++ }
++ else
++ {
++ // DIMM is no longer critical, so remove it
++ criticalDIMMs.erase(sensor);
++ }
++ });
++ }
++
+ void logEvent() override
+ {
+ std::string cpuNumber = "CPU " + std::to_string(cpuNum);
+@@ -39,6 +106,17 @@ class MemThermtripMonitor :
+ "REDFISH_MESSAGE_ARGS=%s", cpuNumber.c_str(), NULL);
+ }
+
++ void assertHandler() override
++ {
++ // Only log a memory thermtrip if a DIMM is critical
++ if (criticalDIMMs.empty())
++ {
++ return;
++ }
++
++ host_error_monitor::base_gpio_monitor::BaseGPIOMonitor::assertHandler();
++ }
++
+ public:
+ MemThermtripMonitor(boost::asio::io_service& io,
+ std::shared_ptr<sdbusplus::asio::connection> conn,
+@@ -46,6 +124,9 @@ class MemThermtripMonitor :
+ BaseGPIOMonitor(io, conn, signalName, assertValue),
+ cpuNum(cpuNum)
+ {
++ // Start tracking critical DIMM status
++ dimmThresholdEventMonitor = startDIMMThresholdEventMonitor();
++
+ if (valid)
+ {
+ startMonitoring();
+--
+2.17.1
+
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend
index 0d1fd91d2..638d833a8 100644
--- a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor_%.bbappend
@@ -1,6 +1,6 @@
FILESEXTRAPATHS_append := "${THISDIR}/${PN}:"
SRC_URI += " \
- file://0001-Filter-memory-thermtrip-events-based-on-DIMM-status.patch \
- file://0002-Add-a-workaround-for-spurious-CPU-errors.patch \
- "
+ file://0001-Configure-host-error-monitors-for-meta-wht.patch \
+ file://0002-Filter-memory-thermtrip-events-based-on-DIMM-status.patch \
+ "
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules
new file mode 100644
index 000000000..b587a3f57
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci/99-peci.rules
@@ -0,0 +1,2 @@
+ACTION=="add", SUBSYSTEM=="peci_dev", ATTRS{name}=="*.peci-bus", SYMLINK+="peci-wire" TAG+="peci-wire"
+ACTION=="add", SUBSYSTEM=="peci_dev", TAG=="peci-wire", SYMLINK+="peci-default"
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend
new file mode 100644
index 000000000..575cfea24
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/libpeci/libpeci_%.bbappend
@@ -0,0 +1,8 @@
+FILESEXTRAPATHS_append := ":${THISDIR}/${PN}"
+
+SRC_URI += "file://99-peci.rules"
+
+do_install_append() {
+ install -d ${D}/lib/udev/rules.d
+ install -m 0644 ${WORKDIR}/99-peci.rules ${D}/lib/udev/rules.d
+}