summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason M. Bills <jason.m.bills@intel.com>2019-08-06 21:03:49 +0300
committerJason M. Bills <jason.m.bills@linux.intel.com>2019-08-29 23:33:24 +0300
commitc47789dcd9efbeb088915d77fb1f60c01d0cfdbc (patch)
tree9ee19185738e67d492f524f5254afd0e0c9295b9
parent10f4d8299e2b6d6251a33ea3da94a315bfdaf131 (diff)
downloadprovingground-c47789dcd9efbeb088915d77fb1f60c01d0cfdbc.tar.xz
Add ERR2 timeout monitoring and logging
This adds ERR2 timeout monitoring to the host error monitor. When the ERR2 signal is asserted for more than 90 seconds, the BMC will log which CPU asserted the ERR2 signal, trigger a Crashdump, and reset the system if enabled. Tested: Manually triggered an ERR2 timeout and confirmed that the event is handled and logged correctly. Change-Id: I779cd02c649603f41fba6a93c1187b5be008af4f Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
-rw-r--r--host_error_monitor/CMakeLists.txt2
-rw-r--r--host_error_monitor/src/host_error_monitor.cpp197
2 files changed, 198 insertions, 1 deletions
diff --git a/host_error_monitor/CMakeLists.txt b/host_error_monitor/CMakeLists.txt
index ce7dbc1..43f9510 100644
--- a/host_error_monitor/CMakeLists.txt
+++ b/host_error_monitor/CMakeLists.txt
@@ -7,7 +7,7 @@ add_executable (host-error-monitor src/host_error_monitor.cpp)
target_include_directories (host-error-monitor PRIVATE ${CMAKE_SOURCE_DIR})
-target_link_libraries (host-error-monitor sdbusplus -lsystemd gpiodcxx)
+target_link_libraries (host-error-monitor sdbusplus -lsystemd gpiodcxx peci)
include_directories (${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/host_error_monitor/src/host_error_monitor.cpp b/host_error_monitor/src/host_error_monitor.cpp
index 07ee69a..ed32947 100644
--- a/host_error_monitor/src/host_error_monitor.cpp
+++ b/host_error_monitor/src/host_error_monitor.cpp
@@ -13,9 +13,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
*/
+#include <peci/libpeci.h>
#include <systemd/sd-journal.h>
+#include <bitset>
#include <boost/asio/posix/stream_descriptor.hpp>
+#include <crashdump/peci_cpus.hpp>
#include <gpiod.hpp>
#include <iostream>
#include <sdbusplus/asio/object_server.hpp>
@@ -28,15 +31,20 @@ static std::shared_ptr<sdbusplus::asio::connection> conn;
static bool hostOff = true;
const static constexpr int caterrTimeoutMs = 1000;
+const static constexpr int err2TimeoutMs = 90000;
const static constexpr int crashdumpTimeoutS = 300;
// Timers
// Timer for CATERR asserted
static boost::asio::steady_timer caterrAssertTimer(io);
+// Timer for ERR2 asserted
+static boost::asio::steady_timer err2AssertTimer(io);
// GPIO Lines and Event Descriptors
static gpiod::line caterrLine;
static boost::asio::posix::stream_descriptor caterrEvent(io);
+static gpiod::line err2Line;
+static boost::asio::posix::stream_descriptor err2Event(io);
//----------------------------------
// PCH_BMC_THERMTRIP function related definition
//----------------------------------
@@ -44,6 +52,22 @@ static boost::asio::posix::stream_descriptor caterrEvent(io);
static gpiod::line pchThermtripLine;
static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
+static void cpuERR2Log()
+{
+ sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
+ "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", "ERR2 Timeout", NULL);
+}
+
+static void cpuERR2Log(const int cpuNum)
+{
+ std::string msg = "ERR2 Timeout on CPU " + std::to_string(cpuNum + 1);
+
+ sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+ LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
+}
+
static void initializeErrorState();
static void initializeHostState()
{
@@ -102,6 +126,7 @@ static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor()
if (hostOff)
{
caterrAssertTimer.cancel();
+ err2AssertTimer.cancel();
}
});
}
@@ -319,6 +344,164 @@ static void pchThermtripHandler()
});
}
+static std::bitset<crashdump::maxCPUs> checkERR2CPUs()
+{
+ std::bitset<crashdump::maxCPUs> err2CPUs = 0;
+ for (int cpu = 0, addr = crashdump::minClientAddr;
+ addr <= crashdump::maxClientAddr; cpu++, addr++)
+ {
+ if (peci_Ping(addr) == PECI_CC_SUCCESS)
+ {
+ uint8_t cc = 0;
+ uint32_t cpuID = 0;
+ if (peci_RdPkgConfig(addr, PECI_MBX_INDEX_CPU_ID,
+ PECI_PKG_ID_CPU_ID, sizeof(uint32_t),
+ (uint8_t*)&cpuID, &cc) != PECI_CC_SUCCESS)
+ {
+ std::cerr << "Cannot get CPUID!\n";
+ continue;
+ }
+
+ crashdump::CPUModel model{};
+ bool modelFound = false;
+ for (int i = 0; i < crashdump::cpuIDMap.size(); i++)
+ {
+ if (cpuID == crashdump::cpuIDMap[i].cpuID)
+ {
+ model = crashdump::cpuIDMap[i].model;
+ modelFound = true;
+ break;
+ }
+ }
+ if (!modelFound)
+ {
+ std::cerr << "Cannot find Model for CPUID 0x" << std::hex
+ << cpuID << "\n";
+ continue;
+ }
+
+ static constexpr int err2Sts = (1 << 2);
+ switch (model)
+ {
+ case crashdump::CPUModel::skx_h0:
+ {
+ // Check the ERRPINSTS to see if this is the CPU that caused
+ // the ERR2 (B(0) D8 F0 offset 210h)
+ uint32_t errpinsts = 0;
+ if (peci_RdPCIConfigLocal(
+ addr, 0, 8, 0, 0x210, sizeof(uint32_t),
+ (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
+ {
+ err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
+ }
+ break;
+ }
+ case crashdump::CPUModel::icx_a0:
+ case crashdump::CPUModel::icx_b0:
+ {
+ // Check the ERRPINSTS to see if this is the CPU that caused
+ // the ERR2 (B(30) D0 F3 offset 274h) (Note: Bus 30 is
+ // accessed on PECI as bus 13)
+ uint32_t errpinsts = 0;
+ if (peci_RdEndPointConfigPciLocal(
+ addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
+ (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
+ {
+ err2CPUs[cpu] = (errpinsts & err2Sts) != 0;
+ }
+ break;
+ }
+ }
+ }
+ }
+ return err2CPUs;
+}
+
+static void err2AssertHandler()
+{
+ // ERR2 status is not guaranteed through the timeout, so save which
+ // CPUs have asserted ERR2 now
+ std::bitset<crashdump::maxCPUs> err2CPUs = checkERR2CPUs();
+ err2AssertTimer.expires_after(std::chrono::milliseconds(err2TimeoutMs));
+ err2AssertTimer.async_wait([err2CPUs](const boost::system::error_code ec) {
+ if (ec)
+ {
+ // operation_aborted is expected if timer is canceled before
+ // completion.
+ if (ec != boost::asio::error::operation_aborted)
+ {
+ std::cerr << "err2 timeout async_wait failed: " << ec.message()
+ << "\n";
+ }
+ return;
+ }
+ std::cerr << "ERR2 asserted for " << std::to_string(err2TimeoutMs)
+ << " ms\n";
+ if (err2CPUs.count())
+ {
+ for (int i = 0; i < err2CPUs.size(); i++)
+ {
+ if (err2CPUs[i])
+ {
+ cpuERR2Log(i);
+ }
+ }
+ }
+ else
+ {
+ cpuERR2Log();
+ }
+ conn->async_method_call(
+ [](boost::system::error_code ec,
+ const std::variant<bool>& property) {
+ if (ec)
+ {
+ return;
+ }
+ const bool* reset = std::get_if<bool>(&property);
+ if (reset == nullptr)
+ {
+ std::cerr << "Unable to read reset on ERR2 value\n";
+ return;
+ }
+ startCrashdumpAndRecovery(*reset);
+ },
+ "xyz.openbmc_project.Settings",
+ "/xyz/openbmc_project/control/processor_error_config",
+ "org.freedesktop.DBus.Properties", "Get",
+ "xyz.openbmc_project.Control.Processor.ErrConfig", "ResetOnERR2");
+ });
+}
+
+static void err2Handler()
+{
+ if (!hostOff)
+ {
+ gpiod::line_event gpioLineEvent = err2Line.event_read();
+
+ bool err2 = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE;
+ if (err2)
+ {
+ err2AssertHandler();
+ }
+ else
+ {
+ err2AssertTimer.cancel();
+ }
+ }
+ err2Event.async_wait(boost::asio::posix::stream_descriptor::wait_read,
+ [](const boost::system::error_code ec) {
+ if (ec)
+ {
+ std::cerr
+ << "err2 handler error: " << ec.message()
+ << "\n";
+ return;
+ }
+ err2Handler();
+ });
+}
+
static void initializeErrorState()
{
// Handle CPU_CATERR if it's asserted now
@@ -326,6 +509,12 @@ static void initializeErrorState()
{
caterrAssertHandler();
}
+
+ // Handle CPU_ERR2 if it's asserted now
+ if (err2Line.get_value() == 0)
+ {
+ err2AssertHandler();
+ }
}
} // namespace host_error_monitor
@@ -356,6 +545,14 @@ int main(int argc, char* argv[])
return -1;
}
+ // Request CPU_ERR2 GPIO events
+ if (!host_error_monitor::requestGPIOEvents(
+ "CPU_ERR2", host_error_monitor::err2Handler,
+ host_error_monitor::err2Line, host_error_monitor::err2Event))
+ {
+ return -1;
+ }
+
// Request PCH_BMC_THERMTRIP GPIO events
if (!host_error_monitor::requestGPIOEvents(
"PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler,