diff options
author | Jason M. Bills <jason.m.bills@intel.com> | 2019-08-06 21:10:02 +0300 |
---|---|---|
committer | Jason M. Bills <jason.m.bills@linux.intel.com> | 2019-09-04 19:05:50 +0300 |
commit | 735bac3bd01e21418aeacbc478ed791422796efe (patch) | |
tree | ab2e54484cf0f9cc6971fba0990b54c230651bef | |
parent | 4a7b10afa702c98a83122e59ce4929afad8cf881 (diff) | |
download | provingground-735bac3bd01e21418aeacbc478ed791422796efe.tar.xz |
Add SMI timeout monitoring and logging
This adds SMI timeout monitoring to the host error monitor. When
the SMI signal is asserted for more than 90 seconds, the BMC will
log it, trigger a Crashdump, and reset the system if enabled.
Tested:
Manually triggered an SMI timeout and confirmed that the event is
handled and logged correctly.
Change-Id: I0579c96211d8e6abcdc190c154f3671151d5e60d
Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
-rw-r--r-- | host_error_monitor/src/host_error_monitor.cpp | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/host_error_monitor/src/host_error_monitor.cpp b/host_error_monitor/src/host_error_monitor.cpp index 9376a3e..a9ebd7e 100644 --- a/host_error_monitor/src/host_error_monitor.cpp +++ b/host_error_monitor/src/host_error_monitor.cpp @@ -32,6 +32,7 @@ static bool hostOff = true; const static constexpr int caterrTimeoutMs = 2000; const static constexpr int err2TimeoutMs = 90000; +const static constexpr int smiTimeoutMs = 90000; const static constexpr int crashdumpTimeoutS = 300; // Timers @@ -39,12 +40,16 @@ const static constexpr int crashdumpTimeoutS = 300; static boost::asio::steady_timer caterrAssertTimer(io); // Timer for ERR2 asserted static boost::asio::steady_timer err2AssertTimer(io); +// Timer for SMI asserted +static boost::asio::steady_timer smiAssertTimer(io); // GPIO Lines and Event Descriptors static gpiod::line caterrLine; static boost::asio::posix::stream_descriptor caterrEvent(io); static gpiod::line err2Line; static boost::asio::posix::stream_descriptor err2Event(io); +static gpiod::line smiLine; +static boost::asio::posix::stream_descriptor smiEvent(io); //---------------------------------- // PCH_BMC_THERMTRIP function related definition //---------------------------------- @@ -93,6 +98,13 @@ static void cpuERR2Log(const int cpuNum) "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL); } +static void smiTimeoutLog() +{ + sd_journal_send("MESSAGE=HostError: SMI Timeout", "PRIORITY=%i", LOG_INFO, + "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError", + "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL); +} + static void initializeErrorState(); static void initializeHostState() { @@ -152,6 +164,7 @@ static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor() { caterrAssertTimer.cancel(); err2AssertTimer.cancel(); + smiAssertTimer.cancel(); } }); } @@ -754,6 +767,75 @@ static void err2Handler() }); } +static void smiAssertHandler() +{ + smiAssertTimer.expires_after(std::chrono::milliseconds(smiTimeoutMs)); + smiAssertTimer.async_wait([](const boost::system::error_code ec) { + if (ec) + { + // operation_aborted is expected if timer is canceled before + // completion. + if (ec != boost::asio::error::operation_aborted) + { + std::cerr << "smi timeout async_wait failed: " << ec.message() + << "\n"; + } + return; + } + std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs) + << " ms\n"; + smiTimeoutLog(); + conn->async_method_call( + [](boost::system::error_code ec, + const std::variant<bool>& property) { + if (ec) + { + return; + } + const bool* reset = std::get_if<bool>(&property); + if (reset == nullptr) + { + std::cerr << "Unable to read reset on SMI value\n"; + return; + } + startCrashdumpAndRecovery(*reset); + }, + "xyz.openbmc_project.Settings", + "/xyz/openbmc_project/control/bmc_reset_disables", + "org.freedesktop.DBus.Properties", "Get", + "xyz.openbmc_project.Control.ResetDisables", "ResetOnSMI"); + }); +} + +static void smiHandler() +{ + if (!hostOff) + { + gpiod::line_event gpioLineEvent = smiLine.event_read(); + + bool smi = gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; + if (smi) + { + smiAssertHandler(); + } + else + { + smiAssertTimer.cancel(); + } + } + smiEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read, + [](const boost::system::error_code ec) { + if (ec) + { + std::cerr + << "smi handler error: " << ec.message() + << "\n"; + return; + } + smiHandler(); + }); +} + static void initializeErrorState() { // Handle CPU_CATERR if it's asserted now @@ -767,6 +849,12 @@ static void initializeErrorState() { err2AssertHandler(); } + + // Handle SMI if it's asserted now + if (smiLine.get_value() == 0) + { + smiAssertHandler(); + } } } // namespace host_error_monitor @@ -805,6 +893,14 @@ int main(int argc, char* argv[]) return -1; } + // Request SMI GPIO events + if (!host_error_monitor::requestGPIOEvents( + "SMI", host_error_monitor::smiHandler, host_error_monitor::smiLine, + host_error_monitor::smiEvent)) + { + return -1; + } + // Request PCH_BMC_THERMTRIP GPIO events if (!host_error_monitor::requestGPIOEvents( "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler, |