diff options
author | Jason M. Bills <jason.m.bills@linux.intel.com> | 2019-08-19 21:22:38 +0300 |
---|---|---|
committer | Jason M. Bills <jason.m.bills@linux.intel.com> | 2019-08-22 04:55:13 +0300 |
commit | dd93b578f1b193cb08f46e7fced6efdeb38dcc96 (patch) | |
tree | 00418a1343e15611c34c5727b660fdc156cf5ed7 | |
parent | bce43ab726d060d614ee8a99f7989f0a412c4cfe (diff) | |
download | provingground-dd93b578f1b193cb08f46e7fced6efdeb38dcc96.tar.xz |
Move host-error-monitor to proving-ground
The old repo is being deprecated, so move host-error-monitor from
the old at-scale-debug repo at commit 6ed87657 to here.
Change-Id: Ife30c3996675ebf2559aa4ed8fa7ada6af1b7f34
Signed-off-by: Jason M. Bills <jason.m.bills@linux.intel.com>
-rw-r--r-- | host_error_monitor/.clang-format | 98 | ||||
-rw-r--r-- | host_error_monitor/CMakeLists.txt | 36 | ||||
-rw-r--r-- | host_error_monitor/cmake-format.json | 12 | ||||
-rw-r--r-- | host_error_monitor/service_files/xyz.openbmc_project.HostErrorMonitor.service | 10 | ||||
-rw-r--r-- | host_error_monitor/src/host_error_monitor.cpp | 356 |
5 files changed, 512 insertions, 0 deletions
diff --git a/host_error_monitor/.clang-format b/host_error_monitor/.clang-format new file mode 100644 index 0000000..ae9ad39 --- /dev/null +++ b/host_error_monitor/.clang-format @@ -0,0 +1,98 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: false +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: true +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: AfterColon +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +PointerAlignment: Left +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^[<"](gtest|gmock)' + Priority: 5 + - Regex: '^"config.h"' + Priority: -1 + - Regex: '^".*\.hpp"' + Priority: 1 + - Regex: '^<.*\.h>' + Priority: 2 + - Regex: '^<.*' + Priority: 3 + - Regex: '.*' + Priority: 4 +IndentCaseLabels: true +IndentWidth: 4 +IndentWrappedFunctionNames: true +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 4 +UseTab: Never +... diff --git a/host_error_monitor/CMakeLists.txt b/host_error_monitor/CMakeLists.txt new file mode 100644 index 0000000..ce7dbc1 --- /dev/null +++ b/host_error_monitor/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required (VERSION 3.6) +project (host-error-monitor CXX) +set (CMAKE_CXX_STANDARD 17) +set (CMAKE_CXX_STANDARD_REQUIRED ON) + +add_executable (host-error-monitor src/host_error_monitor.cpp) + +target_include_directories (host-error-monitor PRIVATE ${CMAKE_SOURCE_DIR}) + +target_link_libraries (host-error-monitor sdbusplus -lsystemd gpiodcxx) + +include_directories (${CMAKE_CURRENT_SOURCE_DIR}/include) + +install (TARGETS host-error-monitor + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static) + +find_package (Boost 1.66 REQUIRED) +include_directories (${BOOST_SRC_DIR}) + +add_definitions (-DBOOST_ERROR_CODE_HEADER_ONLY) +add_definitions (-DBOOST_SYSTEM_NO_DEPRECATED) +add_definitions (-DBOOST_ALL_NO_LIB) +add_definitions (-DBOOST_NO_RTTI) +add_definitions (-DBOOST_NO_TYPEID) +add_definitions (-DBOOST_ASIO_DISABLE_THREADS) + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-rtti") + +set ( + SERVICE_FILES + ${PROJECT_SOURCE_DIR}/service_files/xyz.openbmc_project.HostErrorMonitor.service +) +install (FILES ${SERVICE_FILES} DESTINATION /lib/systemd/system/) diff --git a/host_error_monitor/cmake-format.json b/host_error_monitor/cmake-format.json new file mode 100644 index 0000000..4a701ae --- /dev/null +++ b/host_error_monitor/cmake-format.json @@ -0,0 +1,12 @@ +{ + "enum_char": ".", + "line_ending": "unix", + "bullet_char": "*", + "max_subargs_per_line": 99, + "command_case": "lower", + "tab_size": 4, + "line_width": 80, + "separate_fn_name_with_space": true, + "dangle_parens": true, + "separate_ctrl_name_with_space": true +} diff --git a/host_error_monitor/service_files/xyz.openbmc_project.HostErrorMonitor.service b/host_error_monitor/service_files/xyz.openbmc_project.HostErrorMonitor.service new file mode 100644 index 0000000..cf789e9 --- /dev/null +++ b/host_error_monitor/service_files/xyz.openbmc_project.HostErrorMonitor.service @@ -0,0 +1,10 @@ +[Unit] +Description=Host Error Monitor + +[Service] +Restart=always +ExecStart=/usr/bin/host-error-monitor +Type=simple + +[Install] +WantedBy=multi-user.target diff --git a/host_error_monitor/src/host_error_monitor.cpp b/host_error_monitor/src/host_error_monitor.cpp new file mode 100644 index 0000000..b9d214a --- /dev/null +++ b/host_error_monitor/src/host_error_monitor.cpp @@ -0,0 +1,356 @@ +/* +// Copyright (c) 2019 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ +#include <systemd/sd-journal.h> + +#include <boost/asio/posix/stream_descriptor.hpp> +#include <gpiod.hpp> +#include <iostream> +#include <sdbusplus/asio/object_server.hpp> + +namespace host_error_monitor +{ +static boost::asio::io_service io; +static std::shared_ptr<sdbusplus::asio::connection> conn; + +static bool hostOff = true; + +const static constexpr int caterrTimeoutMs = 1000; +const static constexpr int crashdumpTimeoutS = 300; + +// Timers +// Timer for CATERR asserted +static boost::asio::steady_timer caterrAssertTimer(io); + +// GPIO Lines and Event Descriptors +static gpiod::line caterrLine; +static boost::asio::posix::stream_descriptor caterrEvent(io); +//---------------------------------- +// PCH_BMC_THERMTRIP function related definition +//---------------------------------- +// GPIO Lines and Event Descriptors +static gpiod::line pchThermtripLine; +static boost::asio::posix::stream_descriptor pchThermtripEvent(io); + +static void initializeHostState() +{ + conn->async_method_call( + [](boost::system::error_code ec, + const std::variant<std::string>& property) { + if (ec) + { + return; + } + const std::string* state = std::get_if<std::string>(&property); + if (state == nullptr) + { + std::cerr << "Unable to read host state value\n"; + return; + } + hostOff = *state == "xyz.openbmc_project.State.Host.HostState.Off"; + }, + "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", + "org.freedesktop.DBus.Properties", "Get", + "xyz.openbmc_project.State.Host", "CurrentHostState"); +} + +static std::shared_ptr<sdbusplus::bus::match::match> startHostStateMonitor() +{ + return std::make_shared<sdbusplus::bus::match::match>( + *conn, + "type='signal',interface='org.freedesktop.DBus.Properties'," + "member='PropertiesChanged',arg0namespace='xyz.openbmc_project.State." + "Host'", + [](sdbusplus::message::message& msg) { + std::string interfaceName; + boost::container::flat_map<std::string, std::variant<std::string>> + propertiesChanged; + std::string state; + try + { + msg.read(interfaceName, propertiesChanged); + state = + std::get<std::string>(propertiesChanged.begin()->second); + } + catch (std::exception& e) + { + std::cerr << "Unable to read host state\n"; + return; + } + hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off"; + + // No host events should fire while off, so cancel any pending + // timers + if (hostOff) + { + caterrAssertTimer.cancel(); + } + }); +} + +static bool requestGPIOEvents( + const std::string& name, const std::function<void()>& handler, + gpiod::line& gpioLine, + boost::asio::posix::stream_descriptor& gpioEventDescriptor) +{ + // Find the GPIO line + gpioLine = gpiod::find_line(name); + if (!gpioLine) + { + std::cerr << "Failed to find the " << name << " line\n"; + return false; + } + + try + { + gpioLine.request( + {"host-error-monitor", gpiod::line_request::EVENT_BOTH_EDGES}); + } + catch (std::exception&) + { + std::cerr << "Failed to request events for " << name << "\n"; + return false; + } + + int gpioLineFd = gpioLine.event_get_fd(); + if (gpioLineFd < 0) + { + std::cerr << "Failed to get " << name << " fd\n"; + return false; + } + + gpioEventDescriptor.assign(gpioLineFd); + + gpioEventDescriptor.async_wait( + boost::asio::posix::stream_descriptor::wait_read, + [&name, handler](const boost::system::error_code ec) { + if (ec) + { + std::cerr << name << " fd handler error: " << ec.message() + << "\n"; + return; + } + handler(); + }); + return true; +} + +static void startPowerCycle() +{ + conn->async_method_call( + [](boost::system::error_code ec) { + if (ec) + { + std::cerr << "failed to set Chassis State\n"; + } + }, + "xyz.openbmc_project.State.Chassis", + "/xyz/openbmc_project/state/chassis0", + "org.freedesktop.DBus.Properties", "Set", + "xyz.openbmc_project.State.Chassis", "RequestedPowerTransition", + std::variant<std::string>{ + "xyz.openbmc_project.State.Chassis.Transition.PowerCycle"}); +} + +static void startCrashdumpAndRecovery(bool recoverSystem) +{ + std::cout << "Starting crashdump\n"; + static std::shared_ptr<sdbusplus::bus::match::match> crashdumpCompleteMatch; + static boost::asio::steady_timer crashdumpTimer(io); + + crashdumpCompleteMatch = std::make_shared<sdbusplus::bus::match::match>( + *conn, + "type='signal',interface='org.freedesktop.DBus.Properties'," + "member='PropertiesChanged',arg0namespace='com.intel.crashdump'", + [recoverSystem](sdbusplus::message::message& msg) { + crashdumpTimer.cancel(); + std::cout << "Crashdump completed\n"; + if (recoverSystem) + { + std::cout << "Recovering the system\n"; + startPowerCycle(); + } + crashdumpCompleteMatch.reset(); + }); + + crashdumpTimer.expires_after(std::chrono::seconds(crashdumpTimeoutS)); + crashdumpTimer.async_wait([](const boost::system::error_code ec) { + if (ec) + { + // operation_aborted is expected if timer is canceled + if (ec != boost::asio::error::operation_aborted) + { + std::cerr << "Crashdump async_wait failed: " << ec.message() + << "\n"; + } + std::cout << "Crashdump timer canceled\n"; + return; + } + std::cerr << "Crashdump failed to complete before timeout\n"; + crashdumpCompleteMatch.reset(); + }); + + conn->async_method_call( + [](boost::system::error_code ec) { + if (ec) + { + std::cerr << "failed to start Crashdump\n"; + crashdumpTimer.cancel(); + crashdumpCompleteMatch.reset(); + } + }, + "com.intel.crashdump", "/com/intel/crashdump", + "com.intel.crashdump.Stored", "GenerateStoredLog"); +} + +static void caterrHandler() +{ + if (!hostOff) + { + gpiod::line_event gpioLineEvent = caterrLine.event_read(); + + bool caterr = + gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; + if (caterr) + { + std::cout << "CPU CATERR detected, starting timer\n"; + caterrAssertTimer.expires_after( + std::chrono::milliseconds(caterrTimeoutMs)); + caterrAssertTimer.async_wait( + [](const boost::system::error_code ec) { + if (ec) + { + // operation_aborted is expected if timer is canceled + // before completion. + if (ec != boost::asio::error::operation_aborted) + { + std::cerr << "caterr timeout async_wait failed: " + << ec.message() << "\n"; + } + std::cout << "CATERR assert timer canceled\n"; + return; + } + std::cout << "CATERR asset timer completed\n"; + conn->async_method_call( + [](boost::system::error_code ec, + const std::variant<bool>& property) { + if (ec) + { + return; + } + const bool* reset = std::get_if<bool>(&property); + if (reset == nullptr) + { + std::cerr + << "Unable to read reset on CATERR value\n"; + return; + } + startCrashdumpAndRecovery(*reset); + }, + "xyz.openbmc_project.Settings", + "/xyz/openbmc_project/control/processor_error_config", + "org.freedesktop.DBus.Properties", "Get", + "xyz.openbmc_project.Control.Processor.ErrConfig", + "ResetOnCATERR"); + }); + } + else + { + caterrAssertTimer.cancel(); + } + } + caterrEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read, + [](const boost::system::error_code ec) { + if (ec) + { + std::cerr << "caterr handler error: " + << ec.message() << "\n"; + return; + } + caterrHandler(); + }); +} +static void pchThermtripHandler() +{ + if (!hostOff) + { + gpiod::line_event gpioLineEvent = pchThermtripLine.event_read(); + + bool pchThermtrip = + gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; + if (pchThermtrip) + { + std::cout << "PCH Thermtrip detected \n"; + // log to redfish, call API + sd_journal_send("MESSAGE=SSBThermtrip: SSB Thermtrip", + "PRIORITY=%i", LOG_INFO, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.SSBThermtrip", NULL); + } + } + pchThermtripEvent.async_wait( + boost::asio::posix::stream_descriptor::wait_read, + [](const boost::system::error_code ec) { + if (ec) + { + std::cerr << "PCH Thermtrip handler error: " << ec.message() + << "\n"; + return; + } + pchThermtripHandler(); + }); +} + +} // namespace host_error_monitor + +int main(int argc, char* argv[]) +{ + // setup connection to dbus + host_error_monitor::conn = + std::make_shared<sdbusplus::asio::connection>(host_error_monitor::io); + + // Host Error Monitor Object + host_error_monitor::conn->request_name( + "xyz.openbmc_project.HostErrorMonitor"); + sdbusplus::asio::object_server server = + sdbusplus::asio::object_server(host_error_monitor::conn); + + // Start tracking host state + std::shared_ptr<sdbusplus::bus::match::match> hostStateMonitor = + host_error_monitor::startHostStateMonitor(); + + // Initialize the host state + host_error_monitor::initializeHostState(); + + // Request CPU_CATERR GPIO events + if (!host_error_monitor::requestGPIOEvents( + "CPU_CATERR", host_error_monitor::caterrHandler, + host_error_monitor::caterrLine, host_error_monitor::caterrEvent)) + { + return -1; + } + + // Request PCH_BMC_THERMTRIP GPIO events + if (!host_error_monitor::requestGPIOEvents( + "PCH_BMC_THERMTRIP", host_error_monitor::pchThermtripHandler, + host_error_monitor::pchThermtripLine, + host_error_monitor::pchThermtripEvent)) + { + return -1; + } + + host_error_monitor::io.run(); + + return 0; +} |