From 4f4e62aa9e1ecba85ecd00d24f6b0d7107345963 Mon Sep 17 00:00:00 2001 From: "Jason M. Bills" Date: Wed, 28 Aug 2019 13:57:20 -0700 Subject: Use the new standalone libpeci for CPU info Tested: Injected an ERR2 timeout and verified that it could be attributed to the correct CPU. Change-Id: Ic7880a0ef1c4d0b655ed5fb021ae07c88f6c9074 Signed-off-by: Jason M. Bills --- host_error_monitor/src/host_error_monitor.cpp | 79 ++++++--------------------- 1 file changed, 16 insertions(+), 63 deletions(-) diff --git a/host_error_monitor/src/host_error_monitor.cpp b/host_error_monitor/src/host_error_monitor.cpp index 51434f4..73dc367 100644 --- a/host_error_monitor/src/host_error_monitor.cpp +++ b/host_error_monitor/src/host_error_monitor.cpp @@ -13,12 +13,11 @@ // See the License for the specific language governing permissions and // limitations under the License. */ -#include +#include #include #include #include -#include #include #include #include @@ -391,44 +390,20 @@ static void incrementCPUErrorCount(int cpuNum) static bool checkIERRCPUs() { bool cpuIERRFound = false; - for (int cpu = 0, addr = crashdump::minClientAddr; - addr <= crashdump::maxClientAddr; cpu++, addr++) + for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR; + cpu++, addr++) { - if (peci_Ping(addr) != PECI_CC_SUCCESS) - { - continue; - } uint8_t cc = 0; - uint32_t cpuID = 0; - if (peci_RdPkgConfig(addr, PECI_MBX_INDEX_CPU_ID, PECI_PKG_ID_CPU_ID, - sizeof(uint32_t), (uint8_t*)&cpuID, - &cc) != PECI_CC_SUCCESS) + CPUModel model{}; + if (peci_GetCPUID(addr, &model, &cc) != PECI_CC_SUCCESS) { std::cerr << "Cannot get CPUID!\n"; continue; } - crashdump::CPUModel model{}; - bool modelFound = false; - for (int i = 0; i < crashdump::cpuIDMap.size(); i++) - { - if (cpuID == crashdump::cpuIDMap[i].cpuID) - { - model = crashdump::cpuIDMap[i].model; - modelFound = true; - break; - } - } - if (!modelFound) - { - std::cerr << "Cannot find Model for CPUID 0x" << std::hex << cpuID - << "\n"; - continue; - } - switch (model) { - case crashdump::CPUModel::skx_h0: + case skx: { // First check the MCA_ERR_SRC_LOG to see if this is the CPU // that caused the IERR @@ -513,8 +488,7 @@ static bool checkIERRCPUs() } break; } - case crashdump::CPUModel::icx_a0: - case crashdump::CPUModel::icx_b0: + case icx: { // First check the MCA_ERR_SRC_LOG to see if this is the CPU // that caused the IERR @@ -918,46 +892,26 @@ static void pchThermtripHandler() }); } -static std::bitset checkERRPinCPUs(const int errPin) +static std::bitset checkERRPinCPUs(const int errPin) { int errPinSts = (1 << errPin); - std::bitset errPinCPUs = 0; - for (int cpu = 0, addr = crashdump::minClientAddr; - addr <= crashdump::maxClientAddr; cpu++, addr++) + std::bitset errPinCPUs = 0; + for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR; + cpu++, addr++) { if (peci_Ping(addr) == PECI_CC_SUCCESS) { uint8_t cc = 0; - uint32_t cpuID = 0; - if (peci_RdPkgConfig(addr, PECI_MBX_INDEX_CPU_ID, - PECI_PKG_ID_CPU_ID, sizeof(uint32_t), - (uint8_t*)&cpuID, &cc) != PECI_CC_SUCCESS) + CPUModel model{}; + if (peci_GetCPUID(addr, &model, &cc) != PECI_CC_SUCCESS) { std::cerr << "Cannot get CPUID!\n"; continue; } - crashdump::CPUModel model{}; - bool modelFound = false; - for (int i = 0; i < crashdump::cpuIDMap.size(); i++) - { - if (cpuID == crashdump::cpuIDMap[i].cpuID) - { - model = crashdump::cpuIDMap[i].model; - modelFound = true; - break; - } - } - if (!modelFound) - { - std::cerr << "Cannot find Model for CPUID 0x" << std::hex - << cpuID << "\n"; - continue; - } - switch (model) { - case crashdump::CPUModel::skx_h0: + case skx: { // Check the ERRPINSTS to see if this is the CPU that caused // the ERRx (B(0) D8 F0 offset 210h) @@ -970,8 +924,7 @@ static std::bitset checkERRPinCPUs(const int errPin) } break; } - case crashdump::CPUModel::icx_a0: - case crashdump::CPUModel::icx_b0: + case icx: { // Check the ERRPINSTS to see if this is the CPU that caused // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is @@ -996,7 +949,7 @@ static void errXAssertHandler(const int errPin, { // ERRx status is not guaranteed through the timeout, so save which // CPUs have it asserted - std::bitset errPinCPUs = checkERRPinCPUs(errPin); + std::bitset errPinCPUs = checkERRPinCPUs(errPin); errXAssertTimer.expires_after(std::chrono::milliseconds(errTimeoutMs)); errXAssertTimer.async_wait([errPin, errPinCPUs]( const boost::system::error_code ec) { -- cgit v1.2.3 From 6b5561d6a06b02d21952d42e06659df2cc862874 Mon Sep 17 00:00:00 2001 From: "Jason M. Bills" Date: Fri, 13 Sep 2019 11:11:34 -0700 Subject: Handle host errors on host power-on Some errors such as the Boot IVR (Thermtrip) could be asserted when the host powers-on. This change initializes the error status on host power-on to handle those types of errors. Tested: Moved the jumper to persistently assert Thermtrip and pressed the power button. After the system immediately shut down, confirmed that the Thermtrip event was logged. Change-Id: Id782167e234d906aaadb6190d9b92beafa6ab84a Signed-off-by: Jason M. Bills --- host_error_monitor/src/host_error_monitor.cpp | 113 +++++++++++++++++++++++--- 1 file changed, 103 insertions(+), 10 deletions(-) diff --git a/host_error_monitor/src/host_error_monitor.cpp b/host_error_monitor/src/host_error_monitor.cpp index 73dc367..0dc620b 100644 --- a/host_error_monitor/src/host_error_monitor.cpp +++ b/host_error_monitor/src/host_error_monitor.cpp @@ -211,16 +211,21 @@ static std::shared_ptr startHostStateMonitor() } hostOff = state == "xyz.openbmc_project.State.Host.HostState.Off"; - // No host events should fire while off, so cancel any pending - // timers if (hostOff) { + // No host events should fire while off, so cancel any pending + // timers caterrAssertTimer.cancel(); err0AssertTimer.cancel(); err1AssertTimer.cancel(); err2AssertTimer.cancel(); smiAssertTimer.cancel(); } + else + { + // Handle any initial errors when the host turns on + initializeErrorState(); + } }); } @@ -660,6 +665,11 @@ static void caterrHandler() }); } +static void cpu1ThermtripAssertHandler() +{ + cpuThermTripLog(1); +} + static void cpu1ThermtripHandler() { if (!hostOff) @@ -670,7 +680,7 @@ static void cpu1ThermtripHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu1Thermtrip) { - cpuThermTripLog(1); + cpu1ThermtripAssertHandler(); } } cpu1ThermtripEvent.async_wait( @@ -686,6 +696,11 @@ static void cpu1ThermtripHandler() }); } +static void cpu2ThermtripAssertHandler() +{ + cpuThermTripLog(2); +} + static void cpu2ThermtripHandler() { if (!hostOff) @@ -696,7 +711,7 @@ static void cpu2ThermtripHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu2Thermtrip) { - cpuThermTripLog(2); + cpu2ThermtripAssertHandler(); } } cpu2ThermtripEvent.async_wait( @@ -712,6 +727,11 @@ static void cpu2ThermtripHandler() }); } +static void cpu1VRHotAssertHandler() +{ + cpuVRHotLog("CPU 1"); +} + static void cpu1VRHotHandler() { if (!hostOff) @@ -722,7 +742,7 @@ static void cpu1VRHotHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu1VRHot) { - cpuVRHotLog("CPU 1"); + cpu1VRHotAssertHandler(); } } cpu1VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read, @@ -737,6 +757,11 @@ static void cpu1VRHotHandler() }); } +static void cpu1MemABCDVRHotAssertHandler() +{ + cpuVRHotLog("CPU 1 Memory ABCD"); +} + static void cpu1MemABCDVRHotHandler() { if (!hostOff) @@ -747,7 +772,7 @@ static void cpu1MemABCDVRHotHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu1MemABCDVRHot) { - cpuVRHotLog("CPU 1 Memory ABCD"); + cpu1MemABCDVRHotAssertHandler(); } } cpu1MemABCDVRHotEvent.async_wait( @@ -763,6 +788,11 @@ static void cpu1MemABCDVRHotHandler() }); } +static void cpu1MemEFGHVRHotAssertHandler() +{ + cpuVRHotLog("CPU 1 Memory EFGH"); +} + static void cpu1MemEFGHVRHotHandler() { if (!hostOff) @@ -773,7 +803,7 @@ static void cpu1MemEFGHVRHotHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu1MemEFGHVRHot) { - cpuVRHotLog("CPU 1 Memory EFGH"); + cpu1MemEFGHVRHotAssertHandler(); } } cpu1MemEFGHVRHotEvent.async_wait( @@ -789,6 +819,11 @@ static void cpu1MemEFGHVRHotHandler() }); } +static void cpu2VRHotAssertHandler() +{ + cpuVRHotLog("CPU 2"); +} + static void cpu2VRHotHandler() { if (!hostOff) @@ -799,7 +834,7 @@ static void cpu2VRHotHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu2VRHot) { - cpuVRHotLog("CPU 2"); + cpu2VRHotAssertHandler(); } } cpu2VRHotEvent.async_wait(boost::asio::posix::stream_descriptor::wait_read, @@ -814,6 +849,11 @@ static void cpu2VRHotHandler() }); } +static void cpu2MemABCDVRHotAssertHandler() +{ + cpuVRHotLog("CPU 2 Memory ABCD"); +} + static void cpu2MemABCDVRHotHandler() { if (!hostOff) @@ -824,7 +864,7 @@ static void cpu2MemABCDVRHotHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu2MemABCDVRHot) { - cpuVRHotLog("CPU 2 Memory ABCD"); + cpu2MemABCDVRHotAssertHandler(); } } cpu2MemABCDVRHotEvent.async_wait( @@ -840,6 +880,11 @@ static void cpu2MemABCDVRHotHandler() }); } +static void cpu2MemEFGHVRHotAssertHandler() +{ + cpuVRHotLog("CPU 2 Memory EFGH"); +} + static void cpu2MemEFGHVRHotHandler() { if (!hostOff) @@ -850,7 +895,7 @@ static void cpu2MemEFGHVRHotHandler() gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE; if (cpu2MemEFGHVRHot) { - cpuVRHotLog("CPU 2 Memory EFGH"); + cpu2MemEFGHVRHotAssertHandler(); } } cpu2MemEFGHVRHotEvent.async_wait( @@ -1225,6 +1270,54 @@ static void initializeErrorState() smiAssertHandler(); } + // Handle CPU1_THERMTRIP if it's asserted now + if (cpu1ThermtripLine.get_value() == 0) + { + cpu1ThermtripAssertHandler(); + } + + // Handle CPU2_THERMTRIP if it's asserted now + if (cpu2ThermtripLine.get_value() == 0) + { + cpu2ThermtripAssertHandler(); + } + + // Handle CPU1_VRHOT if it's asserted now + if (cpu1VRHotLine.get_value() == 0) + { + cpu1VRHotAssertHandler(); + } + + // Handle CPU1_MEM_ABCD_VRHOT if it's asserted now + if (cpu1MemABCDVRHotLine.get_value() == 0) + { + cpu1MemABCDVRHotAssertHandler(); + } + + // Handle CPU1_MEM_EFGH_VRHOT if it's asserted now + if (cpu1MemEFGHVRHotLine.get_value() == 0) + { + cpu1MemEFGHVRHotAssertHandler(); + } + + // Handle CPU2_VRHOT if it's asserted now + if (cpu2VRHotLine.get_value() == 0) + { + cpu2VRHotAssertHandler(); + } + + // Handle CPU2_MEM_ABCD_VRHOT if it's asserted now + if (cpu2MemABCDVRHotLine.get_value() == 0) + { + cpu2MemABCDVRHotAssertHandler(); + } + + // Handle CPU2_MEM_EFGH_VRHOT if it's asserted now + if (cpu2MemEFGHVRHotLine.get_value() == 0) + { + cpu2MemEFGHVRHotAssertHandler(); + } + // Handle PCH_BMC_THERMTRIP if it's asserted now if (pchThermtripLine.get_value() == 0) { -- cgit v1.2.3 From 82facc00fddc73f43a13ae72686bdb5d1744e12f Mon Sep 17 00:00:00 2001 From: "Jason M. Bills" Date: Mon, 9 Sep 2019 14:45:38 -0700 Subject: Enable boot FIVR fault monitoring and logging A boot FIVR fault will assert the CPU FIVR Fault GPIO and then assert the thermtrip GPIO. This adds a check for if the CPU FIVR Fault GPIO is asserted to determine whether to log a boot FIVR fault or a CPU thermal trip. Tested: Pulled the CPU FIVR Fault GPIO high and moved the jumper to assert a thermtrip. After the system shut down, confirmed that the event was logged as a boot FIVR fault. Change-Id: Ic4292a3fa9135c7367764f2b126937e33c5ad652 Signed-off-by: Jason M. Bills --- host_error_monitor/src/host_error_monitor.cpp | 67 ++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/host_error_monitor/src/host_error_monitor.cpp b/host_error_monitor/src/host_error_monitor.cpp index 0dc620b..900ead6 100644 --- a/host_error_monitor/src/host_error_monitor.cpp +++ b/host_error_monitor/src/host_error_monitor.cpp @@ -58,8 +58,10 @@ static gpiod::line err2Line; static boost::asio::posix::stream_descriptor err2Event(io); static gpiod::line smiLine; static boost::asio::posix::stream_descriptor smiEvent(io); +static gpiod::line cpu1FIVRFaultLine; static gpiod::line cpu1ThermtripLine; static boost::asio::posix::stream_descriptor cpu1ThermtripEvent(io); +static gpiod::line cpu2FIVRFaultLine; static gpiod::line cpu2ThermtripLine; static boost::asio::posix::stream_descriptor cpu2ThermtripEvent(io); static gpiod::line cpu1VRHotLine; @@ -131,6 +133,15 @@ static void smiTimeoutLog() "REDFISH_MESSAGE_ARGS=%s", "SMI Timeout", NULL); } +static void cpuBootFIVRFaultLog(const int cpuNum) +{ + std::string msg = "Boot FIVR Fault on CPU " + std::to_string(cpuNum); + + sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i", + LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError", + "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL); +} + static void cpuThermTripLog(const int cpuNum) { std::string msg = "CPU " + std::to_string(cpuNum) + " thermal trip"; @@ -276,6 +287,30 @@ static bool requestGPIOEvents( return true; } +static bool requestGPIOInput(const std::string& name, gpiod::line& gpioLine) +{ + // Find the GPIO line + gpioLine = gpiod::find_line(name); + if (!gpioLine) + { + std::cerr << "Failed to find the " << name << " line.\n"; + return false; + } + + // Request GPIO input + try + { + gpioLine.request({__FUNCTION__, gpiod::line_request::DIRECTION_INPUT}); + } + catch (std::exception&) + { + std::cerr << "Failed to request " << name << " input\n"; + return false; + } + + return true; +} + static void startPowerCycle() { conn->async_method_call( @@ -667,7 +702,14 @@ static void caterrHandler() static void cpu1ThermtripAssertHandler() { - cpuThermTripLog(1); + if (cpu1FIVRFaultLine.get_value() == 0) + { + cpuBootFIVRFaultLog(1); + } + else + { + cpuThermTripLog(1); + } } static void cpu1ThermtripHandler() @@ -698,7 +740,14 @@ static void cpu1ThermtripHandler() static void cpu2ThermtripAssertHandler() { - cpuThermTripLog(2); + if (cpu2FIVRFaultLine.get_value() == 0) + { + cpuBootFIVRFaultLog(2); + } + else + { + cpuThermTripLog(2); + } } static void cpu2ThermtripHandler() @@ -1385,6 +1434,13 @@ int main(int argc, char* argv[]) return -1; } + // Request CPU1_FIVR_FAULT GPIO input + if (!host_error_monitor::requestGPIOInput( + "CPU1_FIVR_FAULT", host_error_monitor::cpu1FIVRFaultLine)) + { + return -1; + } + // Request CPU1_THERMTRIP GPIO events if (!host_error_monitor::requestGPIOEvents( "CPU1_THERMTRIP", host_error_monitor::cpu1ThermtripHandler, @@ -1394,6 +1450,13 @@ int main(int argc, char* argv[]) return -1; } + // Request CPU2_FIVR_FAULT GPIO input + if (!host_error_monitor::requestGPIOInput( + "CPU2_FIVR_FAULT", host_error_monitor::cpu2FIVRFaultLine)) + { + return -1; + } + // Request CPU2_THERMTRIP GPIO events if (!host_error_monitor::requestGPIOEvents( "CPU2_THERMTRIP", host_error_monitor::cpu2ThermtripHandler, -- cgit v1.2.3