summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason M. Bills <jason.m.bills@intel.com>2019-08-06 21:07:21 +0300
committerJason M. Bills <jason.m.bills@linux.intel.com>2019-09-04 19:05:50 +0300
commit4a7b10afa702c98a83122e59ce4929afad8cf881 (patch)
tree6f00c702545ceebbfaa08fa7700831e6ea1f4479
parentb1343f99d63781ee3b9ba87575183bbbff1d5476 (diff)
downloadprovingground-4a7b10afa702c98a83122e59ce4929afad8cf881.tar.xz
Add IERR logging to the CATERR/IERR handler
This change attempts to determine which CPU asserted the CATERR signal and the type of CATERR to include in the log. Tested: Manually triggered an IERR and confirmed that the correct data was logged. Change-Id: I8e9ad26889c093392254ae1d70af3dde2c62a519 Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
-rw-r--r--host_error_monitor/src/host_error_monitor.cpp260
1 files changed, 256 insertions, 4 deletions
diff --git a/host_error_monitor/src/host_error_monitor.cpp b/host_error_monitor/src/host_error_monitor.cpp
index 4c44007..9376a3e 100644
--- a/host_error_monitor/src/host_error_monitor.cpp
+++ b/host_error_monitor/src/host_error_monitor.cpp
@@ -30,7 +30,7 @@ static std::shared_ptr<sdbusplus::asio::connection> conn;
static bool hostOff = true;
-const static constexpr int caterrTimeoutMs = 1000;
+const static constexpr int caterrTimeoutMs = 2000;
const static constexpr int err2TimeoutMs = 90000;
const static constexpr int crashdumpTimeoutS = 300;
@@ -52,6 +52,31 @@ static boost::asio::posix::stream_descriptor err2Event(io);
static gpiod::line pchThermtripLine;
static boost::asio::posix::stream_descriptor pchThermtripEvent(io);
+static void cpuIERRLog()
+{
+ sd_journal_send("MESSAGE=HostError: IERR", "PRIORITY=%i", LOG_INFO,
+ "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", "IERR", NULL);
+}
+
+static void cpuIERRLog(const int cpuNum)
+{
+ std::string msg = "IERR on CPU " + std::to_string(cpuNum + 1);
+
+ sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+ LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
+}
+
+static void cpuIERRLog(const int cpuNum, const std::string& type)
+{
+ std::string msg = type + " IERR on CPU " + std::to_string(cpuNum + 1);
+
+ sd_journal_send("MESSAGE=HostError: %s", msg.c_str(), "PRIORITY=%i",
+ LOG_INFO, "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError",
+ "REDFISH_MESSAGE_ARGS=%s", msg.c_str(), NULL);
+}
+
static void cpuERR2Log()
{
sd_journal_send("MESSAGE=HostError: ERR2 Timeout", "PRIORITY=%i", LOG_INFO,
@@ -246,9 +271,232 @@ static void startCrashdumpAndRecovery(bool recoverSystem)
"com.intel.crashdump.Stored", "GenerateStoredLog");
}
+static bool checkIERRCPUs()
+{
+ bool cpuIERRFound = false;
+ for (int cpu = 0, addr = crashdump::minClientAddr;
+ addr <= crashdump::maxClientAddr; cpu++, addr++)
+ {
+ if (peci_Ping(addr) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ uint8_t cc = 0;
+ uint32_t cpuID = 0;
+ if (peci_RdPkgConfig(addr, PECI_MBX_INDEX_CPU_ID, PECI_PKG_ID_CPU_ID,
+ sizeof(uint32_t), (uint8_t*)&cpuID,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ std::cerr << "Cannot get CPUID!\n";
+ continue;
+ }
+
+ crashdump::CPUModel model{};
+ bool modelFound = false;
+ for (int i = 0; i < crashdump::cpuIDMap.size(); i++)
+ {
+ if (cpuID == crashdump::cpuIDMap[i].cpuID)
+ {
+ model = crashdump::cpuIDMap[i].model;
+ modelFound = true;
+ break;
+ }
+ }
+ if (!modelFound)
+ {
+ std::cerr << "Cannot find Model for CPUID 0x" << std::hex << cpuID
+ << "\n";
+ continue;
+ }
+
+ switch (model)
+ {
+ case crashdump::CPUModel::skx_h0:
+ {
+ // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+ // that caused the IERR
+ uint32_t mcaErrSrcLog = 0;
+ if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+ if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
+ {
+ // TODO: Light the CPU fault LED?
+ cpuIERRFound = true;
+ // Next check if it's a CPU/VR mismatch by reading the
+ // IA32_MC4_STATUS MSR (0x411)
+ uint64_t mc4Status = 0;
+ if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
+ PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // Check MSEC bits 31:24 for
+ // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
+ // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
+ // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
+ if ((mc4Status & (0x40 << 24)) ||
+ (mc4Status & (0x42 << 24)) ||
+ (mc4Status & (0x43 << 24)))
+ {
+ cpuIERRLog(cpu, "CPU/VR Mismatch");
+ continue;
+ }
+
+ // Next check if it's a Core FIVR fault by looking for a
+ // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
+ // 80h)
+ uint32_t coreFIVRErrLog = 0;
+ if (peci_RdPCIConfigLocal(
+ addr, 1, 30, 2, 0x80, sizeof(uint32_t),
+ (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (coreFIVRErrLog)
+ {
+ cpuIERRLog(cpu, "Core FIVR Fault");
+ continue;
+ }
+
+ // Next check if it's an Uncore FIVR fault by looking for a
+ // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
+ // 84h)
+ uint32_t uncoreFIVRErrLog = 0;
+ if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
+ sizeof(uint32_t),
+ (uint8_t*)&uncoreFIVRErrLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (uncoreFIVRErrLog)
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+
+ // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
+ // both zero, but MSEC bits 31:24 have either
+ // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
+ // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
+ // uncore FIVR fault
+ if (!coreFIVRErrLog && !uncoreFIVRErrLog &&
+ ((mc4Status & (0x51 << 24)) ||
+ (mc4Status & (0x52 << 24))))
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+ cpuIERRLog(cpu);
+ }
+ break;
+ }
+ case crashdump::CPUModel::icx_a0:
+ case crashdump::CPUModel::icx_b0:
+ {
+ // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+ // that caused the IERR
+ uint32_t mcaErrSrcLog = 0;
+ if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+ if ((mcaErrSrcLog & (1 << 20)) || (mcaErrSrcLog & (1 << 27)))
+ {
+ // TODO: Light the CPU fault LED?
+ cpuIERRFound = true;
+ // Next check if it's a CPU/VR mismatch by reading the
+ // IA32_MC4_STATUS MSR (0x411)
+ uint64_t mc4Status = 0;
+ if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
+ PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ // TODO: Update MSEC/MSCOD_31_24 check
+ // Check MSEC bits 31:24 for
+ // MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE (0x40),
+ // MCA_SVID_VCCIN_VR_VOUT_FAILURE (0x42), or
+ // MCA_SVID_CPU_VR_CAPABILITY_ERROR (0x43)
+ if ((mc4Status & (0x40 << 24)) ||
+ (mc4Status & (0x42 << 24)) ||
+ (mc4Status & (0x43 << 24)))
+ {
+ cpuIERRLog(cpu, "CPU/VR Mismatch");
+ continue;
+ }
+
+ // Next check if it's a Core FIVR fault by looking for a
+ // non-zero value of CORE_FIVR_ERR_LOG (B(31) D30 F2 offsets
+ // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
+ uint32_t coreFIVRErrLog0 = 0;
+ uint32_t coreFIVRErrLog1 = 0;
+ if (peci_RdEndPointConfigPciLocal(
+ addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
+ (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (peci_RdEndPointConfigPciLocal(
+ addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
+ (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (coreFIVRErrLog0 || coreFIVRErrLog1)
+ {
+ cpuIERRLog(cpu, "Core FIVR Fault");
+ continue;
+ }
+
+ // Next check if it's an Uncore FIVR fault by looking for a
+ // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
+ // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
+ uint32_t uncoreFIVRErrLog = 0;
+ if (peci_RdEndPointConfigPciLocal(
+ addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
+ (uint8_t*)&uncoreFIVRErrLog,
+ &cc) != PECI_CC_SUCCESS)
+ {
+ continue;
+ }
+ if (uncoreFIVRErrLog)
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+
+ // TODO: Update MSEC/MSCOD_31_24 check
+ // Last if CORE_FIVR_ERR_LOG and UNCORE_FIVR_ERR_LOG are
+ // both zero, but MSEC bits 31:24 have either
+ // MCA_FIVR_CATAS_OVERVOL_FAULT (0x51) or
+ // MCA_FIVR_CATAS_OVERCUR_FAULT (0x52), then log it as an
+ // uncore FIVR fault
+ if (!coreFIVRErrLog0 && !coreFIVRErrLog1 &&
+ !uncoreFIVRErrLog &&
+ ((mc4Status & (0x51 << 24)) ||
+ (mc4Status & (0x52 << 24))))
+ {
+ cpuIERRLog(cpu, "Uncore FIVR Fault");
+ continue;
+ }
+ cpuIERRLog(cpu);
+ }
+ break;
+ }
+ }
+ }
+ return cpuIERRFound;
+}
+
static void caterrAssertHandler()
{
- std::cout << "CPU CATERR detected, starting timer\n";
caterrAssertTimer.expires_after(std::chrono::milliseconds(caterrTimeoutMs));
caterrAssertTimer.async_wait([](const boost::system::error_code ec) {
if (ec)
@@ -260,10 +508,14 @@ static void caterrAssertHandler()
std::cerr << "caterr timeout async_wait failed: "
<< ec.message() << "\n";
}
- std::cout << "CATERR assert timer canceled\n";
return;
}
- std::cout << "CATERR asset timer completed\n";
+ std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
+ << " ms\n";
+ if (!checkIERRCPUs())
+ {
+ cpuIERRLog();
+ }
conn->async_method_call(
[](boost::system::error_code ec,
const std::variant<bool>& property) {