summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch')
-rw-r--r--meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch251
1 files changed, 251 insertions, 0 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch
new file mode 100644
index 000000000..c94fce70a
--- /dev/null
+++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch
@@ -0,0 +1,251 @@
+From 5e0de05dc29bdb065566f33670e68c903ab51e98 Mon Sep 17 00:00:00 2001
+From: "Jason M. Bills" <jason.m.bills@intel.com>
+Date: Tue, 22 Sep 2020 15:09:49 -0700
+Subject: [PATCH] Improve PECI error checking and reporting
+
+It's possible for a PECI command to return successfully with an
+error in the completion code. This change adds checks for the
+PECI completion code and will print an error on failure.
+
+Tested:
+Injected an IERR and confirmed that PECI data is correctly handled.
+
+Change-Id: I86fc0a99ab04dac4d8b38f9f7e0ee1eb6a39397d
+Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
+---
+ src/host_error_monitor.cpp | 114 +++++++++++++++++++++++++------------
+ 1 file changed, 79 insertions(+), 35 deletions(-)
+
+diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
+index 1b10cd38e..5e6d7a82c 100644
+--- a/src/host_error_monitor.cpp
++++ b/src/host_error_monitor.cpp
+@@ -234,6 +234,21 @@ static void ssbThermTripLog()
+ "OpenBMC.0.1.SsbThermalTrip", NULL);
+ }
+
++static inline bool peciError(EPECIStatus peciStatus, uint8_t cc)
++{
++ return (
++ peciStatus != PECI_CC_SUCCESS ||
++ (cc != PECI_DEV_CC_SUCCESS && cc != PECI_DEV_CC_FATAL_MCA_DETECTED));
++}
++
++static void printPECIError(const std::string& reg, const size_t addr,
++ const EPECIStatus peciStatus, const size_t cc)
++{
++ std::cerr << "Failed to read " << reg << " on CPU address " << addr
++ << ". Error: " << peciStatus << ": cc: 0x" << std::hex << cc
++ << "\n";
++}
++
+ static void initializeErrorState();
+ static void initializeHostState()
+ {
+@@ -583,9 +598,10 @@ static void incrementCPUErrorCount(int cpuNum)
+ static bool checkIERRCPUs()
+ {
+ bool cpuIERRFound = false;
+- for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
++ for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
+ cpu++, addr++)
+ {
++ EPECIStatus peciStatus = PECI_CC_SUCCESS;
+ uint8_t cc = 0;
+ CPUModel model{};
+ uint8_t stepping = 0;
+@@ -602,9 +618,11 @@ static bool checkIERRCPUs()
+ // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+ // that caused the IERR
+ uint32_t mcaErrSrcLog = 0;
+- if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
+- &cc) != PECI_CC_SUCCESS)
++ peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
++ (uint8_t*)&mcaErrSrcLog, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
+ continue;
+ }
+ // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+@@ -616,9 +634,10 @@ static bool checkIERRCPUs()
+ // Next check if it's a CPU/VR mismatch by reading the
+ // IA32_MC4_STATUS MSR (0x411)
+ uint64_t mc4Status = 0;
+- if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
+- PECI_CC_SUCCESS)
++ peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
+ continue;
+ }
+ // Check MSEC bits 31:24 for
+@@ -637,10 +656,13 @@ static bool checkIERRCPUs()
+ // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset
+ // 80h)
+ uint32_t coreFIVRErrLog = 0;
+- if (peci_RdPCIConfigLocal(
+- addr, 1, 30, 2, 0x80, sizeof(uint32_t),
+- (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS)
++ peciStatus = peci_RdPCIConfigLocal(
++ addr, 1, 30, 2, 0x80, sizeof(uint32_t),
++ (uint8_t*)&coreFIVRErrLog, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("CORE_FIVR_ERR_LOG", addr, peciStatus,
++ cc);
+ continue;
+ }
+ if (coreFIVRErrLog)
+@@ -653,11 +675,13 @@ static bool checkIERRCPUs()
+ // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset
+ // 84h)
+ uint32_t uncoreFIVRErrLog = 0;
+- if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84,
+- sizeof(uint32_t),
+- (uint8_t*)&uncoreFIVRErrLog,
+- &cc) != PECI_CC_SUCCESS)
++ peciStatus = peci_RdPCIConfigLocal(
++ addr, 1, 30, 2, 0x84, sizeof(uint32_t),
++ (uint8_t*)&uncoreFIVRErrLog, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
++ cc);
+ continue;
+ }
+ if (uncoreFIVRErrLog)
+@@ -687,9 +711,11 @@ static bool checkIERRCPUs()
+ // First check the MCA_ERR_SRC_LOG to see if this is the CPU
+ // that caused the IERR
+ uint32_t mcaErrSrcLog = 0;
+- if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog,
+- &cc) != PECI_CC_SUCCESS)
++ peciStatus = peci_RdPkgConfig(addr, 0, 5, 4,
++ (uint8_t*)&mcaErrSrcLog, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc);
+ continue;
+ }
+ // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27)
+@@ -701,9 +727,10 @@ static bool checkIERRCPUs()
+ // Next check if it's a CPU/VR mismatch by reading the
+ // IA32_MC4_STATUS MSR (0x411)
+ uint64_t mc4Status = 0;
+- if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) !=
+- PECI_CC_SUCCESS)
++ peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc);
+ continue;
+ }
+ // TODO: Update MSEC/MSCOD_31_24 check
+@@ -724,16 +751,22 @@ static bool checkIERRCPUs()
+ // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14)
+ uint32_t coreFIVRErrLog0 = 0;
+ uint32_t coreFIVRErrLog1 = 0;
+- if (peci_RdEndPointConfigPciLocal(
+- addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
+- (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS)
++ peciStatus = peci_RdEndPointConfigPciLocal(
++ addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t),
++ (uint8_t*)&coreFIVRErrLog0, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("CORE_FIVR_ERR_LOG_0", addr, peciStatus,
++ cc);
+ continue;
+ }
+- if (peci_RdEndPointConfigPciLocal(
+- addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
+- (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS)
++ peciStatus = peci_RdEndPointConfigPciLocal(
++ addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t),
++ (uint8_t*)&coreFIVRErrLog1, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("CORE_FIVR_ERR_LOG_1", addr, peciStatus,
++ cc);
+ continue;
+ }
+ if (coreFIVRErrLog0 || coreFIVRErrLog1)
+@@ -746,11 +779,13 @@ static bool checkIERRCPUs()
+ // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2
+ // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14)
+ uint32_t uncoreFIVRErrLog = 0;
+- if (peci_RdEndPointConfigPciLocal(
+- addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
+- (uint8_t*)&uncoreFIVRErrLog,
+- &cc) != PECI_CC_SUCCESS)
++ peciStatus = peci_RdEndPointConfigPciLocal(
++ addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t),
++ (uint8_t*)&uncoreFIVRErrLog, &cc);
++ if (peciError(peciStatus, cc))
+ {
++ printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus,
++ cc);
+ continue;
+ }
+ if (uncoreFIVRErrLog)
+@@ -1213,11 +1248,12 @@ static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
+ {
+ int errPinSts = (1 << errPin);
+ std::bitset<MAX_CPUS> errPinCPUs = 0;
+- for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
++ for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR;
+ cpu++, addr++)
+ {
+ if (peci_Ping(addr) == PECI_CC_SUCCESS)
+ {
++ EPECIStatus peciStatus = PECI_CC_SUCCESS;
+ uint8_t cc = 0;
+ CPUModel model{};
+ uint8_t stepping = 0;
+@@ -1234,12 +1270,16 @@ static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
+ // Check the ERRPINSTS to see if this is the CPU that caused
+ // the ERRx (B(0) D8 F0 offset 210h)
+ uint32_t errpinsts = 0;
+- if (peci_RdPCIConfigLocal(
+- addr, 0, 8, 0, 0x210, sizeof(uint32_t),
+- (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
++ peciStatus = peci_RdPCIConfigLocal(
++ addr, 0, 8, 0, 0x210, sizeof(uint32_t),
++ (uint8_t*)&errpinsts, &cc);
++ if (peciError(peciStatus, cc))
+ {
+- errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
++ printPECIError("ERRPINSTS", addr, peciStatus, cc);
++ continue;
+ }
++
++ errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
+ break;
+ }
+ case icx:
+@@ -1248,12 +1288,16 @@ static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin)
+ // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is
+ // accessed on PECI as bus 13)
+ uint32_t errpinsts = 0;
+- if (peci_RdEndPointConfigPciLocal(
+- addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
+- (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS)
++ peciStatus = peci_RdEndPointConfigPciLocal(
++ addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t),
++ (uint8_t*)&errpinsts, &cc);
++ if (peciError(peciStatus, cc))
+ {
+- errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
++ printPECIError("ERRPINSTS", addr, peciStatus, cc);
++ continue;
+ }
++
++ errPinCPUs[cpu] = (errpinsts & errPinSts) != 0;
+ break;
+ }
+ }
+--
+2.17.1
+