diff options
Diffstat (limited to 'meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch')
-rw-r--r-- | meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch | 251 |
1 files changed, 251 insertions, 0 deletions
diff --git a/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch new file mode 100644 index 000000000..c94fce70a --- /dev/null +++ b/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0005-Improve-PECI-error-checking-and-reporting.patch @@ -0,0 +1,251 @@ +From 5e0de05dc29bdb065566f33670e68c903ab51e98 Mon Sep 17 00:00:00 2001 +From: "Jason M. Bills" <jason.m.bills@intel.com> +Date: Tue, 22 Sep 2020 15:09:49 -0700 +Subject: [PATCH] Improve PECI error checking and reporting + +It's possible for a PECI command to return successfully with an +error in the completion code. This change adds checks for the +PECI completion code and will print an error on failure. + +Tested: +Injected an IERR and confirmed that PECI data is correctly handled. + +Change-Id: I86fc0a99ab04dac4d8b38f9f7e0ee1eb6a39397d +Signed-off-by: Jason M. Bills <jason.m.bills@intel.com> +--- + src/host_error_monitor.cpp | 114 +++++++++++++++++++++++++------------ + 1 file changed, 79 insertions(+), 35 deletions(-) + +diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp +index 1b10cd38e..5e6d7a82c 100644 +--- a/src/host_error_monitor.cpp ++++ b/src/host_error_monitor.cpp +@@ -234,6 +234,21 @@ static void ssbThermTripLog() + "OpenBMC.0.1.SsbThermalTrip", NULL); + } + ++static inline bool peciError(EPECIStatus peciStatus, uint8_t cc) ++{ ++ return ( ++ peciStatus != PECI_CC_SUCCESS || ++ (cc != PECI_DEV_CC_SUCCESS && cc != PECI_DEV_CC_FATAL_MCA_DETECTED)); ++} ++ ++static void printPECIError(const std::string& reg, const size_t addr, ++ const EPECIStatus peciStatus, const size_t cc) ++{ ++ std::cerr << "Failed to read " << reg << " on CPU address " << addr ++ << ". Error: " << peciStatus << ": cc: 0x" << std::hex << cc ++ << "\n"; ++} ++ + static void initializeErrorState(); + static void initializeHostState() + { +@@ -583,9 +598,10 @@ static void incrementCPUErrorCount(int cpuNum) + static bool checkIERRCPUs() + { + bool cpuIERRFound = false; +- for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR; ++ for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR; + cpu++, addr++) + { ++ EPECIStatus peciStatus = PECI_CC_SUCCESS; + uint8_t cc = 0; + CPUModel model{}; + uint8_t stepping = 0; +@@ -602,9 +618,11 @@ static bool checkIERRCPUs() + // First check the MCA_ERR_SRC_LOG to see if this is the CPU + // that caused the IERR + uint32_t mcaErrSrcLog = 0; +- if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog, +- &cc) != PECI_CC_SUCCESS) ++ peciStatus = peci_RdPkgConfig(addr, 0, 5, 4, ++ (uint8_t*)&mcaErrSrcLog, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc); + continue; + } + // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27) +@@ -616,9 +634,10 @@ static bool checkIERRCPUs() + // Next check if it's a CPU/VR mismatch by reading the + // IA32_MC4_STATUS MSR (0x411) + uint64_t mc4Status = 0; +- if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) != +- PECI_CC_SUCCESS) ++ peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc); + continue; + } + // Check MSEC bits 31:24 for +@@ -637,10 +656,13 @@ static bool checkIERRCPUs() + // non-zero value of CORE_FIVR_ERR_LOG (B(1) D30 F2 offset + // 80h) + uint32_t coreFIVRErrLog = 0; +- if (peci_RdPCIConfigLocal( +- addr, 1, 30, 2, 0x80, sizeof(uint32_t), +- (uint8_t*)&coreFIVRErrLog, &cc) != PECI_CC_SUCCESS) ++ peciStatus = peci_RdPCIConfigLocal( ++ addr, 1, 30, 2, 0x80, sizeof(uint32_t), ++ (uint8_t*)&coreFIVRErrLog, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("CORE_FIVR_ERR_LOG", addr, peciStatus, ++ cc); + continue; + } + if (coreFIVRErrLog) +@@ -653,11 +675,13 @@ static bool checkIERRCPUs() + // non-zero value of UNCORE_FIVR_ERR_LOG (B(1) D30 F2 offset + // 84h) + uint32_t uncoreFIVRErrLog = 0; +- if (peci_RdPCIConfigLocal(addr, 1, 30, 2, 0x84, +- sizeof(uint32_t), +- (uint8_t*)&uncoreFIVRErrLog, +- &cc) != PECI_CC_SUCCESS) ++ peciStatus = peci_RdPCIConfigLocal( ++ addr, 1, 30, 2, 0x84, sizeof(uint32_t), ++ (uint8_t*)&uncoreFIVRErrLog, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus, ++ cc); + continue; + } + if (uncoreFIVRErrLog) +@@ -687,9 +711,11 @@ static bool checkIERRCPUs() + // First check the MCA_ERR_SRC_LOG to see if this is the CPU + // that caused the IERR + uint32_t mcaErrSrcLog = 0; +- if (peci_RdPkgConfig(addr, 0, 5, 4, (uint8_t*)&mcaErrSrcLog, +- &cc) != PECI_CC_SUCCESS) ++ peciStatus = peci_RdPkgConfig(addr, 0, 5, 4, ++ (uint8_t*)&mcaErrSrcLog, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("MCA_ERR_SRC_LOG", addr, peciStatus, cc); + continue; + } + // Check MSMI_INTERNAL (20) and IERR_INTERNAL (27) +@@ -701,9 +727,10 @@ static bool checkIERRCPUs() + // Next check if it's a CPU/VR mismatch by reading the + // IA32_MC4_STATUS MSR (0x411) + uint64_t mc4Status = 0; +- if (peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc) != +- PECI_CC_SUCCESS) ++ peciStatus = peci_RdIAMSR(addr, 0, 0x411, &mc4Status, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("IA32_MC4_STATUS", addr, peciStatus, cc); + continue; + } + // TODO: Update MSEC/MSCOD_31_24 check +@@ -724,16 +751,22 @@ static bool checkIERRCPUs() + // C0h and C4h) (Note: Bus 31 is accessed on PECI as bus 14) + uint32_t coreFIVRErrLog0 = 0; + uint32_t coreFIVRErrLog1 = 0; +- if (peci_RdEndPointConfigPciLocal( +- addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t), +- (uint8_t*)&coreFIVRErrLog0, &cc) != PECI_CC_SUCCESS) ++ peciStatus = peci_RdEndPointConfigPciLocal( ++ addr, 0, 14, 30, 2, 0xC0, sizeof(uint32_t), ++ (uint8_t*)&coreFIVRErrLog0, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("CORE_FIVR_ERR_LOG_0", addr, peciStatus, ++ cc); + continue; + } +- if (peci_RdEndPointConfigPciLocal( +- addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t), +- (uint8_t*)&coreFIVRErrLog1, &cc) != PECI_CC_SUCCESS) ++ peciStatus = peci_RdEndPointConfigPciLocal( ++ addr, 0, 14, 30, 2, 0xC4, sizeof(uint32_t), ++ (uint8_t*)&coreFIVRErrLog1, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("CORE_FIVR_ERR_LOG_1", addr, peciStatus, ++ cc); + continue; + } + if (coreFIVRErrLog0 || coreFIVRErrLog1) +@@ -746,11 +779,13 @@ static bool checkIERRCPUs() + // non-zero value of UNCORE_FIVR_ERR_LOG (B(31) D30 F2 + // offset 84h) (Note: Bus 31 is accessed on PECI as bus 14) + uint32_t uncoreFIVRErrLog = 0; +- if (peci_RdEndPointConfigPciLocal( +- addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t), +- (uint8_t*)&uncoreFIVRErrLog, +- &cc) != PECI_CC_SUCCESS) ++ peciStatus = peci_RdEndPointConfigPciLocal( ++ addr, 0, 14, 30, 2, 0x84, sizeof(uint32_t), ++ (uint8_t*)&uncoreFIVRErrLog, &cc); ++ if (peciError(peciStatus, cc)) + { ++ printPECIError("UNCORE_FIVR_ERR_LOG", addr, peciStatus, ++ cc); + continue; + } + if (uncoreFIVRErrLog) +@@ -1213,11 +1248,12 @@ static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin) + { + int errPinSts = (1 << errPin); + std::bitset<MAX_CPUS> errPinCPUs = 0; +- for (int cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR; ++ for (size_t cpu = 0, addr = MIN_CLIENT_ADDR; addr <= MAX_CLIENT_ADDR; + cpu++, addr++) + { + if (peci_Ping(addr) == PECI_CC_SUCCESS) + { ++ EPECIStatus peciStatus = PECI_CC_SUCCESS; + uint8_t cc = 0; + CPUModel model{}; + uint8_t stepping = 0; +@@ -1234,12 +1270,16 @@ static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin) + // Check the ERRPINSTS to see if this is the CPU that caused + // the ERRx (B(0) D8 F0 offset 210h) + uint32_t errpinsts = 0; +- if (peci_RdPCIConfigLocal( +- addr, 0, 8, 0, 0x210, sizeof(uint32_t), +- (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS) ++ peciStatus = peci_RdPCIConfigLocal( ++ addr, 0, 8, 0, 0x210, sizeof(uint32_t), ++ (uint8_t*)&errpinsts, &cc); ++ if (peciError(peciStatus, cc)) + { +- errPinCPUs[cpu] = (errpinsts & errPinSts) != 0; ++ printPECIError("ERRPINSTS", addr, peciStatus, cc); ++ continue; + } ++ ++ errPinCPUs[cpu] = (errpinsts & errPinSts) != 0; + break; + } + case icx: +@@ -1248,12 +1288,16 @@ static std::bitset<MAX_CPUS> checkERRPinCPUs(const int errPin) + // the ERRx (B(30) D0 F3 offset 274h) (Note: Bus 30 is + // accessed on PECI as bus 13) + uint32_t errpinsts = 0; +- if (peci_RdEndPointConfigPciLocal( +- addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t), +- (uint8_t*)&errpinsts, &cc) == PECI_CC_SUCCESS) ++ peciStatus = peci_RdEndPointConfigPciLocal( ++ addr, 0, 13, 0, 3, 0x274, sizeof(uint32_t), ++ (uint8_t*)&errpinsts, &cc); ++ if (peciError(peciStatus, cc)) + { +- errPinCPUs[cpu] = (errpinsts & errPinSts) != 0; ++ printPECIError("ERRPINSTS", addr, peciStatus, cc); ++ continue; + } ++ ++ errPinCPUs[cpu] = (errpinsts & errPinSts) != 0; + break; + } + } +-- +2.17.1 + |