summaryrefslogtreecommitdiff
path: root/meta-openbmc-mods/meta-wht/recipes-core/host-error-monitor/host-error-monitor/0002-Add-a-workaround-for-spurious-CPU-errors.patch
blob: 2a573311f18182c7b62314661685cdd8ef10e779 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
From d0e4130b2d1e0e44efc8fd6e180487853625edd6 Mon Sep 17 00:00:00 2001
From: "Jason M. Bills" <jason.m.bills@intel.com>
Date: Mon, 17 Aug 2020 15:52:22 -0700
Subject: [PATCH] Add a workaround for spurious CPU errors

There is a possible issue where GPIO event interrupts are getting
missed causing false errors to be logged.

This adds a check that the host is still on and the error is still
asserted before logging an error.

Tested:
Confirmed that a spurious SMI event was ignored correctly after
this change.

Change-Id: Id83d9d67b15dcf9035e6448086b140e5c7dab4fe
Signed-off-by: Jason M. Bills <jason.m.bills@intel.com>
---
 src/host_error_monitor.cpp | 77 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/src/host_error_monitor.cpp b/src/host_error_monitor.cpp
index ca089f70d..fd453ccdc 100644
--- a/src/host_error_monitor.cpp
+++ b/src/host_error_monitor.cpp
@@ -797,6 +797,18 @@ static void caterrAssertHandler()
             }
             return;
         }
+        // Confirm that this is a real failure by checking that the host is on
+        if (hostOff)
+        {
+            return;
+        }
+        // And that the signal is still asserted
+        if (caterrLine.get_value() != 0)
+        {
+            std::cerr
+                << "CPU_CATERR not asserted after timeout. Error ignored.\n";
+            return;
+        }
         std::cerr << "CATERR asserted for " << std::to_string(caterrTimeoutMs)
                   << " ms\n";
         beep(beepCPUIERR);
@@ -1270,6 +1282,48 @@ static void errXAssertHandler(const int errPin,
             }
             return;
         }
+        // Confirm that this is a real failure by checking that the host is on
+        if (hostOff)
+        {
+            return;
+        }
+        // And that the signal is still asserted
+        switch (errPin)
+        {
+            case 0:
+            {
+                if (err0Line.get_value() != 0)
+                {
+                    std::cerr << "CPU_ERR0 not asserted after timeout. Error "
+                                 "ignored.\n";
+                    return;
+                }
+                break;
+            }
+            case 1:
+            {
+                if (err1Line.get_value() != 0)
+                {
+                    std::cerr << "CPU_ERR1 not asserted after timeout. Error "
+                                 "ignored.\n";
+                    return;
+                }
+                break;
+            }
+            case 2:
+            {
+                if (err2Line.get_value() != 0)
+                {
+                    std::cerr << "CPU_ERR2 not asserted after timeout. Error "
+                                 "ignored.\n";
+                    return;
+                }
+                break;
+            }
+            default:
+                std::cerr << "Invalid ERR pin asserted\n";
+                return;
+        }
         std::cerr << "ERR" << std::to_string(errPin) << " asserted for "
                   << std::to_string(errTimeoutMs) << " ms\n";
         if (errPinCPUs.count())
@@ -1379,6 +1433,18 @@ static void err2AssertHandler()
             }
             return;
         }
+        // Confirm that this is a real failure by checking that the host is on
+        if (hostOff)
+        {
+            return;
+        }
+        // And that the signal is still asserted
+        if (err2Line.get_value() != 0)
+        {
+            std::cerr
+                << "CPU_ERR2 not asserted after timeout. Error ignored.\n";
+            return;
+        }
         conn->async_method_call(
             [](boost::system::error_code ec,
                const std::variant<bool>& property) {
@@ -1447,6 +1513,17 @@ static void smiAssertHandler()
             }
             return;
         }
+        // Confirm that this is a real failure by checking that the host is on
+        if (hostOff)
+        {
+            return;
+        }
+        // And that the signal is still asserted
+        if (smiLine.get_value() != 0)
+        {
+            std::cerr << "SMI not asserted after timeout. Error ignored.\n";
+            return;
+        }
         std::cerr << "SMI asserted for " << std::to_string(smiTimeoutMs)
                   << " ms\n";
         smiTimeoutLog();
-- 
2.17.1