summaryrefslogtreecommitdiff
path: root/drivers/pci/pcie/aer/aerdrv_core.c
diff options
context:
space:
mode:
authorOza Pawandeep <poza@codeaurora.org>2018-05-18 00:44:13 +0300
committerBjorn Helgaas <bhelgaas@google.com>2018-05-18 00:44:13 +0300
commit7e9084b36740b2ec263ea35efb203001f755e1d8 (patch)
tree76c7d1d5b86834e01167e80192ad443df621d105 /drivers/pci/pcie/aer/aerdrv_core.c
parent9f5a70f18c5893a30d6c339adc48de43c57dd7e2 (diff)
downloadlinux-7e9084b36740b2ec263ea35efb203001f755e1d8.tar.xz
PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices
PCIe ERR_FATAL errors mean the Link is unreliable. Components on the Link may need to be reset to return to reliable operation (PCIe r4.0, sec 6.2.2). We previously handled these errors much differently depending on whether the platform supports Downstream Port Containment (DPC) (PCIe r4.0, sec 6.2.10) or not. The AER driver has historically logged the error details, called driver-supplied pci_error_handlers callbacks, and reset the Link. This reset downstream devices, but did not remove them from the PCI subsystem, re-enumerate them, or call their driver .remove() or .probe() methods. DPC is different because the hardware automatically disables the Link when it detects ERR_FATAL, which resets downstream devices. There's no opportunity for pci_error_handlers callbacks before resetting the Link. The DPC driver removes affected devices (which calls their driver .remove() methods), brings the Link back up, and re-enumerates (which calls driver .probe() methods). Align AER ERR_FATAL handling with DPC by resetting the Link in software, skipping the driver pci_error_handlers callbacks, removing the devices from the PCI subsystem, and re-enumerating. The idea is that drivers and devices should see the same behavior for ERR_FATAL events, regardless of whether they're handled by AER or DPC. Here are the basic ERR_FATAL recovery steps, showing the previous AER behavior, the AER behavior after this patch, and the DPC behavior: AER AER DPC previous new behavior -------- --- -------- Log error yes yes yes (minimal) drv.error_detected() yes no no Reset Link yes yes yes drv.mmio_enabled() yes no no drv.slot_reset() yes no no drv.resume() yes no no Remove PCI devices no yes yes (calls drv.remove()) Re-enumerate no yes yes (calls drv.probe()) N.B. With DPC, the Link reset happens before the driver .remove() calls, while with AER, the reset happens *after* the .remove() calls. The goal is to eventually do the reset before .remove() for AER as well. Signed-off-by: Oza Pawandeep <poza@codeaurora.org> [bhelgaas: changelog, squash doc patch into this, remove unused "result_data"] Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Keith Busch <keith.busch@intel.com>
Diffstat (limited to 'drivers/pci/pcie/aer/aerdrv_core.c')
-rw-r--r--drivers/pci/pcie/aer/aerdrv_core.c89
1 files changed, 70 insertions, 19 deletions
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 0ea5acc40323..f64ebca09439 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/kfifo.h>
#include "aerdrv.h"
+#include "../../pci.h"
#define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -475,35 +476,81 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
}
/**
- * do_recovery - handle nonfatal/fatal error recovery process
+ * do_fatal_recovery - handle fatal error recovery process
* @dev: pointer to a pci_dev data structure of agent detecting an error
- * @severity: error severity type
*
- * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
+ * Invoked when an error is fatal. Once being invoked, removes the devices
+ * beneath this AER agent, followed by reset link e.g. secondary bus reset
+ * followed by re-enumeration of devices.
+ */
+static void do_fatal_recovery(struct pci_dev *dev)
+{
+ struct pci_dev *udev;
+ struct pci_bus *parent;
+ struct pci_dev *pdev, *temp;
+ pci_ers_result_t result;
+
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+ udev = dev;
+ else
+ udev = dev->bus->self;
+
+ parent = udev->subordinate;
+ pci_lock_rescan_remove();
+ list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
+ bus_list) {
+ pci_dev_get(pdev);
+ pci_dev_set_disconnected(pdev, NULL);
+ if (pci_has_subordinate(pdev))
+ pci_walk_bus(pdev->subordinate,
+ pci_dev_set_disconnected, NULL);
+ pci_stop_and_remove_bus_device(pdev);
+ pci_dev_put(pdev);
+ }
+
+ result = reset_link(udev);
+
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ /*
+ * If the error is reported by a bridge, we think this error
+ * is related to the downstream link of the bridge, so we
+ * do error recovery on all subordinates of the bridge instead
+ * of the bridge and clear the error status of the bridge.
+ */
+ pci_cleanup_aer_uncorrect_error_status(dev);
+ }
+
+ if (result == PCI_ERS_RESULT_RECOVERED) {
+ if (pcie_wait_for_link(udev, true))
+ pci_rescan_bus(udev->bus);
+ } else {
+ pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+ pci_info(dev, "AER: Device recovery from fatal error failed\n");
+ }
+
+ pci_unlock_rescan_remove();
+}
+
+/**
+ * do_nonfatal_recovery - handle nonfatal error recovery process
+ * @dev: pointer to a pci_dev data structure of agent detecting an error
+ *
+ * Invoked when an error is nonfatal. Once being invoked, broadcast
* error detected message to all downstream drivers within a hierarchy in
* question and return the returned code.
*/
-static void do_recovery(struct pci_dev *dev, int severity)
+static void do_nonfatal_recovery(struct pci_dev *dev)
{
- pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+ pci_ers_result_t status;
enum pci_channel_state state;
- if (severity == AER_FATAL)
- state = pci_channel_io_frozen;
- else
- state = pci_channel_io_normal;
+ state = pci_channel_io_normal;
status = broadcast_error_message(dev,
state,
"error_detected",
report_error_detected);
- if (severity == AER_FATAL) {
- result = reset_link(dev);
- if (result != PCI_ERS_RESULT_RECOVERED)
- goto failed;
- }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
@@ -562,8 +609,10 @@ static void handle_error_source(struct pcie_device *aerdev,
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
- } else
- do_recovery(dev, info->severity);
+ } else if (info->severity == AER_NONFATAL)
+ do_nonfatal_recovery(dev);
+ else if (info->severity == AER_FATAL)
+ do_fatal_recovery(dev);
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -627,8 +676,10 @@ static void aer_recover_work_func(struct work_struct *work)
continue;
}
cper_print_aer(pdev, entry.severity, entry.regs);
- if (entry.severity != AER_CORRECTABLE)
- do_recovery(pdev, entry.severity);
+ if (entry.severity == AER_NONFATAL)
+ do_nonfatal_recovery(pdev);
+ else if (entry.severity == AER_FATAL)
+ do_fatal_recovery(pdev);
pci_dev_put(pdev);
}
}