From 5e69a33c5cec019dd8ca46e31749c6dc78f7cbf3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jun 2022 09:40:24 +0200 Subject: PCI/ERR: Recognize disconnected devices in report_error_detected() When a device is already unplugged by pciehp by the time the AER handler is invoked, the PCIe device will already be in the pci_channel_io_perm_failure state. In that case simply return PCI_ERS_RESULT_DISCONNECT instead of trying to do a state transition that will fail. Also untangle the state transition failure from the lack of methods to improve the debugging output in case it happens again. Link: https://lore.kernel.org/r/20220601074024.3481035-1-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/pcie/err.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 0c5a143025af..59c90d04a609 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -55,10 +55,14 @@ static int report_error_detected(struct pci_dev *dev, device_lock(&dev->dev); pdrv = dev->driver; - if (!pci_dev_set_io_state(dev, state) || - !pdrv || - !pdrv->err_handler || - !pdrv->err_handler->error_detected) { + if (pci_dev_is_disconnected(dev)) { + vote = PCI_ERS_RESULT_DISCONNECT; + } else if (!pci_dev_set_io_state(dev, state)) { + pci_info(dev, "can't recover (state transition %u -> %u invalid)\n", + dev->error_state, state); + vote = PCI_ERS_RESULT_NONE; + } else if (!pdrv || !pdrv->err_handler || + !pdrv->err_handler->error_detected) { /* * If any device in the subtree does not have an error_detected * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent -- cgit v1.2.3 From 6cd514e58f12b211d638dbf6f791fa18d854f09c Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 17 May 2022 12:37:38 +0800 Subject: PCI: Clear PCI_STATUS when setting up device We are seeing Master Abort bit is set on Intel I350 ethernet device and its root port right after boot, probably happened during BIOS phase: 00:06.0 PCI bridge [0604]: Intel Corporation Device [8086:464d] (rev 05) (prog-if 00 [Normal decode]) Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- SERR- TAbort- SERR- Signed-off-by: Bjorn Helgaas --- drivers/pci/probe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 17a969942d37..414f659dc873 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1890,6 +1890,9 @@ int pci_setup_device(struct pci_dev *dev) dev->broken_intx_masking = pci_intx_mask_broken(dev); + /* Clear errors left from system firmware */ + pci_write_config_word(dev, PCI_STATUS, 0xffff); + switch (dev->hdr_type) { /* header type */ case PCI_HEADER_TYPE_NORMAL: /* standard header */ if (class == PCI_CLASS_BRIDGE_PCI) -- cgit v1.2.3 From 9ffb98f144ebdc9345f3b34fac35c4ff12ae4e50 Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Tue, 25 Jan 2022 08:18:18 +0100 Subject: PCI/AER: Configure ECRC for every device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move pcie_set_ecrc_checking() to pci_aer_init() to make sure that pcie_set_ecrc_checking() is called for each PCIe device, including hot-added devices. Link: https://lore.kernel.org/r/20220125071820.2247260-2-sr@denx.de Signed-off-by: Stefan Roese Signed-off-by: Bjorn Helgaas Reviewed-by: Pali Rohár Cc: Bharat Kumar Gogada Cc: Michal Simek Cc: Yao Hongbo Cc: Naveen Naidu --- drivers/pci/pcie/aer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 7952e5efd6cf..eb665aca08f2 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -392,6 +392,8 @@ void pci_aer_init(struct pci_dev *dev) pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_ERR, sizeof(u32) * n); pci_aer_clear_status(dev); + + pcie_set_ecrc_checking(dev); } void pci_aer_exit(struct pci_dev *dev) @@ -1228,9 +1230,6 @@ static int set_device_error_reporting(struct pci_dev *dev, void *data) pci_disable_pcie_error_reporting(dev); } - if (enable) - pcie_set_ecrc_checking(dev); - return 0; } -- cgit v1.2.3 From 8795e182b02dc87e343c79e73af6b8b7f9c5e635 Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Tue, 25 Jan 2022 08:18:19 +0100 Subject: PCI/portdrv: Don't disable AER reporting in get_port_device_capability() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AER reporting is currently disabled in the DevCtl registers of all non Root Port PCIe devices on systems using pcie_ports_native || host->native_aer, disabling AER completely in such systems. This is because 2bd50dd800b5 ("PCI: PCIe: Disable PCIe port services during port initialization"), added a call to pci_disable_pcie_error_reporting() *after* the AER setup was completed for the PCIe device tree. Here a longer analysis about the current status of AER enabling / disabling upon bootup provided by Bjorn: pcie_portdrv_probe pcie_port_device_register get_port_device_capability pci_disable_pcie_error_reporting clear CERE NFERE FERE URRE # <-- disable for RP USP DSP pcie_device_init device_register # new AER service device aer_probe aer_enable_rootport # RP only set_downstream_devices_error_reporting set_device_error_reporting # self (RP) if (RP || USP || DSP) pci_enable_pcie_error_reporting set CERE NFERE FERE URRE # <-- enable for RP pci_walk_bus set_device_error_reporting if (RP || USP || DSP) pci_enable_pcie_error_reporting set CERE NFERE FERE URRE # <-- enable for USP DSP In a typical Root Port -> Endpoint hierarchy, the above: - Disables Error Reporting for the Root Port, - Enables Error Reporting for the Root Port, - Does NOT enable Error Reporting for the Endpoint because it is not a Root Port or Switch Port. In a deeper Root Port -> Upstream Switch Port -> Downstream Switch Port -> Endpoint hierarchy: - Disables Error Reporting for the Root Port, - Enables Error Reporting for the Root Port, - Enables Error Reporting for both Switch Ports, - Does NOT enable Error Reporting for the Endpoint because it is not a Root Port or Switch Port, - Disables Error Reporting for the Switch Ports when pcie_portdrv_probe() claims them. AER does not re-enable it because these are not Root Ports. Remove this call to pci_disable_pcie_error_reporting() from get_port_device_capability(), leaving the already enabled AER configuration intact. With this change, AER is enabled in the Root Port and the PCIe switch upstream and downstream ports. Only the PCIe Endpoints don't have AER enabled yet. A follow-up patch will take care of this Endpoint enabling. Fixes: 2bd50dd800b5 ("PCI: PCIe: Disable PCIe port services during port initialization") Link: https://lore.kernel.org/r/20220125071820.2247260-3-sr@denx.de Signed-off-by: Stefan Roese Signed-off-by: Bjorn Helgaas Reviewed-by: Pali Rohár Cc: Rafael J. Wysocki Cc: Bharat Kumar Gogada Cc: Michal Simek Cc: Yao Hongbo Cc: Naveen Naidu --- drivers/pci/pcie/portdrv_core.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 604feeb84ee4..1ac7fec47d6f 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -222,15 +222,8 @@ static int get_port_device_capability(struct pci_dev *dev) #ifdef CONFIG_PCIEAER if (dev->aer_cap && pci_aer_available() && - (pcie_ports_native || host->native_aer)) { + (pcie_ports_native || host->native_aer)) services |= PCIE_PORT_SERVICE_AER; - - /* - * Disable AER on this port in case it's been enabled by the - * BIOS (the AER service driver will enable it when necessary). - */ - pci_disable_pcie_error_reporting(dev); - } #endif /* Root Ports and Root Complex Event Collectors may generate PMEs */ -- cgit v1.2.3 From f26e58bf6f547031f91a1b0e39b9308d48a4ba8c Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Tue, 25 Jan 2022 08:18:20 +0100 Subject: PCI/AER: Enable error reporting when AER is native MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we have native control of AER, set the following error reporting enable bits: - Correctable Error Reporting Enable - Non-Fatal Error Reporting Enable - Fatal Error Reporting Enable - Unsupported Request Reporting Enable Note that these bits are all in the Device Control register and are not AER-specific. This affects all devices with an AER capability, including hot-added devices. Please note that this change is quite invasive, as error reporting now will be enabled for all available PCIe Endpoints, which was previously not the case. When "pci=noaer" is selected, error reporting stays disabled of course. [bhelgaas: commit log, note error reporting is not AER-specific] Link: https://lore.kernel.org/r/20220125071820.2247260-4-sr@denx.de Signed-off-by: Stefan Roese Signed-off-by: Bjorn Helgaas Reviewed-by: Pali Rohár Cc: Bharat Kumar Gogada Cc: Michal Simek Cc: Yao Hongbo Cc: Naveen Naidu --- drivers/pci/pcie/aer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index eb665aca08f2..8cfdd46b774e 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -393,6 +393,9 @@ void pci_aer_init(struct pci_dev *dev) pci_aer_clear_status(dev); + if (pci_aer_available()) + pci_enable_pcie_error_reporting(dev); + pcie_set_ecrc_checking(dev); } -- cgit v1.2.3 From 5e6ae050955b566484f3cc6a66e3925eae87a0ed Mon Sep 17 00:00:00 2001 From: Mohamed Khalfella Date: Mon, 9 May 2022 18:14:41 +0000 Subject: PCI/AER: Iterate over error counters instead of error strings Previously we iterated over AER stat *names*, e.g., aer_correctable_error_string[32], but the actual stat *counters* may not be that large, e.g., pdev->aer_stats->dev_cor_errs[16], which means that we printed junk in the sysfs stats files. Iterate over the stat counter arrays instead of the names to avoid this junk. Also, added a build time check to make sure all counters have entries in strings array. Fixes: 0678e3109a3c ("PCI/AER: Simplify __aer_print_error()") Link: https://lore.kernel.org/r/20220509181441.31884-1-mkhalfella@purestorage.com Reported-by: Meeta Saggi Signed-off-by: Mohamed Khalfella Signed-off-by: Bjorn Helgaas Reviewed-by: Meeta Saggi Reviewed-by: Eric Badger Cc: stable@vger.kernel.org --- drivers/pci/pcie/aer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 8cfdd46b774e..e2d8a74f83c3 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -543,7 +543,7 @@ static const char *aer_agent_string[] = { u64 *stats = pdev->aer_stats->stats_array; \ size_t len = 0; \ \ - for (i = 0; i < ARRAY_SIZE(strings_array); i++) { \ + for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\ if (strings_array[i]) \ len += sysfs_emit_at(buf, len, "%s %llu\n", \ strings_array[i], \ @@ -1349,6 +1349,11 @@ static int aer_probe(struct pcie_device *dev) struct device *device = &dev->device; struct pci_dev *port = dev->port; + BUILD_BUG_ON(ARRAY_SIZE(aer_correctable_error_string) < + AER_MAX_TYPEOF_COR_ERRS); + BUILD_BUG_ON(ARRAY_SIZE(aer_uncorrectable_error_string) < + AER_MAX_TYPEOF_UNCOR_ERRS); + /* Limit to Root Ports or Root Complex Event Collectors */ if ((pci_pcie_type(port) != PCI_EXP_TYPE_RC_EC) && (pci_pcie_type(port) != PCI_EXP_TYPE_ROOT_PORT)) -- cgit v1.2.3