diff options
Diffstat (limited to 'drivers/accel')
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 27 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/firmware_if.c | 123 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs.h | 15 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs_drv.c | 37 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs_ioctl.c | 55 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/hwmon.c | 4 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/memory.c | 7 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/mmu/mmu.c | 1 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/sysfs.c | 42 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi2/gaudi2.c | 74 | ||||
-rw-r--r-- | drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h | 13 | ||||
-rw-r--r-- | drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h | 1 | ||||
-rw-r--r-- | drivers/accel/ivpu/ivpu_hw_37xx.c | 12 | ||||
-rw-r--r-- | drivers/accel/qaic/mhi_controller.c | 15 | ||||
-rw-r--r-- | drivers/accel/qaic/qaic_data.c | 8 |
15 files changed, 277 insertions, 157 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 9711e8fc979d..a73bd4be94b1 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -853,6 +853,9 @@ static int device_early_init(struct hl_device *hdev) gaudi2_set_asic_funcs(hdev); strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name)); break; + case ASIC_GAUDI2C: + gaudi2_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name)); break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", @@ -1041,18 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev) return (vendor_id == PCI_VENDOR_ID_HABANALABS); } -static void hl_device_eq_heartbeat(struct hl_device *hdev) +static int hl_device_eq_heartbeat_check(struct hl_device *hdev) { - u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; struct asic_fixed_properties *prop = &hdev->asic_prop; if (!prop->cpucp_info.eq_health_check_supported) - return; + return 0; - if (hdev->eq_heartbeat_received) + if (hdev->eq_heartbeat_received) { hdev->eq_heartbeat_received = false; - else - hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask); + } else { + dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); + return -EIO; + } + + return 0; } static void hl_device_heartbeat(struct work_struct *work) @@ -1069,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work) /* * For EQ health check need to check if driver received the heartbeat eq event * in order to validate the eq is working. + * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule. */ - hl_device_eq_heartbeat(hdev); - - if (!hdev->asic_funcs->send_heartbeat(hdev)) + if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev))) goto reschedule; if (hl_device_operational(hdev, NULL)) @@ -2035,7 +2040,7 @@ device_reset: if (ctx) hl_ctx_put(ctx); - return hl_device_reset(hdev, flags); + return hl_device_reset(hdev, flags | HL_DRV_RESET_HARD); } static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask) @@ -2044,7 +2049,7 @@ static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 notifier_event->events_mask |= event_mask; if (notifier_event->eventfd) - eventfd_signal(notifier_event->eventfd, 1); + eventfd_signal(notifier_event->eventfd); mutex_unlock(¬ifier_event->lock); } diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 47e8384134aa..3558a6a8e192 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -646,39 +646,27 @@ int hl_fw_send_heartbeat(struct hl_device *hdev) return rc; } -static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, - u32 sts_val) +static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val) { bool err_exists = false; if (!(err_val & CPU_BOOT_ERR0_ENABLED)) return false; - if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) { - dev_err(hdev->dev, - "Device boot error - DRAM initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) + dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) { + if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) dev_err(hdev->dev, "Device boot error - FIT image corrupted\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) { - dev_err(hdev->dev, - "Device boot error - Thermal Sensor initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) + dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n"); if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) { if (hdev->bmc_enable) { - dev_err(hdev->dev, - "Device boot error - Skipped waiting for BMC\n"); - err_exists = true; + dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n"); } else { - dev_info(hdev->dev, - "Device boot message - Skipped waiting for BMC\n"); + dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n"); /* This is an info so we don't want it to disable the * device */ @@ -686,48 +674,29 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, } } - if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) { - dev_err(hdev->dev, - "Device boot error - Serdes data from BMC not available\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) + dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n"); - if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) { - dev_err(hdev->dev, - "Device boot error - NIC F/W initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) + dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) { - dev_err(hdev->dev, - "Device boot warning - security not ready\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) + dev_err(hdev->dev, "Device boot warning - security not ready\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) { + if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) dev_err(hdev->dev, "Device boot error - security failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) { + if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) dev_err(hdev->dev, "Device boot error - eFuse failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) { + if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_PLL_FAIL) { + if (err_val & CPU_BOOT_ERR0_PLL_FAIL) dev_err(hdev->dev, "Device boot error - PLL failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) { + if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n"); - err_exists = true; - } if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) { /* Ignore this bit, don't prevent driver loading */ @@ -735,52 +704,32 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL; } - if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) { + if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) dev_err(hdev->dev, "Device boot error - binning failure\n"); - err_exists = true; - } if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); + if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) + dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n"); + + if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) + dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n"); + + /* All warnings should go here in order not to reach the unknown error validation */ if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) { dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n"); err_exists = true; } - /* All warnings should go here in order not to reach the unknown error validation */ - if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { - dev_warn(hdev->dev, - "Device boot warning - Skipped DRAM initialization\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED; - } + if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) + dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n"); - if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - Failed to load preboot primary image\n"); - /* This is a warning so we don't want it to disable the - * device as we have a secondary preboot image - */ - err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL; - } - - if (err_val & CPU_BOOT_ERR0_TPM_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - TPM failure\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_TPM_FAIL; - } + if (err_val & CPU_BOOT_ERR0_TPM_FAIL) + dev_warn(hdev->dev, "Device boot warning - TPM failure\n"); - if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) { - dev_err(hdev->dev, - "Device boot error - unknown ERR0 error 0x%08x\n", err_val); + if (err_val & CPU_BOOT_ERR_FATAL_MASK) err_exists = true; - } /* return error only if it's in the predefined mask */ if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) & @@ -3295,6 +3244,14 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC); } +int hl_fw_get_dev_info_signed(struct hl_device *hdev, + struct cpucp_dev_info_signed *dev_info_signed, u32 nonce) +{ + return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed, + sizeof(struct cpucp_dev_info_signed), nonce, + HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC); +} + int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode, dma_addr_t buff, u32 *size) { diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 1655c101c705..2a900c9941fe 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -1262,6 +1262,7 @@ struct hl_dec { * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000). * @ASIC_GAUDI2: Gaudi2 device. * @ASIC_GAUDI2B: Gaudi2B device. + * @ASIC_GAUDI2C: Gaudi2C device. */ enum hl_asic_type { ASIC_INVALID, @@ -1270,6 +1271,7 @@ enum hl_asic_type { ASIC_GAUDI_SEC, ASIC_GAUDI2, ASIC_GAUDI2B, + ASIC_GAUDI2C, }; struct hl_cs_parser; @@ -3519,6 +3521,9 @@ struct hl_device { u8 heartbeat; }; +/* Retrieve PCI device name in case of a PCI device or dev name in simulator */ +#define HL_DEV_NAME(hdev) \ + ((hdev)->pdev ? dev_name(&(hdev)->pdev->dev) : "NA-DEVICE") /** * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure @@ -3594,6 +3599,14 @@ static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major return false; } +static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major, + u32 fw_sw_minor) +{ + return (hdev->fw_sw_major_ver > fw_sw_major || + (hdev->fw_sw_major_ver == fw_sw_major && + hdev->fw_sw_minor_ver >= fw_sw_minor)); +} + /* * Kernel module functions that can be accessed by entire module */ @@ -3954,6 +3967,8 @@ long hl_fw_get_max_power(struct hl_device *hdev); void hl_fw_set_max_power(struct hl_device *hdev); int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info, u32 nonce); +int hl_fw_get_dev_info_signed(struct hl_device *hdev, + struct cpucp_dev_info_signed *dev_info_signed, u32 nonce); int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value); int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value); int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value); diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 306a5bc9bf89..e542fd40e16c 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -141,6 +141,9 @@ static enum hl_asic_type get_asic_type(struct hl_device *hdev) case REV_ID_B: asic_type = ASIC_GAUDI2B; break; + case REV_ID_C: + asic_type = ASIC_GAUDI2C; + break; default: break; } @@ -670,6 +673,38 @@ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) return PCI_ERS_RESULT_RECOVERED; } +static void hl_pci_reset_prepare(struct pci_dev *pdev) +{ + struct hl_device *hdev; + + hdev = pci_get_drvdata(pdev); + if (!hdev) + return; + + hdev->disabled = true; +} + +static void hl_pci_reset_done(struct pci_dev *pdev) +{ + struct hl_device *hdev; + u32 flags; + + hdev = pci_get_drvdata(pdev); + if (!hdev) + return; + + /* + * Schedule a thread to trigger hard reset. + * The reason for this handler, is for rare cases where the driver is up + * and FLR occurs. This is valid only when working with no VM, so FW handles FLR + * and resets the device. FW will go back preboot stage, so driver needs to perform + * hard reset in order to load FW fit again. + */ + flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW; + + hl_device_reset(hdev, flags); +} + static const struct dev_pm_ops hl_pm_ops = { .suspend = hl_pmops_suspend, .resume = hl_pmops_resume, @@ -679,6 +714,8 @@ static const struct pci_error_handlers hl_pci_err_handler = { .error_detected = hl_pci_err_detected, .slot_reset = hl_pci_err_slot_reset, .resume = hl_pci_err_resume, + .reset_prepare = hl_pci_reset_prepare, + .reset_done = hl_pci_reset_done, }; static struct pci_driver hl_pci_driver = { diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 8ef36effb95b..1dd6e23172ca 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -19,6 +19,9 @@ #include <asm/msr.h> +/* make sure there is space for all the signed info */ +static_assert(sizeof(struct cpucp_info) <= SEC_DEV_INFO_BUF_SZ); + static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = { [HL_DEBUG_OP_ETR] = sizeof(struct hl_debug_params_etr), [HL_DEBUG_OP_ETF] = sizeof(struct hl_debug_params_etf), @@ -685,7 +688,7 @@ static int sec_attest_info(struct hl_fpriv *hpriv, struct hl_info_args *args) if (!sec_attest_info) return -ENOMEM; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { rc = -ENOMEM; goto free_sec_attest_info; @@ -719,6 +722,53 @@ free_sec_attest_info: return rc; } +static int dev_info_signed(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct cpucp_dev_info_signed *dev_info_signed; + struct hl_info_signed *info; + u32 max_size = args->return_size; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + dev_info_signed = kzalloc(sizeof(*dev_info_signed), GFP_KERNEL); + if (!dev_info_signed) + return -ENOMEM; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto free_dev_info_signed; + } + + rc = hl_fw_get_dev_info_signed(hpriv->hdev, + dev_info_signed, args->sec_attest_nonce); + if (rc) + goto free_info; + + info->nonce = le32_to_cpu(dev_info_signed->nonce); + info->info_sig_len = dev_info_signed->info_sig_len; + info->pub_data_len = le16_to_cpu(dev_info_signed->pub_data_len); + info->certificate_len = le16_to_cpu(dev_info_signed->certificate_len); + info->dev_info_len = sizeof(struct cpucp_info); + memcpy(&info->info_sig, &dev_info_signed->info_sig, sizeof(info->info_sig)); + memcpy(&info->public_data, &dev_info_signed->public_data, sizeof(info->public_data)); + memcpy(&info->certificate, &dev_info_signed->certificate, sizeof(info->certificate)); + memcpy(&info->dev_info, &dev_info_signed->info, info->dev_info_len); + + rc = copy_to_user(out, info, min_t(size_t, max_size, sizeof(*info))) ? -EFAULT : 0; + +free_info: + kfree(info); +free_dev_info_signed: + kfree(dev_info_signed); + + return rc; +} + + static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args) { int rc; @@ -1089,6 +1139,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_FW_GENERIC_REQ: return send_fw_generic_request(hdev, args); + case HL_INFO_DEV_SIGNED: + return dev_info_signed(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -EINVAL; diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c index 8598056216e7..1ee2ee07e9ed 100644 --- a/drivers/accel/habanalabs/common/hwmon.c +++ b/drivers/accel/habanalabs/common/hwmon.c @@ -578,10 +578,6 @@ int hl_get_temperature(struct hl_device *hdev, CPUCP_PKT_CTL_OPCODE_SHIFT); pkt.sensor_index = __cpu_to_le16(sensor_index); pkt.type = __cpu_to_le16(attr); - - dev_dbg(hdev->dev, "get temp, ctl 0x%x, sensor %d, type %d\n", - pkt.ctl, pkt.sensor_index, pkt.type); - rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c index 0b8689fe0b64..3348ad12c237 100644 --- a/drivers/accel/habanalabs/common/memory.c +++ b/drivers/accel/habanalabs/common/memory.c @@ -955,8 +955,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, (i + 1) == phys_pg_pack->npages); if (rc) { dev_err(hdev->dev, - "map failed for handle %u, npages: %llu, mapped: %llu", - phys_pg_pack->handle, phys_pg_pack->npages, + "map failed (%d) for handle %u, npages: %llu, mapped: %llu\n", + rc, phys_pg_pack->handle, phys_pg_pack->npages, mapped_pg_cnt); goto err; } @@ -1186,7 +1186,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack); if (rc) { - dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle); + dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n", + rc, handle); mutex_unlock(&hdev->mmu_lock); goto map_err; } diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c index b2145716c605..b654302a68fc 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu.c +++ b/drivers/accel/habanalabs/common/mmu/mmu.c @@ -596,6 +596,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev) break; case ASIC_GAUDI2: case ASIC_GAUDI2B: + case ASIC_GAUDI2C: /* MMUs in Gaudi2 are always host resident */ hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); break; diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index 01f89f029355..8a9f98832157 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -8,6 +8,7 @@ #include "habanalabs.h" #include <linux/pci.h> +#include <linux/types.h> static ssize_t clk_max_freq_mhz_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -80,12 +81,27 @@ static ssize_t vrm_ver_show(struct device *dev, struct device_attribute *attr, c { struct hl_device *hdev = dev_get_drvdata(dev); struct cpucp_info *cpucp_info; + u32 infineon_second_stage_version; + u32 infineon_second_stage_first_instance; + u32 infineon_second_stage_second_instance; + u32 infineon_second_stage_third_instance; + u32 mask = 0xff; cpucp_info = &hdev->asic_prop.cpucp_info; + infineon_second_stage_version = le32_to_cpu(cpucp_info->infineon_second_stage_version); + infineon_second_stage_first_instance = infineon_second_stage_version & mask; + infineon_second_stage_second_instance = + (infineon_second_stage_version >> 8) & mask; + infineon_second_stage_third_instance = + (infineon_second_stage_version >> 16) & mask; + if (cpucp_info->infineon_second_stage_version) - return sprintf(buf, "%#04x %#04x\n", le32_to_cpu(cpucp_info->infineon_version), - le32_to_cpu(cpucp_info->infineon_second_stage_version)); + return sprintf(buf, "%#04x %#04x:%#04x:%#04x\n", + le32_to_cpu(cpucp_info->infineon_version), + infineon_second_stage_first_instance, + infineon_second_stage_second_instance, + infineon_second_stage_third_instance); else return sprintf(buf, "%#04x\n", le32_to_cpu(cpucp_info->infineon_version)); } @@ -251,6 +267,9 @@ static ssize_t device_type_show(struct device *dev, case ASIC_GAUDI2B: str = "GAUDI2B"; break; + case ASIC_GAUDI2C: + str = "GAUDI2C"; + break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", hdev->asic_type); @@ -383,6 +402,21 @@ static ssize_t security_enabled_show(struct device *dev, return sprintf(buf, "%d\n", hdev->asic_prop.fw_security_enabled); } +static ssize_t module_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%u\n", le32_to_cpu(hdev->asic_prop.cpucp_info.card_location)); +} + +static ssize_t parent_device_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", HL_DEV_NAME(hdev)); +} + static DEVICE_ATTR_RO(armcp_kernel_ver); static DEVICE_ATTR_RO(armcp_ver); static DEVICE_ATTR_RO(cpld_ver); @@ -402,6 +436,8 @@ static DEVICE_ATTR_RO(thermal_ver); static DEVICE_ATTR_RO(uboot_ver); static DEVICE_ATTR_RO(fw_os_ver); static DEVICE_ATTR_RO(security_enabled); +static DEVICE_ATTR_RO(module_id); +static DEVICE_ATTR_RO(parent_device); static struct bin_attribute bin_attr_eeprom = { .attr = {.name = "eeprom", .mode = (0444)}, @@ -427,6 +463,8 @@ static struct attribute *hl_dev_attrs[] = { &dev_attr_uboot_ver.attr, &dev_attr_fw_os_ver.attr, &dev_attr_security_enabled.attr, + &dev_attr_module_id.attr, + &dev_attr_parent_device.attr, NULL, }; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 819660c684cf..e0e5615ef9b0 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7858,39 +7858,44 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, return !!ecc_data->is_critical; } -static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask) +static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u32 engine_id) { - u32 lo, hi, cq_ptr_size, arc_cq_ptr_size; - u64 cq_ptr, arc_cq_ptr, cp_current_inst; + struct undefined_opcode_info *undef_opcode = &hdev->captured_err_info.undef_opcode; + u64 cq_ptr, cp_current_inst; + u32 lo, hi, cq_size, cp_sts; + bool is_arc_cq; - lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET); - hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET); - cq_ptr = ((u64) hi) << 32 | lo; - cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET); + cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET); + is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */ - lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET); - hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET); - arc_cq_ptr = ((u64) hi) << 32 | lo; - arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET); + if (is_arc_cq) { + lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_STS_OFFSET); + hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_STS_OFFSET); + cq_ptr = ((u64) hi) << 32 | lo; + cq_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_STS_OFFSET); + } else { + lo = RREG32(qman_base + QM_CQ_PTR_LO_STS_4_OFFSET); + hi = RREG32(qman_base + QM_CQ_PTR_HI_STS_4_OFFSET); + cq_ptr = ((u64) hi) << 32 | lo; + cq_size = RREG32(qman_base + QM_CQ_TSIZE_STS_4_OFFSET); + } lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET); hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET); cp_current_inst = ((u64) hi) << 32 | lo; dev_info(hdev->dev, - "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n", - cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst); + "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#018llx}\n", + is_arc_cq ? "ARC_" : "", cq_ptr, cq_size, cp_current_inst); - if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { - if (arc_cq_ptr) { - hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr; - hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size; - } else { - hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr; - hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size; - } - - hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS; + if (undef_opcode->write_enable) { + memset(undef_opcode, 0, sizeof(*undef_opcode)); + undef_opcode->timestamp = ktime_get(); + undef_opcode->cq_addr = cq_ptr; + undef_opcode->cq_size = cq_size; + undef_opcode->engine_id = engine_id; + undef_opcode->stream_id = QMAN_STREAMS; + undef_opcode->write_enable = 0; } } @@ -7929,21 +7934,12 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type error_count++; } - if (i == QMAN_STREAMS && error_count) { - /* check for undefined opcode */ - if (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK && - hdev->captured_err_info.undef_opcode.write_enable) { - memset(&hdev->captured_err_info.undef_opcode, 0, - sizeof(hdev->captured_err_info.undef_opcode)); - - hdev->captured_err_info.undef_opcode.write_enable = false; - hdev->captured_err_info.undef_opcode.timestamp = ktime_get(); - hdev->captured_err_info.undef_opcode.engine_id = - gaudi2_queue_id_to_engine_id[qid_base]; - *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; - } - - handle_lower_qman_data_on_err(hdev, qman_base, *event_mask); + /* Check for undefined opcode error in lower QM */ + if ((i == QMAN_STREAMS) && + (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK)) { + handle_lower_qman_data_on_err(hdev, qman_base, + gaudi2_queue_id_to_engine_id[qid_base]); + *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; } } @@ -10007,6 +10003,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13)) + is_critical = true; break; case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN: diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h index a08378d0802b..d21fcd3880b4 100644 --- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h +++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h @@ -242,14 +242,15 @@ #define QM_FENCE2_OFFSET (mmPDMA0_QM_CP_FENCE2_RDATA_0 - mmPDMA0_QM_BASE) #define QM_SEI_STATUS_OFFSET (mmPDMA0_QM_SEI_STATUS - mmPDMA0_QM_BASE) -#define QM_CQ_PTR_LO_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_4 - mmPDMA0_QM_BASE) -#define QM_CQ_PTR_HI_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_4 - mmPDMA0_QM_BASE) -#define QM_CQ_TSIZE_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_4 - mmPDMA0_QM_BASE) +#define QM_CQ_TSIZE_STS_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_STS_4 - mmPDMA0_QM_BASE) +#define QM_CQ_PTR_LO_STS_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_STS_4 - mmPDMA0_QM_BASE) +#define QM_CQ_PTR_HI_STS_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_STS_4 - mmPDMA0_QM_BASE) -#define QM_ARC_CQ_PTR_LO_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO - mmPDMA0_QM_BASE) -#define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE) -#define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_TSIZE_STS_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE_STS - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_PTR_LO_STS_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO_STS - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_PTR_HI_STS_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI_STS - mmPDMA0_QM_BASE) +#define QM_CP_STS_4_OFFSET (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE) #define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE) #define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE) diff --git a/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h b/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h index f5d497dc9bdc..4f951cada077 100644 --- a/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h +++ b/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h @@ -25,6 +25,7 @@ enum hl_revision_id { REV_ID_INVALID = 0x00, REV_ID_A = 0x01, REV_ID_B = 0x02, + REV_ID_C = 0x03 }; #endif /* INCLUDE_PCI_GENERAL_H_ */ diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index e33dfe3089af..574cdeefb66b 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -54,10 +54,12 @@ #define ICB_0_1_IRQ_MASK ((((u64)ICB_1_IRQ_MASK) << 32) | ICB_0_IRQ_MASK) -#define BUTTRESS_IRQ_MASK ((REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, FREQ_CHANGE)) | \ - (REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR)) | \ +#define BUTTRESS_IRQ_MASK ((REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR)) | \ (REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, UFI_ERR))) +#define BUTTRESS_ALL_IRQ_MASK (BUTTRESS_IRQ_MASK | \ + (REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, FREQ_CHANGE))) + #define BUTTRESS_IRQ_ENABLE_MASK ((u32)~BUTTRESS_IRQ_MASK) #define BUTTRESS_IRQ_DISABLE_MASK ((u32)-1) @@ -75,8 +77,12 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) vdev->wa.clear_runtime_mem = false; vdev->wa.d3hot_after_power_off = true; - if (ivpu_device_id(vdev) == PCI_DEVICE_ID_MTL && ivpu_revision(vdev) < 4) + REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, BUTTRESS_ALL_IRQ_MASK); + if (REGB_RD32(VPU_37XX_BUTTRESS_INTERRUPT_STAT) == BUTTRESS_ALL_IRQ_MASK) { + /* Writing 1s does not clear the interrupt status register */ vdev->wa.interrupt_clear_with_0 = true; + REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, 0x0); + } IVPU_PRINT_WA(punit_disabled); IVPU_PRINT_WA(clear_runtime_mem); diff --git a/drivers/accel/qaic/mhi_controller.c b/drivers/accel/qaic/mhi_controller.c index 364eede0ac02..51cb85d0387b 100644 --- a/drivers/accel/qaic/mhi_controller.c +++ b/drivers/accel/qaic/mhi_controller.c @@ -436,8 +436,21 @@ static struct mhi_controller_config aic100_config = { static int mhi_read_reg(struct mhi_controller *mhi_cntrl, void __iomem *addr, u32 *out) { - u32 tmp = readl_relaxed(addr); + u32 tmp; + /* + * SOC_HW_VERSION quirk + * The SOC_HW_VERSION register (offset 0x224) is not reliable and + * may contain uninitialized values, including 0xFFFFFFFF. This could + * cause a false positive link down error. Instead, intercept any + * reads and provide the correct value of the register. + */ + if (addr - mhi_cntrl->regs == 0x224) { + *out = 0x60110200; + return 0; + } + + tmp = readl_relaxed(addr); if (tmp == U32_MAX) return -EIO; diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index f88d925c8001..2459fe4a3f95 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -457,7 +457,7 @@ static int create_sgt(struct qaic_device *qdev, struct sg_table **sgt_out, u64 s * later */ buf_extra = (PAGE_SIZE - size % PAGE_SIZE) % PAGE_SIZE; - max_order = min(MAX_ORDER, get_order(size)); + max_order = min(MAX_PAGE_ORDER, get_order(size)); } else { /* allocate a single page for book keeping */ nr_pages = 1; @@ -787,7 +787,6 @@ struct drm_gem_object *qaic_gem_prime_import(struct drm_device *dev, struct dma_ struct dma_buf_attachment *attach; struct drm_gem_object *obj; struct qaic_bo *bo; - size_t size; int ret; bo = qaic_alloc_init_bo(); @@ -805,13 +804,12 @@ struct drm_gem_object *qaic_gem_prime_import(struct drm_device *dev, struct dma_ goto attach_fail; } - size = PAGE_ALIGN(attach->dmabuf->size); - if (size == 0) { + if (!attach->dmabuf->size) { ret = -EINVAL; goto size_align_fail; } - drm_gem_private_object_init(dev, obj, size); + drm_gem_private_object_init(dev, obj, attach->dmabuf->size); /* * skipping dma_buf_map_attachment() as we do not know the direction * just yet. Once the direction is known in the subsequent IOCTL to |