From d95f87d29cf2097485765baac5109cecb486b5ee Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 11 Apr 2023 10:08:50 -0400 Subject: accel/habanalabs: remove variable gaudi_irq_name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc with W=1 reports drivers/accel/habanalabs/gaudi/gaudi.c:117:19: error: ‘gaudi_irq_name’ defined but not used [-Werror=unused-const-variable=] 117 | static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = { | ^~~~~~~~~~~~~~ This variable is not used so remove it. Signed-off-by: Tom Rix Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi/gaudi.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index a29aa8f7b6f3..a1697581c218 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -114,13 +114,6 @@ static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = { GAUDI_QUEUE_ID_DMA_1_3 }; -static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = { - "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3", - "gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3", - "gaudi cq 5_0", "gaudi cq 5_1", "gaudi cq 5_2", "gaudi cq 5_3", - "gaudi cpu eq" -}; - static const u8 gaudi_dma_assignment[GAUDI_DMA_MAX] = { [GAUDI_PCI_DMA_1] = GAUDI_ENGINE_ID_DMA_0, [GAUDI_PCI_DMA_2] = GAUDI_ENGINE_ID_DMA_1, -- cgit v1.2.3 From 1464fbd8bab958b771af3214bd5cb52cd5f612ed Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Thu, 30 Mar 2023 13:38:19 +0300 Subject: accel/habanalabs: ignore false positive razwi In Gaudi2 asic, PSOC RAZWI may cause in HBW or LBW. The address that caused the error is read from HW register and printed by the Driver. There are cases where the Driver receives an indication on PSOC RAZWI error but the address value is zero. In that case, the indication is a false positive. The Driver should not "count" a PSOC RAZWI event error when the caused the address is zeroed. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 43 ++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index b778cf764a68..6f05aa230376 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8251,6 +8251,7 @@ static bool gaudi2_handle_psoc_razwi_happened(struct hl_device *hdev, u32 razwi_ u16 num_of_eng, eng_id[PSOC_RAZWI_MAX_ENG_PER_RTR]; char eng_name_str[PSOC_RAZWI_ENG_STR_SIZE]; bool razwi_happened = false; + u64 addr; int i; num_of_eng = gaudi2_psoc_razwi_get_engines(common_razwi_info, ARRAY_SIZE(common_razwi_info), @@ -8269,43 +8270,53 @@ static bool gaudi2_handle_psoc_razwi_happened(struct hl_device *hdev, u32 razwi_ if (RREG32(base[i] + DEC_RAZWI_HBW_AW_SET)) { addr_hi = RREG32(base[i] + DEC_RAZWI_HBW_AW_ADDR_HI); addr_lo = RREG32(base[i] + DEC_RAZWI_HBW_AW_ADDR_LO); - dev_err(hdev->dev, + addr = ((u64)addr_hi << 32) + addr_lo; + if (addr) { + dev_err(hdev->dev, "PSOC HBW AW RAZWI: %s, address (aligned to 128 byte): 0x%llX\n", - eng_name_str, ((u64)addr_hi << 32) + addr_lo); - hl_handle_razwi(hdev, ((u64)addr_hi << 32) + addr_lo, &eng_id[0], + eng_name_str, addr); + hl_handle_razwi(hdev, addr, &eng_id[0], num_of_eng, HL_RAZWI_HBW | HL_RAZWI_WRITE, event_mask); - razwi_happened = true; + razwi_happened = true; + } } if (RREG32(base[i] + DEC_RAZWI_HBW_AR_SET)) { addr_hi = RREG32(base[i] + DEC_RAZWI_HBW_AR_ADDR_HI); addr_lo = RREG32(base[i] + DEC_RAZWI_HBW_AR_ADDR_LO); - dev_err(hdev->dev, + addr = ((u64)addr_hi << 32) + addr_lo; + if (addr) { + dev_err(hdev->dev, "PSOC HBW AR RAZWI: %s, address (aligned to 128 byte): 0x%llX\n", - eng_name_str, ((u64)addr_hi << 32) + addr_lo); - hl_handle_razwi(hdev, ((u64)addr_hi << 32) + addr_lo, &eng_id[0], + eng_name_str, addr); + hl_handle_razwi(hdev, addr, &eng_id[0], num_of_eng, HL_RAZWI_HBW | HL_RAZWI_READ, event_mask); - razwi_happened = true; + razwi_happened = true; + } } if (RREG32(base[i] + DEC_RAZWI_LBW_AW_SET)) { addr_lo = RREG32(base[i] + DEC_RAZWI_LBW_AW_ADDR); - dev_err(hdev->dev, + if (addr_lo) { + dev_err(hdev->dev, "PSOC LBW AW RAZWI: %s, address (aligned to 128 byte): 0x%X\n", eng_name_str, addr_lo); - hl_handle_razwi(hdev, addr_lo, &eng_id[0], + hl_handle_razwi(hdev, addr_lo, &eng_id[0], num_of_eng, HL_RAZWI_LBW | HL_RAZWI_WRITE, event_mask); - razwi_happened = true; + razwi_happened = true; + } } if (RREG32(base[i] + DEC_RAZWI_LBW_AR_SET)) { addr_lo = RREG32(base[i] + DEC_RAZWI_LBW_AR_ADDR); - dev_err(hdev->dev, - "PSOC LBW AR RAZWI: %s, address (aligned to 128 byte): 0x%X\n", - eng_name_str, addr_lo); - hl_handle_razwi(hdev, addr_lo, &eng_id[0], + if (addr_lo) { + dev_err(hdev->dev, + "PSOC LBW AR RAZWI: %s, address (aligned to 128 byte): 0x%X\n", + eng_name_str, addr_lo); + hl_handle_razwi(hdev, addr_lo, &eng_id[0], num_of_eng, HL_RAZWI_LBW | HL_RAZWI_READ, event_mask); - razwi_happened = true; + razwi_happened = true; + } } /* In common case the loop will break, when there is only one engine id, or * several engines with the same router. The exceptional case is with psoc razwi -- cgit v1.2.3 From 9ce36082c1723d96603c667436a88866b3fde531 Mon Sep 17 00:00:00 2001 From: Rakesh Ughreja Date: Wed, 5 Apr 2023 09:12:29 +0300 Subject: accel/habanalabs: allow user to modify EDMA RL register EDMA transpose workload requires to signal for every activation. User FW sends all the dummy signals to RD_LBW_RATE_LIM_CFG, to save lbw bandwidth. We need the user to be able to access that register to configure it. Signed-off-by: Rakesh Ughreja Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2_security.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c index 694735f9e6e6..acd33130e7f9 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c @@ -479,6 +479,7 @@ static const u32 gaudi2_pb_dcr0_edma0_unsecured_regs[] = { mmDCORE0_EDMA0_CORE_CTX_TE_NUMROWS, mmDCORE0_EDMA0_CORE_CTX_IDX, mmDCORE0_EDMA0_CORE_CTX_IDX_INC, + mmDCORE0_EDMA0_CORE_RD_LBW_RATE_LIM_CFG, mmDCORE0_EDMA0_QM_CQ_CFG0_0, mmDCORE0_EDMA0_QM_CQ_CFG0_1, mmDCORE0_EDMA0_QM_CQ_CFG0_2, -- cgit v1.2.3 From 574ee40f514453d09ea2c5f3882b3b369b5ba027 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Sun, 9 Apr 2023 14:24:05 +0300 Subject: accel/habanalabs: remove commented code that won't be used Once it was decided that these security settings are to be done by FW rather than by the driver, there's no reason to keep them in the code. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2_security.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c index acd33130e7f9..52a12c2bb58e 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c @@ -3443,15 +3443,6 @@ static int gaudi2_init_protection_bits(struct hl_device *hdev) ARRAY_SIZE(gaudi2_pb_thermal_sensor0), NULL, HL_PB_NA); } - /* HBM */ - /* Temporarily skip until SW-63348 is solved - * instance_offset = mmHBM1_MC0_BASE - mmHBM0_MC0_BASE; - * rc |= hl_init_pb_with_mask(hdev, HL_PB_SHARED, HL_PB_NA, GAUDI2_HBM_NUM, - * instance_offset, gaudi2_pb_hbm, - * ARRAY_SIZE(gaudi2_pb_hbm), NULL, HL_PB_NA, - * prop->dram_enabled_mask); - */ - /* Scheduler ARCs */ instance_offset = mmARC_FARM_ARC1_AUX_BASE - mmARC_FARM_ARC0_AUX_BASE; rc |= hl_init_pb_ranges(hdev, HL_PB_SHARED, HL_PB_NA, -- cgit v1.2.3 From f9b60242af3e2216ed03139bc2c14a02c54073ce Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Mon, 10 Apr 2023 09:31:16 +0300 Subject: accel/habanalabs: fix bug in free scratchpad memory This commit fixes a bug in Gaudi2 when freeing the scratchpad memory in case software init fails. Signed-off-by: Moti Haimovski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 6f05aa230376..acc304ebca82 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -3630,8 +3630,8 @@ static int gaudi2_sw_init(struct hl_device *hdev) special_blocks_free: gaudi2_special_blocks_iterator_free(hdev); free_scratchpad_mem: - hl_asic_dma_pool_free(hdev, gaudi2->scratchpad_kernel_address, - gaudi2->scratchpad_bus_address); + hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address, + gaudi2->scratchpad_bus_address); free_virt_msix_db_mem: hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr); free_cpu_accessible_dma_pool: -- cgit v1.2.3 From 9ef23f05aed4e1dd89b70cd2b8a5e35134ab90bb Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Wed, 22 Mar 2023 08:38:49 +0200 Subject: accel/habanalabs: add helper to extract the FW major/minor the helper is extract_u32_until_given_char and can later be used to also get the major/minor of the sw version. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 69 +++++++++++++++++---------- 1 file changed, 45 insertions(+), 24 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 59f61ec66445..1400a4430045 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -71,38 +71,59 @@ free_fw_ver: return NULL; } -static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) +/** + * extract_u32_until_given_char() - given a string of the format "*", extract the u32. + * @str: the given string + * @ver_num: the pointer to the extracted u32 to be returned to the caller. + * @given_char: the given char at the end of the u32 in the string + * + * Return: Upon success, return a pointer to the given_char in the string. Upon failure, return NULL + */ +static char *extract_u32_until_given_char(char *str, u32 *ver_num, char given_char) { - char major[8], minor[8], *first_dot, *second_dot; - int rc; + char num_str[8] = {}, *ch; - first_dot = strnstr(preboot_ver, ".", 10); - if (first_dot) { - strscpy(major, preboot_ver, first_dot - preboot_ver + 1); - rc = kstrtou32(major, 10, &hdev->fw_major_version); - } else { - rc = -EINVAL; - } + ch = strchrnul(str, given_char); + if (*ch == '\0' || ch == str || ch - str >= sizeof(num_str)) + return NULL; - if (rc) { - dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc); - return rc; + memcpy(num_str, str, ch - str); + if (kstrtou32(num_str, 10, ver_num)) + return NULL; + return ch; +} + +/** + * hl_get_preboot_major_minor() - extract the FW's version major, minor from the version string. + * @hdev: pointer to the hl_device + * @preboot_ver: the FW's version string + * + * preboot_ver is expected to be the format of ..*, e.g: 42.0.1-sec-3 + * The extracted version is set in the hdev fields: fw_inner_{major/minor}_ver. + * + * Return: 0 on success, negative error code for failure. + */ +static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) +{ + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_major_version, '.'); + if (!preboot_ver) { + dev_err(hdev->dev, "Error parsing preboot major version\n"); + goto err_zero_ver; } - /* skip the first dot */ - first_dot++; + preboot_ver++; - second_dot = strnstr(first_dot, ".", 10); - if (second_dot) { - strscpy(minor, first_dot, second_dot - first_dot + 1); - rc = kstrtou32(minor, 10, &hdev->fw_minor_version); - } else { - rc = -EINVAL; + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_minor_version, '.'); + if (!preboot_ver) { + dev_err(hdev->dev, "Error parsing preboot minor version\n"); + goto err_zero_ver; } + return 0; - if (rc) - dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc); - return rc; +err_zero_ver: + hdev->fw_major_version = 0; + hdev->fw_minor_version = 0; + return -EINVAL; } static int hl_request_fw(struct hl_device *hdev, -- cgit v1.2.3 From 3071247ca06195eed4261b99035e0c031fa5ddd0 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Sun, 2 Apr 2023 12:05:43 +0300 Subject: accel/habanalabs: rename fw_{major/minor}_version to fw_inner_{major/minor}_ver We later want to add fields for Firmware SW version. The current extracted FW version is the inner FW versioning so the new name is better and also better differentiate from the FW's SW version. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 8 ++++---- drivers/accel/habanalabs/common/habanalabs.h | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 1400a4430045..6150ab6ba810 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -105,7 +105,7 @@ static char *extract_u32_until_given_char(char *str, u32 *ver_num, char given_ch */ static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) { - preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_major_version, '.'); + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_major_ver, '.'); if (!preboot_ver) { dev_err(hdev->dev, "Error parsing preboot major version\n"); goto err_zero_ver; @@ -113,7 +113,7 @@ static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) preboot_ver++; - preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_minor_version, '.'); + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_minor_ver, '.'); if (!preboot_ver) { dev_err(hdev->dev, "Error parsing preboot minor version\n"); goto err_zero_ver; @@ -121,8 +121,8 @@ static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) return 0; err_zero_ver: - hdev->fw_major_version = 0; - hdev->fw_minor_version = 0; + hdev->fw_inner_major_ver = 0; + hdev->fw_inner_minor_ver = 0; return -EINVAL; } diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index eaae69a9f817..57661cb51621 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3225,8 +3225,8 @@ struct hl_reset_info { * @captured_err_info: holds information about errors. * @reset_info: holds current device reset information. * @stream_master_qid_arr: pointer to array with QIDs of master streams. - * @fw_major_version: major version of current loaded preboot. - * @fw_minor_version: minor version of current loaded preboot. + * @fw_inner_major_ver: the major of current loaded preboot inner version. + * @fw_inner_minor_ver: the minor of current loaded preboot inner version. * @dram_used_mem: current DRAM memory consumption. * @memory_scrub_val: the value to which the dram will be scrubbed to using cb scrub_device_dram * @timeout_jiffies: device CS timeout value. @@ -3412,8 +3412,8 @@ struct hl_device { struct hl_reset_info reset_info; u32 *stream_master_qid_arr; - u32 fw_major_version; - u32 fw_minor_version; + u32 fw_inner_major_ver; + u32 fw_inner_minor_ver; atomic64_t dram_used_mem; u64 memory_scrub_val; u64 timeout_jiffies; @@ -3549,7 +3549,7 @@ struct hl_ioctl_desc { static inline bool hl_is_fw_ver_below_1_9(struct hl_device *hdev) { - return (hdev->fw_major_version < 42); + return (hdev->fw_inner_major_ver < 42); } /* -- cgit v1.2.3 From dd5667ff6f9c52c6a49aa9725b80542793613036 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Thu, 16 Mar 2023 10:20:44 +0200 Subject: accel/habanalabs: extract and save the FW's SW major/minor/sub-minor It is not always possible to know the FW's SW version from the inner FW version. Therefore we should extract the general SW version in addition to the FW version and use it in functions like 'hl_is_fw_ver_below_1_9' etc. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 78 ++++++++++++++++++++++++--- drivers/accel/habanalabs/common/habanalabs.h | 6 +++ 2 files changed, 78 insertions(+), 6 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 6150ab6ba810..62052cfe694f 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -93,6 +93,71 @@ static char *extract_u32_until_given_char(char *str, u32 *ver_num, char given_ch return ch; } +/** + * hl_get_sw_major_minor_subminor() - extract the FW's SW version major, minor, sub-minor + * from the version string + * @hdev: pointer to the hl_device + * @fw_str: the FW's version string + * + * The extracted version is set in the hdev fields: fw_sw_{major/minor/sub_minor}_ver. + * + * fw_str is expected to have one of two possible formats, examples: + * 1) 'Preboot version hl-gaudi2-1.9.0-fw-42.0.1-sec-3' + * 2) 'Preboot version hl-gaudi2-1.9.0-rc-fw-42.0.1-sec-3' + * In those examples, the SW major,minor,subminor are correspondingly: 1,9,0. + * + * Return: 0 for success or a negative error code for failure. + */ +static int hl_get_sw_major_minor_subminor(struct hl_device *hdev, const char *fw_str) +{ + char *end, *start; + + end = strnstr(fw_str, "-rc-", VERSION_MAX_LEN); + if (end == fw_str) + return -EINVAL; + + if (!end) + end = strnstr(fw_str, "-fw-", VERSION_MAX_LEN); + + if (end == fw_str) + return -EINVAL; + + if (!end) + return -EINVAL; + + for (start = end - 1; start != fw_str; start--) { + if (*start == '-') + break; + } + + if (start == fw_str) + return -EINVAL; + + /* start/end point each to the starting and ending hyphen of the sw version e.g. -1.9.0- */ + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_major_ver, '.'); + if (!start) + goto err_zero_ver; + + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_minor_ver, '.'); + if (!start) + goto err_zero_ver; + + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_sub_minor_ver, '-'); + if (!start) + goto err_zero_ver; + + return 0; + +err_zero_ver: + hdev->fw_sw_major_ver = 0; + hdev->fw_sw_minor_ver = 0; + hdev->fw_sw_sub_minor_ver = 0; + return -EINVAL; +} + /** * hl_get_preboot_major_minor() - extract the FW's version major, minor from the version string. * @hdev: pointer to the hl_device @@ -2172,6 +2237,7 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, struct asic_fixed_properties *prop = &hdev->asic_prop; char *preboot_ver, *boot_ver; char btl_ver[32]; + int rc; switch (fwc) { case FW_COMP_BOOT_FIT: @@ -2185,20 +2251,20 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, break; case FW_COMP_PREBOOT: strscpy(prop->preboot_ver, fw_version, VERSION_MAX_LEN); - preboot_ver = strnstr(prop->preboot_ver, "Preboot", - VERSION_MAX_LEN); + preboot_ver = strnstr(prop->preboot_ver, "Preboot", VERSION_MAX_LEN); + dev_info(hdev->dev, "preboot full version: '%s'\n", preboot_ver); + if (preboot_ver && preboot_ver != prop->preboot_ver) { strscpy(btl_ver, prop->preboot_ver, min((int) (preboot_ver - prop->preboot_ver), 31)); dev_info(hdev->dev, "%s\n", btl_ver); } + rc = hl_get_sw_major_minor_subminor(hdev, preboot_ver); + if (rc) + return rc; preboot_ver = extract_fw_ver_from_str(prop->preboot_ver); if (preboot_ver) { - int rc; - - dev_info(hdev->dev, "preboot version %s\n", preboot_ver); - rc = hl_get_preboot_major_minor(hdev, preboot_ver); kfree(preboot_ver); if (rc) diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 57661cb51621..d6f454b1cde4 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3227,6 +3227,9 @@ struct hl_reset_info { * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @fw_inner_major_ver: the major of current loaded preboot inner version. * @fw_inner_minor_ver: the minor of current loaded preboot inner version. + * @fw_sw_major_ver: the major of current loaded preboot SW version. + * @fw_sw_minor_ver: the minor of current loaded preboot SW version. + * @fw_sw_sub_minor_ver: the sub-minor of current loaded preboot SW version. * @dram_used_mem: current DRAM memory consumption. * @memory_scrub_val: the value to which the dram will be scrubbed to using cb scrub_device_dram * @timeout_jiffies: device CS timeout value. @@ -3414,6 +3417,9 @@ struct hl_device { u32 *stream_master_qid_arr; u32 fw_inner_major_ver; u32 fw_inner_minor_ver; + u32 fw_sw_major_ver; + u32 fw_sw_minor_ver; + u32 fw_sw_sub_minor_ver; atomic64_t dram_used_mem; u64 memory_scrub_val; u64 timeout_jiffies; -- cgit v1.2.3 From a12428acf823a1a367d38a4113b7365e3e5603bf Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Tue, 18 Apr 2023 11:39:44 +0300 Subject: accel/habanalabs: check fw version using sw version The fw inner version is less trustable, instead use the fw general sw release version. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs.h | 10 ++++++++-- drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index d6f454b1cde4..02620654ccdf 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3553,9 +3553,15 @@ struct hl_ioctl_desc { hl_ioctl_t *func; }; -static inline bool hl_is_fw_ver_below_1_9(struct hl_device *hdev) +static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major, u32 fw_sw_minor) { - return (hdev->fw_inner_major_ver < 42); + if (hdev->fw_sw_major_ver < fw_sw_major) + return true; + if (hdev->fw_sw_major_ver > fw_sw_major) + return false; + if (hdev->fw_sw_minor_ver < fw_sw_minor) + return true; + return false; } /* diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index acc304ebca82..6e3aa89b19c2 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8043,7 +8043,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, case RAZWI_TPC: hbw_rtr_id = gaudi2_tpc_initiator_hbw_rtr_id[module_idx]; - if (hl_is_fw_ver_below_1_9(hdev) && + if (hl_is_fw_sw_ver_below(hdev, 1, 9) && !hdev->asic_prop.fw_security_enabled && ((module_idx == 0) || (module_idx == 1))) lbw_rtr_id = DCORE0_RTR0; -- cgit v1.2.3 From cc7b790d412461520de49eb321a0aeed2735e5c4 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Wed, 8 Feb 2023 16:16:08 +0200 Subject: accel/habanalabs: do soft-reset using cpucp packet This is done depending on the FW version. The cpucp method is preferable and saves scratchpads resource. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 14 ++++++++++++ drivers/accel/habanalabs/common/habanalabs.h | 1 + drivers/accel/habanalabs/gaudi2/gaudi2.c | 26 +++++++++++++--------- drivers/accel/habanalabs/include/common/cpucp_if.h | 4 ++++ 4 files changed, 35 insertions(+), 10 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 62052cfe694f..e48f024d8649 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -591,6 +591,20 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, size); } +int hl_fw_send_soft_reset(struct hl_device *hdev) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_SOFT_RESET << CPUCP_PKT_CTL_OPCODE_SHIFT); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + if (rc) + dev_err(hdev->dev, "failed to send soft-reset msg (err = %d)\n", rc); + + return rc; +} + int hl_fw_send_device_activity(struct hl_device *hdev, bool open) { struct cpucp_packet pkt; diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 02620654ccdf..6b102947cb90 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3884,6 +3884,7 @@ int hl_fw_dram_replaced_row_get(struct hl_device *hdev, int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num); int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid); int hl_fw_send_device_activity(struct hl_device *hdev, bool open); +int hl_fw_send_soft_reset(struct hl_device *hdev); int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], bool is_wc[3]); int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data); diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 6e3aa89b19c2..5c80e7af5b78 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -6148,18 +6148,24 @@ static int gaudi2_execute_soft_reset(struct hl_device *hdev, bool driver_perform u32 poll_timeout_us) { struct cpu_dyn_regs *dyn_regs = &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + int rc = 0; if (!driver_performs_reset) { - /* set SP to indicate reset request sent to FW */ - if (dyn_regs->cpu_rst_status) - WREG32(le32_to_cpu(dyn_regs->cpu_rst_status), CPU_RST_STATUS_NA); - else - WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA); - - WREG32(le32_to_cpu(dyn_regs->gic_host_soft_rst_irq), - gaudi2_irq_map_table[GAUDI2_EVENT_CPU_SOFT_RESET].cpu_id); - - return gaudi2_get_soft_rst_done_indication(hdev, poll_timeout_us); + if (hl_is_fw_sw_ver_below(hdev, 1, 10)) { + /* set SP to indicate reset request sent to FW */ + if (dyn_regs->cpu_rst_status) + WREG32(le32_to_cpu(dyn_regs->cpu_rst_status), CPU_RST_STATUS_NA); + else + WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA); + WREG32(le32_to_cpu(dyn_regs->gic_host_soft_rst_irq), + gaudi2_irq_map_table[GAUDI2_EVENT_CPU_SOFT_RESET].cpu_id); + + /* wait for f/w response */ + rc = gaudi2_get_soft_rst_done_indication(hdev, poll_timeout_us); + } else { + rc = hl_fw_send_soft_reset(hdev); + } + return rc; } /* Block access to engines, QMANs and SM during reset, these diff --git a/drivers/accel/habanalabs/include/common/cpucp_if.h b/drivers/accel/habanalabs/include/common/cpucp_if.h index 8bbe685458c4..f68308cc2524 100644 --- a/drivers/accel/habanalabs/include/common/cpucp_if.h +++ b/drivers/accel/habanalabs/include/common/cpucp_if.h @@ -665,6 +665,9 @@ enum pq_init_status { * * CPUCP_PACKET_REGISTER_INTERRUPTS - * Packet to register interrupts indicating LKD is ready to receive events from FW. + * + * CPUCP_PACKET_SOFT_RESET - + * Packet to perform soft-reset. */ enum cpucp_packet_id { @@ -731,6 +734,7 @@ enum cpucp_packet_id { CPUCP_PACKET_RESERVED11, /* not used */ CPUCP_PACKET_RESERVED12, /* internal */ CPUCP_PACKET_REGISTER_INTERRUPTS, /* internal */ + CPUCP_PACKET_SOFT_RESET, /* internal */ CPUCP_PACKET_ID_MAX /* must be last */ }; -- cgit v1.2.3 From 57469c12060899b7b0768ace178c7e36501e2f21 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 13 Apr 2023 11:22:06 +0300 Subject: accel/habanalabs: unsecure TPC bias registers User needs to be able to perform downcast / upcast of fp8_143 dtype. Hence bias register needs to be accessed by the user. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2_security.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c index 52a12c2bb58e..fadb870ff4c0 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c @@ -1542,6 +1542,7 @@ static const u32 gaudi2_pb_dcr0_tpc0_unsecured_regs[] = { mmDCORE0_TPC0_CFG_LUT_FUNC128_BASE2_ADDR_HI, mmDCORE0_TPC0_CFG_LUT_FUNC256_BASE2_ADDR_LO, mmDCORE0_TPC0_CFG_LUT_FUNC256_BASE2_ADDR_HI, + mmDCORE0_TPC0_CFG_FP8_143_BIAS, mmDCORE0_TPC0_CFG_ROUND_CSR, mmDCORE0_TPC0_CFG_CONV_ROUND_CSR, mmDCORE0_TPC0_CFG_SEMAPHORE, -- cgit v1.2.3 From 04729b418f8b031712e6dbf5bffa7d639f8f211e Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Thu, 13 Apr 2023 13:54:40 +0300 Subject: accel/habanalabs: call to HW/FW err returns 0 when no events exist This commit modifies the call to retrieve HW or FW error events to return success when no events are pending, as done in the calls to other events. Signed-off-by: Moti Haimovski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs_ioctl.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 203ee857810c..4368e6c9a23a 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -842,15 +842,15 @@ static int hw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args) struct hw_err_info *info; int rc; - if ((!user_buf_size) || (!user_buf)) + if (!user_buf) return -EINVAL; - if (user_buf_size < sizeof(struct hl_info_hw_err_event)) - return -ENOMEM; - info = &hdev->captured_err_info.hw_err; if (!info->event_info_available) - return -ENOENT; + return 0; + + if (user_buf_size < sizeof(struct hl_info_hw_err_event)) + return -ENOMEM; rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_hw_err_event)); return rc ? -EFAULT : 0; @@ -864,15 +864,15 @@ static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args) struct fw_err_info *info; int rc; - if ((!user_buf_size) || (!user_buf)) + if (!user_buf) return -EINVAL; - if (user_buf_size < sizeof(struct hl_info_fw_err_event)) - return -ENOMEM; - info = &hdev->captured_err_info.fw_err; if (!info->event_info_available) - return -ENOENT; + return 0; + + if (user_buf_size < sizeof(struct hl_info_fw_err_event)) + return -ENOMEM; rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_fw_err_event)); return rc ? -EFAULT : 0; -- cgit v1.2.3 From ad8bfd3619bb096dd1929a69989120b45e3c3eee Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Mon, 17 Apr 2023 15:14:30 +0300 Subject: accel/habanalabs: minimize encapsulation signal mutex lock time Sync Stream Encapsulated Signal Handlers can be managed from different contexts, and as such they are protected via a spin_lock. However, spin_lock was unnecessarily protecting a larger code section than really needed, covering a sleepable code section as well. Since spin_lock disables preemption, it could lead to sleeping in atomic context. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/command_submission.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index af9d2e22c6e7..977cd2710686 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -2152,7 +2152,7 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id) hdev->asic_funcs->hw_queues_unlock(hdev); rc = -EINVAL; - goto out; + goto out_unlock; } /* @@ -2167,15 +2167,21 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id) /* Release the id and free allocated memory of the handle */ idr_remove(&mgr->handles, handle_id); + + /* unlock before calling ctx_put, where we might sleep */ + spin_unlock(&mgr->lock); hl_ctx_put(encaps_sig_hdl->ctx); kfree(encaps_sig_hdl); + goto out; } else { rc = -EINVAL; dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n"); } -out: + +out_unlock: spin_unlock(&mgr->lock); +out: return rc; } -- cgit v1.2.3 From 9a4e44a4ee49dff6ded6a4c76bb921a8379cb7b4 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Sun, 4 Dec 2022 18:37:53 +0200 Subject: accel/habanalabs: refactor abort of completions and waits Aborting CS completions should be in command_submission.c but aborting waiting for user interrupts should be in device.c. This separation is also for adding more abort operations in the future. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/command_submission.c | 3 +-- drivers/accel/habanalabs/common/device.c | 17 ++++++++++++----- drivers/accel/habanalabs/common/habanalabs.h | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index 977cd2710686..4ff3256fdf8b 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -1139,11 +1139,10 @@ static void force_complete_cs(struct hl_device *hdev) spin_unlock(&hdev->cs_mirror_lock); } -void hl_abort_waitings_for_completion(struct hl_device *hdev) +void hl_abort_waiting_for_cs_completions(struct hl_device *hdev) { force_complete_cs(hdev); force_complete_multi_cs(hdev); - hl_release_pending_user_interrupts(hdev); } static void job_wq_completion(struct work_struct *work) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index fabfc501ef54..c8f4a34d2831 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1157,6 +1157,16 @@ static void take_release_locks(struct hl_device *hdev) mutex_unlock(&hdev->fpriv_ctrl_list_lock); } +static void hl_abort_waiting_for_completions(struct hl_device *hdev) +{ + hl_abort_waiting_for_cs_completions(hdev); + + /* Release all pending user interrupts, each pending user interrupt + * holds a reference to a user context. + */ + hl_release_pending_user_interrupts(hdev); +} + static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset, bool skip_wq_flush) { @@ -1176,10 +1186,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r /* flush the MMU prefetch workqueue */ flush_workqueue(hdev->prefetch_wq); - /* Release all pending user interrupts, each pending user interrupt - * holds a reference to user context - */ - hl_release_pending_user_interrupts(hdev); + hl_abort_waiting_for_completions(hdev); } /* @@ -1921,7 +1928,7 @@ out: hl_ctx_put(ctx); - hl_abort_waitings_for_completion(hdev); + hl_abort_waiting_for_completions(hdev); return 0; diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 6b102947cb90..f83ea96c6530 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3934,7 +3934,7 @@ void hl_dec_fini(struct hl_device *hdev); void hl_dec_ctx_fini(struct hl_ctx *ctx); void hl_release_pending_user_interrupts(struct hl_device *hdev); -void hl_abort_waitings_for_completion(struct hl_device *hdev); +void hl_abort_waiting_for_cs_completions(struct hl_device *hdev); int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx, struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig); -- cgit v1.2.3 From 3d21ec6424e6d38f284c37d77e7ec524c1a454f2 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Mon, 17 Apr 2023 21:24:19 +0300 Subject: accel/habanalabs: add missing tpc interrupt info For some reason the last possible tpc interrupt cause in gaudi2_tpc_interrupts_cause is missing from the code. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 5c80e7af5b78..058741698d71 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -63,7 +63,7 @@ #define GAUDI2_NUM_OF_CPU_SEI_ERR_CAUSE 3 #define GAUDI2_NUM_OF_QM_SEI_ERR_CAUSE 2 #define GAUDI2_NUM_OF_ROT_ERR_CAUSE 22 -#define GAUDI2_NUM_OF_TPC_INTR_CAUSE 30 +#define GAUDI2_NUM_OF_TPC_INTR_CAUSE 31 #define GAUDI2_NUM_OF_DEC_ERR_CAUSE 25 #define GAUDI2_NUM_OF_MME_ERR_CAUSE 16 #define GAUDI2_NUM_OF_MME_SBTE_ERR_CAUSE 5 @@ -891,6 +891,7 @@ static const char * const gaudi2_tpc_interrupts_cause[GAUDI2_NUM_OF_TPC_INTR_CAU "invalid_lock_access", "LD_L protection violation", "ST_L protection violation", + "D$ L0CS mismatch", }; static const char * const guadi2_mme_error_cause[GAUDI2_NUM_OF_MME_ERR_CAUSE] = { -- cgit v1.2.3 From d8b9cea584661b30305cf341bf9f675dc0a25471 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 18 Apr 2023 14:48:22 +0300 Subject: accel/habanalabs: add pci health check during heartbeat Currently upon a heartbeat failure, we don't know if the failure is due to firmware hang or due to a bad PCI link. Hence, we are reading a PCI config space register with a known value (vendor ID) so we will know which of the two possibilities caused the heartbeat failure. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 15 ++++++++++++++- drivers/accel/habanalabs/common/habanalabs.h | 2 ++ drivers/accel/habanalabs/common/habanalabs_drv.c | 2 -- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index c8f4a34d2831..98b46b2e1898 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -981,6 +981,18 @@ static void device_early_fini(struct hl_device *hdev) hdev->asic_funcs->early_fini(hdev); } +static bool is_pci_link_healthy(struct hl_device *hdev) +{ + u16 vendor_id; + + if (!hdev->pdev) + return false; + + pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id); + + return (vendor_id == PCI_VENDOR_ID_HABANALABS); +} + static void hl_device_heartbeat(struct work_struct *work) { struct hl_device *hdev = container_of(work, struct hl_device, @@ -995,7 +1007,8 @@ static void hl_device_heartbeat(struct work_struct *work) goto reschedule; if (hl_device_operational(hdev, NULL)) - dev_err(hdev->dev, "Device heartbeat failed!\n"); + dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n", + is_pci_link_healthy(hdev) ? "healthy" : "broken"); info.err_type = HL_INFO_FW_HEARTBEAT_ERR; info.event_mask = &event_mask; diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index f83ea96c6530..ad412cc01aba 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -36,6 +36,8 @@ struct hl_device; struct hl_fpriv; +#define PCI_VENDOR_ID_HABANALABS 0x1da3 + /* Use upper bits of mmap offset to store habana driver specific information. * bits[63:59] - Encode mmap type * bits[45:0] - mmap offset value diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index d9df64e75f33..1ec97da3dddb 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444); MODULE_PARM_DESC(boot_error_status_mask, "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)"); -#define PCI_VENDOR_ID_HABANALABS 0x1da3 - #define PCI_IDS_GOYA 0x0001 #define PCI_IDS_GAUDI 0x1000 #define PCI_IDS_GAUDI_SEC 0x1010 -- cgit v1.2.3 From 3b9abb4fa642ee28bcd22a0398087a2d87d3c987 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 19 Mar 2023 21:46:44 +0200 Subject: accel/habanalabs: expose debugfs files later Currently the debugfs root folder and files for a device are created at an early step, before the device initialization and before the char device and sysfs files are exposed to user. As there is no real reason not to do it together with the device creation, postpone it to be done right afterwards. The initialization of the debugfs entry structure is left in its current position because it is used before creating the files. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/debugfs.c | 37 ++++++++++------- drivers/accel/habanalabs/common/device.c | 62 +++++++++++++++------------- drivers/accel/habanalabs/common/habanalabs.h | 6 ++- 3 files changed, 61 insertions(+), 44 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c index 22dd17c077c0..29b78c7cf5de 100644 --- a/drivers/accel/habanalabs/common/debugfs.c +++ b/drivers/accel/habanalabs/common/debugfs.c @@ -1756,17 +1756,15 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent } } -void hl_debugfs_add_device(struct hl_device *hdev) +int hl_debugfs_device_init(struct hl_device *hdev) { struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; int count = ARRAY_SIZE(hl_debugfs_list); dev_entry->hdev = hdev; - dev_entry->entry_arr = kmalloc_array(count, - sizeof(struct hl_debugfs_entry), - GFP_KERNEL); + dev_entry->entry_arr = kmalloc_array(count, sizeof(struct hl_debugfs_entry), GFP_KERNEL); if (!dev_entry->entry_arr) - return; + return -ENOMEM; dev_entry->data_dma_blob_desc.size = 0; dev_entry->data_dma_blob_desc.data = NULL; @@ -1787,21 +1785,14 @@ void hl_debugfs_add_device(struct hl_device *hdev) spin_lock_init(&dev_entry->userptr_spinlock); mutex_init(&dev_entry->ctx_mem_hash_mutex); - dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), - hl_debug_root); - - add_files_to_device(hdev, dev_entry, dev_entry->root); - if (!hdev->asic_prop.fw_security_enabled) - add_secured_nodes(dev_entry, dev_entry->root); + return 0; } -void hl_debugfs_remove_device(struct hl_device *hdev) +void hl_debugfs_device_fini(struct hl_device *hdev) { struct hl_dbg_device_entry *entry = &hdev->hl_debugfs; int i; - debugfs_remove_recursive(entry->root); - mutex_destroy(&entry->ctx_mem_hash_mutex); mutex_destroy(&entry->file_mutex); @@ -1814,6 +1805,24 @@ void hl_debugfs_remove_device(struct hl_device *hdev) kfree(entry->entry_arr); } +void hl_debugfs_add_device(struct hl_device *hdev) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), hl_debug_root); + + add_files_to_device(hdev, dev_entry, dev_entry->root); + if (!hdev->asic_prop.fw_security_enabled) + add_secured_nodes(dev_entry, dev_entry->root); +} + +void hl_debugfs_remove_device(struct hl_device *hdev) +{ + struct hl_dbg_device_entry *entry = &hdev->hl_debugfs; + + debugfs_remove_recursive(entry->root); +} + void hl_debugfs_add_file(struct hl_fpriv *hpriv) { struct hl_dbg_device_entry *dev_entry = &hpriv->hdev->hl_debugfs; diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 98b46b2e1898..cab5a63db8c1 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -674,7 +674,7 @@ static int device_init_cdev(struct hl_device *hdev, struct class *class, return 0; } -static int device_cdev_sysfs_add(struct hl_device *hdev) +static int cdev_sysfs_debugfs_add(struct hl_device *hdev) { int rc; @@ -699,7 +699,9 @@ static int device_cdev_sysfs_add(struct hl_device *hdev) goto delete_ctrl_cdev_device; } - hdev->cdev_sysfs_created = true; + hl_debugfs_add_device(hdev); + + hdev->cdev_sysfs_debugfs_created = true; return 0; @@ -710,11 +712,12 @@ delete_cdev_device: return rc; } -static void device_cdev_sysfs_del(struct hl_device *hdev) +static void cdev_sysfs_debugfs_remove(struct hl_device *hdev) { - if (!hdev->cdev_sysfs_created) + if (!hdev->cdev_sysfs_debugfs_created) goto put_devices; + hl_debugfs_remove_device(hdev); hl_sysfs_fini(hdev); cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); cdev_device_del(&hdev->cdev, hdev->dev); @@ -2054,7 +2057,7 @@ out_err: int hl_device_init(struct hl_device *hdev) { int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt; - bool add_cdev_sysfs_on_err = false; + bool expose_interfaces_on_err = false; rc = create_cdev(hdev); if (rc) @@ -2170,16 +2173,22 @@ int hl_device_init(struct hl_device *hdev) hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC; hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; - hl_debugfs_add_device(hdev); - /* debugfs nodes are created in hl_ctx_init so it must be called after - * hl_debugfs_add_device. + rc = hl_debugfs_device_init(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize debugfs entry structure\n"); + kfree(hdev->kernel_ctx); + goto mmu_fini; + } + + /* The debugfs entry structure is accessed in hl_ctx_init(), so it must be called after + * hl_debugfs_device_init(). */ rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); if (rc) { dev_err(hdev->dev, "failed to initialize kernel context\n"); kfree(hdev->kernel_ctx); - goto remove_device_from_debugfs; + goto debugfs_device_fini; } rc = hl_cb_pool_init(hdev); @@ -2195,11 +2204,10 @@ int hl_device_init(struct hl_device *hdev) } /* - * From this point, override rc (=0) in case of an error to allow - * debugging (by adding char devices and create sysfs nodes as part of - * the error flow). + * From this point, override rc (=0) in case of an error to allow debugging + * (by adding char devices and creating sysfs/debugfs files as part of the error flow). */ - add_cdev_sysfs_on_err = true; + expose_interfaces_on_err = true; /* Device is now enabled as part of the initialization requires * communication with the device firmware to get information that @@ -2241,15 +2249,13 @@ int hl_device_init(struct hl_device *hdev) } /* - * Expose devices and sysfs nodes to user. - * From here there is no need to add char devices and create sysfs nodes - * in case of an error. + * Expose devices and sysfs/debugfs files to user. + * From here there is no need to expose them in case of an error. */ - add_cdev_sysfs_on_err = false; - rc = device_cdev_sysfs_add(hdev); + expose_interfaces_on_err = false; + rc = cdev_sysfs_debugfs_add(hdev); if (rc) { - dev_err(hdev->dev, - "Failed to add char devices and sysfs nodes\n"); + dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n"); rc = 0; goto out_disabled; } @@ -2295,8 +2301,8 @@ release_ctx: if (hl_ctx_put(hdev->kernel_ctx) != 1) dev_err(hdev->dev, "kernel ctx is still alive on initialization failure\n"); -remove_device_from_debugfs: - hl_debugfs_remove_device(hdev); +debugfs_device_fini: + hl_debugfs_device_fini(hdev); mmu_fini: hl_mmu_fini(hdev); eq_fini: @@ -2320,8 +2326,8 @@ free_dev: put_device(hdev->dev); out_disabled: hdev->disabled = true; - if (add_cdev_sysfs_on_err) - device_cdev_sysfs_add(hdev); + if (expose_interfaces_on_err) + cdev_sysfs_debugfs_add(hdev); if (hdev->pdev) dev_err(&hdev->pdev->dev, "Failed to initialize hl%d. Device %s is NOT usable !\n", @@ -2447,8 +2453,6 @@ void hl_device_fini(struct hl_device *hdev) if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) dev_err(hdev->dev, "kernel ctx is still alive\n"); - hl_debugfs_remove_device(hdev); - hl_dec_fini(hdev); hl_vm_fini(hdev); @@ -2473,8 +2477,10 @@ void hl_device_fini(struct hl_device *hdev) device_early_fini(hdev); - /* Hide devices and sysfs nodes from user */ - device_cdev_sysfs_del(hdev); + /* Hide devices and sysfs/debugfs files from user */ + cdev_sysfs_debugfs_remove(hdev); + + hl_debugfs_device_fini(hdev); pr_info("removed device successfully\n"); } diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index ad412cc01aba..ea0914d08bdc 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3292,7 +3292,7 @@ struct hl_reset_info { * @in_debug: whether the device is in a state where the profiling/tracing infrastructure * can be used. This indication is needed because in some ASICs we need to do * specific operations to enable that infrastructure. - * @cdev_sysfs_created: were char devices and sysfs nodes created. + * @cdev_sysfs_debugfs_created: were char devices and sysfs/debugfs files created. * @stop_on_err: true if engines should stop on error. * @supports_sync_stream: is sync stream supported. * @sync_stream_queue_idx: helper index for sync stream queues initialization. @@ -3459,7 +3459,7 @@ struct hl_device { u8 init_done; u8 device_cpu_disabled; u8 in_debug; - u8 cdev_sysfs_created; + u8 cdev_sysfs_debugfs_created; u8 stop_on_err; u8 supports_sync_stream; u8 sync_stream_queue_idx; @@ -3978,6 +3978,8 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info); void hl_debugfs_init(void); void hl_debugfs_fini(void); +int hl_debugfs_device_init(struct hl_device *hdev); +void hl_debugfs_device_fini(struct hl_device *hdev); void hl_debugfs_add_device(struct hl_device *hdev); void hl_debugfs_remove_device(struct hl_device *hdev); void hl_debugfs_add_file(struct hl_fpriv *hpriv); -- cgit v1.2.3 From 67d19a2f49b07b2c4a15e6a96e1de46d95209ae4 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Mon, 27 Mar 2023 11:56:16 +0300 Subject: accel/habanalabs: poll for device status update following WFE cmd Currently, we rely on COMMS protocol's ack to verify that WFE command has been acknowledged by the FW. However, this does not guarantee that the device status has been updated. Although unlikely, this could trigger a race since the driver expects the device to be halted at that stage, but it might not be. Therefore, we increase WFE's robustness by polling on the status register that will be updated once the device is actually halted. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 28 ++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index e48f024d8649..eb51d7f70aec 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -1368,8 +1368,10 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) { - struct static_fw_load_mgr *static_loader = - &hdev->fw_loader.static_loader; + struct fw_load_mgr *fw_loader = &hdev->fw_loader; + u32 status, cpu_boot_status_reg, cpu_timeout; + struct static_fw_load_mgr *static_loader; + struct pre_fw_load_props *pre_fw_load; int rc; if (hdev->device_cpu_is_halted) @@ -1377,12 +1379,28 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) /* Stop device CPU to make sure nothing bad happens */ if (hdev->asic_prop.dynamic_fw_load) { + pre_fw_load = &fw_loader->pre_fw_load; + cpu_timeout = fw_loader->cpu_timeout; + cpu_boot_status_reg = pre_fw_load->cpu_boot_status_reg; + rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, - COMMS_GOTO_WFE, 0, false, - hdev->fw_loader.cpu_timeout); - if (rc) + COMMS_GOTO_WFE, 0, false, cpu_timeout); + if (rc) { dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); + } else { + rc = hl_poll_timeout( + hdev, + cpu_boot_status_reg, + status, + status == CPU_BOOT_STATUS_IN_WFE, + hdev->fw_poll_interval_usec, + cpu_timeout); + if (rc) + dev_err(hdev->dev, "Current status=%u. Timed-out updating to WFE\n", + status); + } } else { + static_loader = &hdev->fw_loader.static_loader; WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); msleep(static_loader->cpu_reset_wait_msec); -- cgit v1.2.3 From 7d212963366e8113d1a8548a35eb9cd5df110dca Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Thu, 20 Apr 2023 14:17:31 +0300 Subject: accel/habanalabs: fix a static warning - 'dubious: x & !y' Use a straight forward approach to get a conditional result. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 058741698d71..240fecfab608 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -4527,7 +4527,7 @@ static int gaudi2_set_tpc_engine_mode(struct hl_device *hdev, u32 engine_id, u32 reg_base = gaudi2_tpc_cfg_blocks_bases[tpc_id]; reg_addr = reg_base + TPC_CFG_STALL_OFFSET; reg_val = FIELD_PREP(DCORE0_TPC0_CFG_TPC_STALL_V_MASK, - !!(engine_command == HL_ENGINE_STALL)); + (engine_command == HL_ENGINE_STALL) ? 1 : 0); WREG32(reg_addr, reg_val); if (engine_command == HL_ENGINE_RESUME) { @@ -4551,7 +4551,7 @@ static int gaudi2_set_mme_engine_mode(struct hl_device *hdev, u32 engine_id, u32 reg_base = gaudi2_mme_ctrl_lo_blocks_bases[mme_id]; reg_addr = reg_base + MME_CTRL_LO_QM_STALL_OFFSET; reg_val = FIELD_PREP(DCORE0_MME_CTRL_LO_QM_STALL_V_MASK, - !!(engine_command == HL_ENGINE_STALL)); + (engine_command == HL_ENGINE_STALL) ? 1 : 0); WREG32(reg_addr, reg_val); return 0; @@ -4572,7 +4572,7 @@ static int gaudi2_set_edma_engine_mode(struct hl_device *hdev, u32 engine_id, u3 reg_base = gaudi2_dma_core_blocks_bases[edma_id]; reg_addr = reg_base + EDMA_CORE_CFG_STALL_OFFSET; reg_val = FIELD_PREP(DCORE0_EDMA0_CORE_CFG_1_HALT_MASK, - !!(engine_command == HL_ENGINE_STALL)); + (engine_command == HL_ENGINE_STALL) ? 1 : 0); WREG32(reg_addr, reg_val); if (engine_command == HL_ENGINE_STALL) { -- cgit v1.2.3 From d0dcd4bbfa47f8726df5a45236c0edd0107ec5c3 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 24 Apr 2023 14:17:03 +0300 Subject: accel/habanalabs: always fetch pci addr_dec error info Due to missing indication of address decode source (LBW/HBW bus), we should always try and fetch extended information. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 240fecfab608..d21ef9997d05 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8892,14 +8892,12 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ "err cause: %s", gaudi2_pcie_addr_dec_error_cause[i]); error_count++; - switch (intr_cause_data & BIT_ULL(i)) { - case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK: - hl_check_for_glbl_errors(hdev); - break; - case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK: - gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask); - break; - } + /* + * Always check for LBW and HBW additional info as the indication itself is + * sometimes missing + */ + hl_check_for_glbl_errors(hdev); + gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask); } return error_count; -- cgit v1.2.3 From cc1eeaa335f2d71e8bddf268e3e60d3a3c3a08bc Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 12 May 2023 14:46:55 +0800 Subject: accel/habanalabs: Fix some kernel-doc comments Make the description of @regs_range_array and @regs_range_array_size to @user_regs_range_array and @user_regs_range_array_size to silence the warnings: drivers/accel/habanalabs/common/security.c:506: warning: Function parameter or member 'user_regs_range_array' not described in 'hl_init_pb_ranges_single_dcore' drivers/accel/habanalabs/common/security.c:506: warning: Function parameter or member 'user_regs_range_array_size' not described in 'hl_init_pb_ranges_single_dcore' drivers/accel/habanalabs/common/security.c:506: warning: Excess function parameter 'regs_range_array' description in 'hl_init_pb_ranges_single_dcore' drivers/accel/habanalabs/common/security.c:506: warning: Excess function parameter 'regs_range_array_size' description in 'hl_init_pb_ranges_single_dcore' Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=4940 Signed-off-by: Yang Li Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/security.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/security.c b/drivers/accel/habanalabs/common/security.c index 297e6e44fd0c..dc23ff57c91a 100644 --- a/drivers/accel/habanalabs/common/security.c +++ b/drivers/accel/habanalabs/common/security.c @@ -495,8 +495,8 @@ free_glbl_sec: * @instance_offset: offset between instances * @pb_blocks: blocks array * @blocks_array_size: blocks array size - * @regs_range_array: register range array - * @regs_range_array_size: register range array size + * @user_regs_range_array: register range array + * @user_regs_range_array_size: register range array size * */ int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset, -- cgit v1.2.3 From 9ec7639b5e124f273db20555cc38bd40057157b3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 15 May 2023 13:32:18 +0300 Subject: accel/habanalabs: fix gaudi2_get_tpc_idle_status() return The gaudi2_get_tpc_idle_status() function returned the incorrect variable so it always returned true. Fixes: d85f0531b928 ("accel/habanalabs: break is_idle function into per-engine sub-routines") Signed-off-by: Dan Carpenter Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index d21ef9997d05..e900017f4ff7 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7238,7 +7238,7 @@ static bool gaudi2_get_tpc_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 gaudi2_iterate_tpcs(hdev, &tpc_iter); - return tpc_idle_data.is_idle; + return *tpc_idle_data.is_idle; } static bool gaudi2_get_decoder_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, -- cgit v1.2.3 From 964234aba59b0a92de701f25d0d6da7a5ebb3e16 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Tue, 16 May 2023 09:56:18 +0300 Subject: accel/habanalabs: rename security functions related arguments Make the argument names specify the registers array represent registers that should be unsecured so the user can access them. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/security.c | 57 +++++++++++++++--------------- 1 file changed, 29 insertions(+), 28 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/security.c b/drivers/accel/habanalabs/common/security.c index dc23ff57c91a..fe913965dbad 100644 --- a/drivers/accel/habanalabs/common/security.c +++ b/drivers/accel/habanalabs/common/security.c @@ -284,14 +284,14 @@ void hl_secure_block(struct hl_device *hdev, * @instance_offset: offset between instances * @pb_blocks: blocks array * @blocks_array_size: blocks array size - * @regs_array: register array - * @regs_array_size: register array size + * @user_regs_array: unsecured register array + * @user_regs_array_size: unsecured register array size * @mask: enabled instances mask: 1- enabled, 0- disabled */ int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, u32 num_instances, u32 instance_offset, const u32 pb_blocks[], u32 blocks_array_size, - const u32 *regs_array, u32 regs_array_size, u64 mask) + const u32 *user_regs_array, u32 user_regs_array_size, u64 mask) { int i, j; struct hl_block_glbl_sec *glbl_sec; @@ -303,8 +303,8 @@ int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores, return -ENOMEM; hl_secure_block(hdev, glbl_sec, blocks_array_size); - hl_unsecure_registers(hdev, regs_array, regs_array_size, 0, pb_blocks, - glbl_sec, blocks_array_size); + hl_unsecure_registers(hdev, user_regs_array, user_regs_array_size, 0, + pb_blocks, glbl_sec, blocks_array_size); /* Fill all blocks with the same configuration */ for (i = 0 ; i < num_dcores ; i++) { @@ -336,19 +336,19 @@ int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores, * @instance_offset: offset between instances * @pb_blocks: blocks array * @blocks_array_size: blocks array size - * @regs_array: register array - * @regs_array_size: register array size + * @user_regs_array: unsecured register array + * @user_regs_array_size: unsecured register array size * */ int hl_init_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, u32 num_instances, u32 instance_offset, const u32 pb_blocks[], u32 blocks_array_size, - const u32 *regs_array, u32 regs_array_size) + const u32 *user_regs_array, u32 user_regs_array_size) { return hl_init_pb_with_mask(hdev, num_dcores, dcore_offset, num_instances, instance_offset, pb_blocks, - blocks_array_size, regs_array, regs_array_size, - ULLONG_MAX); + blocks_array_size, user_regs_array, + user_regs_array_size, ULLONG_MAX); } /** @@ -364,15 +364,15 @@ int hl_init_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, * @instance_offset: offset between instances * @pb_blocks: blocks array * @blocks_array_size: blocks array size - * @regs_range_array: register range array - * @regs_range_array_size: register range array size + * @user_regs_range_array: unsecured register range array + * @user_regs_range_array_size: unsecured register range array size * @mask: enabled instances mask: 1- enabled, 0- disabled */ int hl_init_pb_ranges_with_mask(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, u32 num_instances, u32 instance_offset, const u32 pb_blocks[], u32 blocks_array_size, - const struct range *regs_range_array, u32 regs_range_array_size, - u64 mask) + const struct range *user_regs_range_array, + u32 user_regs_range_array_size, u64 mask) { int i, j, rc = 0; struct hl_block_glbl_sec *glbl_sec; @@ -384,8 +384,8 @@ int hl_init_pb_ranges_with_mask(struct hl_device *hdev, u32 num_dcores, return -ENOMEM; hl_secure_block(hdev, glbl_sec, blocks_array_size); - rc = hl_unsecure_registers_range(hdev, regs_range_array, - regs_range_array_size, 0, pb_blocks, glbl_sec, + rc = hl_unsecure_registers_range(hdev, user_regs_range_array, + user_regs_range_array_size, 0, pb_blocks, glbl_sec, blocks_array_size); if (rc) goto free_glbl_sec; @@ -422,19 +422,20 @@ free_glbl_sec: * @instance_offset: offset between instances * @pb_blocks: blocks array * @blocks_array_size: blocks array size - * @regs_range_array: register range array - * @regs_range_array_size: register range array size + * @user_regs_range_array: unsecured register range array + * @user_regs_range_array_size: unsecured register range array size * */ int hl_init_pb_ranges(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, u32 num_instances, u32 instance_offset, const u32 pb_blocks[], u32 blocks_array_size, - const struct range *regs_range_array, u32 regs_range_array_size) + const struct range *user_regs_range_array, + u32 user_regs_range_array_size) { return hl_init_pb_ranges_with_mask(hdev, num_dcores, dcore_offset, num_instances, instance_offset, pb_blocks, - blocks_array_size, regs_range_array, - regs_range_array_size, ULLONG_MAX); + blocks_array_size, user_regs_range_array, + user_regs_range_array_size, ULLONG_MAX); } /** @@ -447,14 +448,14 @@ int hl_init_pb_ranges(struct hl_device *hdev, u32 num_dcores, * @instance_offset: offset between instances * @pb_blocks: blocks array * @blocks_array_size: blocks array size - * @regs_array: register array - * @regs_array_size: register array size + * @user_regs_array: unsecured register array + * @user_regs_array_size: unsecured register array size * */ int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset, u32 num_instances, u32 instance_offset, const u32 pb_blocks[], u32 blocks_array_size, - const u32 *regs_array, u32 regs_array_size) + const u32 *user_regs_array, u32 user_regs_array_size) { int i, rc = 0; struct hl_block_glbl_sec *glbl_sec; @@ -466,8 +467,8 @@ int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset, return -ENOMEM; hl_secure_block(hdev, glbl_sec, blocks_array_size); - rc = hl_unsecure_registers(hdev, regs_array, regs_array_size, 0, - pb_blocks, glbl_sec, blocks_array_size); + rc = hl_unsecure_registers(hdev, user_regs_array, user_regs_array_size, + 0, pb_blocks, glbl_sec, blocks_array_size); if (rc) goto free_glbl_sec; @@ -495,8 +496,8 @@ free_glbl_sec: * @instance_offset: offset between instances * @pb_blocks: blocks array * @blocks_array_size: blocks array size - * @user_regs_range_array: register range array - * @user_regs_range_array_size: register range array size + * @user_regs_range_array: unsecured register range array + * @user_regs_range_array_size: unsecured register range array size * */ int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset, -- cgit v1.2.3 From e715008b7c318fb1901a37bdd0941fb46c2b90b5 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 1 May 2023 13:12:22 +0300 Subject: accel/habanalabs: set unused bit as reserved Get latest f/w gaudi2 interface file which marks unused bist_need_iatu_config bit in cold_rst_data structure as reserved bit. Signed-off-by: Oded Gabbay Reviewed-by: Ofir Bitton --- drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h index 8522f24deac0..18ca147b1c86 100644 --- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h +++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h @@ -62,7 +62,7 @@ struct gaudi2_cold_rst_data { u32 fake_security_enable : 1; u32 fake_sig_validation_en : 1; u32 bist_skip_enable : 1; - u32 bist_need_iatu_config : 1; + u32 reserved1 : 1; u32 fake_bis_compliant : 1; u32 wd_rst_cause_arm : 1; u32 wd_rst_cause_arcpid : 1; -- cgit v1.2.3 From 314a7ffd7c196b27eedd50cb7553029e17789b55 Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Sun, 16 Apr 2023 18:36:17 +0300 Subject: accel/habanalabs: fix mem leak in capture user mappings This commit fixes a memory leak caused when clearing the user_mappings info when a new context is opened immediately after user_mapping is captured and a hard reset is performed. Signed-off-by: Moti Haimovski Reviewed-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs_drv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 1ec97da3dddb..70fb2df9a93b 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -13,6 +13,7 @@ #include #include +#include #define CREATE_TRACE_POINTS #include @@ -218,6 +219,7 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); + vfree(hdev->captured_err_info.page_fault_info.user_mappings); memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info)); atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); hdev->captured_err_info.undef_opcode.write_enable = true; -- cgit v1.2.3 From dcfce96ee834de6f9223670f3d2e1440559fb9b6 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 1 May 2023 13:19:57 +0300 Subject: accel/habanalabs: align to latest firmware specs Update the firmware common interface files with the latest version. Signed-off-by: Oded Gabbay Reviewed-by: Ofir Bitton --- drivers/accel/habanalabs/include/common/cpucp_if.h | 18 +++++----- .../accel/habanalabs/include/common/hl_boot_if.h | 41 ++++------------------ 2 files changed, 16 insertions(+), 43 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/include/common/cpucp_if.h b/drivers/accel/habanalabs/include/common/cpucp_if.h index f68308cc2524..33807b839c37 100644 --- a/drivers/accel/habanalabs/include/common/cpucp_if.h +++ b/drivers/accel/habanalabs/include/common/cpucp_if.h @@ -359,7 +359,7 @@ struct hl_eq_entry { union { __le64 data_placeholder; struct hl_eq_ecc_data ecc_data; - struct hl_eq_hbm_ecc_data hbm_ecc_data; /* Gaudi1 HBM */ + struct hl_eq_hbm_ecc_data hbm_ecc_data; /* Obsolete */ struct hl_eq_sm_sei_data sm_sei_data; struct cpucp_pkt_sync_err pkt_sync_err; struct hl_eq_fw_alive fw_alive; @@ -653,7 +653,7 @@ enum pq_init_status { * which address is passed via the CpuCp packet. In addition, the host's driver * passes the max size it allows the CpuCP to write to the structure, to prevent * data corruption in case of mismatched driver/FW versions. - * Relevant only to Gaudi. + * Obsolete. * * CPUCP_PACKET_GENERIC_PASSTHROUGH - * Generic opcode for all firmware info that is only passed to host @@ -868,19 +868,19 @@ struct cpucp_array_data_packet { enum cpucp_led_index { CPUCP_LED0_INDEX = 0, CPUCP_LED1_INDEX, - CPUCP_LED2_INDEX + CPUCP_LED2_INDEX, + CPUCP_LED_MAX_INDEX = CPUCP_LED2_INDEX }; /* * enum cpucp_packet_rc - Error return code * @cpucp_packet_success -> in case of success. - * @cpucp_packet_invalid -> this is to support Goya and Gaudi platform. + * @cpucp_packet_invalid -> this is to support first generation platforms. * @cpucp_packet_fault -> in case of processing error like failing to * get device binding or semaphore etc. - * @cpucp_packet_invalid_pkt -> when cpucp packet is un-supported. This is - * supported Greco onwards. + * @cpucp_packet_invalid_pkt -> when cpucp packet is un-supported. * @cpucp_packet_invalid_params -> when checking parameter like length of buffer - * or attribute value etc. Supported Greco onwards. + * or attribute value etc. * @cpucp_packet_rc_max -> It indicates size of enum so should be at last. */ enum cpucp_packet_rc { @@ -1365,7 +1365,7 @@ struct cpucp_dev_info_signed { #define DCORE_MON_REGS_SZ 512 /* * struct dcore_monitor_regs_data - DCORE monitor regs data. - * the structure follows sync manager block layout. relevant only to Gaudi. + * the structure follows sync manager block layout. Obsolete. * @mon_pay_addrl: array of payload address low bits. * @mon_pay_addrh: array of payload address high bits. * @mon_pay_data: array of payload data. @@ -1380,7 +1380,7 @@ struct dcore_monitor_regs_data { __le32 mon_status[DCORE_MON_REGS_SZ]; }; -/* contains SM data for each SYNC_MNGR (relevant only to Gaudi) */ +/* contains SM data for each SYNC_MNGR (Obsolete) */ struct cpucp_monitor_dump { struct dcore_monitor_regs_data sync_mngr_w_s; struct dcore_monitor_regs_data sync_mngr_e_s; diff --git a/drivers/accel/habanalabs/include/common/hl_boot_if.h b/drivers/accel/habanalabs/include/common/hl_boot_if.h index c58d76a2705c..cff79f7f9f75 100644 --- a/drivers/accel/habanalabs/include/common/hl_boot_if.h +++ b/drivers/accel/habanalabs/include/common/hl_boot_if.h @@ -35,6 +35,7 @@ enum cpu_boot_err { CPU_BOOT_ERR_TPM_FAIL = 20, CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL = 21, CPU_BOOT_ERR_EEPROM_FAIL = 22, + CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL = 23, CPU_BOOT_ERR_ENABLED = 31, CPU_BOOT_ERR_SCND_EN = 63, CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */ @@ -51,6 +52,7 @@ enum cpu_boot_err { (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) | \ (1 << CPU_BOOT_ERR_BINNING_FAIL) | \ (1 << CPU_BOOT_ERR_DRAM_SKIPPED) | \ + (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) | \ (1 << CPU_BOOT_ERR_EEPROM_FAIL)) /* @@ -132,6 +134,9 @@ enum cpu_boot_err { * CPU_BOOT_ERR_EEPROM_FAIL Failed reading EEPROM data. Defaults * are used. * + * CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL Failed scrubbing the Engines/ARCFarm + * memories. Boot disabled until reset. + * * CPU_BOOT_ERR0_ENABLED Error registers enabled. * This is a main indication that the * running FW populates the error @@ -157,6 +162,7 @@ enum cpu_boot_err { #define CPU_BOOT_ERR0_TPM_FAIL (1 << CPU_BOOT_ERR_TPM_FAIL) #define CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL (1 << CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL) #define CPU_BOOT_ERR0_EEPROM_FAIL (1 << CPU_BOOT_ERR_EEPROM_FAIL) +#define CPU_BOOT_ERR0_ENG_ARC_MEM_SCRUB_FAIL (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) #define CPU_BOOT_ERR0_ENABLED (1 << CPU_BOOT_ERR_ENABLED) #define CPU_BOOT_ERR1_ENABLED (1 << CPU_BOOT_ERR_ENABLED) @@ -744,36 +750,6 @@ struct comms_status { }; }; -/** - * HL_MODULES_MAX_NUM is determined by the size of modules_mask in struct - * hl_component_versions - */ -enum hl_modules { - HL_MODULES_BOOT_INFO = 0, - HL_MODULES_EEPROM, - HL_MODULES_FDT, - HL_MODULES_I2C, - HL_MODULES_LZ4, - HL_MODULES_MBEDTLS, - HL_MODULES_MAX_NUM = 16 -}; - -/** - * HL_COMPONENTS_MAX_NUM is determined by the size of components_mask in - * struct cpucp_versions - */ -enum hl_components { - HL_COMPONENTS_PID = 0, - HL_COMPONENTS_MGMT, - HL_COMPONENTS_PREBOOT, - HL_COMPONENTS_PPBOOT, - HL_COMPONENTS_ARMCP, - HL_COMPONENTS_CPLD, - HL_COMPONENTS_UBOOT, - HL_COMPONENTS_FUSE, - HL_COMPONENTS_MAX_NUM = 16 -}; - #define NAME_MAX_LEN 32 /* bytes */ struct hl_module_data { __u8 name[NAME_MAX_LEN]; @@ -787,8 +763,6 @@ struct hl_module_data { * @component: version of the component itself. * @fw_os: Firmware OS Version. * @comp_name: Name of the component. - * @modules_mask: i'th bit (from LSB) is a flag - on if module i in enum - * hl_modules is used. * @modules_counter: number of set bits in modules_mask. * @reserved: reserved for future use. * @modules: versions of the component's modules. Elborated explanation in @@ -800,9 +774,8 @@ struct hl_component_versions { __u8 component[VERSION_MAX_LEN]; __u8 fw_os[VERSION_MAX_LEN]; __u8 comp_name[NAME_MAX_LEN]; - __le16 modules_mask; __u8 modules_counter; - __u8 reserved[1]; + __u8 reserved[3]; struct hl_module_data modules[]; }; -- cgit v1.2.3 From adda800c049a4580aaec4d710a9972cdad5b14e3 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 3 May 2023 14:47:54 +0300 Subject: accel/habanalabs: print max timeout value on CS stuck If a workload got stuck, we print an error to the kernel log about it. Add to that print the configured max timeout value, as that value is not fixed between ASICs and in addition it can be configured using a kernel module parameter. Signed-off-by: Oded Gabbay Reviewed-by: Ofir Bitton --- .../accel/habanalabs/common/command_submission.c | 26 +++++++++++++--------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index 4ff3256fdf8b..cc205bad476e 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -804,12 +804,14 @@ out: static void cs_timedout(struct work_struct *work) { + struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); + bool skip_reset_on_timeout, device_reset = false; struct hl_device *hdev; u64 event_mask = 0x0; + uint timeout_sec; int rc; - struct hl_cs *cs = container_of(work, struct hl_cs, - work_tdr.work); - bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false; + + skip_reset_on_timeout = cs->skip_reset_on_timeout; rc = cs_get_unless_zero(cs); if (!rc) @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work) event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT; } + timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000; + switch (cs->type) { case CS_TYPE_SIGNAL: dev_err(hdev->dev, - "Signal command submission %llu has not finished in time!\n", - cs->sequence); + "Signal command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; case CS_TYPE_WAIT: dev_err(hdev->dev, - "Wait command submission %llu has not finished in time!\n", - cs->sequence); + "Wait command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; case CS_TYPE_COLLECTIVE_WAIT: dev_err(hdev->dev, - "Collective Wait command submission %llu has not finished in time!\n", - cs->sequence); + "Collective Wait command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; default: dev_err(hdev->dev, - "Command submission %llu has not finished in time!\n", - cs->sequence); + "Command submission %llu has not finished in %u seconds!\n", + cs->sequence, timeout_sec); break; } -- cgit v1.2.3 From b2d61fecb443c10f116fa00a1b9b5677a5822e52 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Tue, 2 May 2023 17:28:56 +0300 Subject: accel/habanalabs: upon DMA errors, use FW-extracted error cause Initially, the driver used to read the error cause data directly from the ASIC. However, the FW now clears it before the driver could read it. Therefore we should use the error cause data that is extracted by the FW. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 37 +++++++------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index e900017f4ff7..b8644d87f817 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8807,13 +8807,13 @@ static int gaudi2_handle_kdma_core_event(struct hl_device *hdev, u16 event_type, return error_count; } -static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, int sts_addr) +static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, u64 intr_cause) { - u32 error_count = 0, sts_val = RREG32(sts_addr); + u32 error_count = 0; int i; for (i = 0 ; i < GAUDI2_NUM_OF_DMA_CORE_INTR_CAUSE ; i++) - if (sts_val & BIT(i)) { + if (intr_cause & BIT(i)) { gaudi2_print_event(hdev, event_type, true, "err cause: %s", gaudi2_dma_core_interrupts_cause[i]); error_count++; @@ -8824,27 +8824,6 @@ static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, return error_count; } -static int gaudi2_handle_pdma_core_event(struct hl_device *hdev, u16 event_type, int pdma_idx) -{ - u32 sts_addr; - - sts_addr = mmPDMA0_CORE_ERR_CAUSE + pdma_idx * PDMA_OFFSET; - return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr); -} - -static int gaudi2_handle_edma_core_event(struct hl_device *hdev, u16 event_type, int edma_idx) -{ - static const int edma_event_index_map[] = {2, 3, 0, 1, 6, 7, 4, 5}; - u32 sts_addr, index; - - index = edma_event_index_map[edma_idx]; - - sts_addr = mmDCORE0_EDMA0_CORE_ERR_CAUSE + - DCORE_OFFSET * (index / NUM_OF_EDMA_PER_DCORE) + - DCORE_EDMA_OFFSET * (index % NUM_OF_EDMA_PER_DCORE); - return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr); -} - static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev, u64 *event_mask) { u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, razwi_happened_addr; @@ -9725,19 +9704,19 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_KDMA0_CORE: error_count = gaudi2_handle_kdma_core_event(hdev, event_type, - le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); + le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_HDMA2_CORE ... GAUDI2_EVENT_HDMA5_CORE: - index = event_type - GAUDI2_EVENT_HDMA2_CORE; - error_count = gaudi2_handle_edma_core_event(hdev, event_type, index); + error_count = gaudi2_handle_dma_core_event(hdev, event_type, + le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_PDMA0_CORE ... GAUDI2_EVENT_PDMA1_CORE: - index = event_type - GAUDI2_EVENT_PDMA0_CORE; - error_count = gaudi2_handle_pdma_core_event(hdev, event_type, index); + error_count = gaudi2_handle_dma_core_event(hdev, event_type, + le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; -- cgit v1.2.3 From 583f12a80dfb7997d59a42e8642019695f5aa15a Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 9 May 2023 14:02:49 +0300 Subject: accel/habanalabs: remove support for mmu disable As mmu disable mode is only used for bring-up stages, let's remove this option and all code related to it. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/command_buffer.c | 6 -- .../accel/habanalabs/common/command_submission.c | 22 ++--- drivers/accel/habanalabs/common/debugfs.c | 23 +---- drivers/accel/habanalabs/common/habanalabs.h | 18 +--- drivers/accel/habanalabs/common/habanalabs_drv.c | 2 - drivers/accel/habanalabs/common/habanalabs_ioctl.c | 9 +- drivers/accel/habanalabs/common/memory.c | 104 +-------------------- drivers/accel/habanalabs/common/mmu/mmu.c | 56 ++--------- drivers/accel/habanalabs/gaudi/gaudi.c | 6 +- drivers/accel/habanalabs/goya/goya.c | 3 - drivers/accel/habanalabs/goya/goya_coresight.c | 9 +- 11 files changed, 26 insertions(+), 232 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/command_buffer.c b/drivers/accel/habanalabs/common/command_buffer.c index 6e09f48750a0..08f7aee42624 100644 --- a/drivers/accel/habanalabs/common/command_buffer.c +++ b/drivers/accel/habanalabs/common/command_buffer.c @@ -27,12 +27,6 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) return -EINVAL; } - if (!hdev->mmu_enable) { - dev_err_ratelimited(hdev->dev, - "Cannot map CB because MMU is disabled\n"); - return -EINVAL; - } - if (cb->is_mmu_mapped) return 0; diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index cc205bad476e..c23829dab97a 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -280,14 +280,8 @@ bool cs_needs_timeout(struct hl_cs *cs) static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job) { - /* - * Patched CB is created for external queues jobs, and for H/W queues - * jobs if the user CB was allocated by driver and MMU is disabled. - */ - return (job->queue_type == QUEUE_TYPE_EXT || - (job->queue_type == QUEUE_TYPE_HW && - job->is_kernel_allocated_cb && - !hdev->mmu_enable)); + /* Patched CB is created for external queues jobs */ + return (job->queue_type == QUEUE_TYPE_EXT); } /* @@ -363,14 +357,13 @@ static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job) } } - /* For H/W queue jobs, if a user CB was allocated by driver and MMU is - * enabled, the user CB isn't released in cs_parser() and thus should be + /* For H/W queue jobs, if a user CB was allocated by driver, + * the user CB isn't released in cs_parser() and thus should be * released here. This is also true for INT queues jobs which were * allocated by driver. */ - if ((job->is_kernel_allocated_cb && - ((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) || - job->queue_type == QUEUE_TYPE_INT))) { + if (job->is_kernel_allocated_cb && + (job->queue_type == QUEUE_TYPE_HW || job->queue_type == QUEUE_TYPE_INT)) { atomic_dec(&job->user_cb->cs_cnt); hl_cb_put(job->user_cb); } @@ -1951,8 +1944,7 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev, else cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); - cb = hl_cb_kernel_create(hdev, cb_size, - q_type == QUEUE_TYPE_HW && hdev->mmu_enable); + cb = hl_cb_kernel_create(hdev, cb_size, q_type == QUEUE_TYPE_HW); if (!cb) { atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); atomic64_inc(&cntr->out_of_mem_drop_cnt); diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c index 29b78c7cf5de..9e84a47a21dc 100644 --- a/drivers/accel/habanalabs/common/debugfs.c +++ b/drivers/accel/habanalabs/common/debugfs.c @@ -255,9 +255,6 @@ static int vm_show(struct seq_file *s, void *data) u64 j; int i; - if (!dev_entry->hdev->mmu_enable) - return 0; - mutex_lock(&dev_entry->ctx_mem_hash_mutex); list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) { @@ -436,9 +433,6 @@ static int mmu_show(struct seq_file *s, void *data) u64 virt_addr = dev_entry->mmu_addr, phys_addr; int i; - if (!hdev->mmu_enable) - return 0; - if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID) ctx = hdev->kernel_ctx; else @@ -496,9 +490,6 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf, char *c; ssize_t rc; - if (!hdev->mmu_enable) - return count; - if (count > sizeof(kbuf) - 1) goto err; if (copy_from_user(kbuf, buf, count)) @@ -535,9 +526,6 @@ static int mmu_ack_error(struct seq_file *s, void *data) struct hl_device *hdev = dev_entry->hdev; int rc; - if (!hdev->mmu_enable) - return 0; - if (!dev_entry->mmu_cap_mask) { dev_err(hdev->dev, "mmu_cap_mask is not set\n"); goto err; @@ -563,9 +551,6 @@ static ssize_t mmu_ack_error_value_write(struct file *file, char kbuf[MMU_KBUF_SIZE]; ssize_t rc; - if (!hdev->mmu_enable) - return count; - if (count > sizeof(kbuf) - 1) goto err; @@ -661,9 +646,6 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr) { struct asic_fixed_properties *prop = &hdev->asic_prop; - if (!hdev->mmu_enable) - goto out; - if (prop->dram_supports_virtual_memory && (addr >= prop->dmmu.start_addr && addr < prop->dmmu.end_addr)) return true; @@ -675,7 +657,7 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr) if (addr >= prop->pmmu_huge.start_addr && addr < prop->pmmu_huge.end_addr) return true; -out: + return false; } @@ -685,9 +667,6 @@ static bool hl_is_device_internal_memory_va(struct hl_device *hdev, u64 addr, struct asic_fixed_properties *prop = &hdev->asic_prop; u64 dram_start_addr, dram_end_addr; - if (!hdev->mmu_enable) - return false; - if (prop->dram_supports_virtual_memory) { dram_start_addr = prop->dmmu.start_addr; dram_end_addr = prop->dmmu.end_addr; diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index ea0914d08bdc..e2341a75a4b7 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -115,18 +115,6 @@ enum hl_mmu_page_table_location { MMU_NUM_PGT_LOCATIONS /* num of PGT locations */ }; -/** - * enum hl_mmu_enablement - what mmu modules to enable - * @MMU_EN_NONE: mmu disabled. - * @MMU_EN_ALL: enable all. - * @MMU_EN_PMMU_ONLY: Enable only the PMMU leaving the DMMU disabled. - */ -enum hl_mmu_enablement { - MMU_EN_NONE = 0, - MMU_EN_ALL = 1, - MMU_EN_PMMU_ONLY = 3, /* N/A for Goya/Gaudi */ -}; - /* * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream @@ -3319,7 +3307,7 @@ struct hl_reset_info { * @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing. * @fw_components: Controls which f/w components to load to the device. There are multiple f/w * stages and sometimes we want to stop at a certain stage. Used only for testing. - * @mmu_enable: Whether to enable or disable the device MMU(s). Used only for testing. + * @mmu_disable: Disable the device MMU(s). Used only for testing. * @cpu_queues_enable: Whether to enable queues communication vs. the f/w. Used only for testing. * @pldm: Whether we are running in Palladium environment. Used only for testing. * @hard_reset_on_fw_events: Whether to do device hard-reset when a fatal event is received from @@ -3482,7 +3470,7 @@ struct hl_device { /* Parameters for bring-up to be upstreamed */ u64 nic_ports_mask; u64 fw_components; - u8 mmu_enable; + u8 mmu_disable; u8 cpu_queues_enable; u8 pldm; u8 hard_reset_on_fw_events; @@ -3827,8 +3815,6 @@ struct pgt_info *hl_mmu_hr_get_alloc_next_hop(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop); int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops, struct hl_hr_mmu_funcs *hr_func); -void hl_mmu_swap_out(struct hl_ctx *ctx); -void hl_mmu_swap_in(struct hl_ctx *ctx); int hl_mmu_if_set_funcs(struct hl_device *hdev); void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 70fb2df9a93b..446f444a1c7e 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -307,7 +307,6 @@ static void set_driver_behavior_per_device(struct hl_device *hdev) { hdev->nic_ports_mask = 0; hdev->fw_components = FW_TYPE_ALL_TYPES; - hdev->mmu_enable = MMU_EN_ALL; hdev->cpu_queues_enable = 1; hdev->pldm = 0; hdev->hard_reset_on_fw_events = 1; @@ -382,7 +381,6 @@ static int fixup_device_params(struct hl_device *hdev) /* If CPU queues not enabled, no way to do heartbeat */ if (!hdev->cpu_queues_enable) hdev->heartbeat = 0; - fixup_device_params_per_asic(hdev, tmp_timeout); return 0; diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 4368e6c9a23a..9a8be9395fb2 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -62,7 +62,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) hw_ip.device_id = hdev->asic_funcs->get_pci_id(hdev); hw_ip.sram_base_address = prop->sram_user_base_address; hw_ip.dram_base_address = - hdev->mmu_enable && prop->dram_supports_virtual_memory ? + prop->dram_supports_virtual_memory ? prop->dmmu.start_addr : prop->dram_user_base_address; hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask & 0xFF; hw_ip.tpc_enabled_mask_ext = prop->tpc_enabled_mask; @@ -71,11 +71,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) dram_available_size = prop->dram_size - dram_kmd_size; - if (hdev->mmu_enable == MMU_EN_ALL) - hw_ip.dram_size = DIV_ROUND_DOWN_ULL(dram_available_size, - prop->dram_page_size) * prop->dram_page_size; - else - hw_ip.dram_size = dram_available_size; + hw_ip.dram_size = DIV_ROUND_DOWN_ULL(dram_available_size, prop->dram_page_size) * + prop->dram_page_size; if (hw_ip.dram_size > PAGE_SIZE) hw_ip.dram_enabled = 1; diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c index a7b6a273ce21..4fc72a07d2f5 100644 --- a/drivers/accel/habanalabs/common/memory.c +++ b/drivers/accel/habanalabs/common/memory.c @@ -1034,30 +1034,6 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, } } -static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args, - u64 *paddr) -{ - struct hl_device *hdev = ctx->hdev; - struct hl_vm *vm = &hdev->vm; - struct hl_vm_phys_pg_pack *phys_pg_pack; - u32 handle; - - handle = lower_32_bits(args->map_device.handle); - spin_lock(&vm->idr_lock); - phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle); - if (!phys_pg_pack) { - spin_unlock(&vm->idr_lock); - dev_err(hdev->dev, "no match for handle %u\n", handle); - return -EINVAL; - } - - *paddr = phys_pg_pack->pages[0]; - - spin_unlock(&vm->idr_lock); - - return 0; -} - /** * map_device_va() - map the given memory. * @ctx: pointer to the context structure. @@ -2094,76 +2070,6 @@ err_free_dmabuf_wrapper: return rc; } -static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args) -{ - struct hl_device *hdev = hpriv->hdev; - u64 block_handle, device_addr = 0; - struct hl_ctx *ctx = hpriv->ctx; - u32 handle = 0, block_size; - int rc; - - switch (args->in.op) { - case HL_MEM_OP_ALLOC: - if (args->in.alloc.mem_size == 0) { - dev_err(hdev->dev, "alloc size must be larger than 0\n"); - rc = -EINVAL; - goto out; - } - - /* Force contiguous as there are no real MMU - * translations to overcome physical memory gaps - */ - args->in.flags |= HL_MEM_CONTIGUOUS; - rc = alloc_device_memory(ctx, &args->in, &handle); - - memset(args, 0, sizeof(*args)); - args->out.handle = (__u64) handle; - break; - - case HL_MEM_OP_FREE: - rc = free_device_memory(ctx, &args->in); - break; - - case HL_MEM_OP_MAP: - if (args->in.flags & HL_MEM_USERPTR) { - dev_err(hdev->dev, "Failed to map host memory when MMU is disabled\n"); - rc = -EPERM; - } else { - rc = get_paddr_from_handle(ctx, &args->in, &device_addr); - memset(args, 0, sizeof(*args)); - args->out.device_virt_addr = device_addr; - } - - break; - - case HL_MEM_OP_UNMAP: - rc = 0; - break; - - case HL_MEM_OP_MAP_BLOCK: - rc = map_block(hdev, args->in.map_block.block_addr, &block_handle, &block_size); - args->out.block_handle = block_handle; - args->out.block_size = block_size; - break; - - case HL_MEM_OP_EXPORT_DMABUF_FD: - dev_err(hdev->dev, "Failed to export dma-buf object when MMU is disabled\n"); - rc = -EPERM; - break; - - case HL_MEM_OP_TS_ALLOC: - rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle); - break; - default: - dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); - rc = -EINVAL; - break; - } - -out: - return rc; -} - static void ts_buff_release(struct hl_mmap_mem_buf *buf) { struct hl_ts_buff *ts_buff = buf->private; @@ -2282,9 +2188,6 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) return -EBUSY; } - if (!hdev->mmu_enable) - return mem_ioctl_no_mmu(hpriv, args); - switch (args->in.op) { case HL_MEM_OP_ALLOC: if (args->in.alloc.mem_size == 0) { @@ -2779,13 +2682,10 @@ int hl_vm_ctx_init(struct hl_ctx *ctx) atomic64_set(&ctx->dram_phys_mem, 0); /* - * - If MMU is enabled, init the ranges as usual. - * - If MMU is disabled, in case of host mapping, the returned address - * is the given one. * In case of DRAM mapping, the returned address is the physical * address of the memory related to the given handle. */ - if (!ctx->hdev->mmu_enable) + if (ctx->hdev->mmu_disable) return 0; dram_range_start = prop->dmmu.start_addr; @@ -2835,7 +2735,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) struct hl_mem_in args; int i; - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return; hl_debugfs_remove_ctx_mem_hash(hdev, ctx); diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c index f379e5b461a6..b2145716c605 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu.c +++ b/drivers/accel/habanalabs/common/mmu/mmu.c @@ -44,7 +44,7 @@ int hl_mmu_init(struct hl_device *hdev) { int rc = -EOPNOTSUPP; - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return 0; mutex_init(&hdev->mmu_lock); @@ -82,7 +82,7 @@ fini_dr_mmu: */ void hl_mmu_fini(struct hl_device *hdev) { - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return; if (hdev->mmu_func[MMU_DR_PGT].fini != NULL) @@ -107,7 +107,7 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx) struct hl_device *hdev = ctx->hdev; int rc = -EOPNOTSUPP; - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return 0; if (hdev->mmu_func[MMU_DR_PGT].ctx_init != NULL) { @@ -145,7 +145,7 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx) { struct hl_device *hdev = ctx->hdev; - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return; if (hdev->mmu_func[MMU_DR_PGT].ctx_fini != NULL) @@ -233,7 +233,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, bool flu u64 real_virt_addr; bool is_dram_addr; - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return 0; is_dram_addr = hl_is_dram_va(hdev, virt_addr); @@ -301,7 +301,7 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_s bool is_dram_addr; - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return 0; is_dram_addr = hl_is_dram_va(hdev, virt_addr); @@ -472,46 +472,6 @@ int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size) return rc; } -/* - * hl_mmu_swap_out - marks all mapping of the given ctx as swapped out - * - * @ctx: pointer to the context structure - * - */ -void hl_mmu_swap_out(struct hl_ctx *ctx) -{ - struct hl_device *hdev = ctx->hdev; - - if (!hdev->mmu_enable) - return; - - if (hdev->mmu_func[MMU_DR_PGT].swap_out != NULL) - hdev->mmu_func[MMU_DR_PGT].swap_out(ctx); - - if (hdev->mmu_func[MMU_HR_PGT].swap_out != NULL) - hdev->mmu_func[MMU_HR_PGT].swap_out(ctx); -} - -/* - * hl_mmu_swap_in - marks all mapping of the given ctx as swapped in - * - * @ctx: pointer to the context structure - * - */ -void hl_mmu_swap_in(struct hl_ctx *ctx) -{ - struct hl_device *hdev = ctx->hdev; - - if (!hdev->mmu_enable) - return; - - if (hdev->mmu_func[MMU_DR_PGT].swap_in != NULL) - hdev->mmu_func[MMU_DR_PGT].swap_in(ctx); - - if (hdev->mmu_func[MMU_HR_PGT].swap_in != NULL) - hdev->mmu_func[MMU_HR_PGT].swap_in(ctx); -} - static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops, u64 *phys_addr) @@ -594,7 +554,7 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, int pgt_residency, rc; bool is_dram_addr; - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return -EOPNOTSUPP; prop = &hdev->asic_prop; @@ -625,7 +585,7 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, int hl_mmu_if_set_funcs(struct hl_device *hdev) { - if (!hdev->mmu_enable) + if (hdev->mmu_disable) return 0; switch (hdev->asic_type) { diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index a1697581c218..056e2ef44afb 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -1469,8 +1469,7 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev, } /* Allocate internal mapped CB for non patched CBs */ - cb = hl_cb_kernel_create(hdev, cb_size, - hdev->mmu_enable && !patched_cb); + cb = hl_cb_kernel_create(hdev, cb_size, !patched_cb); if (!cb) { atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); atomic64_inc(&cntr->out_of_mem_drop_cnt); @@ -3644,9 +3643,6 @@ static int gaudi_mmu_init(struct hl_device *hdev) u64 hop0_addr; int rc, i; - if (!hdev->mmu_enable) - return 0; - if (gaudi->hw_cap_initialized & HW_CAP_MMU) return 0; diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c index fb0ac9df841a..7c685e6075f6 100644 --- a/drivers/accel/habanalabs/goya/goya.c +++ b/drivers/accel/habanalabs/goya/goya.c @@ -2671,9 +2671,6 @@ int goya_mmu_init(struct hl_device *hdev) u64 hop0_addr; int rc, i; - if (!hdev->mmu_enable) - return 0; - if (goya->hw_cap_initialized & HW_CAP_MMU) return 0; diff --git a/drivers/accel/habanalabs/goya/goya_coresight.c b/drivers/accel/habanalabs/goya/goya_coresight.c index e7ac3046cfaa..a6d6cc38bcd8 100644 --- a/drivers/accel/habanalabs/goya/goya_coresight.c +++ b/drivers/accel/habanalabs/goya/goya_coresight.c @@ -371,13 +371,8 @@ static int goya_etr_validate_address(struct hl_device *hdev, u64 addr, return false; } - if (hdev->mmu_enable) { - range_start = prop->dmmu.start_addr; - range_end = prop->dmmu.end_addr; - } else { - range_start = prop->dram_user_base_address; - range_end = prop->dram_end_address; - } + range_start = prop->dmmu.start_addr; + range_end = prop->dmmu.end_addr; return hl_mem_area_inside_range(addr, size, range_start, range_end); } -- cgit v1.2.3 From dcc8fa88d46ef97ea9812735d0d54591bc16b242 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 8 May 2023 17:24:38 +0300 Subject: accel/habanalabs: use binning info when handling razwi When receiving sei interrupt from tpc or decoder, we need to check the binning mask because if the engine is binned, the razwi info won't be in the router of the binned engine, instead will be in the router of the substitute engine. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index b8644d87f817..a6aa17d86820 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8040,7 +8040,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, u8 module_sub_idx, u64 *event_mask) { bool via_sft = false; - u32 hbw_rtr_id, lbw_rtr_id, dcore_id, dcore_rtr_id, eng_id; + u32 hbw_rtr_id, lbw_rtr_id, dcore_id, dcore_rtr_id, eng_id, binned_idx; u64 hbw_rtr_mstr_if_base_addr, lbw_rtr_mstr_if_base_addr; u32 hbw_shrd_aw = 0, hbw_shrd_ar = 0; u32 lbw_shrd_aw = 0, lbw_shrd_ar = 0; @@ -8048,6 +8048,13 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, switch (module) { case RAZWI_TPC: + sprintf(initiator_name, "TPC_%u", module_idx); + if (hdev->tpc_binning) { + binned_idx = __ffs(hdev->tpc_binning); + if (binned_idx == module_idx) + module_idx = TPC_ID_DCORE0_TPC6; + } + hbw_rtr_id = gaudi2_tpc_initiator_hbw_rtr_id[module_idx]; if (hl_is_fw_sw_ver_below(hdev, 1, 9) && @@ -8056,7 +8063,6 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, lbw_rtr_id = DCORE0_RTR0; else lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx]; - sprintf(initiator_name, "TPC_%u", module_idx); break; case RAZWI_MME: sprintf(initiator_name, "MME_%u", module_idx); @@ -8115,9 +8121,14 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, sprintf(initiator_name, "NIC_%u", module_idx); break; case RAZWI_DEC: + sprintf(initiator_name, "DEC_%u", module_idx); + if (hdev->decoder_binning) { + binned_idx = __ffs(hdev->decoder_binning); + if (binned_idx == module_idx) + module_idx = DEC_ID_PCIE_VDEC1; + } hbw_rtr_id = gaudi2_dec_initiator_hbw_rtr_id[module_idx]; lbw_rtr_id = gaudi2_dec_initiator_lbw_rtr_id[module_idx]; - sprintf(initiator_name, "DEC_%u", module_idx); break; case RAZWI_ROT: hbw_rtr_id = gaudi2_rot_initiator_hbw_rtr_id[module_idx]; -- cgit v1.2.3 From 54381ee8099839c8163cee45b65d824ca76186a2 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 10 May 2023 13:34:28 +0300 Subject: accel/habanalabs: use lower QM in QM errors handling The QMAN GLBL_ERR_STS_4 register has indications for errors also in the lower CQ and the ARC CQ, and not just for errors in the lower CP. Modify the relevant define/struct and the related print to use "lower QM" instead of "lower CP". Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index a6aa17d86820..6e2561ead546 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -57,7 +57,7 @@ #define GAUDI2_NA_EVENT_CAUSE 0xFF #define GAUDI2_NUM_OF_QM_ERR_CAUSE 18 -#define GAUDI2_NUM_OF_QM_LCP_ERR_CAUSE 25 +#define GAUDI2_NUM_OF_LOWER_QM_ERR_CAUSE 25 #define GAUDI2_NUM_OF_QM_ARB_ERR_CAUSE 3 #define GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE 14 #define GAUDI2_NUM_OF_CPU_SEI_ERR_CAUSE 3 @@ -801,7 +801,7 @@ static const char * const gaudi2_qman_error_cause[GAUDI2_NUM_OF_QM_ERR_CAUSE] = "PQC L2H error" }; -static const char * const gaudi2_qman_lower_cp_error_cause[GAUDI2_NUM_OF_QM_LCP_ERR_CAUSE] = { +static const char * const gaudi2_lower_qman_error_cause[GAUDI2_NUM_OF_LOWER_QM_ERR_CAUSE] = { "RSVD0", "CQ AXI HBW error", "CP AXI HBW error", @@ -7895,8 +7895,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type continue; if (i == QMAN_STREAMS) { - snprintf(reg_desc, ARRAY_SIZE(reg_desc), "LowerCP"); - num_error_causes = GAUDI2_NUM_OF_QM_LCP_ERR_CAUSE; + snprintf(reg_desc, ARRAY_SIZE(reg_desc), "LowerQM"); + num_error_causes = GAUDI2_NUM_OF_LOWER_QM_ERR_CAUSE; } else { snprintf(reg_desc, ARRAY_SIZE(reg_desc), "stream%u", i); num_error_causes = GAUDI2_NUM_OF_QM_ERR_CAUSE; @@ -7907,7 +7907,7 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type gaudi2_print_event(hdev, event_type, true, "%s. err cause: %s", reg_desc, i == QMAN_STREAMS ? - gaudi2_qman_lower_cp_error_cause[j] : + gaudi2_lower_qman_error_cause[j] : gaudi2_qman_error_cause[j]); error_count++; } -- cgit v1.2.3 From 6092cedfff1180b0d0aea402a198397fff609c71 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 10 May 2023 13:28:05 +0300 Subject: accel/habanalabs: print qman data on error only for lower qman By default, the upper QMANs are not used, and instead engines ARCs access the lower QMANs directly. Errors for upper QMANs are therefore not expected, and the debug print of the PQ entries is not needed. Modify the QMAN debug data print on errors to include only information for the lower QMAN. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 146 +++------------------ drivers/accel/habanalabs/gaudi2/gaudi2P.h | 2 +- .../include/gaudi2/asic_reg/gaudi2_regs.h | 11 ++ 3 files changed, 31 insertions(+), 128 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 6e2561ead546..4981b8eb0ff5 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7744,137 +7744,28 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, return !!ecc_data->is_critical; } -/* - * gaudi2_queue_idx_dec - decrement queue index (pi/ci) and handle wrap - * - * @idx: the current pi/ci value - * @q_len: the queue length (power of 2) - * - * @return the cyclically decremented index - */ -static inline u32 gaudi2_queue_idx_dec(u32 idx, u32 q_len) -{ - u32 mask = q_len - 1; - - /* - * modular decrement is equivalent to adding (queue_size -1) - * later we take LSBs to make sure the value is in the - * range [0, queue_len - 1] - */ - return (idx + q_len - 1) & mask; -} - -/** - * gaudi2_print_sw_config_stream_data - print SW config stream data - * - * @hdev: pointer to the habanalabs device structure - * @stream: the QMAN's stream - * @qman_base: base address of QMAN registers block - */ -static void gaudi2_print_sw_config_stream_data(struct hl_device *hdev, - u32 stream, u64 qman_base) +static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base) { - u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr; - u32 cq_ptr_lo_off, size; + u32 lo, hi, cq_ptr_size, arc_cq_ptr_size; + u64 cq_ptr, arc_cq_ptr, cp_current_inst; - cq_ptr_lo_off = mmDCORE0_TPC0_QM_CQ_PTR_LO_1 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0; - - cq_ptr_lo = qman_base + (mmDCORE0_TPC0_QM_CQ_PTR_LO_0 - mmDCORE0_TPC0_QM_BASE) + - stream * cq_ptr_lo_off; - - cq_ptr_hi = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_PTR_HI_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0); - - cq_tsize = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_TSIZE_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0); - - cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo); - size = RREG32(cq_tsize); - dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n", - stream, cq_ptr, size); -} - -/** - * gaudi2_print_last_pqes_on_err - print last PQEs on error - * - * @hdev: pointer to the habanalabs device structure - * @qid_base: first QID of the QMAN (out of 4 streams) - * @stream: the QMAN's stream - * @qman_base: base address of QMAN registers block - * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE) - */ -static void gaudi2_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, u32 stream, - u64 qman_base, bool pr_sw_conf) -{ - u32 ci, qm_ci_stream_off; - struct hl_hw_queue *q; - u64 pq_ci; - int i; + lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET); + hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET); + cq_ptr = ((u64) hi) << 32 | lo; + cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET); - q = &hdev->kernel_queues[qid_base + stream]; - - qm_ci_stream_off = mmDCORE0_TPC0_QM_PQ_CI_1 - mmDCORE0_TPC0_QM_PQ_CI_0; - pq_ci = qman_base + (mmDCORE0_TPC0_QM_PQ_CI_0 - mmDCORE0_TPC0_QM_BASE) + - stream * qm_ci_stream_off; - - hdev->asic_funcs->hw_queues_lock(hdev); - - if (pr_sw_conf) - gaudi2_print_sw_config_stream_data(hdev, stream, qman_base); - - ci = RREG32(pq_ci); - - /* we should start printing form ci -1 */ - ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH); - - for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) { - struct hl_bd *bd; - u64 addr; - u32 len; - - bd = q->kernel_address; - bd += ci; - - len = le32_to_cpu(bd->len); - /* len 0 means uninitialized entry- break */ - if (!len) - break; - - addr = le64_to_cpu(bd->ptr); - - dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n", - stream, ci, addr, len); - - /* get previous ci, wrap if needed */ - ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH); - } - - hdev->asic_funcs->hw_queues_unlock(hdev); -} - -/** - * print_qman_data_on_err - extract QMAN data on error - * - * @hdev: pointer to the habanalabs device structure - * @qid_base: first QID of the QMAN (out of 4 streams) - * @stream: the QMAN's stream - * @qman_base: base address of QMAN registers block - * - * This function attempt to extract as much data as possible on QMAN error. - * On upper CP print the SW config stream data and last 8 PQEs. - * On lower CP print SW config data and last PQEs of ALL 4 upper CPs - */ -static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base, u32 stream, u64 qman_base) -{ - u32 i; - - if (stream != QMAN_STREAMS) { - gaudi2_print_last_pqes_on_err(hdev, qid_base, stream, qman_base, true); - return; - } + lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET); + hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET); + arc_cq_ptr = ((u64) hi) << 32 | lo; + arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET); - gaudi2_print_sw_config_stream_data(hdev, stream, qman_base); + lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET); + hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET); + cp_current_inst = ((u64) hi) << 32 | lo; - for (i = 0 ; i < QMAN_STREAMS ; i++) - gaudi2_print_last_pqes_on_err(hdev, qid_base, i, qman_base, false); + dev_info(hdev->dev, + "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n", + cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst); } static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type, @@ -7912,7 +7803,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type error_count++; } - print_qman_data_on_err(hdev, qid_base, i, qman_base); + if (i == QMAN_STREAMS) + print_lower_qman_data_on_err(hdev, qman_base); } arb_err_val = RREG32(arb_err_addr); diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h index 1cebe707772e..5f3ce086928e 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h +++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h @@ -98,7 +98,7 @@ #define GAUDI2_DEFAULT_CARD_NAME "HL225" #define QMAN_STREAMS 4 -#define PQ_FETCHER_CACHE_SIZE 8 + #define NUM_OF_MME_SBTE_PORTS 5 #define NUM_OF_MME_WB_PORTS 2 diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h index 6c58af614236..a08378d0802b 100644 --- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h +++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h @@ -242,6 +242,17 @@ #define QM_FENCE2_OFFSET (mmPDMA0_QM_CP_FENCE2_RDATA_0 - mmPDMA0_QM_BASE) #define QM_SEI_STATUS_OFFSET (mmPDMA0_QM_SEI_STATUS - mmPDMA0_QM_BASE) +#define QM_CQ_PTR_LO_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_4 - mmPDMA0_QM_BASE) +#define QM_CQ_PTR_HI_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_4 - mmPDMA0_QM_BASE) +#define QM_CQ_TSIZE_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_4 - mmPDMA0_QM_BASE) + +#define QM_ARC_CQ_PTR_LO_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE) + +#define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE) +#define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE) + #define SFT_OFFSET (mmSFT1_HBW_RTR_IF0_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE) #define SFT_IF_RTR_OFFSET (mmSFT0_HBW_RTR_IF1_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE) -- cgit v1.2.3 From 7e63f317c0c3eb72e745ce3509299800ef54d6de Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Wed, 10 May 2023 10:15:22 +0300 Subject: accel/habanalabs: update state when loading boot fit Any FW component we load must be followed by a corresponding state update. However, it seems that so far we skipped doing so for the bootfit case, so fix that. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index eb51d7f70aec..acbc1a6b5cb1 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -2486,16 +2486,6 @@ static int hl_fw_dynamic_load_image(struct hl_device *hdev, if (rc) goto release_fw; - /* update state according to boot stage */ - if (cur_fwc == FW_COMP_BOOT_FIT) { - struct cpu_dyn_regs *dyn_regs; - - dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; - hl_fw_boot_fit_update_state(hdev, - le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), - le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); - } - /* copy boot fit to space allocated by FW */ rc = hl_fw_dynamic_copy_image(hdev, fw, fw_loader); if (rc) @@ -2798,6 +2788,14 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, goto protocol_err; } + rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader); + if (rc) + goto protocol_err; + + hl_fw_boot_fit_update_state(hdev, + le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), + le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); + /* * when testing FW load (without Linux) on PLDM we don't want to * wait until boot fit is active as it may take several hours. @@ -2807,10 +2805,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, if (hdev->pldm && !(hdev->fw_components & FW_TYPE_LINUX)) return 0; - rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader); - if (rc) - goto protocol_err; - /* Enable DRAM scrambling before Linux boot and after successful * UBoot */ @@ -2844,7 +2838,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, if (rc) goto protocol_err; - hl_fw_linux_update_state(hdev, le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), + hl_fw_linux_update_state(hdev, + le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); hl_fw_dynamic_update_linux_interrupt_if(hdev); -- cgit v1.2.3 From 5d658d0c5137f709e9888f2f3687afbde38b5ac0 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 8 May 2023 11:06:17 +0300 Subject: accel/habanalabs: mask part of hmmu page fault captured address When receiving page fault from hmmu, the captured address is scrambled both by HW and by driver. The driver part is unscrambled but the HW part isn't getting unscrambled. To avoid declaring wrong address, the HW scrambled part will be masked. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 4981b8eb0ff5..1cb2b72e1cd2 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -162,6 +162,9 @@ #define PSOC_RAZWI_ENG_STR_SIZE 128 #define PSOC_RAZWI_MAX_ENG_PER_RTR 5 +/* HW scrambles only bits 0-25 */ +#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26) + struct gaudi2_razwi_info { u32 axuser_xy; u32 rtr_ctrl; @@ -8835,11 +8838,16 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool addr <<= 32; addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE_VA)); - if (!is_pmmu) + if (is_pmmu) { + dev_err_ratelimited(hdev->dev, "PMMU page fault on va 0x%llx\n", addr); + } else { + addr = gaudi2_mmu_descramble_addr(hdev, addr); + addr &= HW_UNSCRAMBLED_BITS_MASK; + dev_err_ratelimited(hdev->dev, "HMMU page fault on va range 0x%llx - 0x%llx\n", + addr, addr + ~HW_UNSCRAMBLED_BITS_MASK); + } - dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n", - is_pmmu ? "PMMU" : "HMMU", addr); hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask); WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); -- cgit v1.2.3 From 569210233a31fa23932b269779144252ce7bfee9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 22 May 2023 14:09:21 +0300 Subject: accel/habanalabs: remove sim code There were a few places where simulator only code got into the upstream. Remove those places that can confuse other developers. Fixes: 2a0a839b6a28 ("habanalabs: extend fatal messages to contain PCI info") Cc: Moti Haimovski Cc: Dan Carpenter Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 10 +++------- drivers/accel/habanalabs/common/habanalabs.h | 23 ++++------------------- 2 files changed, 7 insertions(+), 26 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index cab5a63db8c1..ea02f2cfdf81 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2328,13 +2328,9 @@ out_disabled: hdev->disabled = true; if (expose_interfaces_on_err) cdev_sysfs_debugfs_add(hdev); - if (hdev->pdev) - dev_err(&hdev->pdev->dev, - "Failed to initialize hl%d. Device %s is NOT usable !\n", - hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); - else - pr_err("Failed to initialize hl%d. Device %s is NOT usable !\n", - hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); + dev_err(&hdev->pdev->dev, + "Failed to initialize hl%d. Device %s is NOT usable !\n", + hdev->cdev_idx, dev_name(&hdev->pdev->dev)); return rc; } diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index e2341a75a4b7..c5aa33eaa826 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -2558,12 +2558,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); ktime_t __timeout; \ u32 __elbi_read; \ int __rc = 0; \ - if (hdev->pdev) \ - __timeout = ktime_add_us(ktime_get(), timeout_us); \ - else \ - __timeout = ktime_add_us(ktime_get(),\ - min((u64)(timeout_us * 10), \ - (u64) HL_SIM_MAX_TIMEOUT_US)); \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ might_sleep_if(sleep_us); \ for (;;) { \ if (elbi) { \ @@ -2615,13 +2610,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); u8 __arr_idx; \ int __rc = 0; \ \ - if (hdev->pdev) \ - __timeout = ktime_add_us(ktime_get(), timeout_us); \ - else \ - __timeout = ktime_add_us(ktime_get(),\ - min(((u64)timeout_us * 10), \ - (u64) HL_SIM_MAX_TIMEOUT_US)); \ - \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ might_sleep_if(sleep_us); \ if (arr_size >= 64) \ __rc = -EINVAL; \ @@ -2679,12 +2668,8 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); mem_written_by_device) \ ({ \ ktime_t __timeout; \ - if (hdev->pdev) \ - __timeout = ktime_add_us(ktime_get(), timeout_us); \ - else \ - __timeout = ktime_add_us(ktime_get(),\ - min((u64)(timeout_us * 100), \ - (u64) HL_SIM_MAX_TIMEOUT_US)); \ + \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ might_sleep_if(sleep_us); \ for (;;) { \ /* Verify we read updates done by other cores or by device */ \ -- cgit v1.2.3 From 8a20b381644528794a78f791fd1a7ae87f3fc6ff Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 17 May 2023 20:47:44 +0300 Subject: accel/habanalabs: fix bug of not fetching addr_dec info addr_dec info should always be fetched, regardless of cause value. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 1cb2b72e1cd2..0d41adf4792c 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8769,6 +8769,9 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ u32 error_count = 0; int i; + gaudi2_print_event(hdev, event_type, true, + "intr_cause_data: %#llx", intr_cause_data); + for (i = 0 ; i < GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE ; i++) { if (!(intr_cause_data & BIT_ULL(i))) continue; @@ -8781,10 +8784,11 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ * Always check for LBW and HBW additional info as the indication itself is * sometimes missing */ - hl_check_for_glbl_errors(hdev); - gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask); } + hl_check_for_glbl_errors(hdev); + gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask); + return error_count; } -- cgit v1.2.3 From ff5c702522bb01d8d28da98b508168b391600080 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 18 May 2023 10:59:38 +0300 Subject: accel/habanalabs: move ioctl error print to debug level We don't want to allow users to spam the kernel log and sending ioctls with bad opcodes is a sure way to do it. Signed-off-by: Oded Gabbay Reviewed-by: Ofir Bitton --- drivers/accel/habanalabs/common/habanalabs_ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 9a8be9395fb2..6a45a92344e9 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -1195,7 +1195,7 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg, out_err: if (retcode) - dev_dbg(dev, "error in ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n", + dev_dbg_ratelimited(dev, "error in ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n", task_pid_nr(current), cmd, nr); if (kdata != stack_kdata) @@ -1219,7 +1219,7 @@ long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) if ((nr >= HL_COMMAND_START) && (nr < HL_COMMAND_END)) { ioctl = &hl_ioctls[nr]; } else { - dev_err(hdev->dev, "invalid ioctl: pid=%d, nr=0x%02x\n", + dev_dbg_ratelimited(hdev->dev, "invalid ioctl: pid=%d, nr=0x%02x\n", task_pid_nr(current), nr); return -ENOTTY; } @@ -1242,7 +1242,7 @@ long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg) if (nr == _IOC_NR(HL_IOCTL_INFO)) { ioctl = &hl_ioctls_control[nr]; } else { - dev_err(hdev->dev_ctrl, "invalid ioctl: pid=%d, nr=0x%02x\n", + dev_dbg_ratelimited(hdev->dev_ctrl, "invalid ioctl: pid=%d, nr=0x%02x\n", task_pid_nr(current), nr); return -ENOTTY; } -- cgit v1.2.3 From 19aa21b9807ad277972f7ebe8a8dd9fbf9063bc8 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 22 May 2023 08:52:46 +0300 Subject: accel/habanalabs: unsecure TSB_CFG_MTRR regs In order to utilize Engine Barrier padding, user must have access to this register set. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2_security.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c index fadb870ff4c0..2742b1f801eb 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c @@ -1534,6 +1534,10 @@ static const u32 gaudi2_pb_dcr0_tpc0_unsecured_regs[] = { mmDCORE0_TPC0_CFG_QM_KERNEL_CONFIG, mmDCORE0_TPC0_CFG_QM_KERNEL_ID, mmDCORE0_TPC0_CFG_QM_POWER_LOOP, + mmDCORE0_TPC0_CFG_TSB_CFG_MTRR_2_0, + mmDCORE0_TPC0_CFG_TSB_CFG_MTRR_2_1, + mmDCORE0_TPC0_CFG_TSB_CFG_MTRR_2_2, + mmDCORE0_TPC0_CFG_TSB_CFG_MTRR_2_3, mmDCORE0_TPC0_CFG_LUT_FUNC32_BASE2_ADDR_LO, mmDCORE0_TPC0_CFG_LUT_FUNC32_BASE2_ADDR_HI, mmDCORE0_TPC0_CFG_LUT_FUNC64_BASE2_ADDR_LO, -- cgit v1.2.3 From fac91dd54f3b6f5dbd20c1cb26e59eceb128ca42 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 21 May 2023 10:24:13 +0300 Subject: accel/habanalabs: add event queue extra validation In order to increase reliability of the event queue interface, we apply to Gaudi2 the same mechanism we have in Gaudi1. The extra validation is basically checking that the received event index matches the expected index. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/irq.c | 2 +- drivers/accel/habanalabs/gaudi2/gaudi2.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c index c67895b1cdeb..b1010d206c2e 100644 --- a/drivers/accel/habanalabs/common/irq.c +++ b/drivers/accel/habanalabs/common/irq.c @@ -430,7 +430,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) cur_eqe_index = FIELD_GET(EQ_CTL_INDEX_MASK, cur_eqe); if ((hdev->event_queue.check_eqe_index) && (((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK) != cur_eqe_index)) { - dev_dbg(hdev->dev, + dev_err(hdev->dev, "EQE %#x in queue is ready but index does not match %d!=%d", cur_eqe, ((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK), diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 0d41adf4792c..20c4583f12b0 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -3619,6 +3619,12 @@ static int gaudi2_sw_init(struct hl_device *hdev) prop->supports_compute_reset = true; + /* Event queue sanity check added in FW version 1.11 */ + if (hl_is_fw_sw_ver_below(hdev, 1, 11)) + hdev->event_queue.check_eqe_index = false; + else + hdev->event_queue.check_eqe_index = true; + hdev->asic_funcs->set_pci_memory_regions(hdev); rc = gaudi2_special_blocks_iterator_config(hdev); -- cgit v1.2.3 From e6f49e96bc57d34fc0f617f37bfdf62a9b58d2c2 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 22 May 2023 17:15:36 +0300 Subject: accel/habanalabs: refactor error info reset Moved error info reset code to single function for future use from other places in the driver. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 8 ++++++++ drivers/accel/habanalabs/common/habanalabs.h | 1 + drivers/accel/habanalabs/common/habanalabs_drv.c | 5 +---- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers/accel') diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index ea02f2cfdf81..b97339d1f7c6 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2689,3 +2689,11 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info) if (info->event_mask) *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; } + +void hl_enable_err_info_capture(struct hl_error_info *captured_err_info) +{ + vfree(captured_err_info->page_fault_info.user_mappings); + memset(captured_err_info, 0, sizeof(struct hl_error_info)); + atomic_set(&captured_err_info->cs_timeout.write_enable, 1); + captured_err_info->undef_opcode.write_enable = true; +} diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index c5aa33eaa826..d92ba2e30e31 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3944,6 +3944,7 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_ u64 *event_mask); void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask); void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info); +void hl_enable_err_info_capture(struct hl_error_info *captured_err_info); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 446f444a1c7e..7263e84c1a4d 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -219,10 +219,7 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); - vfree(hdev->captured_err_info.page_fault_info.user_mappings); - memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info)); - atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); - hdev->captured_err_info.undef_opcode.write_enable = true; + hl_enable_err_info_capture(&hdev->captured_err_info); hdev->open_counter++; hdev->last_successful_open_jif = jiffies; -- cgit v1.2.3