summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs
diff options
context:
space:
mode:
authorOfir Bitton <obitton@habana.ai>2023-05-23 10:42:19 +0300
committerOded Gabbay <ogabbay@kernel.org>2023-10-09 12:37:19 +0300
commita8ab1a81ccc2c68a4fa3d0631ce17529e208c8c2 (patch)
tree2cebd947db7981f369184b08c5929aa48cc41ca3 /drivers/accel/habanalabs
parent10926f60051332a754084d45862afc4e83e597e1 (diff)
downloadlinux-a8ab1a81ccc2c68a4fa3d0631ce17529e208c8c2.tar.xz
accel/habanalabs: add info ioctl for engine error reports
User gets notification for every engine error report, but he still lacks the exact engine information. Hence, we allow user to query for the exact engine reported an error. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/accel/habanalabs')
-rw-r--r--drivers/accel/habanalabs/common/device.c14
-rw-r--r--drivers/accel/habanalabs/common/habanalabs.h17
-rw-r--r--drivers/accel/habanalabs/common/habanalabs_ioctl.c25
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2.c168
4 files changed, 224 insertions, 0 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 28be0fc325ea..80cce6b74d05 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2701,6 +2701,20 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
*info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
}
+void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count)
+{
+ struct engine_err_info *info = &hdev->captured_err_info.engine_err;
+
+ /* Capture only the first engine error */
+ if (atomic_cmpxchg(&info->event_detected, 0, 1))
+ return;
+
+ info->event.timestamp = ktime_to_ns(ktime_get());
+ info->event.engine_id = engine_id;
+ info->event.error_count = error_count;
+ info->event_info_available = true;
+}
+
void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
{
vfree(captured_err_info->page_fault_info.user_mappings);
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index e69b9b195f48..2bd3dedfa4b5 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3063,6 +3063,20 @@ struct fw_err_info {
};
/**
+ * struct engine_err_info - engine error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then an engine event was discovered for the
+ * first time after the driver has finished booting-up.
+ * @event_info_available: indicates that an engine event info is now available.
+ */
+struct engine_err_info {
+ struct hl_info_engine_err_event event;
+ atomic_t event_detected;
+ bool event_info_available;
+};
+
+
+/**
* struct hl_error_info - holds information collected during an error.
* @cs_timeout: CS timeout error information.
* @razwi_info: RAZWI information.
@@ -3070,6 +3084,7 @@ struct fw_err_info {
* @page_fault_info: page fault information.
* @hw_err: (fatal) hardware error information.
* @fw_err: firmware error information.
+ * @engine_err: engine error information.
*/
struct hl_error_info {
struct cs_timeout_info cs_timeout;
@@ -3078,6 +3093,7 @@ struct hl_error_info {
struct page_fault_info page_fault_info;
struct hw_err_info hw_err;
struct fw_err_info fw_err;
+ struct engine_err_info engine_err;
};
/**
@@ -3952,6 +3968,7 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_
u64 *event_mask);
void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask);
void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
+void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count);
void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 549b2518fae0..097d65e493c8 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -875,6 +875,28 @@ static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
return rc ? -EFAULT : 0;
}
+static int engine_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+ struct hl_device *hdev = hpriv->hdev;
+ u32 user_buf_size = args->return_size;
+ struct engine_err_info *info;
+ int rc;
+
+ if (!user_buf)
+ return -EINVAL;
+
+ info = &hdev->captured_err_info.engine_err;
+ if (!info->event_info_available)
+ return 0;
+
+ if (user_buf_size < sizeof(struct hl_info_engine_err_event))
+ return -ENOMEM;
+
+ rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_engine_err_event));
+ return rc ? -EFAULT : 0;
+}
+
static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
{
void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer;
@@ -1001,6 +1023,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_FW_ERR_EVENT:
return fw_err_info(hpriv, args);
+ case HL_INFO_USER_ENGINE_ERR_EVENT:
+ return engine_err_info(hpriv, args);
+
case HL_INFO_DRAM_USAGE:
return dram_usage_info(hpriv, args);
default:
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 22a9aee9a7c9..c317a95c3b34 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9589,6 +9589,171 @@ static int hl_arc_event_handle(struct hl_device *hdev, u16 event_type,
}
}
+static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
+{
+ enum gaudi2_block_types type = GAUDI2_BLOCK_TYPE_MAX;
+ u16 index;
+
+ switch (event_type) {
+ case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
+ index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP;
+ type = GAUDI2_BLOCK_TYPE_TPC;
+ break;
+ case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_TPC24_QM:
+ index = event_type - GAUDI2_EVENT_TPC0_QM;
+ type = GAUDI2_BLOCK_TYPE_TPC;
+ break;
+ case GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME0_SPI_BASE ... GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME0_QM:
+ index = 0;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME1_SPI_BASE ... GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME1_QM:
+ index = 1;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_MME2_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME2_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME2_SPI_BASE ... GAUDI2_EVENT_MME2_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME2_QM:
+ index = 2;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_MME3_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME3_CTRL_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_MME3_SPI_BASE ... GAUDI2_EVENT_MME3_WAP_SOURCE_RESULT_INVALID:
+ case GAUDI2_EVENT_MME3_QM:
+ index = 3;
+ type = GAUDI2_BLOCK_TYPE_MME;
+ break;
+ case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
+ case GAUDI2_EVENT_KDMA_BM_SPMU:
+ case GAUDI2_EVENT_KDMA0_CORE:
+ return GAUDI2_ENGINE_ID_KDMA;
+ case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
+ case GAUDI2_EVENT_PDMA0_CORE:
+ case GAUDI2_EVENT_PDMA0_BM_SPMU:
+ case GAUDI2_EVENT_PDMA0_QM:
+ return GAUDI2_ENGINE_ID_PDMA_0;
+ case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
+ case GAUDI2_EVENT_PDMA1_CORE:
+ case GAUDI2_EVENT_PDMA1_BM_SPMU:
+ case GAUDI2_EVENT_PDMA1_QM:
+ return GAUDI2_ENGINE_ID_PDMA_1;
+ case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
+ index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
+ type = GAUDI2_BLOCK_TYPE_DEC;
+ break;
+ case GAUDI2_EVENT_DEC0_SPI ... GAUDI2_EVENT_DEC9_BMON_SPMU:
+ index = (event_type - GAUDI2_EVENT_DEC0_SPI) >> 1;
+ type = GAUDI2_BLOCK_TYPE_DEC;
+ break;
+ case GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_NIC11_AXI_ERROR_RESPONSE:
+ index = event_type - GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE;
+ return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+ case GAUDI2_EVENT_NIC0_QM0 ... GAUDI2_EVENT_NIC11_QM1:
+ index = event_type - GAUDI2_EVENT_NIC0_QM0;
+ return GAUDI2_ENGINE_ID_NIC0_0 + index;
+ case GAUDI2_EVENT_NIC0_BMON_SPMU ... GAUDI2_EVENT_NIC11_SW_ERROR:
+ index = event_type - GAUDI2_EVENT_NIC0_BMON_SPMU;
+ return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
+ case GAUDI2_EVENT_TPC0_BMON_SPMU ... GAUDI2_EVENT_TPC24_KERNEL_ERR:
+ index = (event_type - GAUDI2_EVENT_TPC0_BMON_SPMU) >> 1;
+ type = GAUDI2_BLOCK_TYPE_TPC;
+ break;
+ case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_ROTATOR0_BMON_SPMU:
+ case GAUDI2_EVENT_ROTATOR0_ROT0_QM:
+ return GAUDI2_ENGINE_ID_ROT_0;
+ case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE:
+ case GAUDI2_EVENT_ROTATOR1_BMON_SPMU:
+ case GAUDI2_EVENT_ROTATOR1_ROT1_QM:
+ return GAUDI2_ENGINE_ID_ROT_1;
+ case GAUDI2_EVENT_HDMA0_BM_SPMU:
+ case GAUDI2_EVENT_HDMA0_QM:
+ case GAUDI2_EVENT_HDMA0_CORE:
+ return GAUDI2_DCORE0_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA1_BM_SPMU:
+ case GAUDI2_EVENT_HDMA1_QM:
+ case GAUDI2_EVENT_HDMA1_CORE:
+ return GAUDI2_DCORE0_ENGINE_ID_EDMA_1;
+ case GAUDI2_EVENT_HDMA2_BM_SPMU:
+ case GAUDI2_EVENT_HDMA2_QM:
+ case GAUDI2_EVENT_HDMA2_CORE:
+ return GAUDI2_DCORE1_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA3_BM_SPMU:
+ case GAUDI2_EVENT_HDMA3_QM:
+ case GAUDI2_EVENT_HDMA3_CORE:
+ return GAUDI2_DCORE1_ENGINE_ID_EDMA_1;
+ case GAUDI2_EVENT_HDMA4_BM_SPMU:
+ case GAUDI2_EVENT_HDMA4_QM:
+ case GAUDI2_EVENT_HDMA4_CORE:
+ return GAUDI2_DCORE2_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA5_BM_SPMU:
+ case GAUDI2_EVENT_HDMA5_QM:
+ case GAUDI2_EVENT_HDMA5_CORE:
+ return GAUDI2_DCORE2_ENGINE_ID_EDMA_1;
+ case GAUDI2_EVENT_HDMA6_BM_SPMU:
+ case GAUDI2_EVENT_HDMA6_QM:
+ case GAUDI2_EVENT_HDMA6_CORE:
+ return GAUDI2_DCORE3_ENGINE_ID_EDMA_0;
+ case GAUDI2_EVENT_HDMA7_BM_SPMU:
+ case GAUDI2_EVENT_HDMA7_QM:
+ case GAUDI2_EVENT_HDMA7_CORE:
+ return GAUDI2_DCORE3_ENGINE_ID_EDMA_1;
+ default:
+ break;
+ }
+
+ switch (type) {
+ case GAUDI2_BLOCK_TYPE_TPC:
+ switch (index) {
+ case TPC_ID_DCORE0_TPC0 ... TPC_ID_DCORE0_TPC5:
+ return GAUDI2_DCORE0_ENGINE_ID_TPC_0 + index;
+ case TPC_ID_DCORE1_TPC0 ... TPC_ID_DCORE1_TPC5:
+ return GAUDI2_DCORE1_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE1_TPC0;
+ case TPC_ID_DCORE2_TPC0 ... TPC_ID_DCORE2_TPC5:
+ return GAUDI2_DCORE2_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE2_TPC0;
+ case TPC_ID_DCORE3_TPC0 ... TPC_ID_DCORE3_TPC5:
+ return GAUDI2_DCORE3_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE3_TPC0;
+ default:
+ break;
+ }
+ break;
+ case GAUDI2_BLOCK_TYPE_MME:
+ switch (index) {
+ case MME_ID_DCORE0: return GAUDI2_DCORE0_ENGINE_ID_MME;
+ case MME_ID_DCORE1: return GAUDI2_DCORE1_ENGINE_ID_MME;
+ case MME_ID_DCORE2: return GAUDI2_DCORE2_ENGINE_ID_MME;
+ case MME_ID_DCORE3: return GAUDI2_DCORE3_ENGINE_ID_MME;
+ default:
+ break;
+ }
+ break;
+ case GAUDI2_BLOCK_TYPE_DEC:
+ switch (index) {
+ case DEC_ID_DCORE0_DEC0: return GAUDI2_DCORE0_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE0_DEC1: return GAUDI2_DCORE0_ENGINE_ID_DEC_1;
+ case DEC_ID_DCORE1_DEC0: return GAUDI2_DCORE1_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE1_DEC1: return GAUDI2_DCORE1_ENGINE_ID_DEC_1;
+ case DEC_ID_DCORE2_DEC0: return GAUDI2_DCORE2_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE2_DEC1: return GAUDI2_DCORE2_ENGINE_ID_DEC_1;
+ case DEC_ID_DCORE3_DEC0: return GAUDI2_DCORE3_ENGINE_ID_DEC_0;
+ case DEC_ID_DCORE3_DEC1: return GAUDI2_DCORE3_ENGINE_ID_DEC_1;
+ case DEC_ID_PCIE_VDEC0: return GAUDI2_PCIE_ENGINE_ID_DEC_0;
+ case DEC_ID_PCIE_VDEC1: return GAUDI2_PCIE_ENGINE_ID_DEC_1;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return U16_MAX;
+}
+
static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{
struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -10011,6 +10176,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
}
}
+ if (event_mask & HL_NOTIFIER_EVENT_USER_ENGINE_ERR)
+ hl_capture_engine_err(hdev, event_id_to_engine_id(hdev, event_type), error_count);
+
/* Make sure to dump an error in case no error cause was printed so far.
* Note that although we have counted the errors, we use this number as
* a boolean.