summaryrefslogtreecommitdiff
path: root/include/uapi/drm/habanalabs_accel.h
diff options
context:
space:
mode:
authorMoti Haimovski <mhaimovski@habana.ai>2023-01-10 18:35:31 +0300
committerOded Gabbay <ogabbay@kernel.org>2023-03-15 14:29:12 +0300
commit313e9f63b74419ca14c2c09f581a79c7037ee0e2 (patch)
tree4dbca4fcd9fd4357a7f1e98cf51a9b14af7676eb /include/uapi/drm/habanalabs_accel.h
parent09524eb8824e102fb57210967bc9d61d7469121c (diff)
downloadlinux-313e9f63b74419ca14c2c09f581a79c7037ee0e2.tar.xz
accel/habanalabs: add critical-event bit in notifier
Enhance the existing user notifications by adding a HW and FW critical event bits to be used when a HW or FW event occur that requires both SW abort and hard-resetting the chip. Signed-off-by: Moti Haimovski <mhaimovski@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org> Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Diffstat (limited to 'include/uapi/drm/habanalabs_accel.h')
-rw-r--r--include/uapi/drm/habanalabs_accel.h43
1 files changed, 43 insertions, 0 deletions
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 331567ec9e79..3a62652a6452 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -723,6 +723,10 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
* HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened
+ * HL_NOTIFIER_EVENT_CRITICAL_HW_ERR - Indicates a HW error that requires SW abort and
+ * HW reset
+ * HL_NOTIFIER_EVENT_CRITICAL_FW_ERR - Indicates a FW error that requires SW abort and
+ * HW reset
*/
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
@@ -733,6 +737,8 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8)
+#define HL_NOTIFIER_EVENT_CRITICL_HW_ERR (1ULL << 9)
+#define HL_NOTIFIER_EVENT_CRITICL_FW_ERR (1ULL << 10)
/* Opcode for management ioctl
*
@@ -790,6 +796,8 @@ enum hl_server_type {
* HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
* HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
* HL_INFO_FW_GENERIC_REQ - Send generic request to FW.
+ * HL_INFO_HW_ERR_EVENT - Retrieve information on the reported HW error.
+ * HL_INFO_FW_ERR_EVENT - Retrieve information on the reported FW error.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -824,6 +832,8 @@ enum hl_server_type {
#define HL_INFO_PAGE_FAULT_EVENT 33
#define HL_INFO_USER_MAPPINGS 34
#define HL_INFO_FW_GENERIC_REQ 35
+#define HL_INFO_HW_ERR_EVENT 36
+#define HL_INFO_FW_ERR_EVENT 37
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@@ -1162,6 +1172,39 @@ struct hl_info_undefined_opcode_event {
};
/**
+ * struct hl_info_hw_err_event - info about HW error
+ * @timestamp: timestamp of error occurrence
+ * @event_id: The async event ID (specific to each device type).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_hw_err_event {
+ __s64 timestamp;
+ __u16 event_id;
+ __u16 pad[3];
+};
+
+/* FW error definition for event_type in struct hl_info_fw_err_event */
+enum hl_info_fw_err_type {
+ HL_INFO_FW_HEARTBEAT_ERR,
+ HL_INFO_FW_REPORTED_ERR,
+};
+
+/**
+ * struct hl_info_fw_err_event - info about FW error
+ * @timestamp: time-stamp of error occurrence
+ * @err_type: The type of event as defined in hl_info_fw_err_type.
+ * @event_id: The async event ID (specific to each device type, applicable only when event type is
+ * HL_INFO_FW_REPORTED_ERR).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_fw_err_event {
+ __s64 timestamp;
+ __u16 err_type;
+ __u16 event_id;
+ __u32 pad;
+};
+
+/**
* struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
* @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
* (e.g. 0x2100000 means that 1MB and 32MB pages are supported).