From 422ef171038d4855ffe938137039a8f3b3e84293 Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Thu, 28 Apr 2022 13:45:18 +0300 Subject: habanalabs: add support for notification via eventfd The driver will be able to send notification events towards a user process, using user's registered event file descriptor. The driver uses the notification mechanism to inform the user about an occurred event. A user thread can wait until a notification is received from the driver. The driver stores the occurred event until the user reads it, using HL_INFO_GET_EVENTS - new ioctl opcode in the INFO ioctl. Gaudi specific implementation includes sending a notification on a TPC assertion event that is received from f/w. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/common/device.c | 52 ++++++++++++++++++ drivers/misc/habanalabs/common/habanalabs.h | 40 +++++++++----- drivers/misc/habanalabs/common/habanalabs_drv.c | 9 ++++ drivers/misc/habanalabs/common/habanalabs_ioctl.c | 65 +++++++++++++++++++++++ drivers/misc/habanalabs/gaudi/gaudi.c | 14 ++++- 5 files changed, 167 insertions(+), 13 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 15df89b31e1b..315510aaca35 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -285,6 +285,14 @@ static void hpriv_release(struct kref *ref) hdev->compute_ctx_in_release = 0; + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) { + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + } + + mutex_destroy(&hpriv->notifier_event.lock); + kfree(hpriv); } @@ -355,6 +363,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp) list_del(&hpriv->dev_node); mutex_unlock(&hdev->fpriv_ctrl_list_lock); out: + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) { + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + } + + mutex_destroy(&hpriv->notifier_event.lock); put_pid(hpriv->taskpid); kfree(hpriv); @@ -1506,6 +1521,43 @@ out_err: return rc; } +static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event) +{ + mutex_lock(¬ifier_event->lock); + notifier_event->events_mask |= event; + if (notifier_event->eventfd) + eventfd_signal(notifier_event->eventfd, 1); + + mutex_unlock(¬ifier_event->lock); +} + +/* + * hl_notifier_event_send_all - notify all user processes via eventfd + * + * @hdev: pointer to habanalabs device structure + * @event: the occurred event + * Returns 0 for success or an error on failure. + */ +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event) +{ + struct hl_fpriv *hpriv; + + mutex_lock(&hdev->fpriv_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event); + + mutex_unlock(&hdev->fpriv_list_lock); + + /* control device */ + mutex_lock(&hdev->fpriv_ctrl_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event); + + mutex_unlock(&hdev->fpriv_ctrl_list_lock); +} + /* * hl_device_init - main initialization function for habanalabs device * diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 918e8a04acab..8977ec67dba7 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1932,6 +1933,18 @@ struct hl_debug_params { bool enable; }; +/** + * struct hl_notifier_event - holds the notifier data structure + * @eventfd: the event file descriptor to raise the notifications + * @lock: mutex lock to protect the notifier data flows + * @events_mask: indicates the bitmap events + */ +struct hl_notifier_event { + struct eventfd_ctx *eventfd; + struct mutex lock; + u64 events_mask; +}; + /* * FILE PRIVATE STRUCTURE */ @@ -1943,24 +1956,25 @@ struct hl_debug_params { * @taskpid: current process ID. * @ctx: current executing context. TODO: remove for multiple ctx per process * @ctx_mgr: context manager to handle multiple context for this FD. - * @cb_mgr: command buffer manager to handle multiple buffers for this FD. * @mem_mgr: manager descriptor for memory exportable via mmap + * @notifier_event: notifier eventfd towards user process * @debugfs_list: list of relevant ASIC debugfs. * @dev_node: node in the device list of file private data * @refcount: number of related contexts. * @restore_phase_mutex: lock for context switch and restore phase. */ struct hl_fpriv { - struct hl_device *hdev; - struct file *filp; - struct pid *taskpid; - struct hl_ctx *ctx; - struct hl_ctx_mgr ctx_mgr; - struct hl_mem_mgr mem_mgr; - struct list_head debugfs_list; - struct list_head dev_node; - struct kref refcount; - struct mutex restore_phase_mutex; + struct hl_device *hdev; + struct file *filp; + struct pid *taskpid; + struct hl_ctx *ctx; + struct hl_ctx_mgr ctx_mgr; + struct hl_mem_mgr mem_mgr; + struct hl_notifier_event notifier_event; + struct list_head debugfs_list; + struct list_head dev_node; + struct kref refcount; + struct mutex restore_phase_mutex; }; @@ -2676,8 +2690,8 @@ struct hl_reset_info { * @state_dump_specs: constants and dictionaries needed to dump system state. * @multi_cs_completion: array of multi-CS completion. * @clk_throttling: holds information about current/previous clock throttling events - * @reset_info: holds current device reset information. * @last_error: holds information about last session in which CS timeout or razwi error occurred. + * @reset_info: holds current device reset information. * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @fw_major_version: major version of current loaded preboot * @dram_used_mem: current DRAM memory consumption. @@ -3071,6 +3085,8 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization); int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sensors_arr); +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event); + int hl_sysfs_init(struct hl_device *hdev); void hl_sysfs_fini(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 1210de39d661..c97173e9507d 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -134,6 +134,10 @@ int hl_device_open(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; + hpriv->notifier_event.events_mask = 0; + hpriv->notifier_event.eventfd = 0; + + mutex_init(&hpriv->notifier_event.lock); mutex_init(&hpriv->restore_phase_mutex); kref_init(&hpriv->refcount); nonseekable_open(inode, filp); @@ -208,6 +212,7 @@ out_err: hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); filp->private_data = NULL; mutex_destroy(&hpriv->restore_phase_mutex); + mutex_destroy(&hpriv->notifier_event.lock); put_pid(hpriv->taskpid); kfree(hpriv); @@ -241,6 +246,10 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; + hpriv->notifier_event.events_mask = 0; + hpriv->notifier_event.eventfd = 0; + + mutex_init(&hpriv->notifier_event.lock); nonseekable_open(inode, filp); hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index bfb5cfe68110..d1ef56a8d3ac 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -116,6 +116,25 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate, return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; } +static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + int rc; + u32 max_size = args->return_size; + u64 events_mask; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((max_size < sizeof(u64)) || (!out)) + return -EINVAL; + + mutex_lock(&hpriv->notifier_event.lock); + events_mask = hpriv->notifier_event.events_mask; + hpriv->notifier_event.events_mask = 0; + mutex_unlock(&hpriv->notifier_event.lock); + + rc = copy_to_user(out, &events_mask, sizeof(u64)); + return rc; +} + static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; @@ -614,6 +633,43 @@ static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_ return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; } +static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + int rc; + + /* check if there is already a registered on that process */ + mutex_lock(&hpriv->notifier_event.lock); + if (hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd); + if (IS_ERR(hpriv->notifier_event.eventfd)) { + rc = PTR_ERR(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + mutex_unlock(&hpriv->notifier_event.lock); + return rc; + } + + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + +static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + mutex_lock(&hpriv->notifier_event.lock); + if (!hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -667,6 +723,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES: return dev_mem_alloc_page_sizes_info(hpriv, args); + case HL_INFO_GET_EVENTS: + return events_info(hpriv, args); + default: break; } @@ -717,6 +776,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DRAM_PENDING_ROWS: return dram_pending_rows_info(hpriv, args); + case HL_INFO_REGISTER_EVENTFD: + return eventfd_register(hpriv, args); + + case HL_INFO_UNREGISTER_EVENTFD: + return eventfd_unregister(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -EINVAL; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 08cd60300b4f..1c388537de33 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7879,7 +7879,6 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_MMU_PAGE_FAULT: case GAUDI_EVENT_MMU_WR_PERM: case GAUDI_EVENT_RAZWI_OR_ADC: - case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM: case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM: fallthrough; @@ -7899,6 +7898,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev, hl_fw_unmask_irq(hdev, event_type); break; + case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_qman_err(hdev, event_type); + hl_fw_unmask_irq(hdev, event_type); + + /* In TPC QM event, notify on TPC assertion. While there isn't + * a specific event for assertion yet, the FW generates QM event. + * The SW upper layer will inspect an internal mapped area to indicate + * if the event is a tpc assertion or tpc QM. + */ + hl_notifier_event_send_all(hdev, HL_NOTIFIER_EVENT_TPC_ASSERT); + break; + case GAUDI_EVENT_RAZWI_OR_ADC_SW: gaudi_print_irq_info(hdev, event_type, true); goto reset_device; -- cgit v1.2.3