summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/device.c')
-rw-r--r--drivers/misc/habanalabs/device.c93
1 files changed, 69 insertions, 24 deletions
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 77d51be66c7e..91a9e47a3482 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -5,11 +5,14 @@
* All Rights Reserved.
*/
+#define pr_fmt(fmt) "habanalabs: " fmt
+
#include "habanalabs.h"
#include <linux/pci.h>
#include <linux/sched/signal.h>
#include <linux/hwmon.h>
+#include <uapi/misc/habanalabs.h>
#define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10)
@@ -21,6 +24,20 @@ bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
return false;
}
+enum hl_device_status hl_device_status(struct hl_device *hdev)
+{
+ enum hl_device_status status;
+
+ if (hdev->disabled)
+ status = HL_DEVICE_STATUS_MALFUNCTION;
+ else if (atomic_read(&hdev->in_reset))
+ status = HL_DEVICE_STATUS_IN_RESET;
+ else
+ status = HL_DEVICE_STATUS_OPERATIONAL;
+
+ return status;
+};
+
static void hpriv_release(struct kref *ref)
{
struct hl_fpriv *hpriv;
@@ -498,11 +515,8 @@ disable_device:
return rc;
}
-static void hl_device_hard_reset_pending(struct work_struct *work)
+static void device_kill_open_processes(struct hl_device *hdev)
{
- struct hl_device_reset_work *device_reset_work =
- container_of(work, struct hl_device_reset_work, reset_work);
- struct hl_device *hdev = device_reset_work->hdev;
u16 pending_total, pending_cnt;
struct task_struct *task = NULL;
@@ -537,6 +551,12 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
}
}
+ /* We killed the open users, but because the driver cleans up after the
+ * user contexts are closed (e.g. mmu mappings), we need to wait again
+ * to make sure the cleaning phase is finished before continuing with
+ * the reset
+ */
+
pending_cnt = pending_total;
while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
@@ -552,6 +572,16 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
mutex_unlock(&hdev->fd_open_cnt_lock);
+}
+
+static void device_hard_reset_pending(struct work_struct *work)
+{
+ struct hl_device_reset_work *device_reset_work =
+ container_of(work, struct hl_device_reset_work, reset_work);
+ struct hl_device *hdev = device_reset_work->hdev;
+
+ device_kill_open_processes(hdev);
+
hl_device_reset(hdev, true, true);
kfree(device_reset_work);
@@ -613,6 +643,8 @@ again:
if ((hard_reset) && (!from_hard_reset_thread)) {
struct hl_device_reset_work *device_reset_work;
+ hdev->hard_reset_pending = true;
+
if (!hdev->pdev) {
dev_err(hdev->dev,
"Reset action is NOT supported in simulator\n");
@@ -620,8 +652,6 @@ again:
goto out_err;
}
- hdev->hard_reset_pending = true;
-
device_reset_work = kzalloc(sizeof(*device_reset_work),
GFP_ATOMIC);
if (!device_reset_work) {
@@ -635,7 +665,7 @@ again:
* from a dedicated work
*/
INIT_WORK(&device_reset_work->reset_work,
- hl_device_hard_reset_pending);
+ device_hard_reset_pending);
device_reset_work->hdev = hdev;
schedule_work(&device_reset_work->reset_work);
@@ -663,17 +693,9 @@ again:
/* Go over all the queues, release all CS and their jobs */
hl_cs_rollback_all(hdev);
- if (hard_reset) {
- /* Release kernel context */
- if (hl_ctx_put(hdev->kernel_ctx) != 1) {
- dev_err(hdev->dev,
- "kernel ctx is alive during hard reset\n");
- rc = -EBUSY;
- goto out_err;
- }
-
+ /* Release kernel context */
+ if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
hdev->kernel_ctx = NULL;
- }
/* Reset the H/W. It will be in idle state after this returns */
hdev->asic_funcs->hw_fini(hdev, hard_reset);
@@ -688,16 +710,24 @@ again:
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
hl_cq_reset(hdev, &hdev->completion_queue[i]);
- /* Make sure the setup phase for the user context will run again */
+ /* Make sure the context switch phase will run again */
if (hdev->user_ctx) {
- atomic_set(&hdev->user_ctx->thread_restore_token, 1);
- hdev->user_ctx->thread_restore_wait_token = 0;
+ atomic_set(&hdev->user_ctx->thread_ctx_switch_token, 1);
+ hdev->user_ctx->thread_ctx_switch_wait_token = 0;
}
/* Finished tear-down, starting to re-initialize */
if (hard_reset) {
hdev->device_cpu_disabled = false;
+ hdev->hard_reset_pending = false;
+
+ if (hdev->kernel_ctx) {
+ dev_crit(hdev->dev,
+ "kernel ctx was alive during hard reset, something is terribly wrong\n");
+ rc = -EBUSY;
+ goto out_err;
+ }
/* Allocate the kernel context */
hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
@@ -752,8 +782,6 @@ again:
}
hl_set_max_power(hdev, hdev->max_power);
-
- hdev->hard_reset_pending = false;
} else {
rc = hdev->asic_funcs->soft_reset_late_init(hdev);
if (rc) {
@@ -1030,11 +1058,22 @@ void hl_device_fini(struct hl_device *hdev)
WARN(1, "Failed to remove device because reset function did not finish\n");
return;
}
- };
+ }
/* Mark device as disabled */
hdev->disabled = true;
+ /*
+ * Flush anyone that is inside the critical section of enqueue
+ * jobs to the H/W
+ */
+ hdev->asic_funcs->hw_queues_lock(hdev);
+ hdev->asic_funcs->hw_queues_unlock(hdev);
+
+ hdev->hard_reset_pending = true;
+
+ device_kill_open_processes(hdev);
+
hl_hwmon_fini(hdev);
device_late_fini(hdev);
@@ -1108,7 +1147,13 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr,
* either by the direct access of the device or by another core
*/
u32 *paddr = (u32 *) (uintptr_t) addr;
- ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
+ ktime_t timeout;
+
+ /* timeout should be longer when working with simulator */
+ if (!hdev->pdev)
+ timeout_us *= 10;
+
+ timeout = ktime_add_us(ktime_get(), timeout_us);
might_sleep();