summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_events.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c316
1 files changed, 281 insertions, 35 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index afe72dd11325..deecccebe5b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Copyright 2014 Advanced Micro Devices, Inc.
+ * Copyright 2014-2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -55,7 +56,6 @@ struct kfd_signal_page {
bool need_to_free_pages;
};
-
static uint64_t *page_slots(struct kfd_signal_page *page)
{
return page->kernel_address;
@@ -92,7 +92,8 @@ fail_alloc_signal_store:
}
static int allocate_event_notification_slot(struct kfd_process *p,
- struct kfd_event *ev)
+ struct kfd_event *ev,
+ const int *restore_id)
{
int id;
@@ -104,14 +105,19 @@ static int allocate_event_notification_slot(struct kfd_process *p,
p->signal_mapped_size = 256*8;
}
- /*
- * Compatibility with old user mode: Only use signal slots
- * user mode has mapped, may be less than
- * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
- * of the event limit without breaking user mode.
- */
- id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
- GFP_KERNEL);
+ if (restore_id) {
+ id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
+ GFP_KERNEL);
+ } else {
+ /*
+ * Compatibility with old user mode: Only use signal slots
+ * user mode has mapped, may be less than
+ * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
+ * of the event limit without breaking user mode.
+ */
+ id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
+ GFP_KERNEL);
+ }
if (id < 0)
return id;
@@ -178,9 +184,8 @@ static struct kfd_event *lookup_signaled_event_by_partial_id(
return ev;
}
-static int create_signal_event(struct file *devkfd,
- struct kfd_process *p,
- struct kfd_event *ev)
+static int create_signal_event(struct file *devkfd, struct kfd_process *p,
+ struct kfd_event *ev, const int *restore_id)
{
int ret;
@@ -193,7 +198,7 @@ static int create_signal_event(struct file *devkfd,
return -ENOSPC;
}
- ret = allocate_event_notification_slot(p, ev);
+ ret = allocate_event_notification_slot(p, ev, restore_id);
if (ret) {
pr_warn("Signal event wasn't created because out of kernel memory\n");
return ret;
@@ -209,16 +214,22 @@ static int create_signal_event(struct file *devkfd,
return 0;
}
-static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
+static int create_other_event(struct kfd_process *p, struct kfd_event *ev, const int *restore_id)
{
- /* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
- * intentional integer overflow to -1 without a compiler
- * warning. idr_alloc treats a negative value as "maximum
- * signed integer".
- */
- int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
- (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
- GFP_KERNEL);
+ int id;
+
+ if (restore_id)
+ id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
+ GFP_KERNEL);
+ else
+ /* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
+ * intentional integer overflow to -1 without a compiler
+ * warning. idr_alloc treats a negative value as "maximum
+ * signed integer".
+ */
+ id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
+ (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
+ GFP_KERNEL);
if (id < 0)
return id;
@@ -295,8 +306,8 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
return ev->type == KFD_EVENT_TYPE_SIGNAL;
}
-int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
- uint64_t size)
+static int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
+ uint64_t size, uint64_t user_handle)
{
struct kfd_signal_page *page;
@@ -315,10 +326,57 @@ int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
p->signal_page = page;
p->signal_mapped_size = size;
-
+ p->signal_handle = user_handle;
return 0;
}
+int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset)
+{
+ struct kfd_dev *kfd;
+ struct kfd_process_device *pdd;
+ void *mem, *kern_addr;
+ uint64_t size;
+ int err = 0;
+
+ if (p->signal_page) {
+ pr_err("Event page is already set\n");
+ return -EINVAL;
+ }
+
+ pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(event_page_offset));
+ if (!pdd) {
+ pr_err("Getting device by id failed in %s\n", __func__);
+ return -EINVAL;
+ }
+ kfd = pdd->dev;
+
+ pdd = kfd_bind_process_to_device(kfd, p);
+ if (IS_ERR(pdd))
+ return PTR_ERR(pdd);
+
+ mem = kfd_process_device_translate_handle(pdd,
+ GET_IDR_HANDLE(event_page_offset));
+ if (!mem) {
+ pr_err("Can't find BO, offset is 0x%llx\n", event_page_offset);
+ return -EINVAL;
+ }
+
+ err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->adev,
+ mem, &kern_addr, &size);
+ if (err) {
+ pr_err("Failed to map event page to kernel\n");
+ return err;
+ }
+
+ err = kfd_event_page_set(p, kern_addr, size, event_page_offset);
+ if (err) {
+ pr_err("Failed to set event page\n");
+ amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(kfd->adev, mem);
+ return err;
+ }
+ return err;
+}
+
int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint32_t event_type, bool auto_reset, uint32_t node_id,
uint32_t *event_id, uint32_t *event_trigger_data,
@@ -343,14 +401,14 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
switch (event_type) {
case KFD_EVENT_TYPE_SIGNAL:
case KFD_EVENT_TYPE_DEBUG:
- ret = create_signal_event(devkfd, p, ev);
+ ret = create_signal_event(devkfd, p, ev, NULL);
if (!ret) {
*event_page_offset = KFD_MMAP_TYPE_EVENTS;
*event_slot_index = ev->event_id;
}
break;
default:
- ret = create_other_event(p, ev);
+ ret = create_other_event(p, ev, NULL);
break;
}
@@ -366,6 +424,166 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
return ret;
}
+int kfd_criu_restore_event(struct file *devkfd,
+ struct kfd_process *p,
+ uint8_t __user *user_priv_ptr,
+ uint64_t *priv_data_offset,
+ uint64_t max_priv_data_size)
+{
+ struct kfd_criu_event_priv_data *ev_priv;
+ struct kfd_event *ev = NULL;
+ int ret = 0;
+
+ ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL);
+ if (!ev_priv)
+ return -ENOMEM;
+
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ if (*priv_data_offset + sizeof(*ev_priv) > max_priv_data_size) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ ret = copy_from_user(ev_priv, user_priv_ptr + *priv_data_offset, sizeof(*ev_priv));
+ if (ret) {
+ ret = -EFAULT;
+ goto exit;
+ }
+ *priv_data_offset += sizeof(*ev_priv);
+
+ if (ev_priv->user_handle) {
+ ret = kfd_kmap_event_page(p, ev_priv->user_handle);
+ if (ret)
+ goto exit;
+ }
+
+ ev->type = ev_priv->type;
+ ev->auto_reset = ev_priv->auto_reset;
+ ev->signaled = ev_priv->signaled;
+
+ init_waitqueue_head(&ev->wq);
+
+ mutex_lock(&p->event_mutex);
+ switch (ev->type) {
+ case KFD_EVENT_TYPE_SIGNAL:
+ case KFD_EVENT_TYPE_DEBUG:
+ ret = create_signal_event(devkfd, p, ev, &ev_priv->event_id);
+ break;
+ case KFD_EVENT_TYPE_MEMORY:
+ memcpy(&ev->memory_exception_data,
+ &ev_priv->memory_exception_data,
+ sizeof(struct kfd_hsa_memory_exception_data));
+
+ ret = create_other_event(p, ev, &ev_priv->event_id);
+ break;
+ case KFD_EVENT_TYPE_HW_EXCEPTION:
+ memcpy(&ev->hw_exception_data,
+ &ev_priv->hw_exception_data,
+ sizeof(struct kfd_hsa_hw_exception_data));
+
+ ret = create_other_event(p, ev, &ev_priv->event_id);
+ break;
+ }
+
+exit:
+ if (ret)
+ kfree(ev);
+
+ kfree(ev_priv);
+
+ mutex_unlock(&p->event_mutex);
+
+ return ret;
+}
+
+int kfd_criu_checkpoint_events(struct kfd_process *p,
+ uint8_t __user *user_priv_data,
+ uint64_t *priv_data_offset)
+{
+ struct kfd_criu_event_priv_data *ev_privs;
+ int i = 0;
+ int ret = 0;
+ struct kfd_event *ev;
+ uint32_t ev_id;
+
+ uint32_t num_events = kfd_get_num_events(p);
+
+ if (!num_events)
+ return 0;
+
+ ev_privs = kvzalloc(num_events * sizeof(*ev_privs), GFP_KERNEL);
+ if (!ev_privs)
+ return -ENOMEM;
+
+
+ idr_for_each_entry(&p->event_idr, ev, ev_id) {
+ struct kfd_criu_event_priv_data *ev_priv;
+
+ /*
+ * Currently, all events have same size of private_data, but the current ioctl's
+ * and CRIU plugin supports private_data of variable sizes
+ */
+ ev_priv = &ev_privs[i];
+
+ ev_priv->object_type = KFD_CRIU_OBJECT_TYPE_EVENT;
+
+ /* We store the user_handle with the first event */
+ if (i == 0 && p->signal_page)
+ ev_priv->user_handle = p->signal_handle;
+
+ ev_priv->event_id = ev->event_id;
+ ev_priv->auto_reset = ev->auto_reset;
+ ev_priv->type = ev->type;
+ ev_priv->signaled = ev->signaled;
+
+ if (ev_priv->type == KFD_EVENT_TYPE_MEMORY)
+ memcpy(&ev_priv->memory_exception_data,
+ &ev->memory_exception_data,
+ sizeof(struct kfd_hsa_memory_exception_data));
+ else if (ev_priv->type == KFD_EVENT_TYPE_HW_EXCEPTION)
+ memcpy(&ev_priv->hw_exception_data,
+ &ev->hw_exception_data,
+ sizeof(struct kfd_hsa_hw_exception_data));
+
+ pr_debug("Checkpointed event[%d] id = 0x%08x auto_reset = %x type = %x signaled = %x\n",
+ i,
+ ev_priv->event_id,
+ ev_priv->auto_reset,
+ ev_priv->type,
+ ev_priv->signaled);
+ i++;
+ }
+
+ ret = copy_to_user(user_priv_data + *priv_data_offset,
+ ev_privs, num_events * sizeof(*ev_privs));
+ if (ret) {
+ pr_err("Failed to copy events priv to user\n");
+ ret = -EFAULT;
+ }
+
+ *priv_data_offset += num_events * sizeof(*ev_privs);
+
+ kvfree(ev_privs);
+ return ret;
+}
+
+int kfd_get_num_events(struct kfd_process *p)
+{
+ struct kfd_event *ev;
+ uint32_t id;
+ u32 num_events = 0;
+
+ idr_for_each_entry(&p->event_idr, ev, id)
+ num_events++;
+
+ return num_events;
+}
+
/* Assumes that p is current. */
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
{
@@ -878,6 +1096,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
{
struct kfd_hsa_memory_exception_data memory_exception_data;
struct vm_area_struct *vma;
+ int user_gpu_id;
/*
* Because we are called from arbitrary context (workqueue) as opposed
@@ -899,12 +1118,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
return; /* Process is exiting */
}
+ user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+ if (unlikely(user_gpu_id == -EINVAL)) {
+ WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ return;
+ }
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
mmap_read_lock(mm);
vma = find_vma(mm, address);
- memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.gpu_id = user_gpu_id;
memory_exception_data.va = address;
/* Set failure reason */
memory_exception_data.failure.NotPresent = 1;
@@ -980,11 +1204,19 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
uint32_t id;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
struct kfd_hsa_memory_exception_data memory_exception_data;
+ int user_gpu_id;
if (!p)
return; /* Presumably process exited. */
+
+ user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+ if (unlikely(user_gpu_id == -EINVAL)) {
+ WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ return;
+ }
+
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
- memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.gpu_id = user_gpu_id;
memory_exception_data.failure.imprecise = true;
/* Set failure reason */
if (info) {
@@ -1024,27 +1256,34 @@ void kfd_signal_reset_event(struct kfd_dev *dev)
/* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
- hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1;
hw_exception_data.reset_cause = reset_cause;
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
- memory_exception_data.gpu_id = dev->id;
memory_exception_data.failure.imprecise = true;
idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+
+ if (unlikely(user_gpu_id == -EINVAL)) {
+ WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ continue;
+ }
+
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
idr_for_each_entry_continue(&p->event_idr, ev, id) {
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data;
+ ev->hw_exception_data.gpu_id = user_gpu_id;
set_event(ev);
}
if (ev->type == KFD_EVENT_TYPE_MEMORY &&
reset_cause == KFD_HW_EXCEPTION_ECC) {
ev->memory_exception_data = memory_exception_data;
+ ev->memory_exception_data.gpu_id = user_gpu_id;
set_event(ev);
}
}
@@ -1060,18 +1299,25 @@ void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid)
struct kfd_hsa_hw_exception_data hw_exception_data;
struct kfd_event *ev;
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+ int user_gpu_id;
if (!p)
return; /* Presumably process exited. */
+ user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+ if (unlikely(user_gpu_id == -EINVAL)) {
+ WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ return;
+ }
+
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
- hw_exception_data.gpu_id = dev->id;
+ hw_exception_data.gpu_id = user_gpu_id;
hw_exception_data.memory_lost = 1;
hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
- memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.gpu_id = user_gpu_id;
memory_exception_data.failure.imprecise = true;
mutex_lock(&p->event_mutex);