summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/common/debugfs.c
diff options
context:
space:
mode:
authorYuri Nudelman <ynudelman@habana.ai>2021-06-06 10:28:51 +0300
committerOded Gabbay <ogabbay@kernel.org>2021-08-29 09:47:46 +0300
commit938b793fdede518e10c67dc1dcfba1804faf98a6 (patch)
tree640575a63c1b0c204cdbacab5f2d0bb1c4cce3d7 /drivers/misc/habanalabs/common/debugfs.c
parente79e745b208b3c709cc5e689e587d04a4c0fe1bb (diff)
downloadlinux-938b793fdede518e10c67dc1dcfba1804faf98a6.tar.xz
habanalabs: expose state dump
To improve the user's ability to debug the case where a workload that is part of executing training/inference of a topology is getting stuck, we need to add a 'core dump' each time a CS times-out. The 'core dump' shall contain all relevant Sync Manager information and corresponding fence values. The most recent dumps shall be accessible via debugfs, under 'state_dump' node. Reading from the node will provide the oldest dump available. Writing an integer value X will discard X dumps, starting with the oldest one, i.e. subsequent read will now return newer dumps. Signed-off-by: Yuri Nudelman <ynudelman@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs/common/debugfs.c')
-rw-r--r--drivers/misc/habanalabs/common/debugfs.c93
1 files changed, 93 insertions, 0 deletions
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 77f7c2aa571d..51744e42b808 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -1043,6 +1043,60 @@ static ssize_t hl_security_violations_read(struct file *f, char __user *buf,
return 0;
}
+static ssize_t hl_state_dump_read(struct file *f, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+ ssize_t rc;
+
+ down_read(&entry->state_dump_sem);
+ if (!entry->state_dump[entry->state_dump_head])
+ rc = 0;
+ else
+ rc = simple_read_from_buffer(
+ buf, count, ppos,
+ entry->state_dump[entry->state_dump_head],
+ strlen(entry->state_dump[entry->state_dump_head]));
+ up_read(&entry->state_dump_sem);
+
+ return rc;
+}
+
+static ssize_t hl_state_dump_write(struct file *f, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+ struct hl_device *hdev = entry->hdev;
+ ssize_t rc;
+ u32 size;
+ int i;
+
+ rc = kstrtouint_from_user(buf, count, 10, &size);
+ if (rc)
+ return rc;
+
+ if (size <= 0 || size >= ARRAY_SIZE(entry->state_dump)) {
+ dev_err(hdev->dev, "Invalid number of dumps to skip\n");
+ return -EINVAL;
+ }
+
+ if (entry->state_dump[entry->state_dump_head]) {
+ down_write(&entry->state_dump_sem);
+ for (i = 0; i < size; ++i) {
+ vfree(entry->state_dump[entry->state_dump_head]);
+ entry->state_dump[entry->state_dump_head] = NULL;
+ if (entry->state_dump_head > 0)
+ entry->state_dump_head--;
+ else
+ entry->state_dump_head =
+ ARRAY_SIZE(entry->state_dump) - 1;
+ }
+ up_write(&entry->state_dump_sem);
+ }
+
+ return count;
+}
+
static const struct file_operations hl_data32b_fops = {
.owner = THIS_MODULE,
.read = hl_data_read32,
@@ -1110,6 +1164,12 @@ static const struct file_operations hl_security_violations_fops = {
.read = hl_security_violations_read
};
+static const struct file_operations hl_state_dump_fops = {
+ .owner = THIS_MODULE,
+ .read = hl_state_dump_read,
+ .write = hl_state_dump_write
+};
+
static const struct hl_info_list hl_debugfs_list[] = {
{"command_buffers", command_buffers_show, NULL},
{"command_submission", command_submission_show, NULL},
@@ -1172,6 +1232,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
INIT_LIST_HEAD(&dev_entry->userptr_list);
INIT_LIST_HEAD(&dev_entry->ctx_mem_hash_list);
mutex_init(&dev_entry->file_mutex);
+ init_rwsem(&dev_entry->state_dump_sem);
spin_lock_init(&dev_entry->cb_spinlock);
spin_lock_init(&dev_entry->cs_spinlock);
spin_lock_init(&dev_entry->cs_job_spinlock);
@@ -1283,6 +1344,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
dev_entry->root,
&hdev->skip_reset_on_timeout);
+ debugfs_create_file("state_dump",
+ 0600,
+ dev_entry->root,
+ dev_entry,
+ &hl_state_dump_fops);
+
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
0444,
@@ -1297,6 +1364,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
void hl_debugfs_remove_device(struct hl_device *hdev)
{
struct hl_dbg_device_entry *entry = &hdev->hl_debugfs;
+ int i;
debugfs_remove_recursive(entry->root);
@@ -1304,6 +1372,9 @@ void hl_debugfs_remove_device(struct hl_device *hdev)
vfree(entry->blob_desc.data);
+ for (i = 0; i < ARRAY_SIZE(entry->state_dump); ++i)
+ vfree(entry->state_dump[i]);
+
kfree(entry->entry_arr);
}
@@ -1416,6 +1487,28 @@ void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx)
spin_unlock(&dev_entry->ctx_mem_hash_spinlock);
}
+/**
+ * hl_debugfs_set_state_dump - register state dump making it accessible via
+ * debugfs
+ * @hdev: pointer to the device structure
+ * @data: the actual dump data
+ * @length: the length of the data
+ */
+void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
+ unsigned long length)
+{
+ struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
+
+ down_write(&dev_entry->state_dump_sem);
+
+ dev_entry->state_dump_head = (dev_entry->state_dump_head + 1) %
+ ARRAY_SIZE(dev_entry->state_dump);
+ vfree(dev_entry->state_dump[dev_entry->state_dump_head]);
+ dev_entry->state_dump[dev_entry->state_dump_head] = data;
+
+ up_write(&dev_entry->state_dump_sem);
+}
+
void __init hl_debugfs_init(void)
{
hl_debug_root = debugfs_create_dir("habanalabs", NULL);