summaryrefslogtreecommitdiff
path: root/drivers/gpu
diff options
context:
space:
mode:
authorJohn Harrison <John.C.Harrison@Intel.com>2023-04-18 21:17:44 +0300
committerJohn Harrison <John.C.Harrison@Intel.com>2023-05-16 22:26:48 +0300
commitf6eeea8d7097a82d1460537146dee670d5014f13 (patch)
tree2ea0cda2c734962c386ac3fdd1a7ad720fdca982 /drivers/gpu
parent6197cff30df44e4db85fed545fecb7df00ff8cd0 (diff)
downloadlinux-f6eeea8d7097a82d1460537146dee670d5014f13.tar.xz
drm/i915/guc: Dump error capture to dmesg on CTB error
In the past, There have been sporadic CTB failures which proved hard to reproduce manually. The most effective solution was to dump the GuC log at the point of failure and let the CI system do the repro. It is preferable not to dump the GuC log via dmesg for all issues as it is not always necessary and is not helpful for end users. But rather than trying to re-invent the code to do this each time it is wanted, commit the code but for DEBUG_GUC builds only. v2: Use IS_ENABLED for testing config options. Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20230418181744.3251240-3-John.C.Harrison@Intel.com
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c53
-rw-r--r--drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h6
2 files changed, 59 insertions, 0 deletions
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 99a0a89091e7..a22e33f37cae 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -13,6 +13,30 @@
#include "intel_guc_ct.h"
#include "intel_guc_print.h"
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+enum {
+ CT_DEAD_ALIVE = 0,
+ CT_DEAD_SETUP,
+ CT_DEAD_WRITE,
+ CT_DEAD_DEADLOCK,
+ CT_DEAD_H2G_HAS_ROOM,
+ CT_DEAD_READ,
+ CT_DEAD_PROCESS_FAILED,
+};
+
+static void ct_dead_ct_worker_func(struct work_struct *w);
+
+#define CT_DEAD(ct, reason) \
+ do { \
+ if (!(ct)->dead_ct_reported) { \
+ (ct)->dead_ct_reason |= 1 << CT_DEAD_##reason; \
+ queue_work(system_unbound_wq, &(ct)->dead_ct_worker); \
+ } \
+ } while (0)
+#else
+#define CT_DEAD(ct, reason) do { } while (0)
+#endif
+
static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct)
{
return container_of(ct, struct intel_guc, ct);
@@ -93,6 +117,9 @@ void intel_guc_ct_init_early(struct intel_guc_ct *ct)
spin_lock_init(&ct->requests.lock);
INIT_LIST_HEAD(&ct->requests.pending);
INIT_LIST_HEAD(&ct->requests.incoming);
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+ INIT_WORK(&ct->dead_ct_worker, ct_dead_ct_worker_func);
+#endif
INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func);
tasklet_setup(&ct->receive_tasklet, ct_receive_tasklet_func);
init_waitqueue_head(&ct->wq);
@@ -319,11 +346,16 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
ct->enabled = true;
ct->stall_time = KTIME_MAX;
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+ ct->dead_ct_reported = false;
+ ct->dead_ct_reason = CT_DEAD_ALIVE;
+#endif
return 0;
err_out:
CT_PROBE_ERROR(ct, "Failed to enable CTB (%pe)\n", ERR_PTR(err));
+ CT_DEAD(ct, SETUP);
return err;
}
@@ -434,6 +466,7 @@ static int ct_write(struct intel_guc_ct *ct,
corrupted:
CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
desc->head, desc->tail, desc->status);
+ CT_DEAD(ct, WRITE);
ctb->broken = true;
return -EPIPE;
}
@@ -504,6 +537,7 @@ static inline bool ct_deadlocked(struct intel_guc_ct *ct)
CT_ERROR(ct, "Head: %u\n (Dwords)", ct->ctbs.recv.desc->head);
CT_ERROR(ct, "Tail: %u\n (Dwords)", ct->ctbs.recv.desc->tail);
+ CT_DEAD(ct, DEADLOCK);
ct->ctbs.send.broken = true;
}
@@ -552,6 +586,7 @@ static inline bool h2g_has_room(struct intel_guc_ct *ct, u32 len_dw)
head, ctb->size);
desc->status |= GUC_CTB_STATUS_OVERFLOW;
ctb->broken = true;
+ CT_DEAD(ct, H2G_HAS_ROOM);
return false;
}
@@ -914,6 +949,7 @@ corrupted:
CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
desc->head, desc->tail, desc->status);
ctb->broken = true;
+ CT_DEAD(ct, READ);
return -EPIPE;
}
@@ -1063,6 +1099,7 @@ static bool ct_process_incoming_requests(struct intel_guc_ct *ct)
if (unlikely(err)) {
CT_ERROR(ct, "Failed to process CT message (%pe) %*ph\n",
ERR_PTR(err), 4 * request->size, request->msg);
+ CT_DEAD(ct, PROCESS_FAILED);
ct_free_msg(request);
}
@@ -1239,3 +1276,19 @@ void intel_guc_ct_print_info(struct intel_guc_ct *ct,
drm_printf(p, "Tail: %u\n",
ct->ctbs.recv.desc->tail);
}
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+static void ct_dead_ct_worker_func(struct work_struct *w)
+{
+ struct intel_guc_ct *ct = container_of(w, struct intel_guc_ct, dead_ct_worker);
+ struct intel_guc *guc = ct_to_guc(ct);
+
+ if (ct->dead_ct_reported)
+ return;
+
+ ct->dead_ct_reported = true;
+
+ guc_info(guc, "CTB is dead - reason=0x%X\n", ct->dead_ct_reason);
+ intel_klog_error_capture(guc_to_gt(guc), (intel_engine_mask_t)~0U);
+}
+#endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
index f709a19c7e21..818415b64f4d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
@@ -85,6 +85,12 @@ struct intel_guc_ct {
/** @stall_time: time of first time a CTB submission is stalled */
ktime_t stall_time;
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+ int dead_ct_reason;
+ bool dead_ct_reported;
+ struct work_struct dead_ct_worker;
+#endif
};
void intel_guc_ct_init_early(struct intel_guc_ct *ct);