summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_guc_submit.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/xe/xe_guc_submit.c')
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c67
1 files changed, 36 insertions, 31 deletions
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index e5a644b47e10..bbcd47737a59 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -23,6 +23,7 @@
#include "xe_force_wake.h"
#include "xe_gpu_scheduler.h"
#include "xe_gt.h"
+#include "xe_gt_printk.h"
#include "xe_guc.h"
#include "xe_guc_ct.h"
#include "xe_guc_exec_queue_types.h"
@@ -929,13 +930,15 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
int i = 0;
if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
- xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_KERNEL));
- xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)));
-
drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
xe_sched_job_seqno(job), q->guc->id, q->flags);
+ xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
+ "Kernel-submitted job timed out\n");
+ xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
+ "VM job timed out on non-killed execqueue\n");
+
simple_error_capture(q);
- xe_devcoredump(q);
+ xe_devcoredump(job);
} else {
drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx",
xe_sched_job_seqno(job), q->guc->id, q->flags);
@@ -1029,8 +1032,6 @@ static void __guc_exec_queue_fini_async(struct work_struct *w)
if (xe_exec_queue_is_lr(q))
cancel_work_sync(&ge->lr_tdr);
- if (q->flags & EXEC_QUEUE_FLAG_PERSISTENT)
- xe_device_remove_persistent_exec_queues(gt_to_xe(q->gt), q);
release_guc_id(guc, q);
xe_sched_entity_fini(&ge->entity);
xe_sched_fini(&ge->sched);
@@ -1219,7 +1220,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
init_waitqueue_head(&ge->suspend_wait);
timeout = (q->vm && xe_vm_in_lr_mode(q->vm)) ? MAX_SCHEDULE_TIMEOUT :
- q->hwe->eclass->sched_props.job_timeout_ms;
+ q->sched_props.job_timeout_ms;
err = xe_sched_init(&ge->sched, &drm_sched_ops, &xe_sched_ops,
get_submit_wq(guc),
q->lrc[0].ring.size / MAX_JOB_SIZE_BYTES, 64,
@@ -1351,21 +1352,6 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
return 0;
}
-static int guc_exec_queue_set_job_timeout(struct xe_exec_queue *q, u32 job_timeout_ms)
-{
- struct xe_gpu_scheduler *sched = &q->guc->sched;
- struct xe_guc *guc = exec_queue_to_guc(q);
- struct xe_device *xe = guc_to_xe(guc);
-
- xe_assert(xe, !exec_queue_registered(q));
- xe_assert(xe, !exec_queue_banned(q));
- xe_assert(xe, !exec_queue_killed(q));
-
- sched->base.timeout = job_timeout_ms;
-
- return 0;
-}
-
static int guc_exec_queue_suspend(struct xe_exec_queue *q)
{
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;
@@ -1416,7 +1402,6 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
.set_priority = guc_exec_queue_set_priority,
.set_timeslice = guc_exec_queue_set_timeslice,
.set_preempt_timeout = guc_exec_queue_set_preempt_timeout,
- .set_job_timeout = guc_exec_queue_set_job_timeout,
.suspend = guc_exec_queue_suspend,
.suspend_wait = guc_exec_queue_suspend_wait,
.resume = guc_exec_queue_resume,
@@ -1797,7 +1782,7 @@ guc_exec_queue_wq_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snaps
/**
* xe_guc_exec_queue_snapshot_capture - Take a quick snapshot of the GuC Engine.
- * @q: Xe exec queue.
+ * @job: faulty Xe scheduled job.
*
* This can be printed out in a later stage like during dev_coredump
* analysis.
@@ -1806,12 +1791,12 @@ guc_exec_queue_wq_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snaps
* caller, using `xe_guc_exec_queue_snapshot_free`.
*/
struct xe_guc_submit_exec_queue_snapshot *
-xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q)
+xe_guc_exec_queue_snapshot_capture(struct xe_sched_job *job)
{
+ struct xe_exec_queue *q = job->q;
struct xe_guc *guc = exec_queue_to_guc(q);
struct xe_device *xe = guc_to_xe(guc);
struct xe_gpu_scheduler *sched = &q->guc->sched;
- struct xe_sched_job *job;
struct xe_guc_submit_exec_queue_snapshot *snapshot;
int i;
@@ -1869,14 +1854,16 @@ xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q)
if (!snapshot->pending_list) {
drm_err(&xe->drm, "Skipping GuC Engine pending_list snapshot.\n");
} else {
+ struct xe_sched_job *job_iter;
+
i = 0;
- list_for_each_entry(job, &sched->base.pending_list, drm.list) {
+ list_for_each_entry(job_iter, &sched->base.pending_list, drm.list) {
snapshot->pending_list[i].seqno =
- xe_sched_job_seqno(job);
+ xe_sched_job_seqno(job_iter);
snapshot->pending_list[i].fence =
- dma_fence_is_signaled(job->fence) ? 1 : 0;
+ dma_fence_is_signaled(job_iter->fence) ? 1 : 0;
snapshot->pending_list[i].finished =
- dma_fence_is_signaled(&job->drm.s_fence->finished)
+ dma_fence_is_signaled(&job_iter->drm.s_fence->finished)
? 1 : 0;
i++;
}
@@ -1962,10 +1949,28 @@ void xe_guc_exec_queue_snapshot_free(struct xe_guc_submit_exec_queue_snapshot *s
static void guc_exec_queue_print(struct xe_exec_queue *q, struct drm_printer *p)
{
struct xe_guc_submit_exec_queue_snapshot *snapshot;
+ struct xe_gpu_scheduler *sched = &q->guc->sched;
+ struct xe_sched_job *job;
+ bool found = false;
- snapshot = xe_guc_exec_queue_snapshot_capture(q);
+ spin_lock(&sched->base.job_list_lock);
+ list_for_each_entry(job, &sched->base.pending_list, drm.list) {
+ if (job->q == q) {
+ xe_sched_job_get(job);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&sched->base.job_list_lock);
+
+ if (!found)
+ return;
+
+ snapshot = xe_guc_exec_queue_snapshot_capture(job);
xe_guc_exec_queue_snapshot_print(snapshot, p);
xe_guc_exec_queue_snapshot_free(snapshot);
+
+ xe_sched_job_put(job);
}
/**