diff options
Diffstat (limited to 'drivers/gpu/drm/i915/selftests/intel_hangcheck.c')
-rw-r--r-- | drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 417 |
1 files changed, 304 insertions, 113 deletions
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index 40efbed611de..7b6f3bea9ef8 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -103,52 +103,87 @@ static u64 hws_address(const struct i915_vma *hws, return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); } -static int emit_recurse_batch(struct hang *h, - struct i915_request *rq) +static int move_to_active(struct i915_vma *vma, + struct i915_request *rq, + unsigned int flags) +{ + int err; + + err = i915_vma_move_to_active(vma, rq, flags); + if (err) + return err; + + if (!i915_gem_object_has_active_reference(vma->obj)) { + i915_gem_object_get(vma->obj); + i915_gem_object_set_active_reference(vma->obj); + } + + return 0; +} + +static struct i915_request * +hang_create_request(struct hang *h, struct intel_engine_cs *engine) { struct drm_i915_private *i915 = h->i915; struct i915_address_space *vm = - rq->gem_context->ppgtt ? - &rq->gem_context->ppgtt->vm : - &i915->ggtt.vm; + h->ctx->ppgtt ? &h->ctx->ppgtt->vm : &i915->ggtt.vm; + struct i915_request *rq = NULL; struct i915_vma *hws, *vma; unsigned int flags; u32 *batch; int err; + if (i915_gem_object_is_active(h->obj)) { + struct drm_i915_gem_object *obj; + void *vaddr; + + obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vaddr = i915_gem_object_pin_map(obj, + i915_coherent_map_type(h->i915)); + if (IS_ERR(vaddr)) { + i915_gem_object_put(obj); + return ERR_CAST(vaddr); + } + + i915_gem_object_unpin_map(h->obj); + i915_gem_object_put(h->obj); + + h->obj = obj; + h->batch = vaddr; + } + vma = i915_vma_instance(h->obj, vm, NULL); if (IS_ERR(vma)) - return PTR_ERR(vma); + return ERR_CAST(vma); hws = i915_vma_instance(h->hws, vm, NULL); if (IS_ERR(hws)) - return PTR_ERR(hws); + return ERR_CAST(hws); err = i915_vma_pin(vma, 0, 0, PIN_USER); if (err) - return err; + return ERR_PTR(err); err = i915_vma_pin(hws, 0, 0, PIN_USER); if (err) goto unpin_vma; - err = i915_vma_move_to_active(vma, rq, 0); - if (err) + rq = i915_request_alloc(engine, h->ctx); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); goto unpin_hws; - - if (!i915_gem_object_has_active_reference(vma->obj)) { - i915_gem_object_get(vma->obj); - i915_gem_object_set_active_reference(vma->obj); } - err = i915_vma_move_to_active(hws, rq, 0); + err = move_to_active(vma, rq, 0); if (err) - goto unpin_hws; + goto cancel_rq; - if (!i915_gem_object_has_active_reference(hws->obj)) { - i915_gem_object_get(hws->obj); - i915_gem_object_set_active_reference(hws->obj); - } + err = move_to_active(hws, rq, 0); + if (err) + goto cancel_rq; batch = h->batch; if (INTEL_GEN(i915) >= 8) { @@ -213,52 +248,16 @@ static int emit_recurse_batch(struct hang *h, err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); +cancel_rq: + if (err) { + i915_request_skip(rq, err); + i915_request_add(rq); + } unpin_hws: i915_vma_unpin(hws); unpin_vma: i915_vma_unpin(vma); - return err; -} - -static struct i915_request * -hang_create_request(struct hang *h, struct intel_engine_cs *engine) -{ - struct i915_request *rq; - int err; - - if (i915_gem_object_is_active(h->obj)) { - struct drm_i915_gem_object *obj; - void *vaddr; - - obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - vaddr = i915_gem_object_pin_map(obj, - i915_coherent_map_type(h->i915)); - if (IS_ERR(vaddr)) { - i915_gem_object_put(obj); - return ERR_CAST(vaddr); - } - - i915_gem_object_unpin_map(h->obj); - i915_gem_object_put(h->obj); - - h->obj = obj; - h->batch = vaddr; - } - - rq = i915_request_alloc(engine, h->ctx); - if (IS_ERR(rq)) - return rq; - - err = emit_recurse_batch(h, rq); - if (err) { - i915_request_add(rq); - return ERR_PTR(err); - } - - return rq; + return err ? ERR_PTR(err) : rq; } static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) @@ -364,9 +363,7 @@ static int igt_global_reset(void *arg) /* Check that we can issue a global GPU reset */ igt_global_reset_lock(i915); - set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags); - mutex_lock(&i915->drm.struct_mutex); reset_count = i915_reset_count(&i915->gpu_error); i915_reset(i915, ALL_ENGINES, NULL); @@ -375,9 +372,7 @@ static int igt_global_reset(void *arg) pr_err("No GPU reset recorded!\n"); err = -EINVAL; } - mutex_unlock(&i915->drm.struct_mutex); - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); igt_global_reset_unlock(i915); if (i915_terminally_wedged(&i915->gpu_error)) @@ -386,6 +381,29 @@ static int igt_global_reset(void *arg) return err; } +static int igt_wedged_reset(void *arg) +{ + struct drm_i915_private *i915 = arg; + intel_wakeref_t wakeref; + + /* Check that we can recover a wedged device with a GPU reset */ + + igt_global_reset_lock(i915); + wakeref = intel_runtime_pm_get(i915); + + i915_gem_set_wedged(i915); + + mutex_lock(&i915->drm.struct_mutex); + GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error)); + i915_reset(i915, ALL_ENGINES, NULL); + mutex_unlock(&i915->drm.struct_mutex); + + intel_runtime_pm_put(i915, wakeref); + igt_global_reset_unlock(i915); + + return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0; +} + static bool wait_for_idle(struct intel_engine_cs *engine) { return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; @@ -431,8 +449,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); do { - u32 seqno = intel_engine_get_seqno(engine); - if (active) { struct i915_request *rq; @@ -451,7 +467,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s: Failed to start request %x, at %x\n", + pr_err("%s: Failed to start request %llx, at %x\n", __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(engine, &p, "%s\n", engine->name); @@ -461,8 +477,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) break; } - GEM_BUG_ON(!rq->global_seqno); - seqno = rq->global_seqno - 1; i915_request_put(rq); } @@ -478,16 +492,15 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) break; } - reset_engine_count += active; if (i915_reset_engine_count(&i915->gpu_error, engine) != - reset_engine_count) { - pr_err("%s engine reset %srecorded!\n", - engine->name, active ? "not " : ""); + ++reset_engine_count) { + pr_err("%s engine reset not recorded!\n", + engine->name); err = -EINVAL; break; } - if (!wait_for_idle(engine)) { + if (!i915_reset_flush(i915)) { struct drm_printer p = drm_info_printer(i915->drm.dev); @@ -552,7 +565,7 @@ static int active_request_put(struct i915_request *rq) return 0; if (i915_request_wait(rq, 0, 5 * HZ) < 0) { - GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n", + GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld, seqno %d.\n", rq->engine->name, rq->fence.context, rq->fence.seqno, @@ -710,7 +723,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915, set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); do { - u32 seqno = intel_engine_get_seqno(engine); struct i915_request *rq = NULL; if (flags & TEST_ACTIVE) { @@ -729,7 +741,7 @@ static int __igt_reset_engines(struct drm_i915_private *i915, if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s: Failed to start request %x, at %x\n", + pr_err("%s: Failed to start request %llx, at %x\n", __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(engine, &p, "%s\n", engine->name); @@ -738,9 +750,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915, err = -EIO; break; } - - GEM_BUG_ON(!rq->global_seqno); - seqno = rq->global_seqno - 1; } err = i915_reset_engine(engine, NULL); @@ -777,10 +786,9 @@ static int __igt_reset_engines(struct drm_i915_private *i915, reported = i915_reset_engine_count(&i915->gpu_error, engine); reported -= threads[engine->id].resets; - if (reported != (flags & TEST_ACTIVE ? count : 0)) { - pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n", - engine->name, test_name, count, reported, - (flags & TEST_ACTIVE ? count : 0)); + if (reported != count) { + pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", + engine->name, test_name, count, reported); if (!err) err = -EINVAL; } @@ -879,20 +887,13 @@ static int igt_reset_engines(void *arg) return 0; } -static u32 fake_hangcheck(struct i915_request *rq, u32 mask) +static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask) { - struct i915_gpu_error *error = &rq->i915->gpu_error; - u32 reset_count = i915_reset_count(error); - - error->stalled_mask = mask; - - /* set_bit() must be after we have setup the backchannel (mask) */ - smp_mb__before_atomic(); - set_bit(I915_RESET_HANDOFF, &error->flags); + u32 count = i915_reset_count(&i915->gpu_error); - wake_up_all(&error->wait_queue); + i915_reset(i915, mask, NULL); - return reset_count; + return count; } static int igt_reset_wait(void *arg) @@ -928,7 +929,7 @@ static int igt_reset_wait(void *arg) if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s: Failed to start request %x, at %x\n", + pr_err("%s: Failed to start request %llx, at %x\n", __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); @@ -938,7 +939,7 @@ static int igt_reset_wait(void *arg) goto out_rq; } - reset_count = fake_hangcheck(rq, ALL_ENGINES); + reset_count = fake_hangcheck(i915, ALL_ENGINES); timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10); if (timeout < 0) { @@ -948,7 +949,6 @@ static int igt_reset_wait(void *arg) goto out_rq; } - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); if (i915_reset_count(&i915->gpu_error) == reset_count) { pr_err("No GPU reset recorded!\n"); err = -EINVAL; @@ -1107,7 +1107,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915, if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s: Failed to start request %x, at %x\n", + pr_err("%s: Failed to start request %llx, at %x\n", __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); @@ -1127,7 +1127,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915, wait_for_completion(&arg.completion); - if (wait_for(waitqueue_active(&rq->execute), 10)) { + if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { struct drm_printer p = drm_info_printer(i915->drm.dev); pr_err("igt/evict_vma kthread did not wait\n"); @@ -1138,7 +1138,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915, } out_reset: - fake_hangcheck(rq, intel_engine_flag(rq->engine)); + fake_hangcheck(rq->i915, intel_engine_flag(rq->engine)); if (tsk) { struct igt_wedge_me w; @@ -1302,7 +1302,7 @@ static int igt_reset_queue(void *arg) if (!wait_until_running(&h, prev)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s(%s): Failed to start request %x, at %x\n", + pr_err("%s(%s): Failed to start request %llx, at %x\n", __func__, engine->name, prev->fence.seqno, hws_seqno(&h, prev)); intel_engine_dump(engine, &p, @@ -1317,12 +1317,7 @@ static int igt_reset_queue(void *arg) goto fini; } - reset_count = fake_hangcheck(prev, ENGINE_MASK(id)); - - i915_reset(i915, ENGINE_MASK(id), NULL); - - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, - &i915->gpu_error.flags)); + reset_count = fake_hangcheck(i915, ENGINE_MASK(id)); if (prev->fence.error != -EIO) { pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", @@ -1413,7 +1408,7 @@ static int igt_handle_error(void *arg) if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s: Failed to start request %x, at %x\n", + pr_err("%s: Failed to start request %llx, at %x\n", __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); @@ -1449,10 +1444,203 @@ err_unlock: return err; } +static void __preempt_begin(void) +{ + preempt_disable(); +} + +static void __preempt_end(void) +{ + preempt_enable(); +} + +static void __softirq_begin(void) +{ + local_bh_disable(); +} + +static void __softirq_end(void) +{ + local_bh_enable(); +} + +static void __hardirq_begin(void) +{ + local_irq_disable(); +} + +static void __hardirq_end(void) +{ + local_irq_enable(); +} + +struct atomic_section { + const char *name; + void (*critical_section_begin)(void); + void (*critical_section_end)(void); +}; + +static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, + const struct atomic_section *p, + const char *mode) +{ + struct tasklet_struct * const t = &engine->execlists.tasklet; + int err; + + GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", + engine->name, mode, p->name); + + tasklet_disable_nosync(t); + p->critical_section_begin(); + + err = i915_reset_engine(engine, NULL); + + p->critical_section_end(); + tasklet_enable(t); + + if (err) + pr_err("i915_reset_engine(%s:%s) failed under %s\n", + engine->name, mode, p->name); + + return err; +} + +static int igt_atomic_reset_engine(struct intel_engine_cs *engine, + const struct atomic_section *p) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_request *rq; + struct hang h; + int err; + + err = __igt_atomic_reset_engine(engine, p, "idle"); + if (err) + return err; + + err = hang_init(&h, i915); + if (err) + return err; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (wait_until_running(&h, rq)) { + err = __igt_atomic_reset_engine(engine, p, "active"); + } else { + pr_err("%s(%s): Failed to start request %llx, at %x\n", + __func__, engine->name, + rq->fence.seqno, hws_seqno(&h, rq)); + i915_gem_set_wedged(i915); + err = -EIO; + } + + if (err == 0) { + struct igt_wedge_me w; + + igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/) + i915_request_wait(rq, + I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT); + if (i915_terminally_wedged(&i915->gpu_error)) + err = -EIO; + } + + i915_request_put(rq); +out: + hang_fini(&h); + return err; +} + +static void force_reset(struct drm_i915_private *i915) +{ + i915_gem_set_wedged(i915); + i915_reset(i915, 0, NULL); +} + +static int igt_atomic_reset(void *arg) +{ + static const struct atomic_section phases[] = { + { "preempt", __preempt_begin, __preempt_end }, + { "softirq", __softirq_begin, __softirq_end }, + { "hardirq", __hardirq_begin, __hardirq_end }, + { } + }; + struct drm_i915_private *i915 = arg; + intel_wakeref_t wakeref; + int err = 0; + + /* Check that the resets are usable from atomic context */ + + if (USES_GUC_SUBMISSION(i915)) + return 0; /* guc is dead; long live the guc */ + + igt_global_reset_lock(i915); + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(i915); + + /* Flush any requests before we get started and check basics */ + force_reset(i915); + if (i915_terminally_wedged(&i915->gpu_error)) + goto unlock; + + if (intel_has_gpu_reset(i915)) { + const typeof(*phases) *p; + + for (p = phases; p->name; p++) { + GEM_TRACE("intel_gpu_reset under %s\n", p->name); + + p->critical_section_begin(); + err = intel_gpu_reset(i915, ALL_ENGINES); + p->critical_section_end(); + + if (err) { + pr_err("intel_gpu_reset failed under %s\n", + p->name); + goto out; + } + } + + force_reset(i915); + } + + if (intel_has_reset_engine(i915)) { + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + const typeof(*phases) *p; + + for (p = phases; p->name; p++) { + err = igt_atomic_reset_engine(engine, p); + if (err) + goto out; + } + } + } + +out: + /* As we poke around the guts, do a full reset before continuing. */ + force_reset(i915); + +unlock: + intel_runtime_pm_put(i915, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + igt_global_reset_unlock(i915); + + return err; +} + int intel_hangcheck_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(igt_global_reset), /* attempt to recover GPU first */ + SUBTEST(igt_wedged_reset), SUBTEST(igt_hang_sanitycheck), SUBTEST(igt_reset_idle_engine), SUBTEST(igt_reset_active_engine), @@ -1463,7 +1651,9 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) SUBTEST(igt_reset_evict_ppgtt), SUBTEST(igt_reset_evict_fence), SUBTEST(igt_handle_error), + SUBTEST(igt_atomic_reset), }; + intel_wakeref_t wakeref; bool saved_hangcheck; int err; @@ -1473,8 +1663,9 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) if (i915_terminally_wedged(&i915->gpu_error)) return -EIO; /* we're long past hope of a successful reset */ - intel_runtime_pm_get(i915); + wakeref = intel_runtime_pm_get(i915); saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); + drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */ err = i915_subtests(tests, i915); @@ -1483,7 +1674,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) mutex_unlock(&i915->drm.struct_mutex); i915_modparams.enable_hangcheck = saved_hangcheck; - intel_runtime_pm_put(i915); + intel_runtime_pm_put(i915, wakeref); return err; } |