// SPDX-License-Identifier: MIT /* * Copyright © 2019 Intel Corporation */ #include #include "gt/intel_gt.h" #include "gt/intel_engine_user.h" #include "i915_selftest.h" #include "gem/i915_gem_context.h" #include "selftests/igt_flush_test.h" #include "selftests/i915_random.h" #include "selftests/mock_drm.h" #include "huge_gem_object.h" #include "mock_context.h" static int wrap_ktime_compare(const void *A, const void *B) { const ktime_t *a = A, *b = B; return ktime_compare(*a, *b); } static int __perf_fill_blt(struct drm_i915_gem_object *obj) { struct drm_i915_private *i915 = to_i915(obj->base.dev); int inst = 0; do { struct intel_engine_cs *engine; ktime_t t[5]; int pass; int err; engine = intel_engine_lookup_user(i915, I915_ENGINE_CLASS_COPY, inst++); if (!engine) return 0; intel_engine_pm_get(engine); for (pass = 0; pass < ARRAY_SIZE(t); pass++) { struct intel_context *ce = engine->kernel_context; ktime_t t0, t1; t0 = ktime_get(); err = i915_gem_object_fill_blt(obj, ce, 0); if (err) break; err = i915_gem_object_wait(obj, I915_WAIT_ALL, MAX_SCHEDULE_TIMEOUT); if (err) break; t1 = ktime_get(); t[pass] = ktime_sub(t1, t0); } intel_engine_pm_put(engine); if (err) return err; sort(t, ARRAY_SIZE(t), sizeof(*t), wrap_ktime_compare, NULL); pr_info("%s: blt %zd KiB fill: %lld MiB/s\n", engine->name, obj->base.size >> 10, div64_u64(mul_u32_u32(4 * obj->base.size, 1000 * 1000 * 1000), t[1] + 2 * t[2] + t[3]) >> 20); } while (1); } static int perf_fill_blt(void *arg) { struct drm_i915_private *i915 = arg; static const unsigned long sizes[] = { SZ_4K, SZ_64K, SZ_2M, SZ_64M }; int i; for (i = 0; i < ARRAY_SIZE(sizes); i++) { struct drm_i915_gem_object *obj; int err; obj = i915_gem_object_create_internal(i915, sizes[i]); if (IS_ERR(obj)) return PTR_ERR(obj); err = __perf_fill_blt(obj); i915_gem_object_put(obj); if (err) return err; } return 0; } static int __perf_copy_blt(struct drm_i915_gem_object *src, struct drm_i915_gem_object *dst) { struct drm_i915_private *i915 = to_i915(src->base.dev); int inst = 0; do { struct intel_engine_cs *engine; ktime_t t[5]; int pass; int err = 0; engine = intel_engine_lookup_user(i915, I915_ENGINE_CLASS_COPY, inst++); if (!engine) return 0; intel_engine_pm_get(engine); for (pass = 0; pass < ARRAY_SIZE(t); pass++) { struct intel_context *ce = engine->kernel_context; ktime_t t0, t1; t0 = ktime_get(); err = i915_gem_object_copy_blt(src, dst, ce); if (err) break; err = i915_gem_object_wait(dst, I915_WAIT_ALL, MAX_SCHEDULE_TIMEOUT); if (err) break; t1 = ktime_get(); t[pass] = ktime_sub(t1, t0); } intel_engine_pm_put(engine); if (err) return err; sort(t, ARRAY_SIZE(t), sizeof(*t), wrap_ktime_compare, NULL); pr_info("%s: blt %zd KiB copy: %lld MiB/s\n", engine->name, src->base.size >> 10, div64_u64(mul_u32_u32(4 * src->base.size, 1000 * 1000 * 1000), t[1] + 2 * t[2] + t[3]) >> 20); } while (1); } static int perf_copy_blt(void *arg) { struct drm_i915_private *i915 = arg; static const unsigned long sizes[] = { SZ_4K, SZ_64K, SZ_2M, SZ_64M }; int i; for (i = 0; i < ARRAY_SIZE(sizes); i++) { struct drm_i915_gem_object *src, *dst; int err; src = i915_gem_object_create_internal(i915, sizes[i]); if (IS_ERR(src)) return PTR_ERR(src); dst = i915_gem_object_create_internal(i915, sizes[i]); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err_src; } err = __perf_copy_blt(src, dst); i915_gem_object_put(dst); err_src: i915_gem_object_put(src); if (err) return err; } return 0; } struct igt_thread_arg { struct intel_engine_cs *engine; struct i915_gem_context *ctx; struct file *file; struct rnd_state prng; unsigned int n_cpus; }; static int igt_fill_blt_thread(void *arg) { struct igt_thread_arg *thread = arg; struct intel_engine_cs *engine = thread->engine; struct rnd_state *prng = &thread->prng; struct drm_i915_gem_object *obj; struct i915_gem_context *ctx; struct intel_context *ce; unsigned int prio; IGT_TIMEOUT(end); u64 total, max; int err; ctx = thread->ctx; if (!ctx) { ctx = live_context_for_engine(engine, thread->file); if (IS_ERR(ctx)) return PTR_ERR(ctx); prio = i915_prandom_u32_max_state(I915_PRIORITY_MAX, prng); ctx->sched.priority = I915_USER_PRIORITY(prio); } ce = i915_gem_context_get_engine(ctx, 0); GEM_BUG_ON(IS_ERR(ce)); /* * If we have a tiny shared address space, like for the GGTT * then we can't be too greedy. */ max = ce->vm->total; if (i915_is_ggtt(ce->vm) || thread->ctx) max = div_u64(max, thread->n_cpus); max >>= 4; total = PAGE_SIZE; do { /* Aim to keep the runtime under reasonable bounds! */ const u32 max_phys_size = SZ_64K; u32 val = prandom_u32_state(prng); u32 phys_sz; u32 sz; u32 *vaddr; u32 i; total = min(total, max); sz = i915_prandom_u32_max_state(total, prng) + 1; phys_sz = sz % max_phys_size + 1; sz = round_up(sz, PAGE_SIZE); phys_sz = round_up(phys_sz, PAGE_SIZE); phys_sz = min(phys_sz, sz); pr_debug("%s with phys_sz= %x, sz=%x, val=%x\n", __func__, phys_sz, sz, val); obj = huge_gem_object(engine->i915, phys_sz, sz); if (IS_ERR(obj)) { err = PTR_ERR(obj); goto err_flush; } vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr); goto err_put; } /* * Make sure the potentially async clflush does its job, if * required. */ memset32(vaddr, val ^ 0xdeadbeaf, huge_gem_object_phys_size(obj) / sizeof(u32)); if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)) obj->cache_dirty = true; err = i915_gem_object_fill_blt(obj, ce, val); if (err) goto err_unpin; err = i915_gem_object_wait(obj, 0, MAX_SCHEDULE_TIMEOUT); if (err) goto err_unpin; for (i = 0; i < huge_gem_object_phys_size(obj) / sizeof(u32); i += 17) { if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) drm_clflush_virt_range(&vaddr[i], sizeof(vaddr[i])); if (vaddr[i] != val) { pr_err("vaddr[%u]=%x, expected=%x\n", i, vaddr[i], val); err = -EINVAL; goto err_unpin; } } i915_gem_object_unpin_map(obj); i915_gem_object_put(obj); total <<= 1; } while (!time_after(jiffies, end)); goto err_flush; err_unpin: i915_gem_object_unpin_map(obj); err_put: i915_gem_object_put(obj); err_flush: if (err == -ENOMEM) err = 0; intel_context_put(ce); return err; } static int igt_copy_blt_thread(void *arg) { struct igt_thread_arg *thread = arg; struct intel_engine_cs *engine = thread->engine; struct rnd_state *prng = &thread->prng; struct drm_i915_gem_object *src, *dst; struct i915_gem_context *ctx; struct intel_context *ce; unsigned int prio; IGT_TIMEOUT(end); u64 total, max; int err; ctx = thread->ctx; if (!ctx) { ctx = live_context_for_engine(engine, thread->file); if (IS_ERR(ctx)) return PTR_ERR(ctx); prio = i915_prandom_u32_max_state(I915_PRIORITY_MAX, prng); ctx->sched.priority = I915_USER_PRIORITY(prio); } ce = i915_gem_context_get_engine(ctx, 0); GEM_BUG_ON(IS_ERR(ce)); /* * If we have a tiny shared address space, like for the GGTT * then we can't be too greedy. */ max = ce->vm->total; if (i915_is_ggtt(ce->vm) || thread->ctx) max = div_u64(max, thread->n_cpus); max >>= 4; total = PAGE_SIZE; do { /* Aim to keep the runtime under reasonable bounds! */ const u32 max_phys_size = SZ_64K; u32 val = prandom_u32_state(prng); u32 phys_sz; u32 sz; u32 *vaddr; u32 i; total = min(total, max); sz = i915_prandom_u32_max_state(total, prng) + 1; phys_sz = sz % max_phys_size + 1; sz = round_up(sz, PAGE_SIZE); phys_sz = round_up(phys_sz, PAGE_SIZE); phys_sz = min(phys_sz, sz); pr_debug("%s with phys_sz= %x, sz=%x, val=%x\n", __func__, phys_sz, sz, val); src = huge_gem_object(engine->i915, phys_sz, sz); if (IS_ERR(src)) { err = PTR_ERR(src); goto err_flush; } vaddr = i915_gem_object_pin_map(src, I915_MAP_WB); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr); goto err_put_src; } memset32(vaddr, val, huge_gem_object_phys_size(src) / sizeof(u32)); i915_gem_object_unpin_map(src); if (!(src->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) src->cache_dirty = true; dst = huge_gem_object(engine->i915, phys_sz, sz); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err_put_src; } vaddr = i915_gem_object_pin_map(dst, I915_MAP_WB); if (IS_ERR(vaddr)) { err = PTR_ERR(vaddr); goto err_put_dst; } memset32(vaddr, val ^ 0xdeadbeaf, huge_gem_object_phys_size(dst) / sizeof(u32)); if (!(dst->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)) dst->cache_dirty = true; err = i915_gem_object_copy_blt(src, dst, ce); if (err) goto err_unpin; err = i915_gem_object_wait(dst, 0, MAX_SCHEDULE_TIMEOUT); if (err) goto err_unpin; for (i = 0; i < huge_gem_object_phys_size(dst) / sizeof(u32); i += 17) { if (!(dst->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) drm_clflush_virt_range(&vaddr[i], sizeof(vaddr[i])); if (vaddr[i] != val) { pr_err("vaddr[%u]=%x, expected=%x\n", i, vaddr[i], val); err = -EINVAL; goto err_unpin; } } i915_gem_object_unpin_map(dst); i915_gem_object_put(src); i915_gem_object_put(dst); total <<= 1; } while (!time_after(jiffies, end)); goto err_flush; err_unpin: i915_gem_object_unpin_map(dst); err_put_dst: i915_gem_object_put(dst); err_put_src: i915_gem_object_put(src); err_flush: if (err == -ENOMEM) err = 0; intel_context_put(ce); return err; } static int igt_threaded_blt(struct intel_engine_cs *engine, int (*blt_fn)(void *arg), unsigned int flags) #define SINGLE_CTX BIT(0) { struct igt_thread_arg *thread; struct task_struct **tsk; unsigned int n_cpus, i; I915_RND_STATE(prng); int err = 0; n_cpus = num_online_cpus() + 1; tsk = kcalloc(n_cpus, sizeof(struct task_struct *), GFP_KERNEL); if (!tsk) return 0; thread = kcalloc(n_cpus, sizeof(struct igt_thread_arg), GFP_KERNEL); if (!thread) goto out_tsk; thread[0].file = mock_file(engine->i915); if (IS_ERR(thread[0].file)) { err = PTR_ERR(thread[0].file); goto out_thread; } if (flags & SINGLE_CTX) { thread[0].ctx = live_context_for_engine(engine, thread[0].file); if (IS_ERR(thread[0].ctx)) { err = PTR_ERR(thread[0].ctx); goto out_file; } } for (i = 0; i < n_cpus; ++i) { thread[i].engine = engine; thread[i].file = thread[0].file; thread[i].ctx = thread[0].ctx; thread[i].n_cpus = n_cpus; thread[i].prng = I915_RND_STATE_INITIALIZER(prandom_u32_state(&prng)); tsk[i] = kthread_run(blt_fn, &thread[i], "igt/blt-%d", i); if (IS_ERR(tsk[i])) { err = PTR_ERR(tsk[i]); break; } get_task_struct(tsk[i]); } yield(); /* start all threads before we kthread_stop() */ for (i = 0; i < n_cpus; ++i) { int status; if (IS_ERR_OR_NULL(tsk[i])) continue; status = kthread_stop(tsk[i]); if (status && !err) err = status; put_task_struct(tsk[i]); } out_file: fput(thread[0].file); out_thread: kfree(thread); out_tsk: kfree(tsk); return err; } static int test_copy_engines(struct drm_i915_private *i915, int (*fn)(void *arg), unsigned int flags) { struct intel_engine_cs *engine; int ret; for_each_uabi_class_engine(engine, I915_ENGINE_CLASS_COPY, i915) { ret = igt_threaded_blt(engine, fn, flags); if (ret) return ret; } return 0; } static int igt_fill_blt(void *arg) { return test_copy_engines(arg, igt_fill_blt_thread, 0); } static int igt_fill_blt_ctx0(void *arg) { return test_copy_engines(arg, igt_fill_blt_thread, SINGLE_CTX); } static int igt_copy_blt(void *arg) { return test_copy_engines(arg, igt_copy_blt_thread, 0); } static int igt_copy_blt_ctx0(void *arg) { return test_copy_engines(arg, igt_copy_blt_thread, SINGLE_CTX); } int i915_gem_object_blt_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(igt_fill_blt), SUBTEST(igt_fill_blt_ctx0), SUBTEST(igt_copy_blt), SUBTEST(igt_copy_blt_ctx0), }; if (intel_gt_is_wedged(&i915->gt)) return 0; return i915_live_subtests(tests, i915); } int i915_gem_object_blt_perf_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(perf_fill_blt), SUBTEST(perf_copy_blt), }; if (intel_gt_is_wedged(&i915->gt)) return 0; return i915_live_subtests(tests, i915); }