From 0ea92ace8b95f67224ee26c4955efc7104d8e8e1 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:41 -0700 Subject: drm/i915/guc: Move GuC guc_id allocation under submission state sub-struct Move guc_id allocation under submission state sub-struct as a future patch will reuse the spin lock as a global submission state lock. Moving this into sub-struct makes ownership of fields / lock clear. v2: (Docs) - Add comment for submission_state sub-structure v3: (John Harrison) - Fixup a few comments v4: (John Harrison) - Fix typo Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-2-matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/intel_context_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 12252c411159..e7e3984aab78 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -197,18 +197,18 @@ struct intel_context { struct { /** * @id: handle which is used to uniquely identify this context - * with the GuC, protected by guc->contexts_lock + * with the GuC, protected by guc->submission_state.lock */ u16 id; /** * @ref: the number of references to the guc_id, when * transitioning in and out of zero protected by - * guc->contexts_lock + * guc->submission_state.lock */ atomic_t ref; /** * @link: in guc->guc_id_list when the guc_id has no refs but is - * still valid, protected by guc->contexts_lock + * still valid, protected by guc->submission_state.lock */ struct list_head link; } guc_id; -- cgit v1.2.3 From 1a52faed3131147c10bb7f908d0f7a29b94f59ae Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:42 -0700 Subject: drm/i915/guc: Take GT PM ref when deregistering context Taking a PM reference to prevent intel_gt_wait_for_idle from short circuiting while a deregister context H2G is in flight. To do this must issue the deregister H2G from a worker as context can be destroyed from an atomic context and taking GT PM ref blows up. Previously we took a runtime PM from this atomic context which worked but will stop working once runtime pm autosuspend in enabled. So this patch is two fold, stop intel_gt_wait_for_idle from short circuting and fix runtime pm autosuspend. v2: (John Harrison) - Split structure changes out in different patch (Tvrtko) - Don't drop lock in deregister_destroyed_contexts v3: (John Harrison) - Flush destroyed contexts before destroying context reg pool Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-3-matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/intel_context.c | 2 + drivers/gpu/drm/i915/gt/intel_context_types.h | 7 ++ drivers/gpu/drm/i915/gt/intel_engine_pm.h | 5 + drivers/gpu/drm/i915/gt/intel_gt_pm.h | 4 + drivers/gpu/drm/i915/gt/uc/intel_guc.h | 11 ++ drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 146 ++++++++++++++-------- 6 files changed, 121 insertions(+), 54 deletions(-) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index e1bba4cfc0ff..0f25d07e55e5 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -399,6 +399,8 @@ intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine) ce->guc_id.id = GUC_INVALID_LRC_ID; INIT_LIST_HEAD(&ce->guc_id.link); + INIT_LIST_HEAD(&ce->destroyed_link); + /* * Initialize fence to be complete as this is expected to be complete * unless there is a pending schedule disable outstanding. diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index e7e3984aab78..4613d027cbc3 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -213,6 +213,13 @@ struct intel_context { struct list_head link; } guc_id; + /** + * @destroyed_link: link in guc->submission_state.destroyed_contexts, in + * list when context is pending to be destroyed (deregistered with the + * GuC), protected by guc->submission_state.lock + */ + struct list_head destroyed_link; + #ifdef CONFIG_DRM_I915_SELFTEST /** * @drop_schedule_enable: Force drop of schedule enable G2H for selftest diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h index 8520c595f5e1..6fdeae668e6e 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h @@ -16,6 +16,11 @@ intel_engine_pm_is_awake(const struct intel_engine_cs *engine) return intel_wakeref_is_active(&engine->wakeref); } +static inline void __intel_engine_pm_get(struct intel_engine_cs *engine) +{ + __intel_wakeref_get(&engine->wakeref); +} + static inline void intel_engine_pm_get(struct intel_engine_cs *engine) { intel_wakeref_get(&engine->wakeref); diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h index d0588d8aaa44..05de6c1af25b 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h @@ -41,6 +41,10 @@ static inline void intel_gt_pm_put_async(struct intel_gt *gt) intel_wakeref_put_async(>->wakeref); } +#define with_intel_gt_pm(gt, tmp) \ + for (tmp = 1, intel_gt_pm_get(gt); tmp; \ + intel_gt_pm_put(gt), tmp = 0) + static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt) { return intel_wakeref_wait_for_idle(>->wakeref); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index 82e248c2290c..74f071a0b6d5 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -90,6 +90,17 @@ struct intel_guc { * refs */ struct list_head guc_id_list; + /** + * @destroyed_contexts: list of contexts waiting to be destroyed + * (deregistered with the GuC) + */ + struct list_head destroyed_contexts; + /** + * @destroyed_worker: worker to deregister contexts, need as we + * need to take a GT PM reference and can't from destroy + * function as it might be in an atomic context (no sleeping) + */ + struct work_struct destroyed_worker; } submission_state; /** diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 48bdcdb823b3..5c53428032b5 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -90,8 +90,8 @@ * used for all of GuC submission but that could change in the future. * * guc->submission_state.lock - * Protects guc_id allocation for the given GuC, i.e. only one context can be - * doing guc_id allocation operations at a time for each GuC in the system. + * Global lock for GuC submission state. Protects guc_ids and destroyed contexts + * list. * * ce->guc_state.lock * Protects everything under ce->guc_state. Ensures that a context is in the @@ -719,6 +719,7 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) if (deregister) guc_signal_context_fence(ce); if (destroyed) { + intel_gt_pm_put_async(guc_to_gt(guc)); release_guc_id(guc, ce); __guc_context_destroy(ce); } @@ -797,6 +798,8 @@ static void guc_flush_submissions(struct intel_guc *guc) spin_unlock_irqrestore(&sched_engine->lock, flags); } +static void guc_flush_destroyed_contexts(struct intel_guc *guc); + void intel_guc_submission_reset_prepare(struct intel_guc *guc) { int i; @@ -815,6 +818,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc) spin_unlock_irq(&guc_to_gt(guc)->irq_lock); guc_flush_submissions(guc); + guc_flush_destroyed_contexts(guc); /* * Handle any outstanding G2Hs before reset. Call IRQ handler directly @@ -1126,6 +1130,8 @@ void intel_guc_submission_reset_finish(struct intel_guc *guc) intel_gt_unpark_heartbeats(guc_to_gt(guc)); } +static void destroyed_worker_func(struct work_struct *w); + /* * Set up the memory resources to be shared with the GuC (via the GGTT) * at firmware loading time. @@ -1151,6 +1157,9 @@ int intel_guc_submission_init(struct intel_guc *guc) spin_lock_init(&guc->submission_state.lock); INIT_LIST_HEAD(&guc->submission_state.guc_id_list); ida_init(&guc->submission_state.guc_ids); + INIT_LIST_HEAD(&guc->submission_state.destroyed_contexts); + INIT_WORK(&guc->submission_state.destroyed_worker, + destroyed_worker_func); return 0; } @@ -1160,6 +1169,7 @@ void intel_guc_submission_fini(struct intel_guc *guc) if (!guc->lrc_desc_pool) return; + guc_flush_destroyed_contexts(guc); guc_lrc_desc_pool_destroy(guc); i915_sched_engine_put(guc->sched_engine); } @@ -1859,11 +1869,30 @@ unpin: static inline void guc_lrc_desc_unpin(struct intel_context *ce) { struct intel_guc *guc = ce_to_guc(ce); + struct intel_gt *gt = guc_to_gt(guc); + unsigned long flags; + bool disabled; + GEM_BUG_ON(!intel_gt_pm_is_awake(gt)); GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id)); GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id)); GEM_BUG_ON(context_enabled(ce)); + /* Seal race with Reset */ + spin_lock_irqsave(&ce->guc_state.lock, flags); + disabled = submission_disabled(guc); + if (likely(!disabled)) { + __intel_gt_pm_get(gt); + set_context_destroyed(ce); + clr_context_registered(ce); + } + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + if (unlikely(disabled)) { + release_guc_id(guc, ce); + __guc_context_destroy(ce); + return; + } + deregister_context(ce, ce->guc_id.id); } @@ -1891,78 +1920,86 @@ static void __guc_context_destroy(struct intel_context *ce) } } +static void guc_flush_destroyed_contexts(struct intel_guc *guc) +{ + struct intel_context *ce, *cn; + unsigned long flags; + + GEM_BUG_ON(!submission_disabled(guc) && + guc_submission_initialized(guc)); + + spin_lock_irqsave(&guc->submission_state.lock, flags); + list_for_each_entry_safe(ce, cn, + &guc->submission_state.destroyed_contexts, + destroyed_link) { + list_del_init(&ce->destroyed_link); + __release_guc_id(guc, ce); + __guc_context_destroy(ce); + } + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static void deregister_destroyed_contexts(struct intel_guc *guc) +{ + struct intel_context *ce, *cn; + unsigned long flags; + + spin_lock_irqsave(&guc->submission_state.lock, flags); + list_for_each_entry_safe(ce, cn, + &guc->submission_state.destroyed_contexts, + destroyed_link) { + list_del_init(&ce->destroyed_link); + guc_lrc_desc_unpin(ce); + } + spin_unlock_irqrestore(&guc->submission_state.lock, flags); +} + +static void destroyed_worker_func(struct work_struct *w) +{ + struct intel_guc *guc = container_of(w, struct intel_guc, + submission_state.destroyed_worker); + struct intel_gt *gt = guc_to_gt(guc); + int tmp; + + with_intel_gt_pm(gt, tmp) + deregister_destroyed_contexts(guc); +} + static void guc_context_destroy(struct kref *kref) { struct intel_context *ce = container_of(kref, typeof(*ce), ref); - struct intel_runtime_pm *runtime_pm = ce->engine->uncore->rpm; struct intel_guc *guc = ce_to_guc(ce); - intel_wakeref_t wakeref; unsigned long flags; - bool disabled; + bool destroy; /* * If the guc_id is invalid this context has been stolen and we can free * it immediately. Also can be freed immediately if the context is not * registered with the GuC or the GuC is in the middle of a reset. */ - if (context_guc_id_invalid(ce)) { - __guc_context_destroy(ce); - return; - } else if (submission_disabled(guc) || - !lrc_desc_registered(guc, ce->guc_id.id)) { - release_guc_id(guc, ce); - __guc_context_destroy(ce); - return; - } - - /* - * We have to acquire the context spinlock and check guc_id again, if it - * is valid it hasn't been stolen and needs to be deregistered. We - * delete this context from the list of unpinned guc_id available to - * steal to seal a race with guc_lrc_desc_pin(). When the G2H CTB - * returns indicating this context has been deregistered the guc_id is - * returned to the pool of available guc_id. - */ spin_lock_irqsave(&guc->submission_state.lock, flags); - if (context_guc_id_invalid(ce)) { - spin_unlock_irqrestore(&guc->submission_state.lock, flags); - __guc_context_destroy(ce); - return; + destroy = submission_disabled(guc) || context_guc_id_invalid(ce) || + !lrc_desc_registered(guc, ce->guc_id.id); + if (likely(!destroy)) { + if (!list_empty(&ce->guc_id.link)) + list_del_init(&ce->guc_id.link); + list_add_tail(&ce->destroyed_link, + &guc->submission_state.destroyed_contexts); + } else { + __release_guc_id(guc, ce); } - - if (!list_empty(&ce->guc_id.link)) - list_del_init(&ce->guc_id.link); spin_unlock_irqrestore(&guc->submission_state.lock, flags); - - /* Seal race with Reset */ - spin_lock_irqsave(&ce->guc_state.lock, flags); - disabled = submission_disabled(guc); - if (likely(!disabled)) { - set_context_destroyed(ce); - clr_context_registered(ce); - } - spin_unlock_irqrestore(&ce->guc_state.lock, flags); - if (unlikely(disabled)) { - release_guc_id(guc, ce); + if (unlikely(destroy)) { __guc_context_destroy(ce); return; } /* - * We defer GuC context deregistration until the context is destroyed - * in order to save on CTBs. With this optimization ideally we only need - * 1 CTB to register the context during the first pin and 1 CTB to - * deregister the context when the context is destroyed. Without this - * optimization, a CTB would be needed every pin & unpin. - * - * XXX: Need to acqiure the runtime wakeref as this can be triggered - * from context_free_worker when runtime wakeref is not held. - * guc_lrc_desc_unpin requires the runtime as a GuC register is written - * in H2G CTB to deregister the context. A future patch may defer this - * H2G CTB if the runtime wakeref is zero. + * We use a worker to issue the H2G to deregister the context as we can + * take the GT PM for the first time which isn't allowed from an atomic + * context. */ - with_intel_runtime_pm(runtime_pm, wakeref) - guc_lrc_desc_unpin(ce); + queue_work(system_unbound_wq, &guc->submission_state.destroyed_worker); } static int guc_context_alloc(struct intel_context *ce) @@ -2798,6 +2835,7 @@ int intel_guc_deregister_done_process_msg(struct intel_guc *guc, intel_context_put(ce); } else if (context_destroyed(ce)) { /* Context has been destroyed */ + intel_gt_pm_put_async(guc_to_gt(guc)); release_guc_id(guc, ce); __guc_context_destroy(ce); } -- cgit v1.2.3 From 3897df4c0187d0f38fff6944c3beab4b6aa92a1b Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:47 -0700 Subject: drm/i915/guc: Introduce context parent-child relationship Introduce context parent-child relationship. Once this relationship is created all pinning / unpinning operations are directed to the parent context. The parent context is responsible for pinning all of its children and itself. This is a precursor to the full GuC multi-lrc implementation but aligns to how GuC mutli-lrc interface is defined - a single H2G is used register / deregister all of the contexts simultaneously. Subsequent patches in the series will implement the pinning / unpinning operations for parent / child contexts. v2: (Daniel Vetter) - Add kernel doc, add wrapper to access parent to ensure safety v3: (John Harrison) - Fix comment explaing GEM_BUG_ON in to_parent() - Make variable names generic (non-GuC specific) v4: (John Harrison) - s/its'/its/g Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-8-matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/intel_context.c | 29 +++++++++++++++++++ drivers/gpu/drm/i915/gt/intel_context.h | 41 +++++++++++++++++++++++++++ drivers/gpu/drm/i915/gt/intel_context_types.h | 21 ++++++++++++++ 3 files changed, 91 insertions(+) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 7a7cd2448277..4d95053855c9 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -403,6 +403,8 @@ intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine) INIT_LIST_HEAD(&ce->destroyed_link); + INIT_LIST_HEAD(&ce->parallel.child_list); + /* * Initialize fence to be complete as this is expected to be complete * unless there is a pending schedule disable outstanding. @@ -417,10 +419,17 @@ intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine) void intel_context_fini(struct intel_context *ce) { + struct intel_context *child, *next; + if (ce->timeline) intel_timeline_put(ce->timeline); i915_vm_put(ce->vm); + /* Need to put the creation ref for the children */ + if (intel_context_is_parent(ce)) + for_each_child_safe(ce, child, next) + intel_context_put(child); + mutex_destroy(&ce->pin_mutex); i915_active_fini(&ce->active); i915_sw_fence_fini(&ce->guc_state.blocked); @@ -537,6 +546,26 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce) return active; } +void intel_context_bind_parent_child(struct intel_context *parent, + struct intel_context *child) +{ + /* + * Callers responsibility to validate that this function is used + * correctly but we use GEM_BUG_ON here ensure that they do. + */ + GEM_BUG_ON(!intel_engine_uses_guc(parent->engine)); + GEM_BUG_ON(intel_context_is_pinned(parent)); + GEM_BUG_ON(intel_context_is_child(parent)); + GEM_BUG_ON(intel_context_is_pinned(child)); + GEM_BUG_ON(intel_context_is_child(child)); + GEM_BUG_ON(intel_context_is_parent(child)); + + parent->parallel.number_children++; + list_add_tail(&child->parallel.child_link, + &parent->parallel.child_list); + child->parallel.parent = parent; +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftest_context.c" #endif diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index c41098950746..b63c10a144af 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -44,6 +44,47 @@ void intel_context_free(struct intel_context *ce); int intel_context_reconfigure_sseu(struct intel_context *ce, const struct intel_sseu sseu); +static inline bool intel_context_is_child(struct intel_context *ce) +{ + return !!ce->parallel.parent; +} + +static inline bool intel_context_is_parent(struct intel_context *ce) +{ + return !!ce->parallel.number_children; +} + +static inline bool intel_context_is_pinned(struct intel_context *ce); + +static inline struct intel_context * +intel_context_to_parent(struct intel_context *ce) +{ + if (intel_context_is_child(ce)) { + /* + * The parent holds ref count to the child so it is always safe + * for the parent to access the child, but the child has a + * pointer to the parent without a ref. To ensure this is safe + * the child should only access the parent pointer while the + * parent is pinned. + */ + GEM_BUG_ON(!intel_context_is_pinned(ce->parallel.parent)); + + return ce->parallel.parent; + } else { + return ce; + } +} + +void intel_context_bind_parent_child(struct intel_context *parent, + struct intel_context *child); + +#define for_each_child(parent, ce)\ + list_for_each_entry(ce, &(parent)->parallel.child_list,\ + parallel.child_link) +#define for_each_child_safe(parent, ce, cn)\ + list_for_each_entry_safe(ce, cn, &(parent)->parallel.child_list,\ + parallel.child_link) + /** * intel_context_lock_pinned - Stablises the 'pinned' status of the HW context * @ce - the context diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 4613d027cbc3..76dfca57cb45 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -220,6 +220,27 @@ struct intel_context { */ struct list_head destroyed_link; + /** @parallel: sub-structure for parallel submission members */ + struct { + union { + /** + * @child_list: parent's list of children + * contexts, no protection as immutable after context + * creation + */ + struct list_head child_list; + /** + * @child_link: child's link into parent's list of + * children + */ + struct list_head child_link; + }; + /** @parent: pointer to parent if child */ + struct intel_context *parent; + /** @number_children: number of children if parent */ + u8 number_children; + } parallel; + #ifdef CONFIG_DRM_I915_SELFTEST /** * @drop_schedule_enable: Force drop of schedule enable G2H for selftest -- cgit v1.2.3 From c2aa552ff09daf78944f44e98d366009b27f1b63 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:48 -0700 Subject: drm/i915/guc: Add multi-lrc context registration Add multi-lrc context registration H2G. In addition a workqueue and process descriptor are setup during multi-lrc context registration as these data structures are needed for multi-lrc submission. v2: (John Harrison) - Move GuC specific fields into sub-struct - Clean up WQ defines - Add comment explaining math to derive WQ / PD address v3: (John Harrison) - Add PARENT_SCRATCH_SIZE define - Update comment explaining multi-lrc register v4: (John Harrison) - Move PARENT_SCRATCH_SIZE to common file Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-9-matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/intel_context.h | 2 + drivers/gpu/drm/i915/gt/intel_context_types.h | 12 +++ drivers/gpu/drm/i915/gt/intel_lrc.c | 5 + drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h | 1 + drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 2 - drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 115 +++++++++++++++++++++- 6 files changed, 134 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index b63c10a144af..9f0995150a7a 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -44,6 +44,8 @@ void intel_context_free(struct intel_context *ce); int intel_context_reconfigure_sseu(struct intel_context *ce, const struct intel_sseu sseu); +#define PARENT_SCRATCH_SIZE PAGE_SIZE + static inline bool intel_context_is_child(struct intel_context *ce) { return !!ce->parallel.parent; diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 76dfca57cb45..48decb5ee954 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -239,6 +239,18 @@ struct intel_context { struct intel_context *parent; /** @number_children: number of children if parent */ u8 number_children; + /** @guc: GuC specific members for parallel submission */ + struct { + /** @wqi_head: head pointer in work queue */ + u16 wqi_head; + /** @wqi_tail: tail pointer in work queue */ + u16 wqi_tail; + /** + * @parent_page: page in context state (ce->state) used + * by parent for work queue, process descriptor + */ + u8 parent_page; + } guc; } parallel; #ifdef CONFIG_DRM_I915_SELFTEST diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 3ef9eaf8c50e..56156cf18c41 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -942,6 +942,11 @@ __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) context_size += PAGE_SIZE; } + if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { + ce->parallel.guc.parent_page = context_size / PAGE_SIZE; + context_size += PARENT_SCRATCH_SIZE; + } + obj = i915_gem_object_create_lmem(engine->i915, context_size, I915_BO_ALLOC_PM_VOLATILE); if (IS_ERR(obj)) diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h index 8ff582222aff..ba10bd374cee 100644 --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h @@ -142,6 +142,7 @@ enum intel_guc_action { INTEL_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505, INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506, INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600, + INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601, INTEL_GUC_ACTION_RESET_CLIENT = 0x5507, INTEL_GUC_ACTION_LIMIT }; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index fa4be13c8854..0eeb2a9feeed 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -52,8 +52,6 @@ #define GUC_DOORBELL_INVALID 256 -#define GUC_WQ_SIZE (PAGE_SIZE * 2) - /* Work queue item header definitions */ #define WQ_STATUS_ACTIVE 1 #define WQ_STATUS_SUSPENDED 2 diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index fd6594bc1b96..d9f5be00e586 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -344,6 +344,46 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) return rb_entry(rb, struct i915_priolist, node); } +/* + * When using multi-lrc submission a scratch memory area is reserved in the + * parent's context state for the process descriptor and work queue. Currently + * the scratch area is sized to a page. + * + * The layout of this scratch area is below: + * 0 guc_process_desc + * ... unused + * PARENT_SCRATCH_SIZE / 2 work queue start + * ... work queue + * PARENT_SCRATCH_SIZE - 1 work queue end + */ +#define WQ_SIZE (PARENT_SCRATCH_SIZE / 2) +#define WQ_OFFSET (PARENT_SCRATCH_SIZE - WQ_SIZE) +static u32 __get_process_desc_offset(struct intel_context *ce) +{ + GEM_BUG_ON(!ce->parallel.guc.parent_page); + + return ce->parallel.guc.parent_page * PAGE_SIZE; +} + +static u32 __get_wq_offset(struct intel_context *ce) +{ + return __get_process_desc_offset(ce) + WQ_OFFSET; +} + +static struct guc_process_desc * +__get_process_desc(struct intel_context *ce) +{ + /* + * Need to subtract LRC_STATE_OFFSET here as the + * parallel.guc.parent_page is the offset into ce->state while + * ce->lrc_reg_reg is ce->state + LRC_STATE_OFFSET. + */ + return (struct guc_process_desc *) + (ce->lrc_reg_state + + ((__get_process_desc_offset(ce) - + LRC_STATE_OFFSET) / sizeof(u32))); +} + static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index) { struct guc_lrc_desc *base = guc->lrc_desc_pool_vaddr; @@ -1365,6 +1405,30 @@ static void unpin_guc_id(struct intel_guc *guc, struct intel_context *ce) spin_unlock_irqrestore(&guc->submission_state.lock, flags); } +static int __guc_action_register_multi_lrc(struct intel_guc *guc, + struct intel_context *ce, + u32 guc_id, + u32 offset, + bool loop) +{ + struct intel_context *child; + u32 action[4 + MAX_ENGINE_INSTANCE]; + int len = 0; + + GEM_BUG_ON(ce->parallel.number_children > MAX_ENGINE_INSTANCE); + + action[len++] = INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC; + action[len++] = guc_id; + action[len++] = ce->parallel.number_children + 1; + action[len++] = offset; + for_each_child(ce, child) { + offset += sizeof(struct guc_lrc_desc); + action[len++] = offset; + } + + return guc_submission_send_busy_loop(guc, action, len, 0, loop); +} + static int __guc_action_register_context(struct intel_guc *guc, u32 guc_id, u32 offset, @@ -1387,9 +1451,15 @@ static int register_context(struct intel_context *ce, bool loop) ce->guc_id.id * sizeof(struct guc_lrc_desc); int ret; + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_register(ce); - ret = __guc_action_register_context(guc, ce->guc_id.id, offset, loop); + if (intel_context_is_parent(ce)) + ret = __guc_action_register_multi_lrc(guc, ce, ce->guc_id.id, + offset, loop); + else + ret = __guc_action_register_context(guc, ce->guc_id.id, offset, + loop); if (likely(!ret)) { unsigned long flags; @@ -1418,6 +1488,7 @@ static int deregister_context(struct intel_context *ce, u32 guc_id) { struct intel_guc *guc = ce_to_guc(ce); + GEM_BUG_ON(intel_context_is_child(ce)); trace_intel_context_deregister(ce); return __guc_action_deregister_context(guc, guc_id); @@ -1445,6 +1516,7 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) struct guc_lrc_desc *desc; bool context_registered; intel_wakeref_t wakeref; + struct intel_context *child; int ret = 0; GEM_BUG_ON(!engine->mask); @@ -1470,6 +1542,41 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; guc_context_policy_init(engine, desc); + /* + * If context is a parent, we need to register a process descriptor + * describing a work queue and register all child contexts. + */ + if (intel_context_is_parent(ce)) { + struct guc_process_desc *pdesc; + + ce->parallel.guc.wqi_tail = 0; + ce->parallel.guc.wqi_head = 0; + + desc->process_desc = i915_ggtt_offset(ce->state) + + __get_process_desc_offset(ce); + desc->wq_addr = i915_ggtt_offset(ce->state) + + __get_wq_offset(ce); + desc->wq_size = WQ_SIZE; + + pdesc = __get_process_desc(ce); + memset(pdesc, 0, sizeof(*(pdesc))); + pdesc->stage_id = ce->guc_id.id; + pdesc->wq_base_addr = desc->wq_addr; + pdesc->wq_size_bytes = desc->wq_size; + pdesc->wq_status = WQ_STATUS_ACTIVE; + + for_each_child(ce, child) { + desc = __get_lrc_desc(guc, child->guc_id.id); + + desc->engine_class = + engine_class_to_guc_class(engine->class); + desc->hw_context_desc = child->lrc.lrca; + desc->priority = ce->guc_state.prio; + desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; + guc_context_policy_init(engine, desc); + } + } + /* * The context_lookup xarray is used to determine if the hardware * context is currently registered. There are two cases in which it @@ -2804,6 +2911,12 @@ g2h_context_lookup(struct intel_guc *guc, u32 desc_idx) return NULL; } + if (unlikely(intel_context_is_child(ce))) { + drm_err(&guc_to_gt(guc)->i915->drm, + "Context is child, desc_idx %u", desc_idx); + return NULL; + } + return ce; } -- cgit v1.2.3 From bc955204919ea8152b7443e7d48a48cc18dea448 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:53 -0700 Subject: drm/i915/guc: Insert submit fences between requests in parent-child relationship The GuC must receive requests in the order submitted for contexts in a parent-child relationship to function correctly. To ensure this, insert a submit fence between the current request and last request submitted for requests / contexts in a parent child relationship. This is conceptually similar to a single timeline. Signed-off-by: Matthew Brost Cc: John Harrison Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-14-matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/intel_context.h | 5 + drivers/gpu/drm/i915/gt/intel_context_types.h | 6 ++ drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +- drivers/gpu/drm/i915/i915_request.c | 120 +++++++++++++++++----- 4 files changed, 108 insertions(+), 28 deletions(-) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index 9f0995150a7a..edf12caaade3 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -77,6 +77,11 @@ intel_context_to_parent(struct intel_context *ce) } } +static inline bool intel_context_is_parallel(struct intel_context *ce) +{ + return intel_context_is_child(ce) || intel_context_is_parent(ce); +} + void intel_context_bind_parent_child(struct intel_context *parent, struct intel_context *child); diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 48decb5ee954..8309d1141d0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -237,6 +237,12 @@ struct intel_context { }; /** @parent: pointer to parent if child */ struct intel_context *parent; + /** + * @last_rq: last request submitted on a parallel context, used + * to insert submit fences between requests in the parallel + * context + */ + struct i915_request *last_rq; /** @number_children: number of children if parent */ u8 number_children; /** @guc: GuC specific members for parallel submission */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 71ae5eb69849..ebb64fb50396 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -684,8 +684,7 @@ static inline int rq_prio(const struct i915_request *rq) static bool is_multi_lrc_rq(struct i915_request *rq) { - return intel_context_is_child(rq->context) || - intel_context_is_parent(rq->context); + return intel_context_is_parallel(rq->context); } static bool can_merge_rq(struct i915_request *rq, @@ -2873,6 +2872,8 @@ static void guc_parent_context_unpin(struct intel_context *ce) GEM_BUG_ON(!intel_context_is_parent(ce)); GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + if (ce->parallel.last_rq) + i915_request_put(ce->parallel.last_rq); unpin_guc_id(guc, ce); lrc_unpin(ce); } diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index ed64fa9defdf..d29e46a001b4 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1549,36 +1549,62 @@ i915_request_await_object(struct i915_request *to, return ret; } +static inline bool is_parallel_rq(struct i915_request *rq) +{ + return intel_context_is_parallel(rq->context); +} + +static inline struct intel_context *request_to_parent(struct i915_request *rq) +{ + return intel_context_to_parent(rq->context); +} + static struct i915_request * -__i915_request_add_to_timeline(struct i915_request *rq) +__i915_request_ensure_parallel_ordering(struct i915_request *rq, + struct intel_timeline *timeline) { - struct intel_timeline *timeline = i915_request_timeline(rq); struct i915_request *prev; - /* - * Dependency tracking and request ordering along the timeline - * is special cased so that we can eliminate redundant ordering - * operations while building the request (we know that the timeline - * itself is ordered, and here we guarantee it). - * - * As we know we will need to emit tracking along the timeline, - * we embed the hooks into our request struct -- at the cost of - * having to have specialised no-allocation interfaces (which will - * be beneficial elsewhere). - * - * A second benefit to open-coding i915_request_await_request is - * that we can apply a slight variant of the rules specialised - * for timelines that jump between engines (such as virtual engines). - * If we consider the case of virtual engine, we must emit a dma-fence - * to prevent scheduling of the second request until the first is - * complete (to maximise our greedy late load balancing) and this - * precludes optimising to use semaphores serialisation of a single - * timeline across engines. - */ + GEM_BUG_ON(!is_parallel_rq(rq)); + + prev = request_to_parent(rq)->parallel.last_rq; + if (prev) { + if (!__i915_request_is_complete(prev)) { + i915_sw_fence_await_sw_fence(&rq->submit, + &prev->submit, + &rq->submitq); + + if (rq->engine->sched_engine->schedule) + __i915_sched_node_add_dependency(&rq->sched, + &prev->sched, + &rq->dep, + 0); + } + i915_request_put(prev); + } + + request_to_parent(rq)->parallel.last_rq = i915_request_get(rq); + + return to_request(__i915_active_fence_set(&timeline->last_request, + &rq->fence)); +} + +static struct i915_request * +__i915_request_ensure_ordering(struct i915_request *rq, + struct intel_timeline *timeline) +{ + struct i915_request *prev; + + GEM_BUG_ON(is_parallel_rq(rq)); + prev = to_request(__i915_active_fence_set(&timeline->last_request, &rq->fence)); + if (prev && !__i915_request_is_complete(prev)) { bool uses_guc = intel_engine_uses_guc(rq->engine); + bool pow2 = is_power_of_2(READ_ONCE(prev->engine)->mask | + rq->engine->mask); + bool same_context = prev->context == rq->context; /* * The requests are supposed to be kept in order. However, @@ -1586,13 +1612,11 @@ __i915_request_add_to_timeline(struct i915_request *rq) * is used as a barrier for external modification to this * context. */ - GEM_BUG_ON(prev->context == rq->context && + GEM_BUG_ON(same_context && i915_seqno_passed(prev->fence.seqno, rq->fence.seqno)); - if ((!uses_guc && - is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) || - (uses_guc && prev->context == rq->context)) + if ((same_context && uses_guc) || (!uses_guc && pow2)) i915_sw_fence_await_sw_fence(&rq->submit, &prev->submit, &rq->submitq); @@ -1607,6 +1631,50 @@ __i915_request_add_to_timeline(struct i915_request *rq) 0); } + return prev; +} + +static struct i915_request * +__i915_request_add_to_timeline(struct i915_request *rq) +{ + struct intel_timeline *timeline = i915_request_timeline(rq); + struct i915_request *prev; + + /* + * Dependency tracking and request ordering along the timeline + * is special cased so that we can eliminate redundant ordering + * operations while building the request (we know that the timeline + * itself is ordered, and here we guarantee it). + * + * As we know we will need to emit tracking along the timeline, + * we embed the hooks into our request struct -- at the cost of + * having to have specialised no-allocation interfaces (which will + * be beneficial elsewhere). + * + * A second benefit to open-coding i915_request_await_request is + * that we can apply a slight variant of the rules specialised + * for timelines that jump between engines (such as virtual engines). + * If we consider the case of virtual engine, we must emit a dma-fence + * to prevent scheduling of the second request until the first is + * complete (to maximise our greedy late load balancing) and this + * precludes optimising to use semaphores serialisation of a single + * timeline across engines. + * + * We do not order parallel submission requests on the timeline as each + * parallel submission context has its own timeline and the ordering + * rules for parallel requests are that they must be submitted in the + * order received from the execbuf IOCTL. So rather than using the + * timeline we store a pointer to last request submitted in the + * relationship in the gem context and insert a submission fence + * between that request and request passed into this function or + * alternatively we use completion fence if gem context has a single + * timeline and this is the first submission of an execbuf IOCTL. + */ + if (likely(!is_parallel_rq(rq))) + prev = __i915_request_ensure_ordering(rq, timeline); + else + prev = __i915_request_ensure_parallel_ordering(rq, timeline); + /* * Make sure that no request gazumped us - if it was allocated after * our i915_request_alloc() and called __i915_request_add() before -- cgit v1.2.3 From e5e32171a2cf1e434d4f88e12467f3e47d0ec618 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:56 -0700 Subject: drm/i915/guc: Connect UAPI to GuC multi-lrc interface Introduce 'set parallel submit' extension to connect UAPI to GuC multi-lrc interface. Kernel doc in new uAPI should explain it all. IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071&rev=1 media UMD: https://github.com/intel/media-driver/pull/1252 v2: (Daniel Vetter) - Add IGT link and placeholder for media UMD link v3: (Kernel test robot) - Fix warning in unpin engines call (John Harrison) - Reword a bunch of the kernel doc v4: (John Harrison) - Add comment why perma-pin is done after setting gem context - Update some comments / docs for proto contexts v5: (John Harrison) - Rework perma-pin comment - Add BUG_IN if context is pinned when setting gem context Cc: Tvrtko Ursulin Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-17-matthew.brost@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 230 ++++++++++++++++++++- drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 16 +- drivers/gpu/drm/i915/gt/intel_context_types.h | 9 +- drivers/gpu/drm/i915/gt/intel_engine.h | 12 +- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 6 +- .../gpu/drm/i915/gt/intel_execlists_submission.c | 6 +- drivers/gpu/drm/i915/gt/selftest_execlists.c | 12 +- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 114 ++++++++-- include/uapi/drm/i915_drm.h | 131 ++++++++++++ 9 files changed, 505 insertions(+), 31 deletions(-) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index d225d3dd0b40..9a00f11fef46 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -556,9 +556,150 @@ set_proto_ctx_engines_bond(struct i915_user_extension __user *base, void *data) return 0; } +static int +set_proto_ctx_engines_parallel_submit(struct i915_user_extension __user *base, + void *data) +{ + struct i915_context_engines_parallel_submit __user *ext = + container_of_user(base, typeof(*ext), base); + const struct set_proto_ctx_engines *set = data; + struct drm_i915_private *i915 = set->i915; + u64 flags; + int err = 0, n, i, j; + u16 slot, width, num_siblings; + struct intel_engine_cs **siblings = NULL; + intel_engine_mask_t prev_mask; + + /* Disabling for now */ + return -ENODEV; + + /* FIXME: This is NIY for execlists */ + if (!(intel_uc_uses_guc_submission(&i915->gt.uc))) + return -ENODEV; + + if (get_user(slot, &ext->engine_index)) + return -EFAULT; + + if (get_user(width, &ext->width)) + return -EFAULT; + + if (get_user(num_siblings, &ext->num_siblings)) + return -EFAULT; + + if (slot >= set->num_engines) { + drm_dbg(&i915->drm, "Invalid placement value, %d >= %d\n", + slot, set->num_engines); + return -EINVAL; + } + + if (set->engines[slot].type != I915_GEM_ENGINE_TYPE_INVALID) { + drm_dbg(&i915->drm, + "Invalid placement[%d], already occupied\n", slot); + return -EINVAL; + } + + if (get_user(flags, &ext->flags)) + return -EFAULT; + + if (flags) { + drm_dbg(&i915->drm, "Unknown flags 0x%02llx", flags); + return -EINVAL; + } + + for (n = 0; n < ARRAY_SIZE(ext->mbz64); n++) { + err = check_user_mbz(&ext->mbz64[n]); + if (err) + return err; + } + + if (width < 2) { + drm_dbg(&i915->drm, "Width (%d) < 2\n", width); + return -EINVAL; + } + + if (num_siblings < 1) { + drm_dbg(&i915->drm, "Number siblings (%d) < 1\n", + num_siblings); + return -EINVAL; + } + + siblings = kmalloc_array(num_siblings * width, + sizeof(*siblings), + GFP_KERNEL); + if (!siblings) + return -ENOMEM; + + /* Create contexts / engines */ + for (i = 0; i < width; ++i) { + intel_engine_mask_t current_mask = 0; + struct i915_engine_class_instance prev_engine; + + for (j = 0; j < num_siblings; ++j) { + struct i915_engine_class_instance ci; + + n = i * num_siblings + j; + if (copy_from_user(&ci, &ext->engines[n], sizeof(ci))) { + err = -EFAULT; + goto out_err; + } + + siblings[n] = + intel_engine_lookup_user(i915, ci.engine_class, + ci.engine_instance); + if (!siblings[n]) { + drm_dbg(&i915->drm, + "Invalid sibling[%d]: { class:%d, inst:%d }\n", + n, ci.engine_class, ci.engine_instance); + err = -EINVAL; + goto out_err; + } + + if (n) { + if (prev_engine.engine_class != + ci.engine_class) { + drm_dbg(&i915->drm, + "Mismatched class %d, %d\n", + prev_engine.engine_class, + ci.engine_class); + err = -EINVAL; + goto out_err; + } + } + + prev_engine = ci; + current_mask |= siblings[n]->logical_mask; + } + + if (i > 0) { + if (current_mask != prev_mask << 1) { + drm_dbg(&i915->drm, + "Non contiguous logical mask 0x%x, 0x%x\n", + prev_mask, current_mask); + err = -EINVAL; + goto out_err; + } + } + prev_mask = current_mask; + } + + set->engines[slot].type = I915_GEM_ENGINE_TYPE_PARALLEL; + set->engines[slot].num_siblings = num_siblings; + set->engines[slot].width = width; + set->engines[slot].siblings = siblings; + + return 0; + +out_err: + kfree(siblings); + + return err; +} + static const i915_user_extension_fn set_proto_ctx_engines_extensions[] = { [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_proto_ctx_engines_balance, [I915_CONTEXT_ENGINES_EXT_BOND] = set_proto_ctx_engines_bond, + [I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT] = + set_proto_ctx_engines_parallel_submit, }; static int set_proto_ctx_engines(struct drm_i915_file_private *fpriv, @@ -794,6 +935,7 @@ static int intel_context_set_gem(struct intel_context *ce, GEM_BUG_ON(rcu_access_pointer(ce->gem_context)); RCU_INIT_POINTER(ce->gem_context, ctx); + GEM_BUG_ON(intel_context_is_pinned(ce)); ce->ring_size = SZ_16K; i915_vm_put(ce->vm); @@ -818,6 +960,25 @@ static int intel_context_set_gem(struct intel_context *ce, return ret; } +static void __unpin_engines(struct i915_gem_engines *e, unsigned int count) +{ + while (count--) { + struct intel_context *ce = e->engines[count], *child; + + if (!ce || !test_bit(CONTEXT_PERMA_PIN, &ce->flags)) + continue; + + for_each_child(ce, child) + intel_context_unpin(child); + intel_context_unpin(ce); + } +} + +static void unpin_engines(struct i915_gem_engines *e) +{ + __unpin_engines(e, e->num_engines); +} + static void __free_engines(struct i915_gem_engines *e, unsigned int count) { while (count--) { @@ -933,6 +1094,40 @@ free_engines: return err; } +static int perma_pin_contexts(struct intel_context *ce) +{ + struct intel_context *child; + int i = 0, j = 0, ret; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + ret = intel_context_pin(ce); + if (unlikely(ret)) + return ret; + + for_each_child(ce, child) { + ret = intel_context_pin(child); + if (unlikely(ret)) + goto unwind; + ++i; + } + + set_bit(CONTEXT_PERMA_PIN, &ce->flags); + + return 0; + +unwind: + intel_context_unpin(ce); + for_each_child(ce, child) { + if (j++ < i) + intel_context_unpin(child); + else + break; + } + + return ret; +} + static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, unsigned int num_engines, struct i915_gem_proto_engine *pe) @@ -946,7 +1141,7 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, e->num_engines = num_engines; for (n = 0; n < num_engines; n++) { - struct intel_context *ce; + struct intel_context *ce, *child; int ret; switch (pe[n].type) { @@ -956,7 +1151,13 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, case I915_GEM_ENGINE_TYPE_BALANCED: ce = intel_engine_create_virtual(pe[n].siblings, - pe[n].num_siblings); + pe[n].num_siblings, 0); + break; + + case I915_GEM_ENGINE_TYPE_PARALLEL: + ce = intel_engine_create_parallel(pe[n].siblings, + pe[n].num_siblings, + pe[n].width); break; case I915_GEM_ENGINE_TYPE_INVALID: @@ -977,6 +1178,30 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, err = ERR_PTR(ret); goto free_engines; } + for_each_child(ce, child) { + ret = intel_context_set_gem(child, ctx, pe->sseu); + if (ret) { + err = ERR_PTR(ret); + goto free_engines; + } + } + + /* + * XXX: Must be done after calling intel_context_set_gem as that + * function changes the ring size. The ring is allocated when + * the context is pinned. If the ring size is changed after + * allocation we have a mismatch of the ring size and will cause + * the context to hang. Presumably with a bit of reordering we + * could move the perma-pin step to the backend function + * intel_engine_create_parallel. + */ + if (pe[n].type == I915_GEM_ENGINE_TYPE_PARALLEL) { + ret = perma_pin_contexts(ce); + if (ret) { + err = ERR_PTR(ret); + goto free_engines; + } + } } return e; @@ -1219,6 +1444,7 @@ static void context_close(struct i915_gem_context *ctx) /* Flush any concurrent set_engines() */ mutex_lock(&ctx->engines_mutex); + unpin_engines(__context_engines_static(ctx)); engines_idle_release(ctx, rcu_replace_pointer(ctx->engines, NULL, 1)); i915_gem_context_set_closed(ctx); mutex_unlock(&ctx->engines_mutex); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index a627b09c4680..282cdb8a5c5a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -78,13 +78,16 @@ enum i915_gem_engine_type { /** @I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set */ I915_GEM_ENGINE_TYPE_BALANCED, + + /** @I915_GEM_ENGINE_TYPE_PARALLEL: A parallel engine set */ + I915_GEM_ENGINE_TYPE_PARALLEL, }; /** * struct i915_gem_proto_engine - prototype engine * * This struct describes an engine that a context may contain. Engines - * have three types: + * have four types: * * - I915_GEM_ENGINE_TYPE_INVALID: Invalid engines can be created but they * show up as a NULL in i915_gem_engines::engines[i] and any attempt to @@ -97,6 +100,10 @@ enum i915_gem_engine_type { * * - I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set, described * i915_gem_proto_engine::num_siblings and i915_gem_proto_engine::siblings. + * + * - I915_GEM_ENGINE_TYPE_PARALLEL: A parallel submission engine set, described + * i915_gem_proto_engine::width, i915_gem_proto_engine::num_siblings, and + * i915_gem_proto_engine::siblings. */ struct i915_gem_proto_engine { /** @type: Type of this engine */ @@ -105,10 +112,13 @@ struct i915_gem_proto_engine { /** @engine: Engine, for physical */ struct intel_engine_cs *engine; - /** @num_siblings: Number of balanced siblings */ + /** @num_siblings: Number of balanced or parallel siblings */ unsigned int num_siblings; - /** @siblings: Balanced siblings */ + /** @width: Width of each sibling */ + unsigned int width; + + /** @siblings: Balanced siblings or num_siblings * width for parallel */ struct intel_engine_cs **siblings; /** @sseu: Client-set SSEU parameters */ diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 8309d1141d0a..1d880303a7e4 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -55,9 +55,13 @@ struct intel_context_ops { void (*reset)(struct intel_context *ce); void (*destroy)(struct kref *kref); - /* virtual engine/context interface */ + /* virtual/parallel engine/context interface */ struct intel_context *(*create_virtual)(struct intel_engine_cs **engine, - unsigned int count); + unsigned int count, + unsigned long flags); + struct intel_context *(*create_parallel)(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width); struct intel_engine_cs *(*get_sibling)(struct intel_engine_cs *engine, unsigned int sibling); }; @@ -113,6 +117,7 @@ struct intel_context { #define CONTEXT_NOPREEMPT 8 #define CONTEXT_LRCA_DIRTY 9 #define CONTEXT_GUC_INIT 10 +#define CONTEXT_PERMA_PIN 11 struct { u64 timeout_us; diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index d5ac49c0691e..08559ace0ada 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -282,9 +282,19 @@ intel_engine_has_preempt_reset(const struct intel_engine_cs *engine) return intel_engine_has_preemption(engine); } +#define FORCE_VIRTUAL BIT(0) struct intel_context * intel_engine_create_virtual(struct intel_engine_cs **siblings, - unsigned int count); + unsigned int count, unsigned long flags); + +static inline struct intel_context * +intel_engine_create_parallel(struct intel_engine_cs **engines, + unsigned int num_engines, + unsigned int width) +{ + GEM_BUG_ON(!engines[0]->cops->create_parallel); + return engines[0]->cops->create_parallel(engines, num_engines, width); +} static inline bool intel_virtual_engine_has_heartbeat(const struct intel_engine_cs *engine) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 2eb798ad068b..ff6753ccb129 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -1953,16 +1953,16 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now) struct intel_context * intel_engine_create_virtual(struct intel_engine_cs **siblings, - unsigned int count) + unsigned int count, unsigned long flags) { if (count == 0) return ERR_PTR(-EINVAL); - if (count == 1) + if (count == 1 && !(flags & FORCE_VIRTUAL)) return intel_context_create(siblings[0]); GEM_BUG_ON(!siblings[0]->cops->create_virtual); - return siblings[0]->cops->create_virtual(siblings, count); + return siblings[0]->cops->create_virtual(siblings, count, flags); } struct i915_request * diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 43a74b216efb..bedb80057046 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -201,7 +201,8 @@ static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) } static struct intel_context * -execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count); +execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags); static struct i915_request * __active_request(const struct intel_timeline * const tl, @@ -3784,7 +3785,8 @@ unlock: } static struct intel_context * -execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count) +execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags) { struct virtual_engine *ve; unsigned int n; diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c index 25a8c4f62b0d..b367ecfa42de 100644 --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c @@ -3733,7 +3733,7 @@ static int nop_virtual_engine(struct intel_gt *gt, GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ve)); for (n = 0; n < nctx; n++) { - ve[n] = intel_engine_create_virtual(siblings, nsibling); + ve[n] = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve[n])) { err = PTR_ERR(ve[n]); nctx = n; @@ -3929,7 +3929,7 @@ static int mask_virtual_engine(struct intel_gt *gt, * restrict it to our desired engine within the virtual engine. */ - ve = intel_engine_create_virtual(siblings, nsibling); + ve = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve)) { err = PTR_ERR(ve); goto out_close; @@ -4060,7 +4060,7 @@ static int slicein_virtual_engine(struct intel_gt *gt, i915_request_add(rq); } - ce = intel_engine_create_virtual(siblings, nsibling); + ce = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ce)) { err = PTR_ERR(ce); goto out; @@ -4112,7 +4112,7 @@ static int sliceout_virtual_engine(struct intel_gt *gt, /* XXX We do not handle oversubscription and fairness with normal rq */ for (n = 0; n < nsibling; n++) { - ce = intel_engine_create_virtual(siblings, nsibling); + ce = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ce)) { err = PTR_ERR(ce); goto out; @@ -4214,7 +4214,7 @@ static int preserved_virtual_engine(struct intel_gt *gt, if (err) goto out_scratch; - ve = intel_engine_create_virtual(siblings, nsibling); + ve = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve)) { err = PTR_ERR(ve); goto out_scratch; @@ -4354,7 +4354,7 @@ static int reset_virtual_engine(struct intel_gt *gt, if (igt_spinner_init(&spin, gt)) return -ENOMEM; - ve = intel_engine_create_virtual(siblings, nsibling); + ve = intel_engine_create_virtual(siblings, nsibling, 0); if (IS_ERR(ve)) { err = PTR_ERR(ve); goto out_spin; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 938dc34e8d3a..7c12364a017a 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -124,7 +124,13 @@ struct guc_virtual_engine { }; static struct intel_context * -guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count); +guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags); + +static struct intel_context * +guc_create_parallel(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width); #define GUC_REQUEST_SIZE 64 /* bytes */ @@ -2609,6 +2615,7 @@ static const struct intel_context_ops guc_context_ops = { .destroy = guc_context_destroy, .create_virtual = guc_create_virtual, + .create_parallel = guc_create_parallel, }; static void submit_work_cb(struct irq_work *wrk) @@ -2858,8 +2865,6 @@ static const struct intel_context_ops virtual_guc_context_ops = { .get_sibling = guc_virtual_get_sibling, }; -/* Future patches will use this function */ -__maybe_unused static int guc_parent_context_pin(struct intel_context *ce, void *vaddr) { struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); @@ -2876,8 +2881,6 @@ static int guc_parent_context_pin(struct intel_context *ce, void *vaddr) return __guc_context_pin(ce, engine, vaddr); } -/* Future patches will use this function */ -__maybe_unused static int guc_child_context_pin(struct intel_context *ce, void *vaddr) { struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); @@ -2889,8 +2892,6 @@ static int guc_child_context_pin(struct intel_context *ce, void *vaddr) return __guc_context_pin(ce, engine, vaddr); } -/* Future patches will use this function */ -__maybe_unused static void guc_parent_context_unpin(struct intel_context *ce) { struct intel_guc *guc = ce_to_guc(ce); @@ -2906,8 +2907,6 @@ static void guc_parent_context_unpin(struct intel_context *ce) lrc_unpin(ce); } -/* Future patches will use this function */ -__maybe_unused static void guc_child_context_unpin(struct intel_context *ce) { GEM_BUG_ON(context_enabled(ce)); @@ -2918,8 +2917,6 @@ static void guc_child_context_unpin(struct intel_context *ce) lrc_unpin(ce); } -/* Future patches will use this function */ -__maybe_unused static void guc_child_context_post_unpin(struct intel_context *ce) { GEM_BUG_ON(!intel_context_is_child(ce)); @@ -2930,6 +2927,98 @@ static void guc_child_context_post_unpin(struct intel_context *ce) intel_context_unpin(ce->parallel.parent); } +static void guc_child_context_destroy(struct kref *kref) +{ + struct intel_context *ce = container_of(kref, typeof(*ce), ref); + + __guc_context_destroy(ce); +} + +static const struct intel_context_ops virtual_parent_context_ops = { + .alloc = guc_virtual_context_alloc, + + .pre_pin = guc_context_pre_pin, + .pin = guc_parent_context_pin, + .unpin = guc_parent_context_unpin, + .post_unpin = guc_context_post_unpin, + + .ban = guc_context_ban, + + .cancel_request = guc_context_cancel_request, + + .enter = guc_virtual_context_enter, + .exit = guc_virtual_context_exit, + + .sched_disable = guc_context_sched_disable, + + .destroy = guc_context_destroy, + + .get_sibling = guc_virtual_get_sibling, +}; + +static const struct intel_context_ops virtual_child_context_ops = { + .alloc = guc_virtual_context_alloc, + + .pre_pin = guc_context_pre_pin, + .pin = guc_child_context_pin, + .unpin = guc_child_context_unpin, + .post_unpin = guc_child_context_post_unpin, + + .cancel_request = guc_context_cancel_request, + + .enter = guc_virtual_context_enter, + .exit = guc_virtual_context_exit, + + .destroy = guc_child_context_destroy, + + .get_sibling = guc_virtual_get_sibling, +}; + +static struct intel_context * +guc_create_parallel(struct intel_engine_cs **engines, + unsigned int num_siblings, + unsigned int width) +{ + struct intel_engine_cs **siblings = NULL; + struct intel_context *parent = NULL, *ce, *err; + int i, j; + + siblings = kmalloc_array(num_siblings, + sizeof(*siblings), + GFP_KERNEL); + if (!siblings) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < width; ++i) { + for (j = 0; j < num_siblings; ++j) + siblings[j] = engines[i * num_siblings + j]; + + ce = intel_engine_create_virtual(siblings, num_siblings, + FORCE_VIRTUAL); + if (!ce) { + err = ERR_PTR(-ENOMEM); + goto unwind; + } + + if (i == 0) { + parent = ce; + parent->ops = &virtual_parent_context_ops; + } else { + ce->ops = &virtual_child_context_ops; + intel_context_bind_parent_child(parent, ce); + } + } + + kfree(siblings); + return parent; + +unwind: + if (parent) + intel_context_put(parent); + kfree(siblings); + return err; +} + static bool guc_irq_enable_breadcrumbs(struct intel_breadcrumbs *b) { @@ -3756,7 +3845,8 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, } static struct intel_context * -guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count) +guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, + unsigned long flags) { struct guc_virtual_engine *ve; struct intel_guc *guc; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 0179f92e0916..c2a63e1584cb 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1824,6 +1824,7 @@ struct drm_i915_gem_context_param { * Extensions: * i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE) * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) + * i915_context_engines_parallel_submit (I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT) */ #define I915_CONTEXT_PARAM_ENGINES 0xa @@ -2098,6 +2099,135 @@ struct i915_context_engines_bond { struct i915_engine_class_instance engines[N__]; \ } __attribute__((packed)) name__ +/** + * struct i915_context_engines_parallel_submit - Configure engine for + * parallel submission. + * + * Setup a slot in the context engine map to allow multiple BBs to be submitted + * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU + * in parallel. Multiple hardware contexts are created internally in the i915 to + * run these BBs. Once a slot is configured for N BBs only N BBs can be + * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user + * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how + * many BBs there are based on the slot's configuration. The N BBs are the last + * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set. + * + * The default placement behavior is to create implicit bonds between each + * context if each context maps to more than 1 physical engine (e.g. context is + * a virtual engine). Also we only allow contexts of same engine class and these + * contexts must be in logically contiguous order. Examples of the placement + * behavior are described below. Lastly, the default is to not allow BBs to be + * preempted mid-batch. Rather insert coordinated preemption points on all + * hardware contexts between each set of BBs. Flags could be added in the future + * to change both of these default behaviors. + * + * Returns -EINVAL if hardware context placement configuration is invalid or if + * the placement configuration isn't supported on the platform / submission + * interface. + * Returns -ENODEV if extension isn't supported on the platform / submission + * interface. + * + * .. code-block:: none + * + * Examples syntax: + * CS[X] = generic engine of same class, logical instance X + * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE + * + * Example 1 pseudo code: + * set_engines(INVALID) + * set_parallel(engine_index=0, width=2, num_siblings=1, + * engines=CS[0],CS[1]) + * + * Results in the following valid placement: + * CS[0], CS[1] + * + * Example 2 pseudo code: + * set_engines(INVALID) + * set_parallel(engine_index=0, width=2, num_siblings=2, + * engines=CS[0],CS[2],CS[1],CS[3]) + * + * Results in the following valid placements: + * CS[0], CS[1] + * CS[2], CS[3] + * + * This can be thought of as two virtual engines, each containing two + * engines thereby making a 2D array. However, there are bonds tying the + * entries together and placing restrictions on how they can be scheduled. + * Specifically, the scheduler can choose only vertical columns from the 2D + * array. That is, CS[0] is bonded to CS[1] and CS[2] to CS[3]. So if the + * scheduler wants to submit to CS[0], it must also choose CS[1] and vice + * versa. Same for CS[2] requires also using CS[3]. + * VE[0] = CS[0], CS[2] + * VE[1] = CS[1], CS[3] + * + * Example 3 pseudo code: + * set_engines(INVALID) + * set_parallel(engine_index=0, width=2, num_siblings=2, + * engines=CS[0],CS[1],CS[1],CS[3]) + * + * Results in the following valid and invalid placements: + * CS[0], CS[1] + * CS[1], CS[3] - Not logically contiguous, return -EINVAL + */ +struct i915_context_engines_parallel_submit { + /** + * @base: base user extension. + */ + struct i915_user_extension base; + + /** + * @engine_index: slot for parallel engine + */ + __u16 engine_index; + + /** + * @width: number of contexts per parallel engine or in other words the + * number of batches in each submission + */ + __u16 width; + + /** + * @num_siblings: number of siblings per context or in other words the + * number of possible placements for each submission + */ + __u16 num_siblings; + + /** + * @mbz16: reserved for future use; must be zero + */ + __u16 mbz16; + + /** + * @flags: all undefined flags must be zero, currently not defined flags + */ + __u64 flags; + + /** + * @mbz64: reserved for future use; must be zero + */ + __u64 mbz64[3]; + + /** + * @engines: 2-d array of engine instances to configure parallel engine + * + * length = width (i) * num_siblings (j) + * index = j + i * num_siblings + */ + struct i915_engine_class_instance engines[0]; + +} __packed; + +#define I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(name__, N__) struct { \ + struct i915_user_extension base; \ + __u16 engine_index; \ + __u16 width; \ + __u16 num_siblings; \ + __u16 mbz16; \ + __u64 flags; \ + __u64 mbz64[3]; \ + struct i915_engine_class_instance engines[N__]; \ +} __attribute__((packed)) name__ + /** * DOC: Context Engine Map uAPI * @@ -2157,6 +2287,7 @@ struct i915_context_param_engines { __u64 extensions; /* linked chain of extension blocks, 0 terminates */ #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */ #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */ +#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */ struct i915_engine_class_instance engines[0]; } __attribute__((packed)); -- cgit v1.2.3 From 5851387a422c2949cb19b52efd9616ff8b18bddd Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:59 -0700 Subject: drm/i915/guc: Implement no mid batch preemption for multi-lrc For some users of multi-lrc, e.g. split frame, it isn't safe to preempt mid BB. To safely enable preemption at the BB boundary, a handshake between parent and child is needed, syncing the set of BBs at the beginning and end of each batch. This is implemented via custom emit_bb_start & emit_fini_breadcrumb functions and enabled by default if a context is configured by set parallel extension. Lastly, this patch updates the process descriptor to the correct size as the memory used in the handshake is directly after the process descriptor. v2: (John Harrison) - Fix a few comments wording - Add struture for parent page layout v3: (John Harrison) - A structure for sync semaphore - Use offsetof to calc address - Update commit message v4: (John Harrison) - Fix typos in comment explaining memory map of scratch page Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-20-matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/intel_context.c | 2 +- drivers/gpu/drm/i915/gt/intel_context_types.h | 2 + drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 2 +- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 333 +++++++++++++++++++++- 4 files changed, 326 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 709c433a2dcc..488acd39ff67 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -569,7 +569,7 @@ void intel_context_bind_parent_child(struct intel_context *parent, GEM_BUG_ON(intel_context_is_child(child)); GEM_BUG_ON(intel_context_is_parent(child)); - parent->parallel.number_children++; + parent->parallel.child_index = parent->parallel.number_children++; list_add_tail(&child->parallel.child_link, &parent->parallel.child_list); child->parallel.parent = parent; diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 1d880303a7e4..95a5b94b4ece 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -250,6 +250,8 @@ struct intel_context { struct i915_request *last_rq; /** @number_children: number of children if parent */ u8 number_children; + /** @child_index: index into child_list if child */ + u8 child_index; /** @guc: GuC specific members for parallel submission */ struct { /** @wqi_head: head pointer in work queue */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index 18da67cfcd92..722933e26347 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -186,7 +186,7 @@ struct guc_process_desc { u32 wq_status; u32 engine_presence; u32 priority; - u32 reserved[30]; + u32 reserved[36]; } __packed; #define CONTEXT_REGISTRATION_FLAG_KMD BIT(0) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 57eb5f8bc8bb..50f0f4eba03b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -11,6 +11,7 @@ #include "gt/intel_context.h" #include "gt/intel_engine_pm.h" #include "gt/intel_engine_heartbeat.h" +#include "gt/intel_gpu_commands.h" #include "gt/intel_gt.h" #include "gt/intel_gt_irq.h" #include "gt/intel_gt_pm.h" @@ -368,11 +369,16 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) /* * When using multi-lrc submission a scratch memory area is reserved in the - * parent's context state for the process descriptor and work queue. Currently - * the scratch area is sized to a page. + * parent's context state for the process descriptor, work queue, and handshake + * between the parent + children contexts to insert safe preemption points + * between each of the BBs. Currently the scratch area is sized to a page. * * The layout of this scratch area is below: * 0 guc_process_desc + * + sizeof(struct guc_process_desc) child go + * + CACHELINE_BYTES child join[0] + * ... + * + CACHELINE_BYTES child join[n - 1] * ... unused * PARENT_SCRATCH_SIZE / 2 work queue start * ... work queue @@ -380,7 +386,25 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) */ #define WQ_SIZE (PARENT_SCRATCH_SIZE / 2) #define WQ_OFFSET (PARENT_SCRATCH_SIZE - WQ_SIZE) -static u32 __get_process_desc_offset(struct intel_context *ce) + +struct sync_semaphore { + u32 semaphore; + u8 unused[CACHELINE_BYTES - sizeof(u32)]; +}; + +struct parent_scratch { + struct guc_process_desc pdesc; + + struct sync_semaphore go; + struct sync_semaphore join[MAX_ENGINE_INSTANCE + 1]; + + u8 unused[WQ_OFFSET - sizeof(struct guc_process_desc) - + sizeof(struct sync_semaphore) * (MAX_ENGINE_INSTANCE + 2)]; + + u32 wq[WQ_SIZE / sizeof(u32)]; +}; + +static u32 __get_parent_scratch_offset(struct intel_context *ce) { GEM_BUG_ON(!ce->parallel.guc.parent_page); @@ -389,23 +413,36 @@ static u32 __get_process_desc_offset(struct intel_context *ce) static u32 __get_wq_offset(struct intel_context *ce) { - return __get_process_desc_offset(ce) + WQ_OFFSET; + BUILD_BUG_ON(offsetof(struct parent_scratch, wq) != WQ_OFFSET); + + return __get_parent_scratch_offset(ce) + WQ_OFFSET; } -static struct guc_process_desc * -__get_process_desc(struct intel_context *ce) +static struct parent_scratch * +__get_parent_scratch(struct intel_context *ce) { + BUILD_BUG_ON(sizeof(struct parent_scratch) != PARENT_SCRATCH_SIZE); + BUILD_BUG_ON(sizeof(struct sync_semaphore) != CACHELINE_BYTES); + /* * Need to subtract LRC_STATE_OFFSET here as the * parallel.guc.parent_page is the offset into ce->state while * ce->lrc_reg_reg is ce->state + LRC_STATE_OFFSET. */ - return (struct guc_process_desc *) + return (struct parent_scratch *) (ce->lrc_reg_state + - ((__get_process_desc_offset(ce) - + ((__get_parent_scratch_offset(ce) - LRC_STATE_OFFSET) / sizeof(u32))); } +static struct guc_process_desc * +__get_process_desc(struct intel_context *ce) +{ + struct parent_scratch *ps = __get_parent_scratch(ce); + + return &ps->pdesc; +} + static u32 *get_wq_pointer(struct guc_process_desc *desc, struct intel_context *ce, u32 wqi_size) @@ -425,8 +462,7 @@ static u32 *get_wq_pointer(struct guc_process_desc *desc, } #undef AVAILABLE_SPACE - return ((u32 *)__get_process_desc(ce)) + - ((WQ_OFFSET + ce->parallel.guc.wqi_tail) / sizeof(u32)); + return &__get_parent_scratch(ce)->wq[ce->parallel.guc.wqi_tail / sizeof(u32)]; } static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index) @@ -1827,6 +1863,27 @@ static int deregister_context(struct intel_context *ce, u32 guc_id) return __guc_action_deregister_context(guc, guc_id); } +static inline void clear_children_join_go_memory(struct intel_context *ce) +{ + struct parent_scratch *ps = __get_parent_scratch(ce); + int i; + + ps->go.semaphore = 0; + for (i = 0; i < ce->parallel.number_children + 1; ++i) + ps->join[i].semaphore = 0; +} + +static inline u32 get_children_go_value(struct intel_context *ce) +{ + return __get_parent_scratch(ce)->go.semaphore; +} + +static inline u32 get_children_join_value(struct intel_context *ce, + u8 child_index) +{ + return __get_parent_scratch(ce)->join[child_index].semaphore; +} + static void guc_context_policy_init(struct intel_engine_cs *engine, struct guc_lrc_desc *desc) { @@ -1886,7 +1943,7 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) ce->parallel.guc.wqi_head = 0; desc->process_desc = i915_ggtt_offset(ce->state) + - __get_process_desc_offset(ce); + __get_parent_scratch_offset(ce); desc->wq_addr = i915_ggtt_offset(ce->state) + __get_wq_offset(ce); desc->wq_size = WQ_SIZE; @@ -1908,6 +1965,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; guc_context_policy_init(engine, desc); } + + clear_children_join_go_memory(ce); } /* @@ -2974,6 +3033,31 @@ static const struct intel_context_ops virtual_child_context_ops = { .get_sibling = guc_virtual_get_sibling, }; +/* + * The below override of the breadcrumbs is enabled when the user configures a + * context for parallel submission (multi-lrc, parent-child). + * + * The overridden breadcrumbs implements an algorithm which allows the GuC to + * safely preempt all the hw contexts configured for parallel submission + * between each BB. The contract between the i915 and GuC is if the parent + * context can be preempted, all the children can be preempted, and the GuC will + * always try to preempt the parent before the children. A handshake between the + * parent / children breadcrumbs ensures the i915 holds up its end of the deal + * creating a window to preempt between each set of BBs. + */ +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags); +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs); + static struct intel_context * guc_create_parallel(struct intel_engine_cs **engines, unsigned int num_siblings, @@ -3009,6 +3093,20 @@ guc_create_parallel(struct intel_engine_cs **engines, } } + parent->engine->emit_bb_start = + emit_bb_start_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_parent_no_preempt_mid_batch; + parent->engine->emit_fini_breadcrumb_dw = + 12 + 4 * parent->parallel.number_children; + for_each_child(parent, ce) { + ce->engine->emit_bb_start = + emit_bb_start_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb = + emit_fini_breadcrumb_child_no_preempt_mid_batch; + ce->engine->emit_fini_breadcrumb_dw = 16; + } + kfree(siblings); return parent; @@ -3837,6 +3935,17 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, drm_printf(p, "\t\tWQI Status: %u\n\n", READ_ONCE(desc->wq_status)); + if (ce->engine->emit_bb_start == + emit_bb_start_parent_no_preempt_mid_batch) { + u8 i; + + drm_printf(p, "\t\tChildren Go: %u\n\n", + get_children_go_value(ce)); + for (i = 0; i < ce->parallel.number_children; ++i) + drm_printf(p, "\t\tChildren Join: %u\n", + get_children_join_value(ce, i)); + } + for_each_child(ce, child) guc_log_context(p, child); } @@ -3844,6 +3953,208 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, xa_unlock_irqrestore(&guc->context_lookup, flags); } +static inline u32 get_children_go_addr(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, go.semaphore); +} + +static inline u32 get_children_join_addr(struct intel_context *ce, + u8 child_index) +{ + GEM_BUG_ON(!intel_context_is_parent(ce)); + + return i915_ggtt_offset(ce->state) + + __get_parent_scratch_offset(ce) + + offsetof(struct parent_scratch, join[child_index].semaphore); +} + +#define PARENT_GO_BB 1 +#define PARENT_GO_FINI_BREADCRUMB 0 +#define CHILD_GO_BB 1 +#define CHILD_GO_FINI_BREADCRUMB 0 +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + u32 *cs; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + cs = intel_ring_begin(rq, 10 + 4 * ce->parallel.number_children); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_BB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_BB, + get_children_go_addr(ce), + 0); + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + u32 *cs; + + GEM_BUG_ON(!intel_context_is_child(ce)); + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_BB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait on parent for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_BB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + /* Turn off preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* Jump to batch */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + intel_ring_advance(rq, cs); + + return 0; +} + +static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, + u32 *cs) +{ + struct intel_context *ce = rq->context; + u8 i; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + /* Wait on children */ + for (i = 0; i < ce->parallel.number_children; ++i) { + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = PARENT_GO_FINI_BREADCRUMB; + *cs++ = get_children_join_addr(ce, i); + *cs++ = 0; + } + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Tell children go */ + cs = gen8_emit_ggtt_write(cs, + CHILD_GO_FINI_BREADCRUMB, + get_children_go_addr(ce), + 0); + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; +} + +static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs) +{ + struct intel_context *ce = rq->context; + struct intel_context *parent = intel_context_to_parent(ce); + + GEM_BUG_ON(!intel_context_is_child(ce)); + + /* Turn on preemption */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + /* Signal parent */ + cs = gen8_emit_ggtt_write(cs, + PARENT_GO_FINI_BREADCRUMB, + get_children_join_addr(parent, + ce->parallel.child_index), + 0); + + /* Wait parent on for go */ + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD); + *cs++ = CHILD_GO_FINI_BREADCRUMB; + *cs++ = get_children_go_addr(parent); + *cs++ = 0; + + /* Emit fini breadcrumb */ + cs = gen8_emit_ggtt_write(cs, + rq->fence.seqno, + i915_request_active_timeline(rq)->hwsp_offset, + 0); + + /* User interrupt */ + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + + return cs; +} + static struct intel_context * guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, unsigned long flags) -- cgit v1.2.3 From 544460c33821b44c2f0c643121303c3dc3f66ef1 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:20:00 -0700 Subject: drm/i915: Multi-BB execbuf Allow multiple batch buffers to be submitted in a single execbuf IOCTL after a context has been configured with the 'set_parallel' extension. The number batches is implicit based on the contexts configuration. This is implemented with a series of loops. First a loop is used to find all the batches, a loop to pin all the HW contexts, a loop to create all the requests, a loop to submit (emit BB start, etc...) all the requests, a loop to tie the requests to the VMAs they touch, and finally a loop to commit the requests to the backend. A composite fence is also created for the generated requests to return to the user and to stick in dma resv slots. No behavior from the existing IOCTL should be changed aside from when throttling because the ring for a context is full. In this situation, i915 will now wait while holding the object locks. This change was done because the code is much simpler to wait while holding the locks and we believe there isn't a huge benefit of dropping these locks. If this proves false we can restructure the code to drop the locks during the wait. IGT: https://patchwork.freedesktop.org/patch/447008/?series=93071&rev=1 media UMD: https://github.com/intel/media-driver/pull/1252 v2: (Matthew Brost) - Return proper error value if i915_request_create fails v3: (John Harrison) - Add comment explaining create / add order loops + locking - Update commit message explaining different in IOCTL behavior - Line wrap some comments - eb_add_request returns void - Return -EINVAL rather triggering BUG_ON if cmd parser used (Checkpatch) - Check eb->batch_len[*current_batch] v4: (CI) - Set batch len if passed if via execbuf args - Call __i915_request_skip after __i915_request_commit (Kernel test robot) - Initialize rq to NULL in eb_pin_timeline v5: (John Harrison) - Fix typo in comments near bb order loops Signed-off-by: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-21-matthew.brost@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 783 +++++++++++++++------- drivers/gpu/drm/i915/gt/intel_context.h | 8 +- drivers/gpu/drm/i915/gt/intel_context_types.h | 10 + drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 + drivers/gpu/drm/i915/i915_request.h | 9 + drivers/gpu/drm/i915/i915_vma.c | 21 +- drivers/gpu/drm/i915/i915_vma.h | 13 +- 7 files changed, 595 insertions(+), 251 deletions(-) (limited to 'drivers/gpu/drm/i915/gt/intel_context_types.h') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 8b3a25bd93e6..1cb241298d9b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -246,17 +246,25 @@ struct i915_execbuffer { struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ struct eb_vma *vma; - struct intel_engine_cs *engine; /** engine to queue the request to */ + struct intel_gt *gt; /* gt for the execbuf */ struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ - struct i915_request *request; /** our request to build */ - struct eb_vma *batch; /** identity of the batch obj/vma */ + /** our requests to build */ + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; + /** identity of the batch obj/vma */ + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; struct i915_vma *trampoline; /** trampoline used for chaining */ + /** used for excl fence in dma_resv objects when > 1 BB submitted */ + struct dma_fence *composite_fence; + /** actual size of execobj[] as we may extend it for the cmdparser */ unsigned int buffer_count; + /* number of batches in execbuf IOCTL */ + unsigned int num_batches; + /** list of vma not yet bound during reservation phase */ struct list_head unbound; @@ -283,7 +291,8 @@ struct i915_execbuffer { u64 invalid_flags; /** Set of execobj.flags that are invalid */ - u64 batch_len; /** Length of batch within object */ + /** Length of batch within object */ + u64 batch_len[MAX_ENGINE_INSTANCE + 1]; u32 batch_start_offset; /** Location within object of batch */ u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ @@ -301,14 +310,13 @@ struct i915_execbuffer { }; static int eb_parse(struct i915_execbuffer *eb); -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, - bool throttle); +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle); static void eb_unpin_engine(struct i915_execbuffer *eb); static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { - return intel_engine_requires_cmd_parser(eb->engine) || - (intel_engine_using_cmd_parser(eb->engine) && + return intel_engine_requires_cmd_parser(eb->context->engine) || + (intel_engine_using_cmd_parser(eb->context->engine) && eb->args->batch_len); } @@ -535,11 +543,21 @@ eb_validate_vma(struct i915_execbuffer *eb, return 0; } -static void +static inline bool +is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx) +{ + return eb->args->flags & I915_EXEC_BATCH_FIRST ? + buffer_idx < eb->num_batches : + buffer_idx >= eb->args->buffer_count - eb->num_batches; +} + +static int eb_add_vma(struct i915_execbuffer *eb, - unsigned int i, unsigned batch_idx, + unsigned int *current_batch, + unsigned int i, struct i915_vma *vma) { + struct drm_i915_private *i915 = eb->i915; struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; struct eb_vma *ev = &eb->vma[i]; @@ -566,15 +584,43 @@ eb_add_vma(struct i915_execbuffer *eb, * Note that actual hangs have only been observed on gen7, but for * paranoia do it everywhere. */ - if (i == batch_idx) { + if (is_batch_buffer(eb, i)) { if (entry->relocation_count && !(ev->flags & EXEC_OBJECT_PINNED)) ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; if (eb->reloc_cache.has_fence) ev->flags |= EXEC_OBJECT_NEEDS_FENCE; - eb->batch = ev; + eb->batches[*current_batch] = ev; + + if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) { + drm_dbg(&i915->drm, + "Attempting to use self-modifying batch buffer\n"); + return -EINVAL; + } + + if (range_overflows_t(u64, + eb->batch_start_offset, + eb->args->batch_len, + ev->vma->size)) { + drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); + return -EINVAL; + } + + if (eb->args->batch_len == 0) + eb->batch_len[*current_batch] = ev->vma->size - + eb->batch_start_offset; + else + eb->batch_len[*current_batch] = eb->args->batch_len; + if (unlikely(eb->batch_len[*current_batch] == 0)) { /* impossible! */ + drm_dbg(&i915->drm, "Invalid batch length\n"); + return -EINVAL; + } + + ++*current_batch; } + + return 0; } static inline int use_cpu_reloc(const struct reloc_cache *cache, @@ -718,14 +764,6 @@ static int eb_reserve(struct i915_execbuffer *eb) } while (1); } -static unsigned int eb_batch_index(const struct i915_execbuffer *eb) -{ - if (eb->args->flags & I915_EXEC_BATCH_FIRST) - return 0; - else - return eb->buffer_count - 1; -} - static int eb_select_context(struct i915_execbuffer *eb) { struct i915_gem_context *ctx; @@ -846,9 +884,7 @@ static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle) static int eb_lookup_vmas(struct i915_execbuffer *eb) { - struct drm_i915_private *i915 = eb->i915; - unsigned int batch = eb_batch_index(eb); - unsigned int i; + unsigned int i, current_batch = 0; int err = 0; INIT_LIST_HEAD(&eb->relocs); @@ -868,7 +904,9 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) goto err; } - eb_add_vma(eb, i, batch, vma); + err = eb_add_vma(eb, ¤t_batch, i, vma); + if (err) + return err; if (i915_gem_object_is_userptr(vma->obj)) { err = i915_gem_object_userptr_submit_init(vma->obj); @@ -891,26 +929,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) } } - if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) { - drm_dbg(&i915->drm, - "Attempting to use self-modifying batch buffer\n"); - return -EINVAL; - } - - if (range_overflows_t(u64, - eb->batch_start_offset, eb->batch_len, - eb->batch->vma->size)) { - drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); - return -EINVAL; - } - - if (eb->batch_len == 0) - eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; - if (unlikely(eb->batch_len == 0)) { /* impossible! */ - drm_dbg(&i915->drm, "Invalid batch length\n"); - return -EINVAL; - } - return 0; err: @@ -1643,8 +1661,7 @@ static int eb_reinit_userptr(struct i915_execbuffer *eb) return 0; } -static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb, - struct i915_request *rq) +static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb) { bool have_copy = false; struct eb_vma *ev; @@ -1660,21 +1677,6 @@ repeat: eb_release_vmas(eb, false); i915_gem_ww_ctx_fini(&eb->ww); - if (rq) { - /* nonblocking is always false */ - if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, - MAX_SCHEDULE_TIMEOUT) < 0) { - i915_request_put(rq); - rq = NULL; - - err = -EINTR; - goto err_relock; - } - - i915_request_put(rq); - rq = NULL; - } - /* * We take 3 passes through the slowpatch. * @@ -1701,28 +1703,21 @@ repeat: if (!err) err = eb_reinit_userptr(eb); -err_relock: i915_gem_ww_ctx_init(&eb->ww, true); if (err) goto out; /* reacquire the objects */ repeat_validate: - rq = eb_pin_engine(eb, false); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - rq = NULL; + err = eb_pin_engine(eb, false); + if (err) goto err; - } - - /* We didn't throttle, should be NULL */ - GEM_WARN_ON(rq); err = eb_validate_vmas(eb); if (err) goto err; - GEM_BUG_ON(!eb->batch); + GEM_BUG_ON(!eb->batches[0]); list_for_each_entry(ev, &eb->relocs, reloc_link) { if (!have_copy) { @@ -1786,46 +1781,23 @@ out: } } - if (rq) - i915_request_put(rq); - return err; } static int eb_relocate_parse(struct i915_execbuffer *eb) { int err; - struct i915_request *rq = NULL; bool throttle = true; retry: - rq = eb_pin_engine(eb, throttle); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - rq = NULL; + err = eb_pin_engine(eb, throttle); + if (err) { if (err != -EDEADLK) return err; goto err; } - if (rq) { - bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; - - /* Need to drop all locks now for throttling, take slowpath */ - err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0); - if (err == -ETIME) { - if (nonblock) { - err = -EWOULDBLOCK; - i915_request_put(rq); - goto err; - } - goto slow; - } - i915_request_put(rq); - rq = NULL; - } - /* only throttle once, even if we didn't need to throttle */ throttle = false; @@ -1865,7 +1837,7 @@ err: return err; slow: - err = eb_relocate_parse_slow(eb, rq); + err = eb_relocate_parse_slow(eb); if (err) /* * If the user expects the execobject.offset and @@ -1879,11 +1851,40 @@ slow: return err; } +/* + * Using two helper loops for the order of which requests / batches are created + * and added the to backend. Requests are created in order from the parent to + * the last child. Requests are added in the reverse order, from the last child + * to parent. This is done for locking reasons as the timeline lock is acquired + * during request creation and released when the request is added to the + * backend. To make lockdep happy (see intel_context_timeline_lock) this must be + * the ordering. + */ +#define for_each_batch_create_order(_eb, _i) \ + for ((_i) = 0; (_i) < (_eb)->num_batches; ++(_i)) +#define for_each_batch_add_order(_eb, _i) \ + BUILD_BUG_ON(!typecheck(int, _i)); \ + for ((_i) = (_eb)->num_batches - 1; (_i) >= 0; --(_i)) + +static struct i915_request * +eb_find_first_request_added(struct i915_execbuffer *eb) +{ + int i; + + for_each_batch_add_order(eb, i) + if (eb->requests[i]) + return eb->requests[i]; + + GEM_BUG_ON("Request not found"); + + return NULL; +} + static int eb_move_to_gpu(struct i915_execbuffer *eb) { const unsigned int count = eb->buffer_count; unsigned int i = count; - int err = 0; + int err = 0, j; while (i--) { struct eb_vma *ev = &eb->vma[i]; @@ -1896,11 +1897,17 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) if (flags & EXEC_OBJECT_CAPTURE) { struct i915_capture_list *capture; - capture = kmalloc(sizeof(*capture), GFP_KERNEL); - if (capture) { - capture->next = eb->request->capture_list; - capture->vma = vma; - eb->request->capture_list = capture; + for_each_batch_create_order(eb, j) { + if (!eb->requests[j]) + break; + + capture = kmalloc(sizeof(*capture), GFP_KERNEL); + if (capture) { + capture->next = + eb->requests[j]->capture_list; + capture->vma = vma; + eb->requests[j]->capture_list = capture; + } } } @@ -1921,14 +1928,26 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) flags &= ~EXEC_OBJECT_ASYNC; } + /* We only need to await on the first request */ if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { err = i915_request_await_object - (eb->request, obj, flags & EXEC_OBJECT_WRITE); + (eb_find_first_request_added(eb), obj, + flags & EXEC_OBJECT_WRITE); } - if (err == 0) - err = i915_vma_move_to_active(vma, eb->request, - flags | __EXEC_OBJECT_NO_RESERVE); + for_each_batch_add_order(eb, j) { + if (err) + break; + if (!eb->requests[j]) + continue; + + err = _i915_vma_move_to_active(vma, eb->requests[j], + j ? NULL : + eb->composite_fence ? + eb->composite_fence : + &eb->requests[j]->fence, + flags | __EXEC_OBJECT_NO_RESERVE); + } } #ifdef CONFIG_MMU_NOTIFIER @@ -1959,11 +1978,16 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) goto err_skip; /* Unconditionally flush any chipset caches (for streaming writes). */ - intel_gt_chipset_flush(eb->engine->gt); + intel_gt_chipset_flush(eb->gt); return 0; err_skip: - i915_request_set_error_once(eb->request, err); + for_each_batch_create_order(eb, j) { + if (!eb->requests[j]) + break; + + i915_request_set_error_once(eb->requests[j], err); + } return err; } @@ -2058,14 +2082,17 @@ static int eb_parse(struct i915_execbuffer *eb) int err; if (!eb_use_cmdparser(eb)) { - batch = eb_dispatch_secure(eb, eb->batch->vma); + batch = eb_dispatch_secure(eb, eb->batches[0]->vma); if (IS_ERR(batch)) return PTR_ERR(batch); goto secure_batch; } - len = eb->batch_len; + if (intel_context_is_parallel(eb->context)) + return -EINVAL; + + len = eb->batch_len[0]; if (!CMDPARSER_USES_GGTT(eb->i915)) { /* * ppGTT backed shadow buffers must be mapped RO, to prevent @@ -2079,11 +2106,11 @@ static int eb_parse(struct i915_execbuffer *eb) } else { len += I915_CMD_PARSER_TRAMPOLINE_SIZE; } - if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */ + if (unlikely(len < eb->batch_len[0])) /* last paranoid check of overflow */ return -EINVAL; if (!pool) { - pool = intel_gt_get_buffer_pool(eb->engine->gt, len, + pool = intel_gt_get_buffer_pool(eb->gt, len, I915_MAP_WB); if (IS_ERR(pool)) return PTR_ERR(pool); @@ -2108,7 +2135,7 @@ static int eb_parse(struct i915_execbuffer *eb) trampoline = shadow; shadow = shadow_batch_pin(eb, pool->obj, - &eb->engine->gt->ggtt->vm, + &eb->gt->ggtt->vm, PIN_GLOBAL); if (IS_ERR(shadow)) { err = PTR_ERR(shadow); @@ -2130,26 +2157,29 @@ static int eb_parse(struct i915_execbuffer *eb) if (err) goto err_trampoline; - err = intel_engine_cmd_parser(eb->engine, - eb->batch->vma, + err = intel_engine_cmd_parser(eb->context->engine, + eb->batches[0]->vma, eb->batch_start_offset, - eb->batch_len, + eb->batch_len[0], shadow, trampoline); if (err) goto err_unpin_batch; - eb->batch = &eb->vma[eb->buffer_count++]; - eb->batch->vma = i915_vma_get(shadow); - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; + eb->batches[0] = &eb->vma[eb->buffer_count++]; + eb->batches[0]->vma = i915_vma_get(shadow); + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; eb->trampoline = trampoline; eb->batch_start_offset = 0; secure_batch: if (batch) { - eb->batch = &eb->vma[eb->buffer_count++]; - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; - eb->batch->vma = i915_vma_get(batch); + if (intel_context_is_parallel(eb->context)) + return -EINVAL; + + eb->batches[0] = &eb->vma[eb->buffer_count++]; + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; + eb->batches[0]->vma = i915_vma_get(batch); } return 0; @@ -2165,19 +2195,18 @@ err: return err; } -static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) +static int eb_request_submit(struct i915_execbuffer *eb, + struct i915_request *rq, + struct i915_vma *batch, + u64 batch_len) { int err; - if (intel_context_nopreempt(eb->context)) - __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); - - err = eb_move_to_gpu(eb); - if (err) - return err; + if (intel_context_nopreempt(rq->context)) + __set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags); if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { - err = i915_reset_gen7_sol_offsets(eb->request); + err = i915_reset_gen7_sol_offsets(rq); if (err) return err; } @@ -2188,26 +2217,26 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) * allows us to determine if the batch is still waiting on the GPU * or actually running by checking the breadcrumb. */ - if (eb->engine->emit_init_breadcrumb) { - err = eb->engine->emit_init_breadcrumb(eb->request); + if (rq->context->engine->emit_init_breadcrumb) { + err = rq->context->engine->emit_init_breadcrumb(rq); if (err) return err; } - err = eb->engine->emit_bb_start(eb->request, - batch->node.start + - eb->batch_start_offset, - eb->batch_len, - eb->batch_flags); + err = rq->context->engine->emit_bb_start(rq, + batch->node.start + + eb->batch_start_offset, + batch_len, + eb->batch_flags); if (err) return err; if (eb->trampoline) { + GEM_BUG_ON(intel_context_is_parallel(rq->context)); GEM_BUG_ON(eb->batch_start_offset); - err = eb->engine->emit_bb_start(eb->request, - eb->trampoline->node.start + - eb->batch_len, - 0, 0); + err = rq->context->engine->emit_bb_start(rq, + eb->trampoline->node.start + + batch_len, 0, 0); if (err) return err; } @@ -2215,6 +2244,27 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) return 0; } +static int eb_submit(struct i915_execbuffer *eb) +{ + unsigned int i; + int err; + + err = eb_move_to_gpu(eb); + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + trace_i915_request_queue(eb->requests[i], eb->batch_flags); + if (!err) + err = eb_request_submit(eb, eb->requests[i], + eb->batches[i]->vma, + eb->batch_len[i]); + } + + return err; +} + static int num_vcs_engines(const struct drm_i915_private *i915) { return hweight_long(VDBOX_MASK(&i915->gt)); @@ -2280,26 +2330,11 @@ static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel return i915_request_get(rq); } -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle) +static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce, + bool throttle) { - struct intel_context *ce = eb->context; struct intel_timeline *tl; struct i915_request *rq = NULL; - int err; - - GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); - - if (unlikely(intel_context_is_banned(ce))) - return ERR_PTR(-EIO); - - /* - * Pinning the contexts may generate requests in order to acquire - * GGTT space, so do this first before we reserve a seqno for - * ourselves. - */ - err = intel_context_pin_ww(ce, &eb->ww); - if (err) - return ERR_PTR(err); /* * Take a local wakeref for preparing to dispatch the execbuf as @@ -2310,33 +2345,108 @@ static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throt * taken on the engine, and the parent device. */ tl = intel_context_timeline_lock(ce); - if (IS_ERR(tl)) { - intel_context_unpin(ce); - return ERR_CAST(tl); - } + if (IS_ERR(tl)) + return PTR_ERR(tl); intel_context_enter(ce); if (throttle) rq = eb_throttle(eb, ce); intel_context_timeline_unlock(tl); + if (rq) { + bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; + long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT; + + if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, + timeout) < 0) { + i915_request_put(rq); + + tl = intel_context_timeline_lock(ce); + intel_context_exit(ce); + intel_context_timeline_unlock(tl); + + if (nonblock) + return -EWOULDBLOCK; + else + return -EINTR; + } + i915_request_put(rq); + } + + return 0; +} + +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle) +{ + struct intel_context *ce = eb->context, *child; + int err; + int i = 0, j = 0; + + GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); + + if (unlikely(intel_context_is_banned(ce))) + return -EIO; + + /* + * Pinning the contexts may generate requests in order to acquire + * GGTT space, so do this first before we reserve a seqno for + * ourselves. + */ + err = intel_context_pin_ww(ce, &eb->ww); + if (err) + return err; + for_each_child(ce, child) { + err = intel_context_pin_ww(child, &eb->ww); + GEM_BUG_ON(err); /* perma-pinned should incr a counter */ + } + + for_each_child(ce, child) { + err = eb_pin_timeline(eb, child, throttle); + if (err) + goto unwind; + ++i; + } + err = eb_pin_timeline(eb, ce, throttle); + if (err) + goto unwind; + eb->args->flags |= __EXEC_ENGINE_PINNED; - return rq; + return 0; + +unwind: + for_each_child(ce, child) { + if (j++ < i) { + mutex_lock(&child->timeline->mutex); + intel_context_exit(child); + mutex_unlock(&child->timeline->mutex); + } + } + for_each_child(ce, child) + intel_context_unpin(child); + intel_context_unpin(ce); + return err; } static void eb_unpin_engine(struct i915_execbuffer *eb) { - struct intel_context *ce = eb->context; - struct intel_timeline *tl = ce->timeline; + struct intel_context *ce = eb->context, *child; if (!(eb->args->flags & __EXEC_ENGINE_PINNED)) return; eb->args->flags &= ~__EXEC_ENGINE_PINNED; - mutex_lock(&tl->mutex); + for_each_child(ce, child) { + mutex_lock(&child->timeline->mutex); + intel_context_exit(child); + mutex_unlock(&child->timeline->mutex); + + intel_context_unpin(child); + } + + mutex_lock(&ce->timeline->mutex); intel_context_exit(ce); - mutex_unlock(&tl->mutex); + mutex_unlock(&ce->timeline->mutex); intel_context_unpin(ce); } @@ -2387,7 +2497,7 @@ eb_select_legacy_ring(struct i915_execbuffer *eb) static int eb_select_engine(struct i915_execbuffer *eb) { - struct intel_context *ce; + struct intel_context *ce, *child; unsigned int idx; int err; @@ -2400,6 +2510,20 @@ eb_select_engine(struct i915_execbuffer *eb) if (IS_ERR(ce)) return PTR_ERR(ce); + if (intel_context_is_parallel(ce)) { + if (eb->buffer_count < ce->parallel.number_children + 1) { + intel_context_put(ce); + return -EINVAL; + } + if (eb->batch_start_offset || eb->args->batch_len) { + intel_context_put(ce); + return -EINVAL; + } + } + eb->num_batches = ce->parallel.number_children + 1; + + for_each_child(ce, child) + intel_context_get(child); intel_gt_pm_get(ce->engine->gt); if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { @@ -2407,6 +2531,13 @@ eb_select_engine(struct i915_execbuffer *eb) if (err) goto err; } + for_each_child(ce, child) { + if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) { + err = intel_context_alloc_state(child); + if (err) + goto err; + } + } /* * ABI: Before userspace accesses the GPU (e.g. execbuffer), report @@ -2417,7 +2548,7 @@ eb_select_engine(struct i915_execbuffer *eb) goto err; eb->context = ce; - eb->engine = ce->engine; + eb->gt = ce->engine->gt; /* * Make sure engine pool stays alive even if we call intel_context_put @@ -2428,6 +2559,8 @@ eb_select_engine(struct i915_execbuffer *eb) err: intel_gt_pm_put(ce->engine->gt); + for_each_child(ce, child) + intel_context_put(child); intel_context_put(ce); return err; } @@ -2435,7 +2568,11 @@ err: static void eb_put_engine(struct i915_execbuffer *eb) { - intel_gt_pm_put(eb->engine->gt); + struct intel_context *child; + + intel_gt_pm_put(eb->gt); + for_each_child(eb->context, child) + intel_context_put(child); intel_context_put(eb->context); } @@ -2658,7 +2795,8 @@ static void put_fence_array(struct eb_fence *fences, int num_fences) } static int -await_fence_array(struct i915_execbuffer *eb) +await_fence_array(struct i915_execbuffer *eb, + struct i915_request *rq) { unsigned int n; int err; @@ -2672,8 +2810,7 @@ await_fence_array(struct i915_execbuffer *eb) if (!eb->fences[n].dma_fence) continue; - err = i915_request_await_dma_fence(eb->request, - eb->fences[n].dma_fence); + err = i915_request_await_dma_fence(rq, eb->fences[n].dma_fence); if (err < 0) return err; } @@ -2681,9 +2818,9 @@ await_fence_array(struct i915_execbuffer *eb) return 0; } -static void signal_fence_array(const struct i915_execbuffer *eb) +static void signal_fence_array(const struct i915_execbuffer *eb, + struct dma_fence * const fence) { - struct dma_fence * const fence = &eb->request->fence; unsigned int n; for (n = 0; n < eb->num_fences; n++) { @@ -2731,9 +2868,9 @@ static void retire_requests(struct intel_timeline *tl, struct i915_request *end) break; } -static int eb_request_add(struct i915_execbuffer *eb, int err) +static int eb_request_add(struct i915_execbuffer *eb, struct i915_request *rq, + int err, bool last_parallel) { - struct i915_request *rq = eb->request; struct intel_timeline * const tl = i915_request_timeline(rq); struct i915_sched_attr attr = {}; struct i915_request *prev; @@ -2755,6 +2892,17 @@ static int eb_request_add(struct i915_execbuffer *eb, int err) err = -ENOENT; /* override any transient errors */ } + if (intel_context_is_parallel(eb->context)) { + if (err) { + __i915_request_skip(rq); + set_bit(I915_FENCE_FLAG_SKIP_PARALLEL, + &rq->fence.flags); + } + if (last_parallel) + set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, + &rq->fence.flags); + } + __i915_request_queue(rq, &attr); /* Try to clean up the client's timeline after submitting the request */ @@ -2766,6 +2914,25 @@ static int eb_request_add(struct i915_execbuffer *eb, int err) return err; } +static int eb_requests_add(struct i915_execbuffer *eb, int err) +{ + int i; + + /* + * We iterate in reverse order of creation to release timeline mutexes in + * same order. + */ + for_each_batch_add_order(eb, i) { + struct i915_request *rq = eb->requests[i]; + + if (!rq) + continue; + err |= eb_request_add(eb, rq, err, i == 0); + } + + return err; +} + static const i915_user_extension_fn execbuf_extensions[] = { [DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences, }; @@ -2792,6 +2959,182 @@ parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args, eb); } +static void eb_requests_get(struct i915_execbuffer *eb) +{ + unsigned int i; + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + i915_request_get(eb->requests[i]); + } +} + +static void eb_requests_put(struct i915_execbuffer *eb) +{ + unsigned int i; + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + i915_request_put(eb->requests[i]); + } +} + +static struct sync_file * +eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + struct dma_fence_array *fence_array; + struct dma_fence **fences; + unsigned int i; + + GEM_BUG_ON(!intel_context_is_parent(eb->context)); + + fences = kmalloc_array(eb->num_batches, sizeof(*fences), GFP_KERNEL); + if (!fences) + return ERR_PTR(-ENOMEM); + + for_each_batch_create_order(eb, i) + fences[i] = &eb->requests[i]->fence; + + fence_array = dma_fence_array_create(eb->num_batches, + fences, + eb->context->parallel.fence_context, + eb->context->parallel.seqno, + false); + if (!fence_array) { + kfree(fences); + return ERR_PTR(-ENOMEM); + } + + /* Move ownership to the dma_fence_array created above */ + for_each_batch_create_order(eb, i) + dma_fence_get(fences[i]); + + if (out_fence_fd != -1) { + out_fence = sync_file_create(&fence_array->base); + /* sync_file now owns fence_arry, drop creation ref */ + dma_fence_put(&fence_array->base); + if (!out_fence) + return ERR_PTR(-ENOMEM); + } + + eb->composite_fence = &fence_array->base; + + return out_fence; +} + +static struct sync_file * +eb_fences_add(struct i915_execbuffer *eb, struct i915_request *rq, + struct dma_fence *in_fence, int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + int err; + + if (unlikely(eb->gem_context->syncobj)) { + struct dma_fence *fence; + + fence = drm_syncobj_fence_get(eb->gem_context->syncobj); + err = i915_request_await_dma_fence(rq, fence); + dma_fence_put(fence); + if (err) + return ERR_PTR(err); + } + + if (in_fence) { + if (eb->args->flags & I915_EXEC_FENCE_SUBMIT) + err = i915_request_await_execution(rq, in_fence); + else + err = i915_request_await_dma_fence(rq, in_fence); + if (err < 0) + return ERR_PTR(err); + } + + if (eb->fences) { + err = await_fence_array(eb, rq); + if (err) + return ERR_PTR(err); + } + + if (intel_context_is_parallel(eb->context)) { + out_fence = eb_composite_fence_create(eb, out_fence_fd); + if (IS_ERR(out_fence)) + return ERR_PTR(-ENOMEM); + } else if (out_fence_fd != -1) { + out_fence = sync_file_create(&rq->fence); + if (!out_fence) + return ERR_PTR(-ENOMEM); + } + + return out_fence; +} + +static struct intel_context * +eb_find_context(struct i915_execbuffer *eb, unsigned int context_number) +{ + struct intel_context *child; + + if (likely(context_number == 0)) + return eb->context; + + for_each_child(eb->context, child) + if (!--context_number) + return child; + + GEM_BUG_ON("Context not found"); + + return NULL; +} + +static struct sync_file * +eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence, + int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + unsigned int i; + + for_each_batch_create_order(eb, i) { + /* Allocate a request for this batch buffer nice and early. */ + eb->requests[i] = i915_request_create(eb_find_context(eb, i)); + if (IS_ERR(eb->requests[i])) { + out_fence = ERR_PTR(PTR_ERR(eb->requests[i])); + eb->requests[i] = NULL; + return out_fence; + } + + /* + * Only the first request added (committed to backend) has to + * take the in fences into account as all subsequent requests + * will have fences inserted inbetween them. + */ + if (i + 1 == eb->num_batches) { + out_fence = eb_fences_add(eb, eb->requests[i], + in_fence, out_fence_fd); + if (IS_ERR(out_fence)) + return out_fence; + } + + /* + * Whilst this request exists, batch_obj will be on the + * active_list, and so will hold the active reference. Only when + * this request is retired will the batch_obj be moved onto + * the inactive_list and lose its active reference. Hence we do + * not need to explicitly hold another reference here. + */ + eb->requests[i]->batch = eb->batches[i]->vma; + if (eb->batch_pool) { + GEM_BUG_ON(intel_context_is_parallel(eb->context)); + intel_gt_buffer_pool_mark_active(eb->batch_pool, + eb->requests[i]); + } + } + + return out_fence; +} + static int i915_gem_do_execbuffer(struct drm_device *dev, struct drm_file *file, @@ -2802,7 +3145,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, struct i915_execbuffer eb; struct dma_fence *in_fence = NULL; struct sync_file *out_fence = NULL; - struct i915_vma *batch; int out_fence_fd = -1; int err; @@ -2826,12 +3168,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.buffer_count = args->buffer_count; eb.batch_start_offset = args->batch_start_offset; - eb.batch_len = args->batch_len; eb.trampoline = NULL; eb.fences = NULL; eb.num_fences = 0; + memset(eb.requests, 0, sizeof(struct i915_request *) * + ARRAY_SIZE(eb.requests)); + eb.composite_fence = NULL; + eb.batch_flags = 0; if (args->flags & I915_EXEC_SECURE) { if (GRAPHICS_VER(i915) >= 11) @@ -2915,70 +3260,25 @@ i915_gem_do_execbuffer(struct drm_device *dev, ww_acquire_done(&eb.ww.ctx); - batch = eb.batch->vma; - - /* Allocate a request for this batch buffer nice and early. */ - eb.request = i915_request_create(eb.context); - if (IS_ERR(eb.request)) { - err = PTR_ERR(eb.request); - goto err_vma; - } - - if (unlikely(eb.gem_context->syncobj)) { - struct dma_fence *fence; - - fence = drm_syncobj_fence_get(eb.gem_context->syncobj); - err = i915_request_await_dma_fence(eb.request, fence); - dma_fence_put(fence); - if (err) - goto err_ext; - } - - if (in_fence) { - if (args->flags & I915_EXEC_FENCE_SUBMIT) - err = i915_request_await_execution(eb.request, - in_fence); - else - err = i915_request_await_dma_fence(eb.request, - in_fence); - if (err < 0) - goto err_request; - } - - if (eb.fences) { - err = await_fence_array(&eb); - if (err) + out_fence = eb_requests_create(&eb, in_fence, out_fence_fd); + if (IS_ERR(out_fence)) { + err = PTR_ERR(out_fence); + if (eb.requests[0]) goto err_request; + else + goto err_vma; } - if (out_fence_fd != -1) { - out_fence = sync_file_create(&eb.request->fence); - if (!out_fence) { - err = -ENOMEM; - goto err_request; - } - } - - /* - * Whilst this request exists, batch_obj will be on the - * active_list, and so will hold the active reference. Only when this - * request is retired will the the batch_obj be moved onto the - * inactive_list and lose its active reference. Hence we do not need - * to explicitly hold another reference here. - */ - eb.request->batch = batch; - if (eb.batch_pool) - intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request); - - trace_i915_request_queue(eb.request, eb.batch_flags); - err = eb_submit(&eb, batch); + err = eb_submit(&eb); err_request: - i915_request_get(eb.request); - err = eb_request_add(&eb, err); + eb_requests_get(&eb); + err = eb_requests_add(&eb, err); if (eb.fences) - signal_fence_array(&eb); + signal_fence_array(&eb, eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); if (out_fence) { if (err == 0) { @@ -2993,10 +3293,15 @@ err_request: if (unlikely(eb.gem_context->syncobj)) { drm_syncobj_replace_fence(eb.gem_context->syncobj, - &eb.request->fence); + eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); } - i915_request_put(eb.request); + if (!out_fence && eb.composite_fence) + dma_fence_put(eb.composite_fence); + + eb_requests_put(&eb); err_vma: eb_release_vmas(&eb, true); diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index edf12caaade3..246c37d72cd7 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -241,7 +241,13 @@ intel_context_timeline_lock(struct intel_context *ce) struct intel_timeline *tl = ce->timeline; int err; - err = mutex_lock_interruptible(&tl->mutex); + if (intel_context_is_parent(ce)) + err = mutex_lock_interruptible_nested(&tl->mutex, 0); + else if (intel_context_is_child(ce)) + err = mutex_lock_interruptible_nested(&tl->mutex, + ce->parallel.child_index + 1); + else + err = mutex_lock_interruptible(&tl->mutex); if (err) return ERR_PTR(err); diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 95a5b94b4ece..9e0177dc5484 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -248,6 +248,16 @@ struct intel_context { * context */ struct i915_request *last_rq; + /** + * @fence_context: fence context composite fence when doing + * parallel submission + */ + u64 fence_context; + /** + * @seqno: seqno for composite fence when doing parallel + * submission + */ + u32 seqno; /** @number_children: number of children if parent */ u8 number_children; /** @child_index: index into child_list if child */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 50f0f4eba03b..361fab2cae99 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -3093,6 +3093,8 @@ guc_create_parallel(struct intel_engine_cs **engines, } } + parent->parallel.fence_context = dma_fence_context_alloc(1); + parent->engine->emit_bb_start = emit_bb_start_parent_no_preempt_mid_batch; parent->engine->emit_fini_breadcrumb = diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 8950785e55d6..24db8459376b 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -147,6 +147,15 @@ enum { * tail. */ I915_FENCE_FLAG_SUBMIT_PARALLEL, + + /* + * I915_FENCE_FLAG_SKIP_PARALLEL - request with a context in a + * parent-child relationship (parallel submission, multi-lrc) that + * hit an error while generating requests in the execbuf IOCTL. + * Indicates this request should be skipped as another request in + * submission / relationship encoutered an error. + */ + I915_FENCE_FLAG_SKIP_PARALLEL, }; /** diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 4b7fc4647e46..90546fa58fc1 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1234,9 +1234,10 @@ int __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq) return i915_active_add_request(&vma->active, rq); } -int i915_vma_move_to_active(struct i915_vma *vma, - struct i915_request *rq, - unsigned int flags) +int _i915_vma_move_to_active(struct i915_vma *vma, + struct i915_request *rq, + struct dma_fence *fence, + unsigned int flags) { struct drm_i915_gem_object *obj = vma->obj; int err; @@ -1257,9 +1258,11 @@ int i915_vma_move_to_active(struct i915_vma *vma, intel_frontbuffer_put(front); } - dma_resv_add_excl_fence(vma->resv, &rq->fence); - obj->write_domain = I915_GEM_DOMAIN_RENDER; - obj->read_domains = 0; + if (fence) { + dma_resv_add_excl_fence(vma->resv, fence); + obj->write_domain = I915_GEM_DOMAIN_RENDER; + obj->read_domains = 0; + } } else { if (!(flags & __EXEC_OBJECT_NO_RESERVE)) { err = dma_resv_reserve_shared(vma->resv, 1); @@ -1267,8 +1270,10 @@ int i915_vma_move_to_active(struct i915_vma *vma, return err; } - dma_resv_add_shared_fence(vma->resv, &rq->fence); - obj->write_domain = 0; + if (fence) { + dma_resv_add_shared_fence(vma->resv, fence); + obj->write_domain = 0; + } } if (flags & EXEC_OBJECT_NEEDS_FENCE && vma->fence) diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h index ed69f66c7ab0..648dbe744c96 100644 --- a/drivers/gpu/drm/i915/i915_vma.h +++ b/drivers/gpu/drm/i915/i915_vma.h @@ -57,9 +57,16 @@ static inline bool i915_vma_is_active(const struct i915_vma *vma) int __must_check __i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq); -int __must_check i915_vma_move_to_active(struct i915_vma *vma, - struct i915_request *rq, - unsigned int flags); +int __must_check _i915_vma_move_to_active(struct i915_vma *vma, + struct i915_request *rq, + struct dma_fence *fence, + unsigned int flags); +static inline int __must_check +i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq, + unsigned int flags) +{ + return _i915_vma_move_to_active(vma, rq, &rq->fence, flags); +} #define __i915_vma_flags(v) ((unsigned long *)&(v)->flags.counter) -- cgit v1.2.3