diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_request.c')
-rw-r--r-- | drivers/gpu/drm/i915/i915_request.c | 288 |
1 files changed, 145 insertions, 143 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index a53777dd371c..4575f368455d 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -169,16 +169,17 @@ remove_from_client(struct i915_request *request) { struct drm_i915_file_private *file_priv; - file_priv = READ_ONCE(request->file_priv); - if (!file_priv) + if (!READ_ONCE(request->file_priv)) return; - spin_lock(&file_priv->mm.lock); - if (request->file_priv) { + rcu_read_lock(); + file_priv = xchg(&request->file_priv, NULL); + if (file_priv) { + spin_lock(&file_priv->mm.lock); list_del(&request->client_link); - request->file_priv = NULL; + spin_unlock(&file_priv->mm.lock); } - spin_unlock(&file_priv->mm.lock); + rcu_read_unlock(); } static void free_capture_list(struct i915_request *request) @@ -194,11 +195,29 @@ static void free_capture_list(struct i915_request *request) } } -static bool i915_request_retire(struct i915_request *rq) +static void remove_from_engine(struct i915_request *rq) { - struct i915_active_request *active, *next; + struct intel_engine_cs *engine, *locked; - lockdep_assert_held(&rq->timeline->mutex); + /* + * Virtual engines complicate acquiring the engine timeline lock, + * as their rq->engine pointer is not stable until under that + * engine lock. The simple ploy we use is to take the lock then + * check that the rq still belongs to the newly locked engine. + */ + locked = READ_ONCE(rq->engine); + spin_lock_irq(&locked->active.lock); + while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { + spin_unlock(&locked->active.lock); + spin_lock(&engine->active.lock); + locked = engine; + } + list_del(&rq->sched.link); + spin_unlock_irq(&locked->active.lock); +} + +bool i915_request_retire(struct i915_request *rq) +{ if (!i915_request_completed(rq)) return false; @@ -219,51 +238,19 @@ static bool i915_request_retire(struct i915_request *rq) * Note this requires that we are always called in request * completion order. */ - GEM_BUG_ON(!list_is_first(&rq->link, &rq->timeline->requests)); + GEM_BUG_ON(!list_is_first(&rq->link, + &i915_request_timeline(rq)->requests)); rq->ring->head = rq->postfix; /* - * Walk through the active list, calling retire on each. This allows - * objects to track their GPU activity and mark themselves as idle - * when their *last* active request is completed (updating state - * tracking lists for eviction, active references for GEM, etc). - * - * As the ->retire() may free the node, we decouple it first and - * pass along the auxiliary information (to avoid dereferencing - * the node after the callback). - */ - list_for_each_entry_safe(active, next, &rq->active_list, link) { - /* - * In microbenchmarks or focusing upon time inside the kernel, - * we may spend an inordinate amount of time simply handling - * the retirement of requests and processing their callbacks. - * Of which, this loop itself is particularly hot due to the - * cache misses when jumping around the list of - * i915_active_request. So we try to keep this loop as - * streamlined as possible and also prefetch the next - * i915_active_request to try and hide the likely cache miss. - */ - prefetchw(next); - - INIT_LIST_HEAD(&active->link); - RCU_INIT_POINTER(active->request, NULL); - - active->retire(active, rq); - } - - local_irq_disable(); - - /* * We only loosely track inflight requests across preemption, * and so we may find ourselves attempting to retire a _completed_ * request that we have removed from the HW and put back on a run * queue. */ - spin_lock(&rq->engine->active.lock); - list_del(&rq->sched.link); - spin_unlock(&rq->engine->active.lock); + remove_from_engine(rq); - spin_lock(&rq->lock); + spin_lock_irq(&rq->lock); i915_request_mark_complete(rq); if (!i915_request_signaled(rq)) dma_fence_signal_locked(&rq->fence); @@ -278,9 +265,7 @@ static bool i915_request_retire(struct i915_request *rq) __notify_execute_cb(rq); } GEM_BUG_ON(!list_empty(&rq->execute_cb)); - spin_unlock(&rq->lock); - - local_irq_enable(); + spin_unlock_irq(&rq->lock); remove_from_client(rq); list_del(&rq->link); @@ -297,7 +282,7 @@ static bool i915_request_retire(struct i915_request *rq) void i915_request_retire_upto(struct i915_request *rq) { - struct intel_timeline * const tl = rq->timeline; + struct intel_timeline * const tl = i915_request_timeline(rq); struct i915_request *tmp; GEM_TRACE("%s fence %llx:%lld, current %d\n", @@ -305,7 +290,6 @@ void i915_request_retire_upto(struct i915_request *rq) rq->fence.context, rq->fence.seqno, hwsp_seqno(rq)); - lockdep_assert_held(&tl->mutex); GEM_BUG_ON(!i915_request_completed(rq)); do { @@ -358,9 +342,10 @@ __i915_request_await_execution(struct i915_request *rq, return 0; } -void __i915_request_submit(struct i915_request *request) +bool __i915_request_submit(struct i915_request *request) { struct intel_engine_cs *engine = request->engine; + bool result = false; GEM_TRACE("%s fence %llx:%lld, current %d\n", engine->name, @@ -370,6 +355,25 @@ void __i915_request_submit(struct i915_request *request) GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->active.lock); + /* + * With the advent of preempt-to-busy, we frequently encounter + * requests that we have unsubmitted from HW, but left running + * until the next ack and so have completed in the meantime. On + * resubmission of that completed request, we can skip + * updating the payload, and execlists can even skip submitting + * the request. + * + * We must remove the request from the caller's priority queue, + * and the caller must only call us when the request is in their + * priority queue, under the active.lock. This ensures that the + * request has *not* yet been retired and we can safely move + * the request into the engine->active.list where it will be + * dropped upon retiring. (Otherwise if resubmit a *retired* + * request, this would be a horrible use-after-free.) + */ + if (i915_request_completed(request)) + goto xfer; + if (i915_gem_context_is_banned(request->gem_context)) i915_request_skip(request, -EIO); @@ -393,13 +397,18 @@ void __i915_request_submit(struct i915_request *request) i915_sw_fence_signaled(&request->semaphore)) engine->saturated |= request->sched.semaphores; - /* We may be recursing from the signal callback of another i915 fence */ - spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); + engine->emit_fini_breadcrumb(request, + request->ring->vaddr + request->postfix); - list_move_tail(&request->sched.link, &engine->active.requests); + trace_i915_request_execute(request); + engine->serial++; + result = true; - GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); - set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); +xfer: /* We may be recursing from the signal callback of another i915 fence */ + spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); + + if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) + list_move_tail(&request->sched.link, &engine->active.requests); if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && @@ -410,12 +419,7 @@ void __i915_request_submit(struct i915_request *request) spin_unlock(&request->lock); - engine->emit_fini_breadcrumb(request, - request->ring->vaddr + request->postfix); - - engine->serial++; - - trace_i915_request_execute(request); + return result; } void i915_request_submit(struct i915_request *request) @@ -641,9 +645,12 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) rq->gem_context = ce->gem_context; rq->engine = ce->engine; rq->ring = ce->ring; - rq->timeline = tl; + rq->execution_mask = ce->engine->mask; + + rcu_assign_pointer(rq->timeline, tl); rq->hwsp_seqno = tl->hwsp_seqno; rq->hwsp_cacheline = tl->hwsp_cacheline; + rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ spin_lock_init(&rq->lock); @@ -661,9 +668,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) rq->batch = NULL; rq->capture_list = NULL; rq->flags = 0; - rq->execution_mask = ALL_ENGINES; - INIT_LIST_HEAD(&rq->active_list); INIT_LIST_HEAD(&rq->execute_cb); /* @@ -702,7 +707,6 @@ err_unwind: ce->ring->emit = rq->head; /* Make sure we didn't add ourselves to external state before freeing */ - GEM_BUG_ON(!list_empty(&rq->active_list)); GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); @@ -747,16 +751,43 @@ err_unlock: static int i915_request_await_start(struct i915_request *rq, struct i915_request *signal) { - if (list_is_first(&signal->link, &signal->timeline->requests)) - return 0; + struct intel_timeline *tl; + struct dma_fence *fence; + int err; + + GEM_BUG_ON(i915_request_timeline(rq) == + rcu_access_pointer(signal->timeline)); - signal = list_prev_entry(signal, link); - if (intel_timeline_sync_is_later(rq->timeline, &signal->fence)) + rcu_read_lock(); + tl = rcu_dereference(signal->timeline); + if (i915_request_started(signal) || !kref_get_unless_zero(&tl->kref)) + tl = NULL; + rcu_read_unlock(); + if (!tl) /* already started or maybe even completed */ return 0; - return i915_sw_fence_await_dma_fence(&rq->submit, - &signal->fence, 0, - I915_FENCE_GFP); + fence = ERR_PTR(-EBUSY); + if (mutex_trylock(&tl->mutex)) { + fence = NULL; + if (!i915_request_started(signal) && + !list_is_first(&signal->link, &tl->requests)) { + signal = list_prev_entry(signal, link); + fence = dma_fence_get(&signal->fence); + } + mutex_unlock(&tl->mutex); + } + intel_timeline_put(tl); + if (IS_ERR_OR_NULL(fence)) + return PTR_ERR_OR_ZERO(fence); + + err = 0; + if (intel_timeline_sync_is_later(i915_request_timeline(rq), fence)) + err = i915_sw_fence_await_dma_fence(&rq->submit, + fence, 0, + I915_FENCE_GFP); + dma_fence_put(fence); + + return err; } static intel_engine_mask_t @@ -782,34 +813,33 @@ emit_semaphore_wait(struct i915_request *to, struct i915_request *from, gfp_t gfp) { + const int has_token = INTEL_GEN(to->i915) >= 12; u32 hwsp_offset; + int len; u32 *cs; - int err; - GEM_BUG_ON(!from->timeline->has_initial_breadcrumb); GEM_BUG_ON(INTEL_GEN(to->i915) < 8); /* Just emit the first semaphore we see as request space is limited. */ if (already_busywaiting(to) & from->engine->mask) - return i915_sw_fence_await_dma_fence(&to->submit, - &from->fence, 0, - I915_FENCE_GFP); + goto await_fence; - err = i915_request_await_start(to, from); - if (err < 0) - return err; + if (i915_request_await_start(to, from) < 0) + goto await_fence; /* Only submit our spinner after the signaler is running! */ - err = __i915_request_await_execution(to, from, NULL, gfp); - if (err) - return err; + if (__i915_request_await_execution(to, from, NULL, gfp)) + goto await_fence; /* We need to pin the signaler's HWSP until we are finished reading. */ - err = intel_timeline_read_hwsp(from, to, &hwsp_offset); - if (err) - return err; + if (intel_timeline_read_hwsp(from, to, &hwsp_offset)) + goto await_fence; - cs = intel_ring_begin(to, 4); + len = 4; + if (has_token) + len += 2; + + cs = intel_ring_begin(to, len); if (IS_ERR(cs)) return PTR_ERR(cs); @@ -821,18 +851,28 @@ emit_semaphore_wait(struct i915_request *to, * (post-wrap) values than they were expecting (and so wait * forever). */ - *cs++ = MI_SEMAPHORE_WAIT | - MI_SEMAPHORE_GLOBAL_GTT | - MI_SEMAPHORE_POLL | - MI_SEMAPHORE_SAD_GTE_SDD; + *cs++ = (MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_GTE_SDD) + + has_token; *cs++ = from->fence.seqno; *cs++ = hwsp_offset; *cs++ = 0; + if (has_token) { + *cs++ = 0; + *cs++ = MI_NOOP; + } intel_ring_advance(to, cs); to->sched.semaphores |= from->engine->mask; to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN; return 0; + +await_fence: + return i915_sw_fence_await_dma_fence(&to->submit, + &from->fence, 0, + I915_FENCE_GFP); } static int @@ -916,21 +956,23 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) /* Squash repeated waits to the same timelines */ if (fence->context && - intel_timeline_sync_is_later(rq->timeline, fence)) + intel_timeline_sync_is_later(i915_request_timeline(rq), + fence)) continue; if (dma_fence_is_i915(fence)) ret = i915_request_await_request(rq, to_request(fence)); else ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, - I915_FENCE_TIMEOUT, + fence->context ? I915_FENCE_TIMEOUT : 0, I915_FENCE_GFP); if (ret < 0) return ret; /* Record the latest fence used against each timeline */ if (fence->context) - intel_timeline_sync_set(rq->timeline, fence); + intel_timeline_sync_set(i915_request_timeline(rq), + fence); } while (--nchild); return 0; @@ -1072,7 +1114,7 @@ void i915_request_skip(struct i915_request *rq, int error) static struct i915_request * __i915_request_add_to_timeline(struct i915_request *rq) { - struct intel_timeline *timeline = rq->timeline; + struct intel_timeline *timeline = i915_request_timeline(rq); struct i915_request *prev; /* @@ -1095,8 +1137,8 @@ __i915_request_add_to_timeline(struct i915_request *rq) * precludes optimising to use semaphores serialisation of a single * timeline across engines. */ - prev = rcu_dereference_protected(timeline->last_request.request, - lockdep_is_held(&timeline->mutex)); + prev = to_request(__i915_active_fence_set(&timeline->last_request, + &rq->fence)); if (prev && !i915_request_completed(prev)) { if (is_power_of_2(prev->engine->mask | rq->engine->mask)) i915_sw_fence_await_sw_fence(&rq->submit, @@ -1121,7 +1163,6 @@ __i915_request_add_to_timeline(struct i915_request *rq) * us, the timeline will hold its seqno which is later than ours. */ GEM_BUG_ON(timeline->seqno != rq->fence.seqno); - __i915_active_request_set(&timeline->last_request, rq); return prev; } @@ -1185,7 +1226,7 @@ void __i915_request_queue(struct i915_request *rq, void i915_request_add(struct i915_request *rq) { struct i915_sched_attr attr = rq->gem_context->sched; - struct intel_timeline * const tl = rq->timeline; + struct intel_timeline * const tl = i915_request_timeline(rq); struct i915_request *prev; lockdep_assert_held(&tl->mutex); @@ -1240,7 +1281,9 @@ void i915_request_add(struct i915_request *rq) * work on behalf of others -- but instead we should benefit from * improved resource management. (Well, that's the theory at least.) */ - if (prev && i915_request_completed(prev) && prev->timeline == tl) + if (prev && + i915_request_completed(prev) && + rcu_access_pointer(prev->timeline) == tl) i915_request_retire_upto(prev); mutex_unlock(&tl->mutex); @@ -1449,6 +1492,7 @@ long i915_request_wait(struct i915_request *rq, break; } + intel_engine_flush_submission(rq->engine); timeout = io_schedule_timeout(timeout); } __set_current_state(TASK_RUNNING); @@ -1461,48 +1505,6 @@ out: return timeout; } -bool i915_retire_requests(struct drm_i915_private *i915) -{ - struct intel_gt_timelines *timelines = &i915->gt.timelines; - struct intel_timeline *tl, *tn; - unsigned long flags; - LIST_HEAD(free); - - spin_lock_irqsave(&timelines->lock, flags); - list_for_each_entry_safe(tl, tn, &timelines->active_list, link) { - if (!mutex_trylock(&tl->mutex)) - continue; - - intel_timeline_get(tl); - GEM_BUG_ON(!tl->active_count); - tl->active_count++; /* pin the list element */ - spin_unlock_irqrestore(&timelines->lock, flags); - - retire_requests(tl); - - spin_lock_irqsave(&timelines->lock, flags); - - /* Resume iteration after dropping lock */ - list_safe_reset_next(tl, tn, link); - if (!--tl->active_count) - list_del(&tl->link); - - mutex_unlock(&tl->mutex); - - /* Defer the final release to after the spinlock */ - if (refcount_dec_and_test(&tl->kref.refcount)) { - GEM_BUG_ON(tl->active_count); - list_add(&tl->link, &free); - } - } - spin_unlock_irqrestore(&timelines->lock, flags); - - list_for_each_entry_safe(tl, tn, &free, link) - __intel_timeline_free(&tl->kref); - - return !list_empty(&timelines->active_list); -} - #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftests/mock_request.c" #include "selftests/i915_request.c" |