From 96c7c2f4d5bd94b15fe63448c087f01607b56f4a Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 18 Apr 2023 12:04:53 +0200 Subject: drm/scheduler: set entity to NULL in drm_sched_entity_pop_job() It already happend a few times that patches slipped through which implemented access to an entity through a job that was already removed from the entities queue. Since jobs and entities might have different lifecycles, this can potentially cause UAF bugs. In order to make it obvious that a jobs entity pointer shouldn't be accessed after drm_sched_entity_pop_job() was called successfully, set the jobs entity pointer to NULL once the job is removed from the entity queue. Moreover, debugging a potential NULL pointer dereference is way easier than potentially corrupted memory through a UAF. Signed-off-by: Danilo Krummrich Link: https://lore.kernel.org/r/20230418100453.4433-1-dakr@redhat.com Reviewed-by: Luben Tuikov Signed-off-by: Luben Tuikov --- drivers/gpu/drm/scheduler/sched_entity.c | 6 ++++++ drivers/gpu/drm/scheduler/sched_main.c | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'drivers/gpu/drm/scheduler') diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index e0a8890a62e2..3e2a31d8190e 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -448,6 +448,12 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) drm_sched_rq_update_fifo(entity, next->submit_ts); } + /* Jobs and entities might have different lifecycles. Since we're + * removing the job from the entities queue, set the jobs entity pointer + * to NULL to prevent any future access of the entity through this job. + */ + sched_job->entity = NULL; + return sched_job; } diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index f6801eb21820..c3582c4fc299 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -42,6 +42,10 @@ * the hardware. * * The jobs in a entity are always scheduled in the order that they were pushed. + * + * Note that once a job was taken from the entities queue and pushed to the + * hardware, i.e. the pending queue, the entity must not be referenced anymore + * through the jobs entity pointer. */ #include -- cgit v1.2.3 From 539f9ee4b52a8bec95ff064e22dd2fb1e258e818 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 17 Apr 2023 13:36:02 +0200 Subject: drm/scheduler: properly forward fence errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a hw fence is signaled with an error properly forward that to the finished fence. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20230420115752.31470-1-christian.koenig@amd.com --- drivers/gpu/drm/scheduler/sched_entity.c | 4 +--- drivers/gpu/drm/scheduler/sched_fence.c | 4 +++- drivers/gpu/drm/scheduler/sched_main.c | 18 ++++++++---------- include/drm/gpu_scheduler.h | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/drm/scheduler') diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 3e2a31d8190e..1795cd7e42ed 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -144,7 +144,7 @@ static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk) { struct drm_sched_job *job = container_of(wrk, typeof(*job), work); - drm_sched_fence_finished(job->s_fence); + drm_sched_fence_finished(job->s_fence, -ESRCH); WARN_ON(job->s_fence->parent); job->sched->ops->free_job(job); } @@ -195,8 +195,6 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity) while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) { struct drm_sched_fence *s_fence = job->s_fence; - dma_fence_set_error(&s_fence->finished, -ESRCH); - dma_fence_get(&s_fence->finished); if (!prev || dma_fence_add_callback(prev, &job->finish_cb, drm_sched_entity_kill_jobs_cb)) diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index fe9c6468e440..ef120475e7c6 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -53,8 +53,10 @@ void drm_sched_fence_scheduled(struct drm_sched_fence *fence) dma_fence_signal(&fence->scheduled); } -void drm_sched_fence_finished(struct drm_sched_fence *fence) +void drm_sched_fence_finished(struct drm_sched_fence *fence, int result) { + if (result) + dma_fence_set_error(&fence->finished, result); dma_fence_signal(&fence->finished); } diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index c3582c4fc299..8d248ce233c8 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -262,7 +262,7 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq) * * Finish the job's fence and wake up the worker thread. */ -static void drm_sched_job_done(struct drm_sched_job *s_job) +static void drm_sched_job_done(struct drm_sched_job *s_job, int result) { struct drm_sched_fence *s_fence = s_job->s_fence; struct drm_gpu_scheduler *sched = s_fence->sched; @@ -273,7 +273,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job) trace_drm_sched_process_job(s_fence); dma_fence_get(&s_fence->finished); - drm_sched_fence_finished(s_fence); + drm_sched_fence_finished(s_fence, result); dma_fence_put(&s_fence->finished); wake_up_interruptible(&sched->wake_up_worker); } @@ -287,7 +287,7 @@ static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb) { struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); - drm_sched_job_done(s_job); + drm_sched_job_done(s_job, f->error); } /** @@ -537,12 +537,12 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) r = dma_fence_add_callback(fence, &s_job->cb, drm_sched_job_done_cb); if (r == -ENOENT) - drm_sched_job_done(s_job); + drm_sched_job_done(s_job, fence->error); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); } else - drm_sched_job_done(s_job); + drm_sched_job_done(s_job, 0); } if (full_recovery) { @@ -1059,15 +1059,13 @@ static int drm_sched_main(void *param) r = dma_fence_add_callback(fence, &sched_job->cb, drm_sched_job_done_cb); if (r == -ENOENT) - drm_sched_job_done(sched_job); + drm_sched_job_done(sched_job, fence->error); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); } else { - if (IS_ERR(fence)) - dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); - - drm_sched_job_done(sched_job); + drm_sched_job_done(sched_job, IS_ERR(fence) ? + PTR_ERR(fence) : 0); } wake_up(&sched->job_scheduled); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index c0ec6719282a..3b4800e0b24b 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -598,7 +598,7 @@ void drm_sched_fence_init(struct drm_sched_fence *fence, void drm_sched_fence_free(struct drm_sched_fence *fence); void drm_sched_fence_scheduled(struct drm_sched_fence *fence); -void drm_sched_fence_finished(struct drm_sched_fence *fence); +void drm_sched_fence_finished(struct drm_sched_fence *fence, int result); unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, -- cgit v1.2.3 From 70102d77ff22dd88a0111b1c3bac5099ac5d0425 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 17 Apr 2023 17:32:11 +0200 Subject: drm/scheduler: add drm_sched_entity_error and use rcu for last_scheduled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to using RCU handling for the last scheduled job and add a function to return the error code of it. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20230420115752.31470-2-christian.koenig@amd.com --- drivers/gpu/drm/scheduler/sched_entity.c | 39 +++++++++++++++++++++++++------- include/drm/gpu_scheduler.h | 3 ++- 2 files changed, 33 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/scheduler') diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 1795cd7e42ed..cfb433e92005 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -72,7 +72,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, entity->num_sched_list = num_sched_list; entity->priority = priority; entity->sched_list = num_sched_list > 1 ? sched_list : NULL; - entity->last_scheduled = NULL; + RCU_INIT_POINTER(entity->last_scheduled, NULL); RB_CLEAR_NODE(&entity->rb_tree_node); if(num_sched_list) @@ -140,6 +140,27 @@ bool drm_sched_entity_is_ready(struct drm_sched_entity *entity) return true; } +/** + * drm_sched_entity_error - return error of last scheduled job + * @entity: scheduler entity to check + * + * Opportunistically return the error of the last scheduled job. Result can + * change any time when new jobs are pushed to the hw. + */ +int drm_sched_entity_error(struct drm_sched_entity *entity) +{ + struct dma_fence *fence; + int r; + + rcu_read_lock(); + fence = rcu_dereference(entity->last_scheduled); + r = fence ? fence->error : 0; + rcu_read_unlock(); + + return r; +} +EXPORT_SYMBOL(drm_sched_entity_error); + static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk) { struct drm_sched_job *job = container_of(wrk, typeof(*job), work); @@ -191,7 +212,9 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity) /* Make sure this entity is not used by the scheduler at the moment */ wait_for_completion(&entity->entity_idle); - prev = dma_fence_get(entity->last_scheduled); + /* The entity is guaranteed to not be used by the scheduler */ + prev = rcu_dereference_check(entity->last_scheduled, true); + dma_fence_get(prev); while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) { struct drm_sched_fence *s_fence = job->s_fence; @@ -278,8 +301,8 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) entity->dependency = NULL; } - dma_fence_put(entity->last_scheduled); - entity->last_scheduled = NULL; + dma_fence_put(rcu_dereference_check(entity->last_scheduled, true)); + RCU_INIT_POINTER(entity->last_scheduled, NULL); } EXPORT_SYMBOL(drm_sched_entity_fini); @@ -421,9 +444,9 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) if (entity->guilty && atomic_read(entity->guilty)) dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); - dma_fence_put(entity->last_scheduled); - - entity->last_scheduled = dma_fence_get(&sched_job->s_fence->finished); + dma_fence_put(rcu_dereference_check(entity->last_scheduled, true)); + rcu_assign_pointer(entity->last_scheduled, + dma_fence_get(&sched_job->s_fence->finished)); /* * If the queue is empty we allow drm_sched_entity_select_rq() to @@ -477,7 +500,7 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity) */ smp_rmb(); - fence = entity->last_scheduled; + fence = rcu_dereference_check(entity->last_scheduled, true); /* stay on the same engine if the previous job hasn't finished */ if (fence && !dma_fence_is_signaled(fence)) diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 3b4800e0b24b..fd15f8ae0c3f 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -201,7 +201,7 @@ struct drm_sched_entity { * by the scheduler thread, can be accessed locklessly from * drm_sched_job_arm() iff the queue is empty. */ - struct dma_fence *last_scheduled; + struct dma_fence __rcu *last_scheduled; /** * @last_user: last group leader pushing a job into the entity. @@ -588,6 +588,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job); void drm_sched_entity_set_priority(struct drm_sched_entity *entity, enum drm_sched_priority priority); bool drm_sched_entity_is_ready(struct drm_sched_entity *entity); +int drm_sched_entity_error(struct drm_sched_entity *entity); void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence, struct dma_fence *fence); -- cgit v1.2.3 From 03877d621db082610c9b7602c6e8cd6ebcb75a8f Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 27 Apr 2023 14:05:43 +0200 Subject: drm/scheduler: mark jobs without fence as canceled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When no hw fence is provided for a job that means that the job didn't executed. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20230427122726.1290170-1-christian.koenig@amd.com --- drivers/gpu/drm/scheduler/sched_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/scheduler') diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 8d248ce233c8..3e2d453296dd 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -542,7 +542,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); } else - drm_sched_job_done(s_job, 0); + drm_sched_job_done(s_job, -ECANCELED); } if (full_recovery) { -- cgit v1.2.3