diff options
author | Monk Liu <Monk.Liu@amd.com> | 2021-09-01 08:46:46 +0800 |
---|---|---|
committer | Andrey Grodzovsky <andrey.grodzovsky@amd.com> | 2021-09-15 10:21:30 -0400 |
commit | bcf26654a38f8e55ecac4635dac2e72c161d0063 (patch) | |
tree | 20eb90439123abf55821c744dd05e7974206a743 | |
parent | 282abb5a1f381d0ec10b20893961563be174a1c3 (diff) |
drm/sched: fix the bug of time out calculation(v4)
issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer
even the its corresponding job is still running.
fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.
v2:
further cleanup the logic, and do the TDR timer cancelling if the signaled job
is the last one in its scheduler.
v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.
v4:
remove the kthread_should_park() checking in cleanup_job routine,
we should cleanup the signaled job asap
TODO:
1)introduce pause/resume scheduler in job_timeout to serial the handling
of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above serialization
(no race issue anymore with the serialization)
Tested-by: jingwen <jingwen.chen@@amd.com>
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1630457207-13107-1-git-send-email-Monk.Liu@amd.com
-rw-r--r-- | drivers/gpu/drm/scheduler/sched_main.c | 26 |
1 files changed, 9 insertions, 17 deletions
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 6987d412a946..042c16b5d54a 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -827,15 +827,6 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) { struct drm_sched_job *job, *next; - /* - * Don't destroy jobs while the timeout worker is running OR thread - * is being parked and hence assumed to not touch pending_list - */ - if ((sched->timeout != MAX_SCHEDULE_TIMEOUT && - !cancel_delayed_work(&sched->work_tdr)) || - kthread_should_park()) - return NULL; - spin_lock(&sched->job_list_lock); job = list_first_entry_or_null(&sched->pending_list, @@ -844,17 +835,21 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) if (job && dma_fence_is_signaled(&job->s_fence->finished)) { /* remove job from pending_list */ list_del_init(&job->list); + + /* cancel this job's TO timer */ + cancel_delayed_work(&sched->work_tdr); /* make the scheduled timestamp more accurate */ next = list_first_entry_or_null(&sched->pending_list, typeof(*next), list); - if (next) + + if (next) { next->s_fence->scheduled.timestamp = job->s_fence->finished.timestamp; - + /* start TO timer for next job */ + drm_sched_start_timeout(sched); + } } else { job = NULL; - /* queue timeout for next job */ - drm_sched_start_timeout(sched); } spin_unlock(&sched->job_list_lock); @@ -942,11 +937,8 @@ static int drm_sched_main(void *param) (entity = drm_sched_select_entity(sched))) || kthread_should_stop()); - if (cleanup_job) { + if (cleanup_job) sched->ops->free_job(cleanup_job); - /* queue timeout for next job */ - drm_sched_start_timeout(sched); - } if (!entity) continue; |