summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorjqdeng <Emily.Deng@amd.com>2020-08-07 12:31:19 +0300
committerAlex Deucher <alexander.deucher@amd.com>2020-08-19 01:22:02 +0300
commit9a1cddd6374fb1eb166f40f10d5411e3699066d4 (patch)
treeec1d6acdcfd9550058cb21e04f72f04411fb1908 /drivers/gpu/drm/amd/amdgpu
parent5ce99853a6fd94c729d4b50188f43300d6568898 (diff)
downloadlinux-9a1cddd6374fb1eb166f40f10d5411e3699066d4.tar.xz
drm/amdgpu: Fix repeatly flr issue
Only for no job running test case need to do recover in flr notification. For having job in mirror list, then let guest driver to hit job timeout, and then do recover. Signed-off-by: jqdeng <Emily.Deng@amd.com> Acked-by: Nirmoy Das <nirmoy.das@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c28
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c3
4 files changed, 32 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 08f80ca3b296..54666eea1863 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1133,6 +1133,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
#define amdgpu_inc_vram_lost(adev) atomic_inc(&((adev)->vram_lost_counter));
/* Common functions */
+bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_job* job);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 415e1a32b98c..6573e1112462 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3923,6 +3923,34 @@ error:
}
/**
+ * amdgpu_device_has_job_running - check if there is any job in mirror list
+ *
+ * @adev: amdgpu device pointer
+ *
+ * check if there is any job in mirror list
+ */
+bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
+{
+ int i;
+ struct drm_sched_job *job;
+
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+
+ if (!ring || !ring->sched.thread)
+ continue;
+
+ spin_lock(&ring->sched.job_list_lock);
+ job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
+ struct drm_sched_job, node);
+ spin_unlock(&ring->sched.job_list_lock);
+ if (job)
+ return true;
+ }
+ return false;
+}
+
+/**
* amdgpu_device_should_recover_gpu - check if we should try GPU recovery
*
* @adev: amdgpu device pointer
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 5fd67e1cc2a0..475ff5df8c87 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -268,7 +268,7 @@ flr_done:
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)
- && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
+ && (amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
amdgpu_device_gpu_recover(adev, NULL);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index ce2bf1fb79ed..9cf695c05db3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -289,7 +289,8 @@ flr_done:
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)
- && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+ && (amdgpu_device_has_job_running(adev) ||
+ adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))