summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Clements <john.clements@amd.com>2021-03-25 12:10:10 +0300
committerAlex Deucher <alexander.deucher@amd.com>2021-04-09 23:54:51 +0300
commit134d16d50f0948f00e7172b509e869b6eaecf437 (patch)
tree036563baedfcf183200a94b37eb2f3bbbfac4363
parent340c571bebbefe03da1c1139b62a55f4ec6fcdce (diff)
downloadlinux-134d16d50f0948f00e7172b509e869b6eaecf437.tar.xz
drm/amdgpu: RAS harvest on driver load
In event of RAS UE + warm reset, error counters shall be harvested and cleared on driver load Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: John Clements <john.clements@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c29
1 files changed, 29 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1d905bcbc1ac..b0fe5885e4c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2090,6 +2090,32 @@ release_con:
return r;
}
+static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+{
+ if (adev->gmc.xgmi.connected_to_cpu)
+ return 1;
+ return 0;
+}
+
+static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
+{
+ struct ras_query_if info = {
+ .head = *ras_block,
+ };
+
+ if (!amdgpu_persistent_edc_harvesting_supported(adev))
+ return 0;
+
+ if (amdgpu_ras_query_error_status(adev, &info) != 0)
+ DRM_WARN("RAS init harvest failure");
+
+ if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
+ DRM_WARN("RAS init harvest reset failure");
+
+ return 0;
+}
+
/* helper function to handle common stuff in ip late init phase */
int amdgpu_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
@@ -2119,6 +2145,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
return r;
}
+ /* check for errors on warm reset edc persisant supported ASIC */
+ amdgpu_persistent_edc_harvesting(adev, ras_block);
+
/* in resume phase, no need to create ras fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
return 0;