summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
diff options
context:
space:
mode:
authorTao Zhou <tao.zhou1@amd.com>2022-10-17 13:31:20 +0300
committerAlex Deucher <alexander.deucher@amd.com>2022-10-27 22:12:08 +0300
commitae45a18b80d9d0d29f0ecfc52fb4e7831671b299 (patch)
tree5e4b92907103f8773af3f400ef26cb073608f5b1 /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
parent24b822928b5139b85ee9a818a65e343b7e3bb4fe (diff)
downloadlinux-ae45a18b80d9d0d29f0ecfc52fb4e7831671b299.tar.xz
drm/amdgpu: add RAS poison handling for MCA
For MCA poison, if unmap queue fails, only gpu reset should be triggered without page retirement handling, MCA notifier will do it. v2: handle MCA poison consumption in umc_poison_handler directly. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c31
1 files changed, 20 insertions, 11 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 3c83129f4090..758942150c09 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -169,19 +169,28 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
void *ras_error_status,
bool reset)
{
- int ret;
- struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- struct ras_common_if head = {
- .block = AMDGPU_RAS_BLOCK__UMC,
- };
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+ int ret = AMDGPU_RAS_SUCCESS;
- ret =
- amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+ struct ras_common_if head = {
+ .block = AMDGPU_RAS_BLOCK__UMC,
+ };
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
- if (ret == AMDGPU_RAS_SUCCESS && obj) {
- obj->err_data.ue_count += err_data->ue_count;
- obj->err_data.ce_count += err_data->ce_count;
+ ret =
+ amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+
+ if (ret == AMDGPU_RAS_SUCCESS && obj) {
+ obj->err_data.ue_count += err_data->ue_count;
+ obj->err_data.ce_count += err_data->ce_count;
+ }
+ } else if (reset) {
+ /* MCA poison handler is only responsible for GPU reset,
+ * let MCA notifier do page retirement.
+ */
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ amdgpu_ras_reset_gpu(adev);
}
return ret;