From 18eae367cb74d05b5e37ce77ef4025b735df012e Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 30 Oct 2023 20:44:37 +0800 Subject: drm/amdgpu: check recovery status of xgmi hive in ras_reset_error_count Handle xgmi hive case. Suggested-by: Hawking Zhang Signed-off-by: Tao Zhou Reviewed-by: Stanley.Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3af50754800d..b7fe5951b166 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1222,6 +1222,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + struct amdgpu_hive_info *hive; + int hive_ras_recovery = 0; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1233,8 +1235,15 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_mca_debug_mode(adev)) return -EOPNOTSUPP; + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + /* skip ras error reset in gpu reset */ - if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) && + if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || + hive_ras_recovery) && mca_funcs && mca_funcs->mca_set_debug_mode) return -EOPNOTSUPP; -- cgit v1.2.3