summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorHawking Zhang <Hawking.Zhang@amd.com>2024-01-02 14:32:56 +0300
committerAlex Deucher <alexander.deucher@amd.com>2024-01-16 02:35:35 +0300
commitcce4febb274cc16655dd4692fbcebc6c7d7953b5 (patch)
treed4532e9cd25dc91b56e783007d83861a898758de /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parentf5e4cc8461c408dcb679bb1f7c3bd8a586406709 (diff)
downloadlinux-cce4febb274cc16655dd4692fbcebc6c7d7953b5.tar.xz
drm/amdgpu: Add ras helper to query boot errors v2
Add ras helper function to query boot time gpu errors. v2: use aqua_vanjaram smn addressing pattern Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Yang Wang <kevinyang.wang@amd.com> Reviewed-by: Le Ma <le.ma@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c95
1 files changed, 95 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 31823a30dea2..a86f91838301 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3767,3 +3767,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
return 0;
}
+
+#define mmMP0_SMN_C2PMSG_92 0x1609C
+#define mmMP0_SMN_C2PMSG_126 0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+ u32 instance, u32 boot_error)
+{
+ u32 socket_id, aid_id, hbm_id;
+ u32 reg_data;
+ u64 reg_addr;
+
+ socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+ aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+ hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+ /* The pattern for smn addressing in other SOC could be different from
+ * the one for aqua_vanjaram. We should revisit the code if the pattern
+ * is changed. In such case, replace the aqua_vanjaram implementation
+ * with more common helper */
+ reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+ aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+ reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+ dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n",
+ socket_id, aid_id, reg_data);
+
+ if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n",
+ socket_id, aid_id, hbm_id);
+
+ if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n",
+ socket_id, aid_id);
+
+ if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n",
+ socket_id, aid_id);
+
+ if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n",
+ socket_id, aid_id);
+
+ if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n",
+ socket_id, aid_id);
+
+ if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n",
+ socket_id, aid_id);
+
+ if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n",
+ socket_id, aid_id, hbm_id);
+
+ if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+ dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n",
+ socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+ u32 instance, u32 *boot_error)
+{
+ u32 reg_addr;
+ u32 reg_data;
+ int retry_loop;
+
+ /* The pattern for smn addressing in other SOC could be different from
+ * the one for aqua_vanjaram. We should revisit the code if the pattern
+ * is changed. In such case, replace the aqua_vanjaram implementation
+ * with more common helper */
+ reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+ aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+ for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+ reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+ if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+ *boot_error = reg_data;
+ return 0;
+ }
+ msleep(1);
+ }
+
+ *boot_error = reg_data;
+ return -ETIME;
+}
+
+void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
+{
+ u32 boot_error = 0;
+ u32 i;
+
+ for (i = 0; i < num_instances; i++) {
+ if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
+ amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
+ }
+}