summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
diff options
context:
space:
mode:
authorJonathan Kim <jonathan.kim@amd.com>2021-08-04 02:01:55 +0300
committerAlex Deucher <alexander.deucher@amd.com>2021-08-19 01:22:24 +0300
commit44357a1bd5f5a1012024a127a4653c2dfa4af18a (patch)
tree2cce7ef66e40d8c3652cdb4470c0fa6b5656ec54 /drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
parent3a6e4106a8fdb87fd3ede0eb242b70b0c4c82bd5 (diff)
downloadlinux-44357a1bd5f5a1012024a127a4653c2dfa4af18a.tar.xz
drm/amdgpu: get extended xgmi topology data
The TA has a limit to the amount of data that can be retrieved from GET_TOPOLOGY. For setups that exceed this limit, the xGMI topology needs to be re-initialized and data needs to be re-fetched from the extended link records by setting a flag in the shared command buffer. The number of hops and the number of links must be accumulated by the driver. Other data points are all fetched from the first request. Because the TA has already exceeded its link record limit, it cannot hold bidirectional information. Otherwise the driver would have to do more than two fetches so the driver has to reflect the topology information in the opposite direction. v2: squashed with internal reviewed fix Signed-off-by: Jonathan Kim <jonathan.kim@amd.com> Reviewed-by: Hawking Zhang <hawking.zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c58
1 files changed, 56 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 2e47bc446700..dda4f0c5c4e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -498,6 +498,32 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
return -EINVAL;
}
+/*
+ * Devices that support extended data require the entire hive to initialize with
+ * the shared memory buffer flag set.
+ *
+ * Hive locks and conditions apply - see amdgpu_xgmi_add_device
+ */
+static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
+ bool set_extended_data)
+{
+ struct amdgpu_device *tmp_adev;
+ int ret;
+
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
+ if (ret) {
+ dev_err(tmp_adev->dev,
+ "XGMI: Failed to initialize xgmi session for data partition %i\n",
+ set_extended_data);
+ return ret;
+ }
+
+ }
+
+ return 0;
+}
+
int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
{
struct psp_xgmi_topology_info *top_info;
@@ -512,7 +538,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.pending_reset &&
amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
- ret = psp_xgmi_initialize(&adev->psp);
+ ret = psp_xgmi_initialize(&adev->psp, false, true);
if (ret) {
dev_err(adev->dev,
"XGMI: Failed to initialize xgmi session\n");
@@ -575,7 +601,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
/* get latest topology info for each device from psp */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
- &tmp_adev->psp.xgmi_context.top_info);
+ &tmp_adev->psp.xgmi_context.top_info, false);
if (ret) {
dev_err(tmp_adev->dev,
"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
@@ -585,6 +611,34 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit_unlock;
}
}
+
+ /* get topology again for hives that support extended data */
+ if (adev->psp.xgmi_context.supports_extended_data) {
+
+ /* initialize the hive to get extended data. */
+ ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
+ if (ret)
+ goto exit_unlock;
+
+ /* get the extended data. */
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
+ &tmp_adev->psp.xgmi_context.top_info, true);
+ if (ret) {
+ dev_err(tmp_adev->dev,
+ "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
+ tmp_adev->gmc.xgmi.node_id,
+ tmp_adev->gmc.xgmi.hive_id, ret);
+ goto exit_unlock;
+ }
+ }
+
+ /* initialize the hive to get non-extended data for the next round. */
+ ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
+ if (ret)
+ goto exit_unlock;
+
+ }
}
if (!ret && !adev->gmc.xgmi.pending_reset)