summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2022-09-07 11:07:42 +0300
committerAlex Deucher <alexander.deucher@amd.com>2022-09-19 22:17:20 +0300
commitf5c7e7797060255dbc8160734ccc5ad6183c5e04 (patch)
tree3c39f8392eef3d319e35f721121864bc761b5eaa /drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
parente4cf73fdfa420eb73507cf95c165d5d538d70d76 (diff)
downloadlinux-f5c7e7797060255dbc8160734ccc5ad6183c5e04.tar.xz
drm/amdgpu: Adjust removal control flow for smu v13_0_2
Adjust removal control flow for smu v13_0_2: During amdgpu uninstallation, when removing the first device, the kernel needs to first send a mode1reset message to all gpu devices. Otherwise, smu initialization will fail the next time amdgpu is installed. V2: 1. Update commit comments. 2. Remove the global variable amdgpu_device_remove_cnt and add a variable to the structure amdgpu_hive_info. 3. Use hive to detect the first removed device instead of a global variable. V3: 1. Update commit comments. 2. Split a patch into multiple patches. 3. The current patch does: a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into the existing gpu recover path, which make all devices in hive list only have HW reset but no resume (except the base IP). b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover in amdgpu_pci_remove when removing the first device in hive list. c. When removing the first device, the IP blocks keyword function call sequence is as follows: .suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini. ^ | |-<----------<---------<----| The first three sequences are because of a call to amdgpu_device_gpu_recover. The three sequences will be executed in a loop until all devices in the hive list are iterated. The sequences starting from .hw_fini only apply to the first device. Since .suspend has been called before, except the resumed phase1 basic ip blocks, all other ip blocks .hw_fini of current device will do nothing. d. When removing other devices, the calling sequences is the same as legacy: .hw_fini -> .early_fini -> .sw_fini. Since .suspend has been called when removing the first device, except the resumed phase1 basic ip blocks, all of other ip blocks .hw_fini of current device will do nothing. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c30
1 files changed, 30 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 728a0933ea6f..2e16210bebaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2186,6 +2186,36 @@ amdgpu_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(dev->dev);
}
+ if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) {
+ bool need_to_reset_gpu = false;
+
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ struct amdgpu_hive_info *hive;
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (hive->device_remove_count == 0)
+ need_to_reset_gpu = true;
+ hive->device_remove_count++;
+ amdgpu_put_xgmi_hive(hive);
+ } else {
+ need_to_reset_gpu = true;
+ }
+
+ /* Workaround for ASICs need to reset SMU.
+ * Called only when the first device is removed.
+ */
+ if (need_to_reset_gpu) {
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+ }
+ }
+
amdgpu_driver_unload_kms(dev);
drm_dev_unplug(dev);