summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
authorTao Zhou <tao.zhou1@amd.com>2024-03-12 06:30:09 +0300
committerAlex Deucher <alexander.deucher@amd.com>2024-03-20 20:38:15 +0300
commit2fc46e0b2fe8802c98a68d5c903e0109e1d4a49c (patch)
tree62cd28b285ec0864ee1cc8cadf61336f0555a696 /drivers/gpu/drm/amd/amdkfd
parente3d4de8d8b24933a9588019bf53891bca520b226 (diff)
downloadlinux-2fc46e0b2fe8802c98a68d5c903e0109e1d4a49c.tar.xz
drm/amdgpu: make reset method configurable for RAS poison
Each RAS block has different requirement for gpu reset in poison consumption handling. Add support for mmhub RAS poison consumption handling. v2: remove the mmhub poison support for kfd int v10. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c13
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c9
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c20
3 files changed, 28 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 650da18b0d87..740f89aafbc0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
{
enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
+ uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
@@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
break;
@@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
* resetting queue fails, fallback to gpu reset solution
*/
- if (!ret) {
+ if (!ret)
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
- } else {
+ else
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
- }
+
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool event_interrupt_isr_v10(struct kfd_node *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 7e2859736a55..d3d6b5c180b3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
{
enum amdgpu_ras_block block = 0;
int ret = -EINVAL;
+ uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
@@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC21_INTSRC_SDMA_ECC:
default:
block = AMDGPU_RAS_BLOCK__GFX;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
}
@@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
- if (!ret)
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
- else
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool event_interrupt_isr_v11(struct kfd_node *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 11641f4645e6..2a37ab7a7150 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
{
enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
+ uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
@@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ break;
+ case SOC15_IH_CLIENTID_VMC:
+ case SOC15_IH_CLIENTID_VMC1:
+ ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__MMHUB;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
break;
@@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
* resetting queue fails, fallback to gpu reset solution
*/
- if (!ret) {
+ if (!ret)
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
- } else {
+ else
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
- }
+
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool context_id_expected(struct kfd_dev *dev)