summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/pm/powerplay
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/pm/powerplay')
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c48
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c4
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c2
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c27
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c10
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c4
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_thermal.c4
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c4
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_thermal.c4
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h2
-rw-r--r--drivers/gpu/drm/amd/pm/powerplay/inc/power_state.h1
11 files changed, 89 insertions, 21 deletions
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
index 11b7b4cffaae..ff360c699171 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -26,6 +26,7 @@
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/firmware.h>
+#include <linux/reboot.h>
#include "amd_shared.h"
#include "amd_powerplay.h"
#include "power_state.h"
@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
return 0;
}
+static void pp_swctf_delayed_work_handler(struct work_struct *work)
+{
+ struct pp_hwmgr *hwmgr =
+ container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
+ struct amdgpu_device *adev = hwmgr->adev;
+ struct amdgpu_dpm_thermal *range =
+ &adev->pm.dpm.thermal;
+ uint32_t gpu_temperature, size;
+ int ret;
+
+ /*
+ * If the hotspot/edge temperature is confirmed as below SW CTF setting point
+ * after the delay enforced, nothing will be done.
+ * Otherwise, a graceful shutdown will be performed to prevent further damage.
+ */
+ if (range->sw_ctf_threshold &&
+ hwmgr->hwmgr_func->read_sensor) {
+ ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+ AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+ &gpu_temperature,
+ &size);
+ /*
+ * For some legacy ASICs, hotspot temperature retrieving might be not
+ * supported. Check the edge temperature instead then.
+ */
+ if (ret == -EOPNOTSUPP)
+ ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+ AMDGPU_PP_SENSOR_EDGE_TEMP,
+ &gpu_temperature,
+ &size);
+ if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
+ return;
+ }
+
+ dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
+ dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
+ orderly_poweroff(true);
+}
+
static int pp_sw_init(void *handle)
{
struct amdgpu_device *adev = handle;
@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)
pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
+ if (!ret)
+ INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
+ pp_swctf_delayed_work_handler);
+
return ret;
}
@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
+ cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
hwmgr_hw_fini(hwmgr);
return 0;
@@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
+ cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
return hwmgr_suspend(hwmgr);
}
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c
index 981dc8c7112d..90452b66e107 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/hardwaremanager.c
@@ -241,7 +241,8 @@ int phm_start_thermal_controller(struct pp_hwmgr *hwmgr)
TEMP_RANGE_MAX,
TEMP_RANGE_MIN,
TEMP_RANGE_MAX,
- TEMP_RANGE_MAX};
+ TEMP_RANGE_MAX,
+ 0};
struct amdgpu_device *adev = hwmgr->adev;
if (!hwmgr->not_vf)
@@ -265,6 +266,7 @@ int phm_start_thermal_controller(struct pp_hwmgr *hwmgr)
adev->pm.dpm.thermal.min_mem_temp = range.mem_min;
adev->pm.dpm.thermal.max_mem_crit_temp = range.mem_crit_max;
adev->pm.dpm.thermal.max_mem_emergency_temp = range.mem_emergency_max;
+ adev->pm.dpm.thermal.sw_ctf_threshold = range.sw_ctf_threshold;
return ret;
}
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
index e10cc5e7928e..6841a4bce186 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
@@ -5432,6 +5432,8 @@ static int smu7_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
thermal_data->max = data->thermal_temp_setting.temperature_shutdown *
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ thermal_data->sw_ctf_threshold = thermal_data->max;
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
index bfe80ac0ad8c..d0b1ab6c4523 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
@@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry)
{
+ struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
uint32_t client_id = entry->client_id;
uint32_t src_id = entry->src_id;
if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
- } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
+ schedule_delayed_work(&hwmgr->swctf_delayed_work,
+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
+ } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
- else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
+ } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
/*
* HW CTF just occurred. Shutdown to prevent further damage.
@@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev,
orderly_poweroff(true);
}
} else if (client_id == SOC15_IH_CLIENTID_THM) {
- if (src_id == 0) {
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
- } else
+ if (src_id == 0)
+ schedule_delayed_work(&hwmgr->swctf_delayed_work,
+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
+ else
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c
index 99cd2e63afdd..c51dd4c74fe9 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c
@@ -5241,6 +5241,9 @@ static int vega10_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
{
struct vega10_hwmgr *data = hwmgr->backend;
PPTable_t *pp_table = &(data->smc_state_table.pp_table);
+ struct phm_ppt_v2_information *pp_table_info =
+ (struct phm_ppt_v2_information *)(hwmgr->pptable);
+ struct phm_tdp_table *tdp_table = pp_table_info->tdp_table;
memcpy(thermal_data, &SMU7ThermalWithDelayPolicy[0], sizeof(struct PP_TemperatureRange));
@@ -5257,6 +5260,13 @@ static int vega10_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
thermal_data->mem_emergency_max = (pp_table->ThbmLimit + CTF_OFFSET_HBM)*
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ if (tdp_table->usSoftwareShutdownTemp > pp_table->ThotspotLimit &&
+ tdp_table->usSoftwareShutdownTemp < VEGA10_THERMAL_MAXIMUM_ALERT_TEMP)
+ thermal_data->sw_ctf_threshold = tdp_table->usSoftwareShutdownTemp;
+ else
+ thermal_data->sw_ctf_threshold = VEGA10_THERMAL_MAXIMUM_ALERT_TEMP;
+ thermal_data->sw_ctf_threshold *= PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c
index e9db137cd1c6..1937be1cf5b4 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c
@@ -2763,6 +2763,8 @@ static int vega12_notify_cac_buffer_info(struct pp_hwmgr *hwmgr,
static int vega12_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
struct PP_TemperatureRange *thermal_data)
{
+ struct phm_ppt_v3_information *pptable_information =
+ (struct phm_ppt_v3_information *)hwmgr->pptable;
struct vega12_hwmgr *data =
(struct vega12_hwmgr *)(hwmgr->backend);
PPTable_t *pp_table = &(data->smc_state_table.pp_table);
@@ -2781,6 +2783,8 @@ static int vega12_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
thermal_data->mem_emergency_max = (pp_table->ThbmLimit + CTF_OFFSET_HBM)*
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ thermal_data->sw_ctf_threshold = pptable_information->us_software_shutdown_temp *
+ PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
return 0;
}
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_thermal.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_thermal.c
index ed3dff0b52d2..ae342c58cd3e 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_thermal.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_thermal.c
@@ -192,7 +192,9 @@ static int vega12_thermal_set_temperature_range(struct pp_hwmgr *hwmgr,
val = REG_SET_FIELD(val, THM_THERMAL_INT_CTRL, THERM_IH_HW_ENA, 1);
val = REG_SET_FIELD(val, THM_THERMAL_INT_CTRL, DIG_THERM_INTH, high);
val = REG_SET_FIELD(val, THM_THERMAL_INT_CTRL, DIG_THERM_INTL, low);
- val = val & (~THM_THERMAL_INT_CTRL__THERM_TRIGGER_MASK_MASK);
+ val &= ~THM_THERMAL_INT_CTRL__THERM_TRIGGER_MASK_MASK;
+ val &= ~THM_THERMAL_INT_CTRL__THERM_INTH_MASK_MASK;
+ val &= ~THM_THERMAL_INT_CTRL__THERM_INTL_MASK_MASK;
WREG32_SOC15(THM, 0, mmTHM_THERMAL_INT_CTRL, val);
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c
index 0d4d4811527c..4e19ccbdb807 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c
@@ -4206,6 +4206,8 @@ static int vega20_notify_cac_buffer_info(struct pp_hwmgr *hwmgr,
static int vega20_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
struct PP_TemperatureRange *thermal_data)
{
+ struct phm_ppt_v3_information *pptable_information =
+ (struct phm_ppt_v3_information *)hwmgr->pptable;
struct vega20_hwmgr *data =
(struct vega20_hwmgr *)(hwmgr->backend);
PPTable_t *pp_table = &(data->smc_state_table.pp_table);
@@ -4224,6 +4226,8 @@ static int vega20_get_thermal_temperature_range(struct pp_hwmgr *hwmgr,
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
thermal_data->mem_emergency_max = (pp_table->ThbmLimit + CTF_OFFSET_HBM)*
PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ thermal_data->sw_ctf_threshold = pptable_information->us_software_shutdown_temp *
+ PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
return 0;
}
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_thermal.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_thermal.c
index f4f4efdbda79..e9737ca8418a 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_thermal.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_thermal.c
@@ -263,7 +263,9 @@ static int vega20_thermal_set_temperature_range(struct pp_hwmgr *hwmgr,
val = CGS_REG_SET_FIELD(val, THM_THERMAL_INT_CTRL, THERM_IH_HW_ENA, 1);
val = CGS_REG_SET_FIELD(val, THM_THERMAL_INT_CTRL, DIG_THERM_INTH, high);
val = CGS_REG_SET_FIELD(val, THM_THERMAL_INT_CTRL, DIG_THERM_INTL, low);
- val = val & (~THM_THERMAL_INT_CTRL__THERM_TRIGGER_MASK_MASK);
+ val &= ~THM_THERMAL_INT_CTRL__THERM_TRIGGER_MASK_MASK;
+ val &= ~THM_THERMAL_INT_CTRL__THERM_INTH_MASK_MASK;
+ val &= ~THM_THERMAL_INT_CTRL__THERM_INTL_MASK_MASK;
WREG32_SOC15(THM, 0, mmTHM_THERMAL_INT_CTRL, val);
diff --git a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
index f1580a26a850..612d66aeaab9 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
+++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
@@ -811,6 +811,8 @@ struct pp_hwmgr {
bool gfxoff_state_changed_by_workload;
uint32_t pstate_sclk_peak;
uint32_t pstate_mclk_peak;
+
+ struct delayed_work swctf_delayed_work;
};
int hwmgr_early_init(struct pp_hwmgr *hwmgr);
diff --git a/drivers/gpu/drm/amd/pm/powerplay/inc/power_state.h b/drivers/gpu/drm/amd/pm/powerplay/inc/power_state.h
index a5f2227a3971..0ffc2347829d 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/inc/power_state.h
+++ b/drivers/gpu/drm/amd/pm/powerplay/inc/power_state.h
@@ -131,6 +131,7 @@ struct PP_TemperatureRange {
int mem_min;
int mem_crit_max;
int mem_emergency_max;
+ int sw_ctf_threshold;
};
struct PP_StateValidationBlock {