summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c506
1 files changed, 406 insertions, 100 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8ebab6f22e5a..1adc81a55734 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -122,6 +122,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
+#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
+
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1045,6 +1047,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct ras_manager *ras_mgr,
struct ras_err_data *err_data,
+ struct ras_query_context *qctx,
const char *blk_name,
bool is_ue,
bool is_de)
@@ -1052,27 +1055,28 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node;
struct ras_err_info *err_info;
+ u64 event_id = qctx->event_id;
if (is_ue) {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ue_count) {
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new uncorrectable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ue_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld new uncorrectable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ue_count,
+ blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld uncorrectable hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld uncorrectable hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
}
} else {
@@ -1081,44 +1085,44 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->de_count) {
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new deferred hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->de_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld new deferred hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->de_count,
+ blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld deferred hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id,
- err_info->de_count, blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld deferred hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->de_count, blk_name);
}
} else {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ce_count) {
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new correctable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ce_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld new correctable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ce_count,
+ blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld correctable hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id,
- err_info->ce_count, blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld correctable hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->ce_count, blk_name);
}
}
}
@@ -1131,77 +1135,79 @@ static inline bool err_data_has_source_info(struct ras_err_data *data)
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
struct ras_query_if *query_if,
- struct ras_err_data *err_data)
+ struct ras_err_data *err_data,
+ struct ras_query_context *qctx)
{
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
const char *blk_name = get_ras_block_str(&query_if->head);
+ u64 event_id = qctx->event_id;
if (err_data->ce_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, false, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld correctable hardware errors "
- "detected in %s block\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- ras_mgr->err_data.ce_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+ "%ld correctable hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.ce_count,
+ blk_name);
} else {
- dev_info(adev->dev, "%ld correctable hardware errors "
- "detected in %s block\n",
- ras_mgr->err_data.ce_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.ce_count,
+ blk_name);
}
}
if (err_data->ue_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, true, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld uncorrectable hardware errors "
- "detected in %s block\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- ras_mgr->err_data.ue_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+ "%ld uncorrectable hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.ue_count,
+ blk_name);
} else {
- dev_info(adev->dev, "%ld uncorrectable hardware errors "
- "detected in %s block\n",
- ras_mgr->err_data.ue_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.ue_count,
+ blk_name);
}
}
if (err_data->de_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, false, true);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld deferred hardware errors "
- "detected in %s block\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- ras_mgr->err_data.de_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+ "%ld deferred hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.de_count,
+ blk_name);
} else {
- dev_info(adev->dev, "%ld deferred hardware errors "
- "detected in %s block\n",
- ras_mgr->err_data.de_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.de_count,
+ blk_name);
}
}
}
@@ -1244,6 +1250,10 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
{
struct ras_manager *obj;
+ /* in resume phase, no need to create aca fs node */
+ if (adev->in_suspend || amdgpu_in_reset(adev))
+ return 0;
+
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
@@ -1265,7 +1275,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
}
static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
- enum aca_error_type type, struct ras_err_data *err_data)
+ enum aca_error_type type, struct ras_err_data *err_data,
+ struct ras_query_context *qctx)
{
struct ras_manager *obj;
@@ -1273,7 +1284,7 @@ static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu
if (!obj)
return -EINVAL;
- return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
+ return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);
}
ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
@@ -1287,13 +1298,14 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
- return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
- "ce", info.ce_count);
+ return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
+ "ce", info.ce_count, "de", info.ue_count);
}
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
+ struct ras_query_context *qctx,
unsigned int error_query_mode)
{
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
@@ -1329,17 +1341,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
}
} else {
if (amdgpu_aca_is_enabled(adev)) {
- ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx);
+ if (ret)
+ return ret;
+
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);
if (ret)
return ret;
- ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);
if (ret)
return ret;
} else {
/* FIXME: add code to check return value later */
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
}
}
@@ -1351,6 +1367,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
+ struct ras_query_context qctx;
unsigned int error_query_mode;
int ret;
@@ -1364,8 +1381,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
return -EINVAL;
+ memset(&qctx, 0, sizeof(qctx));
+ qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
+ RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
ret = amdgpu_ras_query_error_status_helper(adev, info,
&err_data,
+ &qctx,
error_query_mode);
if (ret)
goto out_fini_err_data;
@@ -1376,7 +1397,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
info->ce_count = obj->err_data.ce_count;
info->de_count = obj->err_data.de_count;
- amdgpu_ras_error_generate_report(adev, info, &err_data);
+ amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
out_fini_err_data:
amdgpu_ras_error_data_fini(&err_data);
@@ -2041,7 +2062,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
- amdgpu_umc_poison_handler(adev, obj->head.block, false);
+ amdgpu_umc_poison_handler(adev, obj->head.block, 0);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
@@ -2061,6 +2082,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
{
dev_info(obj->adev->dev,
"Poison is created\n");
+
+ if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+ struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
+
+ amdgpu_ras_put_poison_req(obj->adev,
+ AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+
+ atomic_inc(&con->page_retirement_req_cnt);
+
+ wake_up(&con->page_retirement_wq);
+ }
}
static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
@@ -2371,7 +2403,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
};
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
- data->bps[i].retired_page);
+ data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
if (status == -EBUSY)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
else if (status == -ENOENT)
@@ -2384,6 +2416,19 @@ out:
return ret;
}
+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+ struct amdgpu_hive_info *hive, bool status)
+{
+ struct amdgpu_device *tmp_adev;
+
+ if (hive) {
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ amdgpu_ras_set_fed(tmp_adev, status);
+ } else {
+ amdgpu_ras_set_fed(adev, status);
+ }
+}
+
static void amdgpu_ras_do_recovery(struct work_struct *work)
{
struct amdgpu_ras *ras =
@@ -2393,8 +2438,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
- if (hive)
+ if (hive) {
atomic_set(&hive->ras_recovery, 1);
+
+ /* If any device which is part of the hive received RAS fatal
+ * error interrupt, set fatal error status on all. This
+ * condition will need a recovery, and flag will be cleared
+ * as part of recovery.
+ */
+ list_for_each_entry(remote_adev, &hive->device_list,
+ gmc.xgmi.head)
+ if (amdgpu_ras_get_fed_status(remote_adev)) {
+ amdgpu_ras_set_fed_all(adev, hive, true);
+ break;
+ }
+ }
if (!ras->disable_ras_err_cnt_harvest) {
/* Build list of devices to query RAS related errors */
@@ -2439,18 +2497,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- /* For any RAS error that needs a full reset to
- * recover, set the fatal error status
- */
- if (hive) {
- list_for_each_entry(remote_adev,
- &hive->device_list,
- gmc.xgmi.head)
- amdgpu_ras_set_fed(remote_adev,
- true);
- } else {
- amdgpu_ras_set_fed(adev, true);
- }
psp_fatal_error_recovery_quirk(&adev->psp);
}
}
@@ -2516,9 +2562,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out;
}
- amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
- AMDGPU_GPU_PAGE_SIZE);
+ amdgpu_ras_reserve_page(adev, bps[i].retired_page);
memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
data->count++;
@@ -2674,10 +2718,167 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
}
}
+int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t pasid,
+ pasid_notify pasid_fn, void *data, uint32_t reset)
+{
+ int ret = 0;
+ struct ras_poison_msg poison_msg;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ memset(&poison_msg, 0, sizeof(poison_msg));
+ poison_msg.block = block;
+ poison_msg.pasid = pasid;
+ poison_msg.reset = reset;
+ poison_msg.pasid_fn = pasid_fn;
+ poison_msg.data = data;
+
+ ret = kfifo_put(&con->poison_fifo, poison_msg);
+ if (!ret) {
+ dev_err(adev->dev, "Poison message fifo is full!\n");
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
+ struct ras_poison_msg *poison_msg)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ return kfifo_get(&con->poison_fifo, poison_msg);
+}
+
+static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
+{
+ mutex_init(&ecc_log->lock);
+
+ /* Set any value as siphash key */
+ memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
+
+ INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
+ ecc_log->de_updated = false;
+}
+
+static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ struct ras_ecc_err *ecc_err;
+
+ mutex_lock(&ecc_log->lock);
+ radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
+ ecc_err = radix_tree_deref_slot(slot);
+ kfree(ecc_err->err_pages.pfn);
+ kfree(ecc_err);
+ radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
+ }
+ mutex_unlock(&ecc_log->lock);
+
+ mutex_destroy(&ecc_log->lock);
+ ecc_log->de_updated = false;
+}
+
+static void amdgpu_ras_do_page_retirement(struct work_struct *work)
+{
+ struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+ page_retirement_dwork.work);
+ struct amdgpu_device *adev = con->adev;
+ struct ras_err_data err_data;
+
+ if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
+ return;
+
+ amdgpu_ras_error_data_init(&err_data);
+
+ amdgpu_umc_handle_bad_pages(adev, &err_data);
+
+ amdgpu_ras_error_data_fini(&err_data);
+
+ mutex_lock(&con->umc_ecc_log.lock);
+ if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+ UMC_ECC_NEW_DETECTED_TAG))
+ schedule_delayed_work(&con->page_retirement_dwork,
+ msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
+ mutex_unlock(&con->umc_ecc_log.lock);
+}
+
+static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+ enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
+{
+ int ret = 0;
+ struct ras_ecc_log_info *ecc_log;
+ struct ras_query_if info;
+ uint32_t timeout = timeout_ms;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ memset(&info, 0, sizeof(info));
+ info.head.block = ras_block;
+
+ ecc_log = &ras->umc_ecc_log;
+ ecc_log->de_updated = false;
+ do {
+ ret = amdgpu_ras_query_error_status(adev, &info);
+ if (ret) {
+ dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
+ return ret;
+ }
+
+ if (timeout && !ecc_log->de_updated) {
+ msleep(1);
+ timeout--;
+ }
+ } while (timeout && !ecc_log->de_updated);
+
+ if (timeout_ms && !timeout) {
+ dev_warn(adev->dev, "Can't find deferred error\n");
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+ uint32_t timeout)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ int ret;
+
+ ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+ if (!ret)
+ schedule_delayed_work(&con->page_retirement_dwork, 0);
+}
+
+static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
+ struct ras_poison_msg *poison_msg)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ uint32_t reset = poison_msg->reset;
+ uint16_t pasid = poison_msg->pasid;
+
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
+ if (poison_msg->pasid_fn)
+ poison_msg->pasid_fn(adev, pasid, poison_msg->data);
+
+ if (reset) {
+ flush_delayed_work(&con->page_retirement_dwork);
+
+ con->gpu_reset_flags |= reset;
+ amdgpu_ras_reset_gpu(adev);
+ }
+
+ return 0;
+}
+
static int amdgpu_ras_page_retirement_thread(void *param)
{
struct amdgpu_device *adev = (struct amdgpu_device *)param;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_poison_msg poison_msg;
+ enum amdgpu_ras_block ras_block;
+ bool poison_creation_is_handled = false;
while (!kthread_should_stop()) {
@@ -2688,13 +2889,34 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop())
break;
- dev_info(adev->dev, "Start processing page retirement. request:%d\n",
- atomic_read(&con->page_retirement_req_cnt));
-
atomic_dec(&con->page_retirement_req_cnt);
- amdgpu_umc_bad_page_polling_timeout(adev,
- false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+ if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
+ continue;
+
+ ras_block = poison_msg.block;
+
+ dev_info(adev->dev, "Start processing ras block %s(%d)\n",
+ ras_block_str(ras_block), ras_block);
+
+ if (ras_block == AMDGPU_RAS_BLOCK__UMC) {
+ amdgpu_ras_poison_creation_handler(adev,
+ MAX_UMC_POISON_POLLING_TIME_ASYNC);
+ poison_creation_is_handled = true;
+ } else {
+ /* poison_creation_is_handled:
+ * false: no poison creation interrupt, but it has poison
+ * consumption interrupt.
+ * true: It has poison creation interrupt at the beginning,
+ * but it has no poison creation interrupt later.
+ */
+ amdgpu_ras_poison_creation_handler(adev,
+ poison_creation_is_handled ?
+ 0 : MAX_UMC_POISON_POLLING_TIME_ASYNC);
+
+ amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
+ poison_creation_is_handled = false;
+ }
}
return 0;
@@ -2763,6 +2985,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
}
}
+ mutex_init(&con->page_rsv_lock);
+ INIT_KFIFO(con->poison_fifo);
mutex_init(&con->page_retirement_lock);
init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0);
@@ -2773,6 +2997,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
}
+ INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
+ amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
#ifdef CONFIG_X86_MCE_AMD
if ((adev->asic_type == CHIP_ALDEBARAN) &&
(adev->gmc.xgmi.connected_to_cpu))
@@ -2813,8 +3039,14 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
atomic_set(&con->page_retirement_req_cnt, 0);
+ mutex_destroy(&con->page_rsv_lock);
+
cancel_work_sync(&con->recovery_work);
+ cancel_delayed_work_sync(&con->page_retirement_dwork);
+
+ amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
+
mutex_lock(&con->recovery_lock);
con->eh_data = NULL;
kfree(data->bps);
@@ -3036,6 +3268,35 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
AMDGPU_RAS_ERROR__PARITY;
}
+static void ras_event_mgr_init(struct ras_event_manager *mgr)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++)
+ atomic64_set(&mgr->seqnos[i], 0);
+}
+
+static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ struct amdgpu_hive_info *hive;
+
+ if (!ras)
+ return;
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
+
+ /* init event manager with node 0 on xgmi system */
+ if (!amdgpu_in_reset(adev)) {
+ if (!hive || adev->gmc.xgmi.node_id == 0)
+ ras_event_mgr_init(ras->event_mgr);
+ }
+
+ if (hive)
+ amdgpu_put_xgmi_hive(hive);
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3356,6 +3617,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
+ amdgpu_ras_event_mgr_init(adev);
+
if (amdgpu_aca_is_enabled(adev)) {
if (amdgpu_in_reset(adev))
r = amdgpu_aca_reset(adev);
@@ -3472,14 +3735,39 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
atomic_set(&ras->fed, !!status);
}
+bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id)
+{
+ return !(id & BIT_ULL(63));
+}
+
+u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ u64 id;
+
+ switch (type) {
+ case RAS_EVENT_TYPE_ISR:
+ id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]);
+ break;
+ case RAS_EVENT_TYPE_INVALID:
+ default:
+ id = BIT_ULL(63) | 0ULL;
+ break;
+ }
+
+ return id;
+}
+
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]);
- dev_info(adev->dev, "uncorrectable hardware error"
- "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
+ RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
+ "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
+ amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
@@ -3998,6 +4286,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_a
{
struct ras_err_addr *mca_err_addr;
+ /* This function will be retired. */
+ return;
mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
if (!mca_err_addr)
return;
@@ -4195,3 +4485,19 @@ void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
}
}
+
+int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+ uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
+ int ret = 0;
+
+ mutex_lock(&con->page_rsv_lock);
+ ret = amdgpu_vram_mgr_query_page_status(mgr, start);
+ if (ret == -ENOENT)
+ ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE);
+ mutex_unlock(&con->page_rsv_lock);
+
+ return ret;
+}