summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
diff options
context:
space:
mode:
authorPhilip Yang <Philip.Yang@amd.com>2022-01-14 03:22:54 +0300
committerAlex Deucher <alexander.deucher@amd.com>2022-06-30 22:30:54 +0300
commite0f1e65b836c42741288a367eab565167a408b59 (patch)
tree66c6dd0ba9fff96cf1fcf305e2dcc5dfbc2f1215 /drivers/gpu/drm/amd/amdkfd/kfd_svm.c
parent163a5a58437062ce4dbef2aab6de4d784043bcf5 (diff)
downloadlinux-e0f1e65b836c42741288a367eab565167a408b59.tar.xz
drm/amdkfd: Add GPU recoverable fault SMI event
Use ktime_get_boottime_ns() as timestamp to correlate with other APIs. Output timestamp when GPU recoverable fault starts and ends to recover the fault, if migration happened or only GPU page table is updated to recover, fault address, if read or write fault. Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_svm.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c19
1 files changed, 14 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 7b332246eda3..a3c7dd411b77 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -32,6 +32,7 @@
#include "kfd_priv.h"
#include "kfd_svm.h"
#include "kfd_migrate.h"
+#include "kfd_smi_events.h"
#ifdef dev_fmt
#undef dev_fmt
@@ -43,7 +44,7 @@
/* Long enough to ensure no retry fault comes after svm range is restored and
* page table is updated.
*/
-#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000
+#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC)
struct criu_svm_metadata {
struct list_head list;
@@ -1617,7 +1618,7 @@ unreserve_out:
svm_range_unreserve_bos(&ctx);
if (!r)
- prange->validate_timestamp = ktime_to_us(ktime_get());
+ prange->validate_timestamp = ktime_get_boottime();
return r;
}
@@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
struct svm_range_list *svms;
struct svm_range *prange;
struct kfd_process *p;
- uint64_t timestamp;
+ ktime_t timestamp = ktime_get_boottime();
int32_t best_loc;
int32_t gpuidx = MAX_GPU_INSTANCE;
bool write_locked = false;
struct vm_area_struct *vma;
+ bool migration = false;
int r = 0;
if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
@@ -2775,9 +2777,9 @@ retry_write_locked:
goto out_unlock_range;
}
- timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
/* skip duplicate vm fault on different pages of same range */
- if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
+ if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
+ AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
svms, prange->start, prange->last);
r = 0;
@@ -2813,7 +2815,11 @@ retry_write_locked:
svms, prange->start, prange->last, best_loc,
prange->actual_loc);
+ kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
+ write_fault, timestamp);
+
if (prange->actual_loc != best_loc) {
+ migration = true;
if (best_loc) {
r = svm_migrate_to_vram(prange, best_loc, mm);
if (r) {
@@ -2842,6 +2848,9 @@ retry_write_locked:
pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
r, svms, prange->start, prange->last);
+ kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
+ migration);
+
out_unlock_range:
mutex_unlock(&prange->migrate_mutex);
out_unlock_svms: