summaryrefslogtreecommitdiff
path: root/virt/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-15 23:03:13 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-15 23:03:13 +0300
commit4f712ee0cbbd5c777d270427092bb301fc31044f (patch)
tree21feb90dbb43d3d771249558b090404b8eedc7c2 /virt/kvm
parent8a2fbffcbfcb60378626e5d4144a6ff43f3b6776 (diff)
parent4781179012d9380005649b0fe07f77dcaa2610e3 (diff)
downloadlinux-4f712ee0cbbd5c777d270427092bb301fc31044f.tar.xz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "S390: - Changes to FPU handling came in via the main s390 pull request - Only deliver to the guest the SCLP events that userspace has requested - More virtual vs physical address fixes (only a cleanup since virtual and physical address spaces are currently the same) - Fix selftests undefined behavior x86: - Fix a restriction that the guest can't program a PMU event whose encoding matches an architectural event that isn't included in the guest CPUID. The enumeration of an architectural event only says that if a CPU supports an architectural event, then the event can be programmed *using the architectural encoding*. The enumeration does NOT say anything about the encoding when the CPU doesn't report support the event *in general*. It might support it, and it might support it using the same encoding that made it into the architectural PMU spec - Fix a variety of bugs in KVM's emulation of RDPMC (more details on individual commits) and add a selftest to verify KVM correctly emulates RDMPC, counter availability, and a variety of other PMC-related behaviors that depend on guest CPUID and therefore are easier to validate with selftests than with custom guests (aka kvm-unit-tests) - Zero out PMU state on AMD if the virtual PMU is disabled, it does not cause any bug but it wastes time in various cases where KVM would check if a PMC event needs to be synthesized - Optimize triggering of emulated events, with a nice ~10% performance improvement in VM-Exit microbenchmarks when a vPMU is exposed to the guest - Tighten the check for "PMI in guest" to reduce false positives if an NMI arrives in the host while KVM is handling an IRQ VM-Exit - Fix a bug where KVM would report stale/bogus exit qualification information when exiting to userspace with an internal error exit code - Add a VMX flag in /proc/cpuinfo to report 5-level EPT support - Rework TDP MMU root unload, free, and alloc to run with mmu_lock held for read, e.g. to avoid serializing vCPUs when userspace deletes a memslot - Tear down TDP MMU page tables at 4KiB granularity (used to be 1GiB). KVM doesn't support yielding in the middle of processing a zap, and 1GiB granularity resulted in multi-millisecond lags that are quite impolite for CONFIG_PREEMPT kernels - Allocate write-tracking metadata on-demand to avoid the memory overhead when a kernel is built with i915 virtualization support but the workloads use neither shadow paging nor i915 virtualization - Explicitly initialize a variety of on-stack variables in the emulator that triggered KMSAN false positives - Fix the debugregs ABI for 32-bit KVM - Rework the "force immediate exit" code so that vendor code ultimately decides how and when to force the exit, which allowed some optimization for both Intel and AMD - Fix a long-standing bug where kvm_has_noapic_vcpu could be left elevated if vCPU creation ultimately failed, causing extra unnecessary work - Cleanup the logic for checking if the currently loaded vCPU is in-kernel - Harden against underflowing the active mmu_notifier invalidation count, so that "bad" invalidations (usually due to bugs elsehwere in the kernel) are detected earlier and are less likely to hang the kernel x86 Xen emulation: - Overlay pages can now be cached based on host virtual address, instead of guest physical addresses. This removes the need to reconfigure and invalidate the cache if the guest changes the gpa but the underlying host virtual address remains the same - When possible, use a single host TSC value when computing the deadline for Xen timers in order to improve the accuracy of the timer emulation - Inject pending upcall events when the vCPU software-enables its APIC to fix a bug where an upcall can be lost (and to follow Xen's behavior) - Fall back to the slow path instead of warning if "fast" IRQ delivery of Xen events fails, e.g. if the guest has aliased xAPIC IDs RISC-V: - Support exception and interrupt handling in selftests - New self test for RISC-V architectural timer (Sstc extension) - New extension support (Ztso, Zacas) - Support userspace emulation of random number seed CSRs ARM: - Infrastructure for building KVM's trap configuration based on the architectural features (or lack thereof) advertised in the VM's ID registers - Support for mapping vfio-pci BARs as Normal-NC (vaguely similar to x86's WC) at stage-2, improving the performance of interacting with assigned devices that can tolerate it - Conversion of KVM's representation of LPIs to an xarray, utilized to address serialization some of the serialization on the LPI injection path - Support for _architectural_ VHE-only systems, advertised through the absence of FEAT_E2H0 in the CPU's ID register - Miscellaneous cleanups, fixes, and spelling corrections to KVM and selftests LoongArch: - Set reserved bits as zero in CPUCFG - Start SW timer only when vcpu is blocking - Do not restart SW timer when it is expired - Remove unnecessary CSR register saving during enter guest - Misc cleanups and fixes as usual Generic: - Clean up Kconfig by removing CONFIG_HAVE_KVM, which was basically always true on all architectures except MIPS (where Kconfig determines the available depending on CPU capabilities). It is replaced either by an architecture-dependent symbol for MIPS, and IS_ENABLED(CONFIG_KVM) everywhere else - Factor common "select" statements in common code instead of requiring each architecture to specify it - Remove thoroughly obsolete APIs from the uapi headers - Move architecture-dependent stuff to uapi/asm/kvm.h - Always flush the async page fault workqueue when a work item is being removed, especially during vCPU destruction, to ensure that there are no workers running in KVM code when all references to KVM-the-module are gone, i.e. to prevent a very unlikely use-after-free if kvm.ko is unloaded - Grab a reference to the VM's mm_struct in the async #PF worker itself instead of gifting the worker a reference, so that there's no need to remember to *conditionally* clean up after the worker Selftests: - Reduce boilerplate especially when utilize selftest TAP infrastructure - Add basic smoke tests for SEV and SEV-ES, along with a pile of library support for handling private/encrypted/protected memory - Fix benign bugs where tests neglect to close() guest_memfd files" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (246 commits) selftests: kvm: remove meaningless assignments in Makefiles KVM: riscv: selftests: Add Zacas extension to get-reg-list test RISC-V: KVM: Allow Zacas extension for Guest/VM KVM: riscv: selftests: Add Ztso extension to get-reg-list test RISC-V: KVM: Allow Ztso extension for Guest/VM RISC-V: KVM: Forward SEED CSR access to user space KVM: riscv: selftests: Add sstc timer test KVM: riscv: selftests: Change vcpu_has_ext to a common function KVM: riscv: selftests: Add guest helper to get vcpu id KVM: riscv: selftests: Add exception handling support LoongArch: KVM: Remove unnecessary CSR register saving during enter guest LoongArch: KVM: Do not restart SW timer when it is expired LoongArch: KVM: Start SW timer only when vcpu is blocking LoongArch: KVM: Set reserved bits as zero in CPUCFG KVM: selftests: Explicitly close guest_memfd files in some gmem tests KVM: x86/xen: fix recursive deadlock in timer injection KVM: pfncache: simplify locking and make more self-contained KVM: x86/xen: remove WARN_ON_ONCE() with false positives in evtchn delivery KVM: x86/xen: inject vCPU upcall vector when local APIC is enabled KVM: x86/xen: improve accuracy of Xen timers ...
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/Kconfig7
-rw-r--r--virt/kvm/async_pf.c73
-rw-r--r--virt/kvm/kvm_main.c37
-rw-r--r--virt/kvm/pfncache.c251
4 files changed, 223 insertions, 145 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 184dab4ee871..29b73eedfe74 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -1,9 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
# KVM common configuration items and defaults
-config HAVE_KVM
- bool
-
config KVM_COMMON
bool
select EVENTFD
@@ -55,6 +52,9 @@ config KVM_ASYNC_PF_SYNC
config HAVE_KVM_MSI
bool
+config HAVE_KVM_READONLY_MEM
+ bool
+
config HAVE_KVM_CPU_RELAX_INTERCEPT
bool
@@ -73,6 +73,7 @@ config KVM_COMPAT
config HAVE_KVM_IRQ_BYPASS
bool
+ select IRQ_BYPASS_MANAGER
config HAVE_KVM_VCPU_ASYNC_IOCTL
bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index e033c79d528e..99a63bad0306 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -46,8 +46,8 @@ static void async_pf_execute(struct work_struct *work)
{
struct kvm_async_pf *apf =
container_of(work, struct kvm_async_pf, work);
- struct mm_struct *mm = apf->mm;
struct kvm_vcpu *vcpu = apf->vcpu;
+ struct mm_struct *mm = vcpu->kvm->mm;
unsigned long addr = apf->addr;
gpa_t cr2_or_gpa = apf->cr2_or_gpa;
int locked = 1;
@@ -56,15 +56,24 @@ static void async_pf_execute(struct work_struct *work)
might_sleep();
/*
- * This work is run asynchronously to the task which owns
- * mm and might be done in another context, so we must
- * access remotely.
+ * Attempt to pin the VM's host address space, and simply skip gup() if
+ * acquiring a pin fail, i.e. if the process is exiting. Note, KVM
+ * holds a reference to its associated mm_struct until the very end of
+ * kvm_destroy_vm(), i.e. the struct itself won't be freed before this
+ * work item is fully processed.
*/
- mmap_read_lock(mm);
- get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked);
- if (locked)
- mmap_read_unlock(mm);
+ if (mmget_not_zero(mm)) {
+ mmap_read_lock(mm);
+ get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked);
+ if (locked)
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
+ /*
+ * Notify and kick the vCPU even if faulting in the page failed, e.g.
+ * so that the vCPU can retry the fault synchronously.
+ */
if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC))
kvm_arch_async_page_present(vcpu, apf);
@@ -74,20 +83,39 @@ static void async_pf_execute(struct work_struct *work)
apf->vcpu = NULL;
spin_unlock(&vcpu->async_pf.lock);
- if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
- kvm_arch_async_page_present_queued(vcpu);
-
/*
- * apf may be freed by kvm_check_async_pf_completion() after
- * this point
+ * The apf struct may be freed by kvm_check_async_pf_completion() as
+ * soon as the lock is dropped. Nullify it to prevent improper usage.
*/
+ apf = NULL;
+
+ if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
+ kvm_arch_async_page_present_queued(vcpu);
trace_kvm_async_pf_completed(addr, cr2_or_gpa);
__kvm_vcpu_wake_up(vcpu);
+}
- mmput(mm);
- kvm_put_kvm(vcpu->kvm);
+static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work)
+{
+ /*
+ * The async #PF is "done", but KVM must wait for the work item itself,
+ * i.e. async_pf_execute(), to run to completion. If KVM is a module,
+ * KVM must ensure *no* code owned by the KVM (the module) can be run
+ * after the last call to module_put(). Note, flushing the work item
+ * is always required when the item is taken off the completion queue.
+ * E.g. even if the vCPU handles the item in the "normal" path, the VM
+ * could be terminated before async_pf_execute() completes.
+ *
+ * Wake all events skip the queue and go straight done, i.e. don't
+ * need to be flushed (but sanity check that the work wasn't queued).
+ */
+ if (work->wakeup_all)
+ WARN_ON_ONCE(work->work.func);
+ else
+ flush_work(&work->work);
+ kmem_cache_free(async_pf_cache, work);
}
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
@@ -112,11 +140,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
#ifdef CONFIG_KVM_ASYNC_PF_SYNC
flush_work(&work->work);
#else
- if (cancel_work_sync(&work->work)) {
- mmput(work->mm);
- kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
+ if (cancel_work_sync(&work->work))
kmem_cache_free(async_pf_cache, work);
- }
#endif
spin_lock(&vcpu->async_pf.lock);
}
@@ -126,7 +151,10 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
list_first_entry(&vcpu->async_pf.done,
typeof(*work), link);
list_del(&work->link);
- kmem_cache_free(async_pf_cache, work);
+
+ spin_unlock(&vcpu->async_pf.lock);
+ kvm_flush_and_free_async_pf_work(work);
+ spin_lock(&vcpu->async_pf.lock);
}
spin_unlock(&vcpu->async_pf.lock);
@@ -151,7 +179,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
list_del(&work->queue);
vcpu->async_pf.queued--;
- kmem_cache_free(async_pf_cache, work);
+ kvm_flush_and_free_async_pf_work(work);
}
}
@@ -184,9 +212,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
work->cr2_or_gpa = cr2_or_gpa;
work->addr = hva;
work->arch = *arch;
- work->mm = current->mm;
- mmget(work->mm);
- kvm_get_kvm(work->vcpu->kvm);
INIT_WORK(&work->work, async_pf_execute);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0f50960b0e3a..fb49c2a60200 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -421,7 +421,7 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity,
if (WARN_ON_ONCE(!capacity))
return -EIO;
- mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
+ mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
if (!mc->objects)
return -ENOMEM;
@@ -890,7 +890,9 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
/* Pairs with the increment in range_start(). */
spin_lock(&kvm->mn_invalidate_lock);
- wake = (--kvm->mn_active_invalidate_count == 0);
+ if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
+ --kvm->mn_active_invalidate_count;
+ wake = !kvm->mn_active_invalidate_count;
spin_unlock(&kvm->mn_invalidate_lock);
/*
@@ -1150,10 +1152,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
&stat_fops_per_vm);
}
- ret = kvm_arch_create_vm_debugfs(kvm);
- if (ret)
- goto out_err;
-
+ kvm_arch_create_vm_debugfs(kvm);
return 0;
out_err:
kvm_destroy_vm_debugfs(kvm);
@@ -1183,9 +1182,8 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
* Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
* a per-arch destroy interface is not needed.
*/
-int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
+void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
- return 0;
}
static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
@@ -1614,7 +1612,7 @@ static int check_memory_region_flags(struct kvm *kvm,
if (mem->flags & KVM_MEM_GUEST_MEMFD)
valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-#ifdef __KVM_HAVE_READONLY_MEM
+#ifdef CONFIG_HAVE_KVM_READONLY_MEM
/*
* GUEST_MEMFD is incompatible with read-only memslots, as writes to
* read-only memslots have emulated MMIO, not page fault, semantics,
@@ -4048,6 +4046,18 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
return false;
}
+/*
+ * By default, simply query the target vCPU's current mode when checking if a
+ * vCPU was preempted in kernel mode. All architectures except x86 (or more
+ * specifical, except VMX) allow querying whether or not a vCPU is in kernel
+ * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
+ * directly for cross-vCPU checks is functionally correct and accurate.
+ */
+bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_vcpu_in_kernel(vcpu);
+}
+
bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
return false;
@@ -4084,9 +4094,16 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
continue;
if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
continue;
+
+ /*
+ * Treat the target vCPU as being in-kernel if it has a
+ * pending interrupt, as the vCPU trying to yield may
+ * be spinning waiting on IPI delivery, i.e. the target
+ * vCPU is in-kernel for the purposes of directed yield.
+ */
if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
!kvm_arch_dy_has_pending_interrupt(vcpu) &&
- !kvm_arch_vcpu_in_kernel(vcpu))
+ !kvm_arch_vcpu_preempted_in_kernel(vcpu))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba677830..4e07112a24c2 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -25,55 +25,36 @@
void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
unsigned long end, bool may_block)
{
- DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
struct gfn_to_pfn_cache *gpc;
- bool evict_vcpus = false;
spin_lock(&kvm->gpc_lock);
list_for_each_entry(gpc, &kvm->gpc_list, list) {
- write_lock_irq(&gpc->lock);
+ read_lock_irq(&gpc->lock);
/* Only a single page so no need to care about length */
if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
gpc->uhva >= start && gpc->uhva < end) {
- gpc->valid = false;
+ read_unlock_irq(&gpc->lock);
/*
- * If a guest vCPU could be using the physical address,
- * it needs to be forced out of guest mode.
+ * There is a small window here where the cache could
+ * be modified, and invalidation would no longer be
+ * necessary. Hence check again whether invalidation
+ * is still necessary once the write lock has been
+ * acquired.
*/
- if (gpc->usage & KVM_GUEST_USES_PFN) {
- if (!evict_vcpus) {
- evict_vcpus = true;
- bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
- }
- __set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
- }
- }
- write_unlock_irq(&gpc->lock);
- }
- spin_unlock(&kvm->gpc_lock);
-
- if (evict_vcpus) {
- /*
- * KVM needs to ensure the vCPU is fully out of guest context
- * before allowing the invalidation to continue.
- */
- unsigned int req = KVM_REQ_OUTSIDE_GUEST_MODE;
- bool called;
- /*
- * If the OOM reaper is active, then all vCPUs should have
- * been stopped already, so perform the request without
- * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd.
- */
- if (!may_block)
- req &= ~KVM_REQUEST_WAIT;
-
- called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
+ write_lock_irq(&gpc->lock);
+ if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+ gpc->uhva >= start && gpc->uhva < end)
+ gpc->valid = false;
+ write_unlock_irq(&gpc->lock);
+ continue;
+ }
- WARN_ON_ONCE(called && !may_block);
+ read_unlock_irq(&gpc->lock);
}
+ spin_unlock(&kvm->gpc_lock);
}
bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
@@ -83,10 +64,17 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
if (!gpc->active)
return false;
- if ((gpc->gpa & ~PAGE_MASK) + len > PAGE_SIZE)
+ /*
+ * If the page was cached from a memslot, make sure the memslots have
+ * not been re-configured.
+ */
+ if (!kvm_is_error_gpa(gpc->gpa) && gpc->generation != slots->generation)
+ return false;
+
+ if (kvm_is_error_hva(gpc->uhva))
return false;
- if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva))
+ if (offset_in_page(gpc->uhva) + len > PAGE_SIZE)
return false;
if (!gpc->valid)
@@ -94,19 +82,33 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
return true;
}
-EXPORT_SYMBOL_GPL(kvm_gpc_check);
-static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva)
+static void *gpc_map(kvm_pfn_t pfn)
{
- /* Unmap the old pfn/page if it was mapped before. */
- if (!is_error_noslot_pfn(pfn) && khva) {
- if (pfn_valid(pfn))
- kunmap(pfn_to_page(pfn));
+ if (pfn_valid(pfn))
+ return kmap(pfn_to_page(pfn));
+
#ifdef CONFIG_HAS_IOMEM
- else
- memunmap(khva);
+ return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+#else
+ return NULL;
#endif
+}
+
+static void gpc_unmap(kvm_pfn_t pfn, void *khva)
+{
+ /* Unmap the old pfn/page if it was mapped before. */
+ if (is_error_noslot_pfn(pfn) || !khva)
+ return;
+
+ if (pfn_valid(pfn)) {
+ kunmap(pfn_to_page(pfn));
+ return;
}
+
+#ifdef CONFIG_HAS_IOMEM
+ memunmap(khva);
+#endif
}
static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
@@ -140,7 +142,7 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
{
/* Note, the new page offset may be different than the old! */
- void *old_khva = gpc->khva - offset_in_page(gpc->khva);
+ void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
void *new_khva = NULL;
unsigned long mmu_seq;
@@ -175,7 +177,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
* the existing mapping and didn't create a new one.
*/
if (new_khva != old_khva)
- gpc_unmap_khva(new_pfn, new_khva);
+ gpc_unmap(new_pfn, new_khva);
kvm_release_pfn_clean(new_pfn);
@@ -192,20 +194,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
* pfn. Note, kmap() and memremap() can both sleep, so this
* too must be done outside of gpc->lock!
*/
- if (gpc->usage & KVM_HOST_USES_PFN) {
- if (new_pfn == gpc->pfn) {
- new_khva = old_khva;
- } else if (pfn_valid(new_pfn)) {
- new_khva = kmap(pfn_to_page(new_pfn));
-#ifdef CONFIG_HAS_IOMEM
- } else {
- new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB);
-#endif
- }
- if (!new_khva) {
- kvm_release_pfn_clean(new_pfn);
- goto out_error;
- }
+ if (new_pfn == gpc->pfn)
+ new_khva = old_khva;
+ else
+ new_khva = gpc_map(new_pfn);
+
+ if (!new_khva) {
+ kvm_release_pfn_clean(new_pfn);
+ goto out_error;
}
write_lock_irq(&gpc->lock);
@@ -219,7 +215,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
gpc->valid = true;
gpc->pfn = new_pfn;
- gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK);
+ gpc->khva = new_khva + offset_in_page(gpc->uhva);
/*
* Put the reference to the _new_ pfn. The pfn is now tracked by the
@@ -236,30 +232,31 @@ out_error:
return -EFAULT;
}
-static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
+static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
unsigned long len)
{
- struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
- unsigned long page_offset = gpa & ~PAGE_MASK;
+ unsigned long page_offset;
bool unmap_old = false;
unsigned long old_uhva;
kvm_pfn_t old_pfn;
+ bool hva_change = false;
void *old_khva;
int ret;
+ /* Either gpa or uhva must be valid, but not both */
+ if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva)))
+ return -EINVAL;
+
/*
- * If must fit within a single page. The 'len' argument is
- * only to enforce that.
+ * The cached acces must fit within a single page. The 'len' argument
+ * exists only to enforce that.
*/
+ page_offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) :
+ offset_in_page(gpa);
if (page_offset + len > PAGE_SIZE)
return -EINVAL;
- /*
- * If another task is refreshing the cache, wait for it to complete.
- * There is no guarantee that concurrent refreshes will see the same
- * gpa, memslots generation, etc..., so they must be fully serialized.
- */
- mutex_lock(&gpc->refresh_lock);
+ lockdep_assert_held(&gpc->refresh_lock);
write_lock_irq(&gpc->lock);
@@ -269,30 +266,52 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
}
old_pfn = gpc->pfn;
- old_khva = gpc->khva - offset_in_page(gpc->khva);
- old_uhva = gpc->uhva;
-
- /* If the userspace HVA is invalid, refresh that first */
- if (gpc->gpa != gpa || gpc->generation != slots->generation ||
- kvm_is_error_hva(gpc->uhva)) {
- gfn_t gfn = gpa_to_gfn(gpa);
-
- gpc->gpa = gpa;
- gpc->generation = slots->generation;
- gpc->memslot = __gfn_to_memslot(slots, gfn);
- gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
-
- if (kvm_is_error_hva(gpc->uhva)) {
- ret = -EFAULT;
- goto out;
+ old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
+ old_uhva = PAGE_ALIGN_DOWN(gpc->uhva);
+
+ if (kvm_is_error_gpa(gpa)) {
+ gpc->gpa = INVALID_GPA;
+ gpc->memslot = NULL;
+ gpc->uhva = PAGE_ALIGN_DOWN(uhva);
+
+ if (gpc->uhva != old_uhva)
+ hva_change = true;
+ } else {
+ struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
+
+ if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+ kvm_is_error_hva(gpc->uhva)) {
+ gfn_t gfn = gpa_to_gfn(gpa);
+
+ gpc->gpa = gpa;
+ gpc->generation = slots->generation;
+ gpc->memslot = __gfn_to_memslot(slots, gfn);
+ gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
+
+ if (kvm_is_error_hva(gpc->uhva)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Even if the GPA and/or the memslot generation changed, the
+ * HVA may still be the same.
+ */
+ if (gpc->uhva != old_uhva)
+ hva_change = true;
+ } else {
+ gpc->uhva = old_uhva;
}
}
+ /* Note: the offset must be correct before calling hva_to_pfn_retry() */
+ gpc->uhva += page_offset;
+
/*
* If the userspace HVA changed or the PFN was already invalid,
* drop the lock and do the HVA to PFN lookup again.
*/
- if (!gpc->valid || old_uhva != gpc->uhva) {
+ if (!gpc->valid || hva_change) {
ret = hva_to_pfn_retry(gpc);
} else {
/*
@@ -323,41 +342,47 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
out_unlock:
write_unlock_irq(&gpc->lock);
- mutex_unlock(&gpc->refresh_lock);
-
if (unmap_old)
- gpc_unmap_khva(old_pfn, old_khva);
+ gpc_unmap(old_pfn, old_khva);
return ret;
}
int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len)
{
- return __kvm_gpc_refresh(gpc, gpc->gpa, len);
+ unsigned long uhva;
+
+ guard(mutex)(&gpc->refresh_lock);
+
+ /*
+ * If the GPA is valid then ignore the HVA, as a cache can be GPA-based
+ * or HVA-based, not both. For GPA-based caches, the HVA will be
+ * recomputed during refresh if necessary.
+ */
+ uhva = kvm_is_error_gpa(gpc->gpa) ? gpc->uhva : KVM_HVA_ERR_BAD;
+
+ return __kvm_gpc_refresh(gpc, gpc->gpa, uhva, len);
}
-EXPORT_SYMBOL_GPL(kvm_gpc_refresh);
-void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
- struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
{
- WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
- WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu);
-
rwlock_init(&gpc->lock);
mutex_init(&gpc->refresh_lock);
gpc->kvm = kvm;
- gpc->vcpu = vcpu;
- gpc->usage = usage;
gpc->pfn = KVM_PFN_ERR_FAULT;
+ gpc->gpa = INVALID_GPA;
gpc->uhva = KVM_HVA_ERR_BAD;
+ gpc->active = gpc->valid = false;
}
-EXPORT_SYMBOL_GPL(kvm_gpc_init);
-int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
+static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
+ unsigned long len)
{
struct kvm *kvm = gpc->kvm;
+ guard(mutex)(&gpc->refresh_lock);
+
if (!gpc->active) {
if (KVM_BUG_ON(gpc->valid, kvm))
return -EIO;
@@ -375,9 +400,18 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
gpc->active = true;
write_unlock_irq(&gpc->lock);
}
- return __kvm_gpc_refresh(gpc, gpa, len);
+ return __kvm_gpc_refresh(gpc, gpa, uhva, len);
+}
+
+int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
+{
+ return __kvm_gpc_activate(gpc, gpa, KVM_HVA_ERR_BAD, len);
+}
+
+int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len)
+{
+ return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len);
}
-EXPORT_SYMBOL_GPL(kvm_gpc_activate);
void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
{
@@ -385,6 +419,8 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
kvm_pfn_t old_pfn;
void *old_khva;
+ guard(mutex)(&gpc->refresh_lock);
+
if (gpc->active) {
/*
* Deactivate the cache before removing it from the list, KVM
@@ -412,7 +448,6 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
list_del(&gpc->list);
spin_unlock(&kvm->gpc_lock);
- gpc_unmap_khva(old_pfn, old_khva);
+ gpc_unmap(old_pfn, old_khva);
}
}
-EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);