From 5c697c367a66307a5d943c3449421aff2aa3ca4a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Apr 2022 00:46:22 +0000 Subject: KVM: Initialize debugfs_dentry when a VM is created to avoid NULL deref Initialize debugfs_entry to its semi-magical -ENOENT value when the VM is created. KVM's teardown when VM creation fails is kludgy and calls kvm_uevent_notify_change() and kvm_destroy_vm_debugfs() even if KVM never attempted kvm_create_vm_debugfs(). Because debugfs_entry is zero initialized, the IS_ERR() checks pass and KVM derefs a NULL pointer. BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1068b1067 P4D 1068b1067 PUD 1068b0067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 871 Comm: repro Not tainted 5.18.0-rc1+ #825 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:__dentry_path+0x7b/0x130 Call Trace: dentry_path_raw+0x42/0x70 kvm_uevent_notify_change.part.0+0x10c/0x200 [kvm] kvm_put_kvm+0x63/0x2b0 [kvm] kvm_dev_ioctl+0x43a/0x920 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x31/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xae Modules linked in: kvm_intel kvm irqbypass Fixes: a44a4cc1c969 ("KVM: Don't create VM debugfs files outside of the VM directory") Cc: stable@vger.kernel.org Cc: Marc Zyngier Cc: Oliver Upton Reported-by: syzbot+df6fbbd2ee39f21289ef@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Message-Id: <20220415004622.2207751-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dfb7dabdbc63..d292c4397579 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -955,12 +955,6 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; - /* - * Force subsequent debugfs file creations to fail if the VM directory - * is not created. - */ - kvm->debugfs_dentry = ERR_PTR(-ENOENT); - if (!debugfs_initialized()) return 0; @@ -1081,6 +1075,12 @@ static struct kvm *kvm_create_vm(unsigned long type) BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); + /* + * Force subsequent debugfs file creations to fail if the VM directory + * is not created (by kvm_create_vm_debugfs()). + */ + kvm->debugfs_dentry = ERR_PTR(-ENOENT); + if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) -- cgit v1.2.3 From a413a625b43e5f085d4e1a8c4053494d593fb3c1 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 10 Apr 2022 11:38:40 -0400 Subject: KVM: SPDX style and spelling fixes SPDX comments use use /* */ style comments in headers anad // style comments in .c files. Also fix two spelling mistakes. Signed-off-by: Tom Rix Message-Id: <20220410153840.55506-1-trix@redhat.com> Signed-off-by: Paolo Bonzini --- virt/kvm/dirty_ring.c | 2 +- virt/kvm/kvm_main.c | 4 ++-- virt/kvm/kvm_mm.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 222ecc81d7df..f4c2a6eb1666 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +// SPDX-License-Identifier: GPL-2.0-only /* * KVM dirty ring implementation * diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d292c4397579..2a23f24d13cf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -662,7 +662,7 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, kvm->mmu_notifier_range_end = end; } else { /* - * Fully tracking multiple concurrent ranges has dimishing + * Fully tracking multiple concurrent ranges has diminishing * returns. Keep things simple and just find the minimal range * which includes the current and new ranges. As there won't be * enough information to subtract a range after its invalidate @@ -1799,7 +1799,7 @@ static int kvm_set_memslot(struct kvm *kvm, /* * No need to refresh new->arch, changes after dropping slots_arch_lock - * will directly hit the final, active memsot. Architectures are + * will directly hit the final, active memslot. Architectures are * responsible for knowing that new->arch may be stale. */ kvm_commit_memory_region(kvm, old, new, change); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 34ca40823260..41da467d99c9 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_MM_H__ #define __KVM_MM_H__ 1 -- cgit v1.2.3 From 683412ccf61294d727ead4a73d97397396e69a6b Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 21 Apr 2022 03:14:07 +0000 Subject: KVM: SEV: add cache flush to solve SEV cache incoherency issues Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned. Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time. KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running. Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs. Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson Reported-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Message-Id: <20220421031407.2516575-4-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/sev.c | 8 ++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/x86.c | 5 +++++ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 27 ++++++++++++++++++++++++--- 8 files changed, 44 insertions(+), 3 deletions(-) (limited to 'virt') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 3c368b639c04..1a6d7e3f6c32 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -118,6 +118,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) +KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) KVM_X86_OP(get_msr_feature) KVM_X86_OP(can_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e0c0f0e1f754..4ff36610af6a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1484,6 +1484,7 @@ struct kvm_x86_ops { int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); + void (*guest_memory_reclaimed)(struct kvm *kvm); int (*get_msr_feature)(struct kvm_msr_entry *entry); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9a0375987029..0ad70c12c7c3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2262,6 +2262,14 @@ do_wbinvd: wbinvd_on_all_cpus(); } +void sev_guest_memory_reclaimed(struct kvm *kvm) +{ + if (!sev_guest(kvm)) + return; + + wbinvd_on_all_cpus(); +} + void sev_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd4c64b362d2..7e45d03cd018 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4620,6 +4620,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, .mem_enc_unregister_region = sev_mem_enc_unregister_region, + .guest_memory_reclaimed = sev_guest_memory_reclaimed, .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f77a7d2d39dd..f76deff71002 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -609,6 +609,8 @@ int sev_mem_enc_unregister_region(struct kvm *kvm, struct kvm_enc_region *range); int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); +void sev_guest_memory_reclaimed(struct kvm *kvm); + void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c89dc09a764f..a6ab19afc638 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9889,6 +9889,11 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); } +void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ + static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); +} + static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2dab4b696682..34eed5f85ed6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2219,6 +2219,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end); +void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); + #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); #else diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2a23f24d13cf..f30bb8c16f26 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -164,6 +164,10 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, { } +__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ +} + bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) { /* @@ -357,6 +361,12 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); #endif +static void kvm_flush_shadow_all(struct kvm *kvm) +{ + kvm_arch_flush_shadow_all(kvm); + kvm_arch_guest_memory_reclaimed(kvm); +} + #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) @@ -485,12 +495,15 @@ typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, unsigned long end); +typedef void (*on_unlock_fn_t)(struct kvm *kvm); + struct kvm_hva_range { unsigned long start; unsigned long end; pte_t pte; hva_handler_t handler; on_lock_fn_t on_lock; + on_unlock_fn_t on_unlock; bool flush_on_ret; bool may_block; }; @@ -578,8 +591,11 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && ret) kvm_flush_remote_tlbs(kvm); - if (locked) + if (locked) { KVM_MMU_UNLOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_unlock)) + range->on_unlock(kvm); + } srcu_read_unlock(&kvm->srcu, idx); @@ -600,6 +616,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .pte = pte, .handler = handler, .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = false, }; @@ -619,6 +636,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .pte = __pte(0), .handler = handler, .on_lock = (void *)kvm_null_fn, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = false, }; @@ -687,6 +705,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .pte = __pte(0), .handler = kvm_unmap_gfn_range, .on_lock = kvm_inc_notifier_count, + .on_unlock = kvm_arch_guest_memory_reclaimed, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; @@ -741,6 +760,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .pte = __pte(0), .handler = (void *)kvm_null_fn, .on_lock = kvm_dec_notifier_count, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; @@ -813,7 +833,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, int idx; idx = srcu_read_lock(&kvm->srcu); - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); } @@ -1225,7 +1245,7 @@ static void kvm_destroy_vm(struct kvm *kvm) WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); kvm->mn_active_invalidate_count = 0; #else - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); @@ -1652,6 +1672,7 @@ static void kvm_invalidate_memslot(struct kvm *kvm, * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, old); + kvm_arch_guest_memory_reclaimed(kvm); /* Was released by kvm_swap_active_memslots, reacquire. */ mutex_lock(&kvm->slots_arch_lock); -- cgit v1.2.3