summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst213
-rw-r--r--arch/arm64/include/asm/kvm_host.h2
-rw-r--r--arch/arm64/kvm/Kconfig4
-rw-r--r--arch/loongarch/include/asm/kvm_host.h25
-rw-r--r--arch/loongarch/include/asm/kvm_vcpu.h21
-rw-r--r--arch/loongarch/include/uapi/asm/kvm.h1
-rw-r--r--arch/loongarch/kernel/fpu.S2
-rw-r--r--arch/loongarch/kvm/Kconfig3
-rw-r--r--arch/loongarch/kvm/exit.c50
-rw-r--r--arch/loongarch/kvm/main.c1
-rw-r--r--arch/loongarch/kvm/mmu.c124
-rw-r--r--arch/loongarch/kvm/switch.S31
-rw-r--r--arch/loongarch/kvm/timer.c129
-rw-r--r--arch/loongarch/kvm/trace.h6
-rw-r--r--arch/loongarch/kvm/vcpu.c307
-rw-r--r--arch/mips/include/asm/kvm_host.h2
-rw-r--r--arch/mips/kvm/Kconfig3
-rw-r--r--arch/powerpc/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/kvm/Kconfig11
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2
-rw-r--r--arch/powerpc/kvm/powerpc.c9
-rw-r--r--arch/riscv/include/asm/kvm_host.h2
-rw-r--r--arch/riscv/kvm/Kconfig4
-rw-r--r--arch/s390/include/asm/facility.h6
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/kernel/Makefile2
-rw-r--r--arch/s390/kernel/facility.c21
-rw-r--r--arch/s390/kvm/Kconfig2
-rw-r--r--arch/s390/kvm/vsie.c19
-rw-r--r--arch/x86/include/asm/kvm_host.h17
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/kvm/Kconfig16
-rw-r--r--arch/x86/kvm/debugfs.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c271
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h2
-rw-r--r--arch/x86/kvm/vmx/vmx.c11
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--drivers/s390/char/uvdevice.c3
-rw-r--r--fs/anon_inodes.c51
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--include/linux/anon_inodes.h4
-rw-r--r--include/linux/kvm_host.h181
-rw-r--r--include/linux/kvm_types.h1
-rw-r--r--include/linux/pagemap.h17
-rw-r--r--include/trace/events/kvm.h8
-rw-r--r--include/uapi/linux/kvm.h139
-rw-r--r--io_uring/io_uring.c3
-rw-r--r--mm/compaction.c43
-rw-r--r--mm/migrate.c2
-rw-r--r--tools/testing/selftests/kvm/Makefile4
-rw-r--r--tools/testing/selftests/kvm/aarch64/page_fault_test.c2
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c2
-rw-r--r--tools/testing/selftests/kvm/guest_memfd_test.c198
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util_base.h215
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h7
-rw-r--r--tools/testing/selftests/kvm/include/ucall_common.h18
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h15
-rw-r--r--tools/testing/selftests/kvm/kvm_page_table_test.c2
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c226
-rw-r--r--tools/testing/selftests/kvm/lib/memstress.c3
-rw-r--r--tools/testing/selftests/kvm/s390x/cmma_test.c11
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c161
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_features.c10
-rw-r--r--tools/testing/selftests/kvm/x86_64/mmio_warning_test.c121
-rw-r--r--tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c6
-rw-r--r--tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c482
-rw-r--r--tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c120
-rw-r--r--tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c4
-rw-r--r--tools/testing/selftests/kvm/x86_64/ucna_injection_test.c2
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c2
-rw-r--r--tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c8
-rw-r--r--virt/kvm/Kconfig25
-rw-r--r--virt/kvm/Makefile.kvm1
-rw-r--r--virt/kvm/dirty_ring.c2
-rw-r--r--virt/kvm/eventfd.c28
-rw-r--r--virt/kvm/guest_memfd.c532
-rw-r--r--virt/kvm/kvm_main.c516
-rw-r--r--virt/kvm/kvm_mm.h26
78 files changed, 3759 insertions, 801 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 7025b3751027..9326af2a4869 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -147,10 +147,29 @@ described as 'basic' will be available.
The new VM has no virtual cpus and no memory.
You probably want to use 0 as machine type.
+X86:
+^^^^
+
+Supported X86 VM types can be queried via KVM_CAP_VM_TYPES.
+
+S390:
+^^^^^
+
In order to create user controlled virtual machines on S390, check
KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
privileged user (CAP_SYS_ADMIN).
+MIPS:
+^^^^^
+
+To use hardware assisted virtualization on MIPS (VZ ASE) rather than
+the default trap & emulate implementation (which changes the virtual
+memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
+flag KVM_VM_MIPS_VZ.
+
+ARM64:
+^^^^^^
+
On arm64, the physical address size for a VM (IPA Size limit) is limited
to 40bits by default. The limit can be configured if the host supports the
extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use
@@ -608,18 +627,6 @@ interrupt number dequeues the interrupt.
This is an asynchronous vcpu ioctl and can be invoked from any thread.
-4.17 KVM_DEBUG_GUEST
---------------------
-
-:Capability: basic
-:Architectures: none
-:Type: vcpu ioctl
-:Parameters: none)
-:Returns: -1 on error
-
-Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead.
-
-
4.18 KVM_GET_MSRS
-----------------
@@ -6192,6 +6199,130 @@ to know what fields can be changed for the system register described by
``op0, op1, crn, crm, op2``. KVM rejects ID register values that describe a
superset of the features supported by the system.
+4.140 KVM_SET_USER_MEMORY_REGION2
+---------------------------------
+
+:Capability: KVM_CAP_USER_MEMORY2
+:Architectures: all
+:Type: vm ioctl
+:Parameters: struct kvm_userspace_memory_region2 (in)
+:Returns: 0 on success, -1 on error
+
+KVM_SET_USER_MEMORY_REGION2 is an extension to KVM_SET_USER_MEMORY_REGION that
+allows mapping guest_memfd memory into a guest. All fields shared with
+KVM_SET_USER_MEMORY_REGION identically. Userspace can set KVM_MEM_GUEST_MEMFD
+in flags to have KVM bind the memory region to a given guest_memfd range of
+[guest_memfd_offset, guest_memfd_offset + memory_size]. The target guest_memfd
+must point at a file created via KVM_CREATE_GUEST_MEMFD on the current VM, and
+the target range must not be bound to any other memory region. All standard
+bounds checks apply (use common sense).
+
+::
+
+ struct kvm_userspace_memory_region2 {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+ __u64 guest_memfd_offset;
+ __u32 guest_memfd;
+ __u32 pad1;
+ __u64 pad2[14];
+ };
+
+A KVM_MEM_GUEST_MEMFD region _must_ have a valid guest_memfd (private memory) and
+userspace_addr (shared memory). However, "valid" for userspace_addr simply
+means that the address itself must be a legal userspace address. The backing
+mapping for userspace_addr is not required to be valid/populated at the time of
+KVM_SET_USER_MEMORY_REGION2, e.g. shared memory can be lazily mapped/allocated
+on-demand.
+
+When mapping a gfn into the guest, KVM selects shared vs. private, i.e consumes
+userspace_addr vs. guest_memfd, based on the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE
+state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute
+is '0' for all gfns. Userspace can control whether memory is shared/private by
+toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed.
+
+4.141 KVM_SET_MEMORY_ATTRIBUTES
+-------------------------------
+
+:Capability: KVM_CAP_MEMORY_ATTRIBUTES
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_memory_attributes (in)
+:Returns: 0 on success, <0 on error
+
+KVM_SET_MEMORY_ATTRIBUTES allows userspace to set memory attributes for a range
+of guest physical memory.
+
+::
+
+ struct kvm_memory_attributes {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+ };
+
+ #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
+
+The address and size must be page aligned. The supported attributes can be
+retrieved via ioctl(KVM_CHECK_EXTENSION) on KVM_CAP_MEMORY_ATTRIBUTES. If
+executed on a VM, KVM_CAP_MEMORY_ATTRIBUTES precisely returns the attributes
+supported by that VM. If executed at system scope, KVM_CAP_MEMORY_ATTRIBUTES
+returns all attributes supported by KVM. The only attribute defined at this
+time is KVM_MEMORY_ATTRIBUTE_PRIVATE, which marks the associated gfn as being
+guest private memory.
+
+Note, there is no "get" API. Userspace is responsible for explicitly tracking
+the state of a gfn/page as needed.
+
+The "flags" field is reserved for future extensions and must be '0'.
+
+4.142 KVM_CREATE_GUEST_MEMFD
+----------------------------
+
+:Capability: KVM_CAP_GUEST_MEMFD
+:Architectures: none
+:Type: vm ioctl
+:Parameters: struct kvm_create_guest_memfd(in)
+:Returns: 0 on success, <0 on error
+
+KVM_CREATE_GUEST_MEMFD creates an anonymous file and returns a file descriptor
+that refers to it. guest_memfd files are roughly analogous to files created
+via memfd_create(), e.g. guest_memfd files live in RAM, have volatile storage,
+and are automatically released when the last reference is dropped. Unlike
+"regular" memfd_create() files, guest_memfd files are bound to their owning
+virtual machine (see below), cannot be mapped, read, or written by userspace,
+and cannot be resized (guest_memfd files do however support PUNCH_HOLE).
+
+::
+
+ struct kvm_create_guest_memfd {
+ __u64 size;
+ __u64 flags;
+ __u64 reserved[6];
+ };
+
+Conceptually, the inode backing a guest_memfd file represents physical memory,
+i.e. is coupled to the virtual machine as a thing, not to a "struct kvm". The
+file itself, which is bound to a "struct kvm", is that instance's view of the
+underlying memory, e.g. effectively provides the translation of guest addresses
+to host memory. This allows for use cases where multiple KVM structures are
+used to manage a single virtual machine, e.g. when performing intrahost
+migration of a virtual machine.
+
+KVM currently only supports mapping guest_memfd via KVM_SET_USER_MEMORY_REGION2,
+and more specifically via the guest_memfd and guest_memfd_offset fields in
+"struct kvm_userspace_memory_region2", where guest_memfd_offset is the offset
+into the guest_memfd instance. For a given guest_memfd file, there can be at
+most one mapping per page, i.e. binding multiple memory regions to a single
+guest_memfd range is not allowed (any number of memory regions can be bound to
+a single guest_memfd file, but the bound ranges must not overlap).
+
+See KVM_SET_USER_MEMORY_REGION2 for additional details.
+
5. The kvm_run structure
========================
@@ -6826,6 +6957,30 @@ spec refer, https://github.com/riscv/riscv-sbi-doc.
::
+ /* KVM_EXIT_MEMORY_FAULT */
+ struct {
+ #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3)
+ __u64 flags;
+ __u64 gpa;
+ __u64 size;
+ } memory_fault;
+
+KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that
+could not be resolved by KVM. The 'gpa' and 'size' (in bytes) describe the
+guest physical address range [gpa, gpa + size) of the fault. The 'flags' field
+describes properties of the faulting access that are likely pertinent:
+
+ - KVM_MEMORY_EXIT_FLAG_PRIVATE - When set, indicates the memory fault occurred
+ on a private memory access. When clear, indicates the fault occurred on a
+ shared access.
+
+Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it
+accompanies a return code of '-1', not '0'! errno will always be set to EFAULT
+or EHWPOISON when KVM exits with KVM_EXIT_MEMORY_FAULT, userspace should assume
+kvm_run.exit_reason is stale/undefined for all other error numbers.
+
+::
+
/* KVM_EXIT_NOTIFY */
struct {
#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0)
@@ -7858,6 +8013,27 @@ This capability is aimed to mitigate the threat that malicious VMs can
cause CPU stuck (due to event windows don't open up) and make the CPU
unavailable to host or other VMs.
+7.34 KVM_CAP_MEMORY_FAULT_INFO
+------------------------------
+
+:Architectures: x86
+:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
+
+The presence of this capability indicates that KVM_RUN will fill
+kvm_run.memory_fault if KVM cannot resolve a guest page fault VM-Exit, e.g. if
+there is a valid memslot but no backing VMA for the corresponding host virtual
+address.
+
+The information in kvm_run.memory_fault is valid if and only if KVM_RUN returns
+an error with errno=EFAULT or errno=EHWPOISON *and* kvm_run.exit_reason is set
+to KVM_EXIT_MEMORY_FAULT.
+
+Note: Userspaces which attempt to resolve memory faults so that they can retry
+KVM_RUN are encouraged to guard against repeatedly receiving the same
+error/annotated fault.
+
+See KVM_EXIT_MEMORY_FAULT for more information.
+
8. Other capabilities.
======================
@@ -8596,6 +8772,19 @@ block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a
64-bit bitmap (each bit describing a block size). The default value is
0, to disable the eager page splitting.
+8.41 KVM_CAP_VM_TYPES
+---------------------
+
+:Capability: KVM_CAP_MEMORY_ATTRIBUTES
+:Architectures: x86
+:Type: system ioctl
+
+This capability returns a bitmap of support VM types. The 1-setting of bit @n
+means the VM type with value @n is supported. Possible values of @n are::
+
+ #define KVM_X86_DEFAULT_VM 0
+ #define KVM_X86_SW_PROTECTED_VM 1
+
9. Known KVM API problems
=========================
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 824f29f04916..567bbc417e4b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -954,8 +954,6 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events);
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 83c1e09be42e..b07c60c9737d 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,15 +22,13 @@ menuconfig KVM
bool "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
select KVM_GENERIC_HARDWARE_ENABLING
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
select KVM_MMIO
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_XFER_TO_GUEST_WORK
select KVM_VFIO
- select HAVE_KVM_EVENTFD
- select HAVE_KVM_IRQFD
select HAVE_KVM_DIRTY_RING_ACQ_REL
select NEED_KVM_DIRTY_RING_WITH_BITMAP
select HAVE_KVM_MSI
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index 11328700d4fa..2d62f7b0d377 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -45,7 +45,10 @@ struct kvm_vcpu_stat {
u64 signal_exits;
};
+#define KVM_MEM_HUGEPAGE_CAPABLE (1UL << 0)
+#define KVM_MEM_HUGEPAGE_INCAPABLE (1UL << 1)
struct kvm_arch_memory_slot {
+ unsigned long flags;
};
struct kvm_context {
@@ -92,8 +95,10 @@ enum emulation_result {
};
#define KVM_LARCH_FPU (0x1 << 0)
-#define KVM_LARCH_SWCSR_LATEST (0x1 << 1)
-#define KVM_LARCH_HWCSR_USABLE (0x1 << 2)
+#define KVM_LARCH_LSX (0x1 << 1)
+#define KVM_LARCH_LASX (0x1 << 2)
+#define KVM_LARCH_SWCSR_LATEST (0x1 << 3)
+#define KVM_LARCH_HWCSR_USABLE (0x1 << 4)
struct kvm_vcpu_arch {
/*
@@ -175,6 +180,21 @@ static inline void writel_sw_gcsr(struct loongarch_csrs *csr, int reg, unsigned
csr->csrs[reg] = val;
}
+static inline bool kvm_guest_has_fpu(struct kvm_vcpu_arch *arch)
+{
+ return arch->cpucfg[2] & CPUCFG2_FP;
+}
+
+static inline bool kvm_guest_has_lsx(struct kvm_vcpu_arch *arch)
+{
+ return arch->cpucfg[2] & CPUCFG2_LSX;
+}
+
+static inline bool kvm_guest_has_lasx(struct kvm_vcpu_arch *arch)
+{
+ return arch->cpucfg[2] & CPUCFG2_LASX;
+}
+
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
@@ -183,7 +203,6 @@ void kvm_flush_tlb_all(void);
void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa);
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write);
-#define KVM_ARCH_WANT_MMU_NOTIFIER
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h
index 553cfa2b2b1c..e71ceb88f29e 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -55,7 +55,26 @@ void kvm_save_fpu(struct loongarch_fpu *fpu);
void kvm_restore_fpu(struct loongarch_fpu *fpu);
void kvm_restore_fcsr(struct loongarch_fpu *fpu);
-void kvm_acquire_timer(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_CPU_HAS_LSX
+int kvm_own_lsx(struct kvm_vcpu *vcpu);
+void kvm_save_lsx(struct loongarch_fpu *fpu);
+void kvm_restore_lsx(struct loongarch_fpu *fpu);
+#else
+static inline int kvm_own_lsx(struct kvm_vcpu *vcpu) { }
+static inline void kvm_save_lsx(struct loongarch_fpu *fpu) { }
+static inline void kvm_restore_lsx(struct loongarch_fpu *fpu) { }
+#endif
+
+#ifdef CONFIG_CPU_HAS_LASX
+int kvm_own_lasx(struct kvm_vcpu *vcpu);
+void kvm_save_lasx(struct loongarch_fpu *fpu);
+void kvm_restore_lasx(struct loongarch_fpu *fpu);
+#else
+static inline int kvm_own_lasx(struct kvm_vcpu *vcpu) { }
+static inline void kvm_save_lasx(struct loongarch_fpu *fpu) { }
+static inline void kvm_restore_lasx(struct loongarch_fpu *fpu) { }
+#endif
+
void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz);
void kvm_reset_timer(struct kvm_vcpu *vcpu);
void kvm_save_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h
index c6ad2ee6106c..923d0bd38294 100644
--- a/arch/loongarch/include/uapi/asm/kvm.h
+++ b/arch/loongarch/include/uapi/asm/kvm.h
@@ -79,6 +79,7 @@ struct kvm_fpu {
#define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT))
#define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG)
#define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG)
+#define KVM_LOONGARCH_VCPU_CPUCFG 0
struct kvm_debug_exit_arch {
};
diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index d53ab10f4644..4382e36ae3d4 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -349,6 +349,7 @@ SYM_FUNC_START(_restore_lsx_upper)
lsx_restore_all_upper a0 t0 t1
jr ra
SYM_FUNC_END(_restore_lsx_upper)
+EXPORT_SYMBOL(_restore_lsx_upper)
SYM_FUNC_START(_init_lsx_upper)
lsx_init_all_upper t1
@@ -384,6 +385,7 @@ SYM_FUNC_START(_restore_lasx_upper)
lasx_restore_all_upper a0 t0 t1
jr ra
SYM_FUNC_END(_restore_lasx_upper)
+EXPORT_SYMBOL(_restore_lasx_upper)
SYM_FUNC_START(_init_lasx_upper)
lasx_init_all_upper t1
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index fda425babfb2..daba4cd5e87d 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -22,13 +22,12 @@ config KVM
depends on AS_HAS_LVZ_EXTENSION
depends on HAVE_KVM
select HAVE_KVM_DIRTY_RING_ACQ_REL
- select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_MMU_NOTIFIER
select KVM_MMIO
select KVM_XFER_TO_GUEST_WORK
- select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
help
Support hosting virtualized guest machines using
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index ce8de3fa472c..ed1d89d53e2e 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -200,17 +200,8 @@ int kvm_emu_idle(struct kvm_vcpu *vcpu)
++vcpu->stat.idle_exits;
trace_kvm_exit_idle(vcpu, KVM_TRACE_EXIT_IDLE);
- if (!kvm_arch_vcpu_runnable(vcpu)) {
- /*
- * Switch to the software timer before halt-polling/blocking as
- * the guest's timer may be a break event for the vCPU, and the
- * hypervisor timer runs only when the CPU is in guest mode.
- * Switch before halt-polling so that KVM recognizes an expired
- * timer before blocking.
- */
- kvm_save_timer(vcpu);
- kvm_vcpu_block(vcpu);
- }
+ if (!kvm_arch_vcpu_runnable(vcpu))
+ kvm_vcpu_halt(vcpu);
return EMULATE_DONE;
}
@@ -643,6 +634,11 @@ static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
+ if (!kvm_guest_has_fpu(&vcpu->arch)) {
+ kvm_queue_exception(vcpu, EXCCODE_INE, 0);
+ return RESUME_GUEST;
+ }
+
/*
* If guest FPU not present, the FPU operation should have been
* treated as a reserved instruction!
@@ -660,6 +656,36 @@ static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu)
}
/*
+ * kvm_handle_lsx_disabled() - Guest used LSX while disabled in root.
+ * @vcpu: Virtual CPU context.
+ *
+ * Handle when the guest attempts to use LSX when it is disabled in the root
+ * context.
+ */
+static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu)
+{
+ if (kvm_own_lsx(vcpu))
+ kvm_queue_exception(vcpu, EXCCODE_INE, 0);
+
+ return RESUME_GUEST;
+}
+
+/*
+ * kvm_handle_lasx_disabled() - Guest used LASX while disabled in root.
+ * @vcpu: Virtual CPU context.
+ *
+ * Handle when the guest attempts to use LASX when it is disabled in the root
+ * context.
+ */
+static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu)
+{
+ if (kvm_own_lasx(vcpu))
+ kvm_queue_exception(vcpu, EXCCODE_INE, 0);
+
+ return RESUME_GUEST;
+}
+
+/*
* LoongArch KVM callback handling for unimplemented guest exiting
*/
static int kvm_fault_ni(struct kvm_vcpu *vcpu)
@@ -687,6 +713,8 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = {
[EXCCODE_TLBS] = kvm_handle_write_fault,
[EXCCODE_TLBM] = kvm_handle_write_fault,
[EXCCODE_FPDIS] = kvm_handle_fpu_disabled,
+ [EXCCODE_LSXDIS] = kvm_handle_lsx_disabled,
+ [EXCCODE_LASXDIS] = kvm_handle_lasx_disabled,
[EXCCODE_GSPR] = kvm_handle_gspr,
};
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index 1c1d5199500e..86a2f2d0cb27 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -287,7 +287,6 @@ int kvm_arch_hardware_enable(void)
if (env & CSR_GCFG_MATC_ROOT)
gcfg |= CSR_GCFG_MATC_ROOT;
- gcfg |= CSR_GCFG_TIT;
write_csr_gcfg(gcfg);
kvm_flush_tlb_all();
diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index 80480df5f550..915f17527893 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -13,6 +13,16 @@
#include <asm/tlb.h>
#include <asm/kvm_mmu.h>
+static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot)
+{
+ return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE;
+}
+
+static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot)
+{
+ return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE;
+}
+
static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx)
{
ctx->level = kvm->arch.root_level;
@@ -365,6 +375,69 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx);
}
+int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new, enum kvm_mr_change change)
+{
+ gpa_t gpa_start;
+ hva_t hva_start;
+ size_t size, gpa_offset, hva_offset;
+
+ if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE))
+ return 0;
+ /*
+ * Prevent userspace from creating a memory region outside of the
+ * VM GPA address space
+ */
+ if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ new->arch.flags = 0;
+ size = new->npages * PAGE_SIZE;
+ gpa_start = new->base_gfn << PAGE_SHIFT;
+ hva_start = new->userspace_addr;
+ if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE)
+ && IS_ALIGNED(hva_start, PMD_SIZE))
+ new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE;
+ else {
+ /*
+ * Pages belonging to memslots that don't have the same
+ * alignment within a PMD for userspace and GPA cannot be
+ * mapped with PMD entries, because we'll end up mapping
+ * the wrong pages.
+ *
+ * Consider a layout like the following:
+ *
+ * memslot->userspace_addr:
+ * +-----+--------------------+--------------------+---+
+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
+ * +-----+--------------------+--------------------+---+
+ *
+ * memslot->base_gfn << PAGE_SIZE:
+ * +---+--------------------+--------------------+-----+
+ * |abc|def Stage-2 block | Stage-2 block |tvxyz|
+ * +---+--------------------+--------------------+-----+
+ *
+ * If we create those stage-2 blocks, we'll end up with this
+ * incorrect mapping:
+ * d -> f
+ * e -> g
+ * f -> h
+ */
+ gpa_offset = gpa_start & (PMD_SIZE - 1);
+ hva_offset = hva_start & (PMD_SIZE - 1);
+ if (gpa_offset != hva_offset) {
+ new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;
+ } else {
+ if (gpa_offset == 0)
+ gpa_offset = PMD_SIZE;
+ if ((size + gpa_offset) < (PMD_SIZE * 2))
+ new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;
+ }
+ }
+
+ return 0;
+}
+
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
@@ -562,47 +635,23 @@ out:
}
static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
- unsigned long hva, unsigned long map_size, bool write)
+ unsigned long hva, bool write)
{
- size_t size;
- gpa_t gpa_start;
- hva_t uaddr_start, uaddr_end;
+ hva_t start, end;
/* Disable dirty logging on HugePages */
if (kvm_slot_dirty_track_enabled(memslot) && write)
return false;
- size = memslot->npages * PAGE_SIZE;
- gpa_start = memslot->base_gfn << PAGE_SHIFT;
- uaddr_start = memslot->userspace_addr;
- uaddr_end = uaddr_start + size;
+ if (kvm_hugepage_capable(memslot))
+ return true;
- /*
- * Pages belonging to memslots that don't have the same alignment
- * within a PMD for userspace and GPA cannot be mapped with stage-2
- * PMD entries, because we'll end up mapping the wrong pages.
- *
- * Consider a layout like the following:
- *
- * memslot->userspace_addr:
- * +-----+--------------------+--------------------+---+
- * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
- * +-----+--------------------+--------------------+---+
- *
- * memslot->base_gfn << PAGE_SIZE:
- * +---+--------------------+--------------------+-----+
- * |abc|def Stage-2 block | Stage-2 block |tvxyz|
- * +---+--------------------+--------------------+-----+
- *
- * If we create those stage-2 blocks, we'll end up with this incorrect
- * mapping:
- * d -> f
- * e -> g
- * f -> h
- */
- if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
+ if (kvm_hugepage_incapable(memslot))
return false;
+ start = memslot->userspace_addr;
+ end = start + memslot->npages * PAGE_SIZE;
+
/*
* Next, let's make sure we're not trying to map anything not covered
* by the memslot. This means we have to prohibit block size mappings
@@ -615,8 +664,7 @@ static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
* userspace_addr or the base_gfn, as both are equally aligned (per
* the check above) and equally sized.
*/
- return (hva & ~(map_size - 1)) >= uaddr_start &&
- (hva & ~(map_size - 1)) + map_size <= uaddr_end;
+ return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE));
}
/*
@@ -842,7 +890,7 @@ retry:
/* Disable dirty logging on HugePages */
level = 0;
- if (!fault_supports_huge_mapping(memslot, hva, PMD_SIZE, write)) {
+ if (!fault_supports_huge_mapping(memslot, hva, write)) {
level = 0;
} else {
level = host_pfn_mapping_level(kvm, gfn, memslot);
@@ -901,12 +949,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
}
-int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old,
- struct kvm_memory_slot *new, enum kvm_mr_change change)
-{
- return 0;
-}
-
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S
index 0ed9040307b7..ba976509bfe8 100644
--- a/arch/loongarch/kvm/switch.S
+++ b/arch/loongarch/kvm/switch.S
@@ -245,6 +245,37 @@ SYM_FUNC_START(kvm_restore_fpu)
jr ra
SYM_FUNC_END(kvm_restore_fpu)
+#ifdef CONFIG_CPU_HAS_LSX
+SYM_FUNC_START(kvm_save_lsx)
+ fpu_save_csr a0 t1
+ fpu_save_cc a0 t1 t2
+ lsx_save_data a0 t1
+ jr ra
+SYM_FUNC_END(kvm_save_lsx)
+
+SYM_FUNC_START(kvm_restore_lsx)
+ lsx_restore_data a0 t1
+ fpu_restore_cc a0 t1 t2
+ fpu_restore_csr a0 t1 t2
+ jr ra
+SYM_FUNC_END(kvm_restore_lsx)
+#endif
+
+#ifdef CONFIG_CPU_HAS_LASX
+SYM_FUNC_START(kvm_save_lasx)
+ fpu_save_csr a0 t1
+ fpu_save_cc a0 t1 t2
+ lasx_save_data a0 t1
+ jr ra
+SYM_FUNC_END(kvm_save_lasx)
+
+SYM_FUNC_START(kvm_restore_lasx)
+ lasx_restore_data a0 t1
+ fpu_restore_cc a0 t1 t2
+ fpu_restore_csr a0 t1 t2
+ jr ra
+SYM_FUNC_END(kvm_restore_lasx)
+#endif
.section ".rodata"
SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry)
SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest)
diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c
index 284bf553fefe..111328f60872 100644
--- a/arch/loongarch/kvm/timer.c
+++ b/arch/loongarch/kvm/timer.c
@@ -65,40 +65,23 @@ void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long timer_hz)
}
/*
- * Restore hard timer state and enable guest to access timer registers
- * without trap, should be called with irq disabled
- */
-void kvm_acquire_timer(struct kvm_vcpu *vcpu)
-{
- unsigned long cfg;
-
- cfg = read_csr_gcfg();
- if (!(cfg & CSR_GCFG_TIT))
- return;
-
- /* Enable guest access to hard timer */
- write_csr_gcfg(cfg & ~CSR_GCFG_TIT);
-
- /*
- * Freeze the soft-timer and sync the guest stable timer with it. We do
- * this with interrupts disabled to avoid latency.
- */
- hrtimer_cancel(&vcpu->arch.swtimer);
-}
-
-/*
* Restore soft timer state from saved context.
*/
void kvm_restore_timer(struct kvm_vcpu *vcpu)
{
- unsigned long cfg, delta, period;
+ unsigned long cfg, estat;
+ unsigned long ticks, delta, period;
ktime_t expire, now;
struct loongarch_csrs *csr = vcpu->arch.csr;
/*
* Set guest stable timer cfg csr
+ * Disable timer before restore estat CSR register, avoid to
+ * get invalid timer interrupt for old timer cfg
*/
cfg = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG);
+
+ write_gcsr_timercfg(0);
kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ESTAT);
kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TCFG);
if (!(cfg & CSR_TCFG_EN)) {
@@ -108,23 +91,55 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
}
/*
+ * Freeze the soft-timer and sync the guest stable timer with it.
+ */
+ hrtimer_cancel(&vcpu->arch.swtimer);
+
+ /*
+ * From LoongArch Reference Manual Volume 1 Chapter 7.6.2
+ * If oneshot timer is fired, CSR TVAL will be -1, there are two
+ * conditions:
+ * 1) timer is fired during exiting to host
+ * 2) timer is fired and vm is doing timer irq, and then exiting to
+ * host. Host should not inject timer irq to avoid spurious
+ * timer interrupt again
+ */
+ ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL);
+ estat = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+ if (!(cfg & CSR_TCFG_PERIOD) && (ticks > cfg)) {
+ /*
+ * Writing 0 to LOONGARCH_CSR_TVAL will inject timer irq
+ * and set CSR TVAL with -1
+ */
+ write_gcsr_timertick(0);
+
+ /*
+ * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear
+ * timer interrupt, and CSR TVAL keeps unchanged with -1, it
+ * avoids spurious timer interrupt
+ */
+ if (!(estat & CPU_TIMER))
+ gcsr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR);
+ return;
+ }
+
+ /*
* Set remainder tick value if not expired
*/
+ delta = 0;
now = ktime_get();
expire = vcpu->arch.expire;
if (ktime_before(now, expire))
delta = ktime_to_tick(vcpu, ktime_sub(expire, now));
- else {
- if (cfg & CSR_TCFG_PERIOD) {
- period = cfg & CSR_TCFG_VAL;
- delta = ktime_to_tick(vcpu, ktime_sub(now, expire));
- delta = period - (delta % period);
- } else
- delta = 0;
+ else if (cfg & CSR_TCFG_PERIOD) {
+ period = cfg & CSR_TCFG_VAL;
+ delta = ktime_to_tick(vcpu, ktime_sub(now, expire));
+ delta = period - (delta % period);
+
/*
* Inject timer here though sw timer should inject timer
* interrupt async already, since sw timer may be cancelled
- * during injecting intr async in function kvm_acquire_timer
+ * during injecting intr async
*/
kvm_queue_irq(vcpu, INT_TI);
}
@@ -139,27 +154,41 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
*/
static void _kvm_save_timer(struct kvm_vcpu *vcpu)
{
- unsigned long ticks, delta;
+ unsigned long ticks, delta, cfg;
ktime_t expire;
struct loongarch_csrs *csr = vcpu->arch.csr;
+ cfg = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG);
ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL);
- delta = tick_to_ns(vcpu, ticks);
- expire = ktime_add_ns(ktime_get(), delta);
- vcpu->arch.expire = expire;
- if (ticks) {
+
+ /*
+ * From LoongArch Reference Manual Volume 1 Chapter 7.6.2
+ * If period timer is fired, CSR TVAL will be reloaded from CSR TCFG
+ * If oneshot timer is fired, CSR TVAL will be -1
+ * Here judge one-shot timer fired by checking whether TVAL is larger
+ * than TCFG
+ */
+ if (ticks < cfg) {
+ delta = tick_to_ns(vcpu, ticks);
+ expire = ktime_add_ns(ktime_get(), delta);
+ vcpu->arch.expire = expire;
+
/*
- * Update hrtimer to use new timeout
* HRTIMER_MODE_PINNED is suggested since vcpu may run in
* the same physical cpu in next time
*/
- hrtimer_cancel(&vcpu->arch.swtimer);
hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED);
- } else
+ } else if (vcpu->stat.generic.blocking) {
/*
- * Inject timer interrupt so that hall polling can dectect and exit
+ * Inject timer interrupt so that halt polling can dectect and exit.
+ * VCPU is scheduled out already and sleeps in rcuwait queue and
+ * will not poll pending events again. kvm_queue_irq() is not enough,
+ * hrtimer swtimer should be used here.
*/
- kvm_queue_irq(vcpu, INT_TI);
+ expire = ktime_add_ns(ktime_get(), 10);
+ vcpu->arch.expire = expire;
+ hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED);
+ }
}
/*
@@ -168,21 +197,15 @@ static void _kvm_save_timer(struct kvm_vcpu *vcpu)
*/
void kvm_save_timer(struct kvm_vcpu *vcpu)
{
- unsigned long cfg;
struct loongarch_csrs *csr = vcpu->arch.csr;
preempt_disable();
- cfg = read_csr_gcfg();
- if (!(cfg & CSR_GCFG_TIT)) {
- /* Disable guest use of hard timer */
- write_csr_gcfg(cfg | CSR_GCFG_TIT);
-
- /* Save hard timer state */
- kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG);
- kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL);
- if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN)
- _kvm_save_timer(vcpu);
- }
+
+ /* Save hard timer state */
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG);
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL);
+ if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN)
+ _kvm_save_timer(vcpu);
/* Save timer-related state to vCPU context */
kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT);
diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h
index a1e35d655418..c2484ad4cffa 100644
--- a/arch/loongarch/kvm/trace.h
+++ b/arch/loongarch/kvm/trace.h
@@ -102,6 +102,8 @@ TRACE_EVENT(kvm_exit_gspr,
#define KVM_TRACE_AUX_DISCARD 4
#define KVM_TRACE_AUX_FPU 1
+#define KVM_TRACE_AUX_LSX 2
+#define KVM_TRACE_AUX_LASX 3
#define kvm_trace_symbol_aux_op \
{ KVM_TRACE_AUX_SAVE, "save" }, \
@@ -111,7 +113,9 @@ TRACE_EVENT(kvm_exit_gspr,
{ KVM_TRACE_AUX_DISCARD, "discard" }
#define kvm_trace_symbol_aux_state \
- { KVM_TRACE_AUX_FPU, "FPU" }
+ { KVM_TRACE_AUX_FPU, "FPU" }, \
+ { KVM_TRACE_AUX_LSX, "LSX" }, \
+ { KVM_TRACE_AUX_LASX, "LASX" }
TRACE_EVENT(kvm_aux,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op,
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 73d0c2b9c1a5..27701991886d 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -95,7 +95,6 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu)
* check vmid before vcpu enter guest
*/
local_irq_disable();
- kvm_acquire_timer(vcpu);
kvm_deliver_intr(vcpu);
kvm_deliver_exception(vcpu);
/* Make sure the vcpu mode has been written */
@@ -187,8 +186,15 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- return kvm_pending_timer(vcpu) ||
+ int ret;
+
+ /* Protect from TOD sync and vcpu_load/put() */
+ preempt_disable();
+ ret = kvm_pending_timer(vcpu) ||
kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT) & (1 << INT_TI);
+ preempt_enable();
+
+ return ret;
}
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
@@ -244,23 +250,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
return -EINVAL;
}
-/**
- * kvm_migrate_count() - Migrate timer.
- * @vcpu: Virtual CPU.
- *
- * Migrate hrtimer to the current CPU by cancelling and restarting it
- * if the hrtimer is active.
- *
- * Must be called when the vCPU is migrated to a different CPU, so that
- * the timer can interrupt the guest at the new CPU, and the timer irq can
- * be delivered to the vCPU.
- */
-static void kvm_migrate_count(struct kvm_vcpu *vcpu)
-{
- if (hrtimer_cancel(&vcpu->arch.swtimer))
- hrtimer_restart(&vcpu->arch.swtimer);
-}
-
static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 *val)
{
unsigned long gintc;
@@ -309,6 +298,76 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val)
return ret;
}
+static int _kvm_get_cpucfg(int id, u64 *v)
+{
+ int ret = 0;
+
+ if (id < 0 && id >= KVM_MAX_CPUCFG_REGS)
+ return -EINVAL;
+
+ switch (id) {
+ case 2:
+ /* Return CPUCFG2 features which have been supported by KVM */
+ *v = CPUCFG2_FP | CPUCFG2_FPSP | CPUCFG2_FPDP |
+ CPUCFG2_FPVERS | CPUCFG2_LLFTP | CPUCFG2_LLFTPREV |
+ CPUCFG2_LAM;
+ /*
+ * If LSX is supported by CPU, it is also supported by KVM,
+ * as we implement it.
+ */
+ if (cpu_has_lsx)
+ *v |= CPUCFG2_LSX;
+ /*
+ * if LASX is supported by CPU, it is also supported by KVM,
+ * as we implement it.
+ */
+ if (cpu_has_lasx)
+ *v |= CPUCFG2_LASX;
+
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+static int kvm_check_cpucfg(int id, u64 val)
+{
+ u64 mask;
+ int ret = 0;
+
+ if (id < 0 && id >= KVM_MAX_CPUCFG_REGS)
+ return -EINVAL;
+
+ if (_kvm_get_cpucfg(id, &mask))
+ return ret;
+
+ switch (id) {
+ case 2:
+ /* CPUCFG2 features checking */
+ if (val & ~mask)
+ /* The unsupported features should not be set */
+ ret = -EINVAL;
+ else if (!(val & CPUCFG2_LLFTP))
+ /* The LLFTP must be set, as guest must has a constant timer */
+ ret = -EINVAL;
+ else if ((val & CPUCFG2_FP) && (!(val & CPUCFG2_FPSP) || !(val & CPUCFG2_FPDP)))
+ /* Single and double float point must both be set when enable FP */
+ ret = -EINVAL;
+ else if ((val & CPUCFG2_LSX) && !(val & CPUCFG2_FP))
+ /* FP should be set when enable LSX */
+ ret = -EINVAL;
+ else if ((val & CPUCFG2_LASX) && !(val & CPUCFG2_LSX))
+ /* LSX, FP should be set when enable LASX, and FP has been checked before. */
+ ret = -EINVAL;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
static int kvm_get_one_reg(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg, u64 *v)
{
@@ -378,10 +437,10 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu,
break;
case KVM_REG_LOONGARCH_CPUCFG:
id = KVM_GET_IOC_CPUCFG_IDX(reg->id);
- if (id >= 0 && id < KVM_MAX_CPUCFG_REGS)
- vcpu->arch.cpucfg[id] = (u32)v;
- else
- ret = -EINVAL;
+ ret = kvm_check_cpucfg(id, v);
+ if (ret)
+ break;
+ vcpu->arch.cpucfg[id] = (u32)v;
break;
case KVM_REG_LOONGARCH_KVM:
switch (reg->id) {
@@ -471,10 +530,94 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return -EINVAL;
}
+static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case 2:
+ return 0;
+ default:
+ return -ENXIO;
+ }
+
+ return -ENXIO;
+}
+
+static int kvm_loongarch_vcpu_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->group) {
+ case KVM_LOONGARCH_VCPU_CPUCFG:
+ ret = kvm_loongarch_cpucfg_has_attr(vcpu, attr);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static int kvm_loongarch_get_cpucfg_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int ret = 0;
+ uint64_t val;
+ uint64_t __user *uaddr = (uint64_t __user *)attr->addr;
+
+ ret = _kvm_get_cpucfg(attr->attr, &val);
+ if (ret)
+ return ret;
+
+ put_user(val, uaddr);
+
+ return ret;
+}
+
+static int kvm_loongarch_vcpu_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->group) {
+ case KVM_LOONGARCH_VCPU_CPUCFG:
+ ret = kvm_loongarch_get_cpucfg_attr(vcpu, attr);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static int kvm_loongarch_cpucfg_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ return -ENXIO;
+}
+
+static int kvm_loongarch_vcpu_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->group) {
+ case KVM_LOONGARCH_VCPU_CPUCFG:
+ ret = kvm_loongarch_cpucfg_set_attr(vcpu, attr);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
long r;
+ struct kvm_device_attr attr;
void __user *argp = (void __user *)arg;
struct kvm_vcpu *vcpu = filp->private_data;
@@ -514,6 +657,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
break;
}
+ case KVM_HAS_DEVICE_ATTR: {
+ r = -EFAULT;
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ break;
+ r = kvm_loongarch_vcpu_has_attr(vcpu, &attr);
+ break;
+ }
+ case KVM_GET_DEVICE_ATTR: {
+ r = -EFAULT;
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ break;
+ r = kvm_loongarch_vcpu_get_attr(vcpu, &attr);
+ break;
+ }
+ case KVM_SET_DEVICE_ATTR: {
+ r = -EFAULT;
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ break;
+ r = kvm_loongarch_vcpu_set_attr(vcpu, &attr);
+ break;
+ }
default:
r = -ENOIOCTLCMD;
break;
@@ -561,12 +725,96 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
preempt_enable();
}
+#ifdef CONFIG_CPU_HAS_LSX
+/* Enable LSX and restore context */
+int kvm_own_lsx(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_guest_has_fpu(&vcpu->arch) || !kvm_guest_has_lsx(&vcpu->arch))
+ return -EINVAL;
+
+ preempt_disable();
+
+ /* Enable LSX for guest */
+ set_csr_euen(CSR_EUEN_LSXEN | CSR_EUEN_FPEN);
+ switch (vcpu->arch.aux_inuse & KVM_LARCH_FPU) {
+ case KVM_LARCH_FPU:
+ /*
+ * Guest FPU state already loaded,
+ * only restore upper LSX state
+ */
+ _restore_lsx_upper(&vcpu->arch.fpu);
+ break;
+ default:
+ /* Neither FP or LSX already active,
+ * restore full LSX state
+ */
+ kvm_restore_lsx(&vcpu->arch.fpu);
+ break;
+ }
+
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_LSX);
+ vcpu->arch.aux_inuse |= KVM_LARCH_LSX | KVM_LARCH_FPU;
+ preempt_enable();
+
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_CPU_HAS_LASX
+/* Enable LASX and restore context */
+int kvm_own_lasx(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_guest_has_fpu(&vcpu->arch) || !kvm_guest_has_lsx(&vcpu->arch) || !kvm_guest_has_lasx(&vcpu->arch))
+ return -EINVAL;
+
+ preempt_disable();
+
+ set_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN);
+ switch (vcpu->arch.aux_inuse & (KVM_LARCH_FPU | KVM_LARCH_LSX)) {
+ case KVM_LARCH_LSX:
+ case KVM_LARCH_LSX | KVM_LARCH_FPU:
+ /* Guest LSX state already loaded, only restore upper LASX state */
+ _restore_lasx_upper(&vcpu->arch.fpu);
+ break;
+ case KVM_LARCH_FPU:
+ /* Guest FP state already loaded, only restore upper LSX & LASX state */
+ _restore_lsx_upper(&vcpu->arch.fpu);
+ _restore_lasx_upper(&vcpu->arch.fpu);
+ break;
+ default:
+ /* Neither FP or LSX already active, restore full LASX state */
+ kvm_restore_lasx(&vcpu->arch.fpu);
+ break;
+ }
+
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_LASX);
+ vcpu->arch.aux_inuse |= KVM_LARCH_LASX | KVM_LARCH_LSX | KVM_LARCH_FPU;
+ preempt_enable();
+
+ return 0;
+}
+#endif
+
/* Save context and disable FPU */
void kvm_lose_fpu(struct kvm_vcpu *vcpu)
{
preempt_disable();
- if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) {
+ if (vcpu->arch.aux_inuse & KVM_LARCH_LASX) {
+ kvm_save_lasx(&vcpu->arch.fpu);
+ vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU | KVM_LARCH_LASX);
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_LASX);
+
+ /* Disable LASX & LSX & FPU */
+ clear_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN);
+ } else if (vcpu->arch.aux_inuse & KVM_LARCH_LSX) {
+ kvm_save_lsx(&vcpu->arch.fpu);
+ vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU);
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_LSX);
+
+ /* Disable LSX & FPU */
+ clear_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN);
+ } else if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) {
kvm_save_fpu(&vcpu->arch.fpu);
vcpu->arch.aux_inuse &= ~KVM_LARCH_FPU;
trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU);
@@ -789,17 +1037,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
unsigned long flags;
local_irq_save(flags);
- if (vcpu->arch.last_sched_cpu != cpu) {
- kvm_debug("[%d->%d]KVM vCPU[%d] switch\n",
- vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
- /*
- * Migrate the timer interrupt to the current CPU so that it
- * always interrupts the guest and synchronously triggers a
- * guest timer interrupt.
- */
- kvm_migrate_count(vcpu);
- }
-
/* Restore guest state to registers */
_kvm_vcpu_load(vcpu, cpu);
local_irq_restore(flags);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 54a85f1d4f2c..179f320cc231 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -810,8 +810,6 @@ int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
pgd_t *kvm_pgd_alloc(void);
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
/* Emulation */
enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index a8cdba75f98d..428141b0b48f 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -22,10 +22,9 @@ config KVM
select EXPORT_UASM
select PREEMPT_NOTIFIERS
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
- select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_MMIO
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
select INTERVAL_TREE
select KVM_GENERIC_HARDWARE_ENABLING
help
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8799b37be295..8abac532146e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -63,8 +63,6 @@
#include <linux/mmu_notifier.h>
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
#define HPTEG_CACHE_NUM (1 << 15)
#define HPTEG_HASH_BITS_PTE 13
#define HPTEG_HASH_BITS_PTE_LONG 12
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 902611954200..b47196085a42 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
config KVM
bool
select PREEMPT_NOTIFIERS
- select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_VFIO
select IRQ_BYPASS_MANAGER
@@ -42,7 +41,7 @@ config KVM_BOOK3S_64_HANDLER
config KVM_BOOK3S_PR_POSSIBLE
bool
select KVM_MMIO
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
config KVM_BOOK3S_HV_POSSIBLE
bool
@@ -85,7 +84,7 @@ config KVM_BOOK3S_64_HV
tristate "KVM for POWER7 and later using hypervisor mode in host"
depends on KVM_BOOK3S_64 && PPC_POWERNV
select KVM_BOOK3S_HV_POSSIBLE
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
select CMA
help
Support running unmodified book3s_64 guest kernels in
@@ -194,7 +193,7 @@ config KVM_E500V2
depends on !CONTEXT_TRACKING_USER
select KVM
select KVM_MMIO
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
help
Support running unmodified E500 guest kernels in virtual machines on
E500v2 host processors.
@@ -211,7 +210,7 @@ config KVM_E500MC
select KVM
select KVM_MMIO
select KVM_BOOKE_HV
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
help
Support running unmodified E500MC/E5500/E6500 guest kernels in
virtual machines on E500MC/E5500/E6500 host processors.
@@ -225,7 +224,6 @@ config KVM_MPIC
bool "KVM in-kernel MPIC emulation"
depends on KVM && PPC_E500
select HAVE_KVM_IRQCHIP
- select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_MSI
help
@@ -238,7 +236,6 @@ config KVM_XICS
bool "KVM in-kernel XICS emulation"
depends on KVM_BOOK3S_64 && !KVM_MPIC
select HAVE_KVM_IRQCHIP
- select HAVE_KVM_IRQFD
default y
help
Include support for the XICS (eXternal Interrupt Controller
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ed6ec140701..b5b04dd5d0ee 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -6210,7 +6210,7 @@ static int kvmhv_svm_off(struct kvm *kvm)
}
srcu_idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
struct kvm_memory_slot *memslot;
struct kvm_memslots *slots = __kvm_memslots(kvm, i);
int bkt;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index f6af752698d0..c3f82e238b70 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -578,7 +578,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
case KVM_CAP_IRQFD_RESAMPLE:
r = !xive_enabled();
break;
@@ -632,13 +632,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
case KVM_CAP_SYNC_MMU:
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- r = hv_enabled;
-#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_KVM_GENERIC_MMU_NOTIFIER));
r = 1;
-#else
- r = 0;
-#endif
break;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_HTAB_FD:
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 525cba63e0c5..484d04a92fa6 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -274,8 +274,6 @@ struct kvm_vcpu_arch {
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12
void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index 148e52b516cf..1fd76aee3b71 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -20,9 +20,7 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
depends on RISCV_SBI && MMU
- select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQCHIP
- select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_MSI
select HAVE_KVM_VCPU_ASYNC_IOCTL
@@ -30,7 +28,7 @@ config KVM
select KVM_GENERIC_HARDWARE_ENABLING
select KVM_MMIO
select KVM_XFER_TO_GUEST_WORK
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select SCHED_INFO
help
diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index 94b6919026df..796007125dff 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -111,4 +111,10 @@ static inline void stfle(u64 *stfle_fac_list, int size)
preempt_enable();
}
+/**
+ * stfle_size - Actual size of the facility list as specified by stfle
+ * (number of double words)
+ */
+unsigned int stfle_size(void);
+
#endif /* __ASM_FACILITY_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 67a298b6cf6e..52664105a473 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -818,7 +818,7 @@ struct s390_io_adapter {
struct kvm_s390_cpu_model {
/* facility mask supported by kvm & hosting machine */
- __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64];
+ __u64 fac_mask[S390_ARCH_FAC_MASK_SIZE_U64];
struct kvm_s390_vm_cpu_subfunc subfuncs;
/* facility list requested by guest (in dma page) */
__u64 *fac_list;
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 353def93973b..7a562b4199c8 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -41,7 +41,7 @@ obj-y += sysinfo.o lgr.o os_info.o ctlreg.o
obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
obj-y += entry.o reipl.o kdebugfs.o alternative.o
obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o
+obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o
extra-y += vmlinux.lds
diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c
new file mode 100644
index 000000000000..f02127219a27
--- /dev/null
+++ b/arch/s390/kernel/facility.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2023
+ */
+
+#include <asm/facility.h>
+
+unsigned int stfle_size(void)
+{
+ static unsigned int size;
+ unsigned int r;
+ u64 dummy;
+
+ r = READ_ONCE(size);
+ if (!r) {
+ r = __stfle_asm(&dummy, 1) + 1;
+ WRITE_ONCE(size, r);
+ }
+ return r;
+}
+EXPORT_SYMBOL(stfle_size);
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 45fdf2a9b2e3..bb6d90351119 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -23,11 +23,9 @@ config KVM
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_VCPU_ASYNC_IOCTL
- select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
select KVM_ASYNC_PF_SYNC
select HAVE_KVM_IRQCHIP
- select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_INVALID_WAKEUPS
select HAVE_KVM_NO_POLL
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8207a892bbe2..fef42e2a80a2 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -19,6 +19,7 @@
#include <asm/nmi.h>
#include <asm/dis.h>
#include <asm/fpu/api.h>
+#include <asm/facility.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -984,12 +985,26 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page)
static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
- __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
+ __u32 fac = READ_ONCE(vsie_page->scb_o->fac);
+ /*
+ * Alternate-STFLE-Interpretive-Execution facilities are not supported
+ * -> format-0 flcb
+ */
if (fac && test_kvm_facility(vcpu->kvm, 7)) {
retry_vsie_icpt(vsie_page);
+ /*
+ * The facility list origin (FLO) is in bits 1 - 28 of the FLD
+ * so we need to mask here before reading.
+ */
+ fac = fac & 0x7ffffff8U;
+ /*
+ * format-0 -> size of nested guest's facility list == guest's size
+ * guest's size == host's size, since STFLE is interpretatively executed
+ * using a format-0 for the guest, too.
+ */
if (read_guest_real(vcpu, fac, &vsie_page->fac,
- sizeof(vsie_page->fac)))
+ stfle_size() * sizeof(u64)))
return set_validity_icpt(scb_s, 0x1090U);
scb_s->fac = (__u32)(__u64) &vsie_page->fac;
}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d7036982332e..a565a2e70f30 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1255,6 +1255,7 @@ enum kvm_apicv_inhibit {
};
struct kvm_arch {
+ unsigned long vm_type;
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
@@ -1848,6 +1849,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -2086,6 +2090,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
+#ifdef CONFIG_KVM_PRIVATE_MEM
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
+#else
+#define kvm_arch_has_private_mem(kvm) false
+#endif
+
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -2133,16 +2143,15 @@ enum {
#define HF_SMM_MASK (1 << 1)
#define HF_SMM_INSIDE_NMI_MASK (1 << 2)
-# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
-# define KVM_ADDRESS_SPACE_NUM 2
+# define KVM_MAX_NR_ADDRESS_SPACES 2
+/* SMM is currently unsupported for guests with private memory. */
+# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2)
# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
#else
# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
#endif
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 1a6a1f987949..a448d0964fc0 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -562,4 +562,7 @@ struct kvm_pmu_event_filter {
/* x86-specific KVM_EXIT_HYPERCALL flags. */
#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
+#define KVM_X86_DEFAULT_VM 0
+#define KVM_X86_SW_PROTECTED_VM 1
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 950c12868d30..b07247b0b958 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -24,16 +24,14 @@ config KVM
depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
select PREEMPT_NOTIFIERS
- select MMU_NOTIFIER
+ select KVM_GENERIC_MMU_NOTIFIER
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
- select HAVE_KVM_IRQFD
select HAVE_KVM_DIRTY_RING_TSO
select HAVE_KVM_DIRTY_RING_ACQ_REL
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
- select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
@@ -77,6 +75,18 @@ config KVM_WERROR
If in doubt, say "N".
+config KVM_SW_PROTECTED_VM
+ bool "Enable support for KVM software-protected VMs"
+ depends on EXPERT
+ depends on X86_64
+ select KVM_GENERIC_PRIVATE_MEM
+ help
+ Enable support for KVM software-protected VMs. Currently "protected"
+ means the VM can be backed with memory provided by
+ KVM_CREATE_GUEST_MEMFD.
+
+ If unsure, say "N".
+
config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index eea6ea7f14af..95ea1a1f7403 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -111,7 +111,7 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
mutex_lock(&kvm->slots_lock);
write_lock(&kvm->mmu_lock);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
int bkt;
slots = __kvm_memslots(kvm, i);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c57e181bba21..59b026b6ad2a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -795,16 +795,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
+/*
+ * The most significant bit in disallow_lpage tracks whether or not memory
+ * attributes are mixed, i.e. not identical for all gfns at the current level.
+ * The lower order bits are used to refcount other cases where a hugepage is
+ * disallowed, e.g. if KVM has shadow a page table at the gfn.
+ */
+#define KVM_LPAGE_MIXED_FLAG BIT(31)
+
static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
gfn_t gfn, int count)
{
struct kvm_lpage_info *linfo;
- int i;
+ int old, i;
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
+
+ old = linfo->disallow_lpage;
linfo->disallow_lpage += count;
- WARN_ON_ONCE(linfo->disallow_lpage < 0);
+ WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
}
}
@@ -3056,7 +3066,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
*
* There are several ways to safely use this helper:
*
- * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
+ * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
* consuming it. In this case, mmu_lock doesn't need to be held during the
* lookup, but it does need to be held while checking the MMU notifier.
*
@@ -3137,9 +3147,9 @@ out:
return level;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn,
- int max_level)
+static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, int max_level, bool is_private)
{
struct kvm_lpage_info *linfo;
int host_level;
@@ -3151,6 +3161,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
break;
}
+ if (is_private)
+ return max_level;
+
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
@@ -3158,6 +3171,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
return min(host_level, max_level);
}
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ int max_level)
+{
+ bool is_private = kvm_slot_can_be_private(slot) &&
+ kvm_mem_is_private(kvm, gfn);
+
+ return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+}
+
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
@@ -3178,8 +3201,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
- fault->gfn, fault->max_level);
+ fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+ fault->gfn, fault->max_level,
+ fault->is_private);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
@@ -3739,7 +3763,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
kvm_page_track_write_tracking_enabled(kvm))
goto out_success;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(slot, bkt, slots) {
/*
@@ -4259,6 +4283,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
}
+static inline u8 kvm_max_level_for_order(int order)
+{
+ BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+ KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+ return PG_LEVEL_1G;
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
+static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int max_order, r;
+
+ if (!kvm_slot_can_be_private(fault->slot)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
+ &max_order);
+ if (r) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return r;
+ }
+
+ fault->max_level = min(kvm_max_level_for_order(max_order),
+ fault->max_level);
+ fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+
+ return RET_PF_CONTINUE;
+}
+
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
@@ -4291,6 +4364,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_EMULATE;
}
+ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (fault->is_private)
+ return kvm_faultin_pfn_private(vcpu, fault);
+
async = false;
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
@@ -4366,7 +4447,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
return true;
return fault->slot &&
- mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
+ mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
}
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -6228,7 +6309,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (!kvm_memslots_have_rmaps(kvm))
return flush;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
@@ -6260,7 +6341,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
write_lock(&kvm->mmu_lock);
- kvm_mmu_invalidate_begin(kvm, 0, -1ul);
+ kvm_mmu_invalidate_begin(kvm);
+
+ kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
@@ -6270,7 +6353,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (flush)
kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
- kvm_mmu_invalidate_end(kvm, 0, -1ul);
+ kvm_mmu_invalidate_end(kvm);
write_unlock(&kvm->mmu_lock);
}
@@ -6723,7 +6806,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
* modifier prior to checking for a wrap of the MMIO generation so
* that a wrap in any address space is detected.
*/
- gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
/*
* The very rare case: if the MMIO generation number has wrapped,
@@ -7176,3 +7259,163 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
if (kvm->arch.nx_huge_page_recovery_thread)
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ /*
+ * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
+ * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
+ * can simply ignore such slots. But if userspace is making memory
+ * PRIVATE, then KVM must prevent the guest from accessing the memory
+ * as shared. And if userspace is making memory SHARED and this point
+ * is reached, then at least one page within the range was previously
+ * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
+ * Zapping SPTEs in this case ensures KVM will reassess whether or not
+ * a hugepage can be used for affected ranges.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ return kvm_unmap_gfn_range(kvm, range);
+}
+
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
+static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, int level, unsigned long attrs)
+{
+ const unsigned long start = gfn;
+ const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
+
+ if (level == PG_LEVEL_2M)
+ return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+
+ for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
+ if (hugepage_test_mixed(slot, gfn, level - 1) ||
+ attrs != kvm_get_memory_attributes(kvm, gfn))
+ return false;
+ }
+ return true;
+}
+
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ unsigned long attrs = range->arg.attributes;
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Calculate which ranges can be mapped with hugepages even if the slot
+ * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over
+ * a range that has PRIVATE GFNs, and conversely converting a range to
+ * SHARED may now allow hugepages.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ /*
+ * The sequence matters here: upper levels consume the result of lower
+ * level's scanning.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn = gfn_round_for_level(range->start, level);
+
+ /* Process the head page if it straddles the range. */
+ if (gfn != range->start || gfn + nr_pages > range->end) {
+ /*
+ * Skip mixed tracking if the aligned gfn isn't covered
+ * by the memslot, KVM can't use a hugepage due to the
+ * misaligned address regardless of memory attributes.
+ */
+ if (gfn >= slot->base_gfn) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ gfn += nr_pages;
+ }
+
+ /*
+ * Pages entirely covered by the range are guaranteed to have
+ * only the attributes which were just set.
+ */
+ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
+ hugepage_clear_mixed(slot, gfn, level);
+
+ /*
+ * Process the last tail page if it straddles the range and is
+ * contained by the memslot. Like the head page, KVM can't
+ * create a hugepage if the slot size is misaligned.
+ */
+ if (gfn < range->end &&
+ (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+ return false;
+}
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ int level;
+
+ if (!kvm_arch_has_private_mem(kvm))
+ return;
+
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ /*
+ * Don't bother tracking mixed attributes for pages that can't
+ * be huge due to alignment, i.e. process only pages that are
+ * entirely contained by the memslot.
+ */
+ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
+ gfn_t start = gfn_round_for_level(slot->base_gfn, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn;
+
+ if (start < slot->base_gfn)
+ start += nr_pages;
+
+ /*
+ * Unlike setting attributes, every potential hugepage needs to
+ * be manually checked as the attributes may already be mixed.
+ */
+ for (gfn = start; gfn < end; gfn += nr_pages) {
+ unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
+
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index decc1f153669..b66a7d47e0e4 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -201,6 +201,7 @@ struct kvm_page_fault {
/* Derived from mmu and global state. */
const bool is_tdp;
+ const bool is_private;
const bool nx_huge_page_workaround_enabled;
/*
@@ -296,6 +297,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
+ .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
};
int r;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index be20a60047b1..40e3780d73ae 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6757,10 +6757,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
return;
/*
- * Grab the memslot so that the hva lookup for the mmu_notifier retry
- * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
- * on the pfn lookup's validation of the memslot to ensure a valid hva
- * is used for the retry check.
+ * Explicitly grab the memslot using KVM's internal slot ID to ensure
+ * KVM doesn't unintentionally grab a userspace memslot. It _should_
+ * be impossible for userspace to create a memslot for the APIC when
+ * APICv is enabled, but paranoia won't hurt in this case.
*/
slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
@@ -6785,8 +6785,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
return;
read_lock(&vcpu->kvm->mmu_lock);
- if (mmu_invalidate_retry_hva(kvm, mmu_seq,
- gfn_to_hva_memslot(slot, gfn))) {
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
read_unlock(&vcpu->kvm->mmu_lock);
goto out;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1a3aaa7dafae..ba8a8ea7bd6f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4548,6 +4548,13 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
return 0;
}
+static bool kvm_is_vm_type_supported(unsigned long type)
+{
+ return type == KVM_X86_DEFAULT_VM ||
+ (type == KVM_X86_SW_PROTECTED_VM &&
+ IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled);
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r = 0;
@@ -4625,6 +4632,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_MEMORY_FAULT_INFO:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4738,6 +4746,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_X86_NOTIFY_VMEXIT:
r = kvm_caps.has_notify_vmexit;
break;
+ case KVM_CAP_VM_TYPES:
+ r = BIT(KVM_X86_DEFAULT_VM);
+ if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
+ r |= BIT(KVM_X86_SW_PROTECTED_VM);
+ break;
default:
break;
}
@@ -11081,6 +11094,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
@@ -12434,9 +12448,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
int ret;
unsigned long flags;
- if (type)
+ if (!kvm_is_vm_type_supported(type))
return -EINVAL;
+ kvm->arch.vm_type = type;
+
ret = kvm_page_track_init(kvm);
if (ret)
goto out;
@@ -12575,8 +12591,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
hva = slot->userspace_addr;
}
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- struct kvm_userspace_memory_region m;
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ struct kvm_userspace_memory_region2 m;
m.slot = id | (i << 16);
m.flags = 0;
@@ -12726,6 +12742,10 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ kvm_mmu_init_memslot_memory_attributes(kvm, slot);
+#endif
+
if (kvm_page_track_create_memslot(kvm, slot, npages))
goto out_free;
diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c
index 144cd2e03590..42c9f77f8da0 100644
--- a/drivers/s390/char/uvdevice.c
+++ b/drivers/s390/char/uvdevice.c
@@ -109,6 +109,7 @@ static int uvio_copy_attest_result_to_user(struct uv_cb_attest *uvcb_attest,
struct uvio_attest *uvio_attest)
{
struct uvio_attest __user *user_uvio_attest = (void __user *)uv_ioctl->argument_addr;
+ u32 __user *user_buf_add_len = (u32 __user *)&user_uvio_attest->add_data_len;
void __user *user_buf_add = (void __user *)uvio_attest->add_data_addr;
void __user *user_buf_meas = (void __user *)uvio_attest->meas_addr;
void __user *user_buf_uid = &user_uvio_attest->config_uid;
@@ -117,6 +118,8 @@ static int uvio_copy_attest_result_to_user(struct uv_cb_attest *uvcb_attest,
return -EFAULT;
if (add_data && copy_to_user(user_buf_add, add_data, uvio_attest->add_data_len))
return -EFAULT;
+ if (put_user(uvio_attest->add_data_len, user_buf_add_len))
+ return -EFAULT;
if (copy_to_user(user_buf_uid, uvcb_attest->config_uid, sizeof(uvcb_attest->config_uid)))
return -EFAULT;
return 0;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d26222b7eefe..0496cb5b6eab 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
struct inode *inode;
struct file *file;
@@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name,
if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);
- if (secure) {
+ if (make_inode) {
inode = anon_inode_make_secure_inode(name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
@@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name,
EXPORT_SYMBOL_GPL(anon_inode_getfile);
/**
- * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
* !S_PRIVATE anon inode rather than reuse the
* singleton anon inode and calls the
- * inode_init_security_anon() LSM hook. This
- * allows for both the inode to have its own
- * security context and for the LSM to enforce
- * policy on the inode's creation.
+ * inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
- * reference to it is not held. Returns the newly created file* or an error
- * pointer. See the anon_inode_getfile() documentation for more information.
+ * reference to it is not held.
+ *
+ * Returns the newly created file* or an error pointer.
*/
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
@@ -176,12 +183,13 @@ struct file *anon_inode_getfile_secure(const char *name,
return __anon_inode_getfile(name, fops, priv, flags,
context_inode, true);
}
+EXPORT_SYMBOL_GPL(anon_inode_create_getfile);
static int __anon_inode_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
int error, fd;
struct file *file;
@@ -192,7 +200,7 @@ static int __anon_inode_getfd(const char *name,
fd = error;
file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
- secure);
+ make_inode);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_put_unused_fd;
@@ -231,10 +239,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
EXPORT_SYMBOL_GPL(anon_inode_getfd);
/**
- * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new
+ * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
* !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
- * the inode_init_security_anon() LSM hook. This allows the inode to have its
- * own security context and for a LSM to reject creation of the inode.
+ * the inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -243,16 +250,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
* reference to it is not held.
+ *
+ * Returns a newly created file descriptor or an error code.
*/
-int anon_inode_getfd_secure(const char *name, const struct file_operations *fops,
+int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
{
return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}
-EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
static int __init anon_inode_init(void)
{
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e8af40b05549..38b10390a8aa 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1033,7 +1033,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
{
int fd;
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -2189,7 +2189,8 @@ static int new_userfaultfd(int flags)
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 5deaddbd7927..93a5f16d03f3 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -15,13 +15,13 @@ struct inode;
struct file *anon_inode_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags);
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode);
int anon_inode_getfd(const char *name, const struct file_operations *fops,
void *priv, int flags);
-int anon_inode_getfd_secure(const char *name,
+int anon_inode_create_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4944136efaa2..7e7fd25b09b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -80,8 +80,8 @@
/* Two fragments for cross MMIO pages. */
#define KVM_MAX_MMIO_FRAGMENTS 2
-#ifndef KVM_ADDRESS_SPACE_NUM
-#define KVM_ADDRESS_SPACE_NUM 1
+#ifndef KVM_MAX_NR_ADDRESS_SPACES
+#define KVM_MAX_NR_ADDRESS_SPACES 1
#endif
/*
@@ -253,9 +253,10 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
-#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
union kvm_mmu_notifier_arg {
pte_t pte;
+ unsigned long attributes;
};
struct kvm_gfn_range {
@@ -588,8 +589,20 @@ struct kvm_memory_slot {
u32 flags;
short id;
u16 as_id;
+
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ struct {
+ struct file __rcu *file;
+ pgoff_t pgoff;
+ } gmem;
+#endif
};
+static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
+{
+ return slot && (slot->flags & KVM_MEM_GUEST_MEMFD);
+}
+
static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
{
return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -677,13 +690,29 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);
#define KVM_MEM_SLOTS_NUM SHRT_MAX
#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
-#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#if KVM_MAX_NR_ADDRESS_SPACES == 1
+static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm)
+{
+ return KVM_MAX_NR_ADDRESS_SPACES;
+}
+
static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
{
return 0;
}
#endif
+/*
+ * Arch code must define kvm_arch_has_private_mem if support for private memory
+ * is enabled.
+ */
+#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
+static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
+{
+ return false;
+}
+#endif
+
struct kvm_memslots {
u64 generation;
atomic_long_t last_used_slot;
@@ -721,9 +750,9 @@ struct kvm {
struct mm_struct *mm; /* userspace tied to this vm */
unsigned long nr_memslot_pages;
/* The two memslot sets - active and inactive (per address space) */
- struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2];
+ struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2];
/* The current active memslot set for each address space */
- struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
+ struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES];
struct xarray vcpu_array;
/*
* Protected by slots_lock, but can be read outside if an
@@ -753,7 +782,7 @@ struct kvm {
struct list_head vm_list;
struct mutex lock;
struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
-#ifdef CONFIG_HAVE_KVM_EVENTFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
struct {
spinlock_t lock;
struct list_head items;
@@ -761,8 +790,8 @@ struct kvm {
struct list_head resampler_list;
struct mutex resampler_lock;
} irqfds;
- struct list_head ioeventfds;
#endif
+ struct list_head ioeventfds;
struct kvm_vm_stat stat;
struct kvm_arch arch;
refcount_t users_count;
@@ -778,17 +807,16 @@ struct kvm {
* Update side is protected by irq_lock.
*/
struct kvm_irq_routing_table __rcu *irq_routing;
-#endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+
struct hlist_head irq_ack_notifier_list;
#endif
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
struct mmu_notifier mmu_notifier;
unsigned long mmu_invalidate_seq;
long mmu_invalidate_in_progress;
- unsigned long mmu_invalidate_range_start;
- unsigned long mmu_invalidate_range_end;
+ gfn_t mmu_invalidate_range_start;
+ gfn_t mmu_invalidate_range_end;
#endif
struct list_head devices;
u64 manual_dirty_log_protect;
@@ -807,6 +835,10 @@ struct kvm {
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ /* Protected by slots_locks (for writes) and RCU (for reads) */
+ struct xarray mem_attr_array;
+#endif
char stats_id[KVM_STATS_NAME_SIZE];
};
@@ -965,7 +997,7 @@ static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
}
#endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
int kvm_irqfd_init(void);
void kvm_irqfd_exit(void);
#else
@@ -989,7 +1021,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm);
static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
{
- as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
+ as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES);
return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
lockdep_is_held(&kvm->slots_lock) ||
!refcount_read(&kvm->users_count));
@@ -1146,9 +1178,9 @@ enum kvm_mr_change {
};
int kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -1392,10 +1424,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
#endif
-void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
- unsigned long end);
-void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
- unsigned long end);
+void kvm_mmu_invalidate_begin(struct kvm *kvm);
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
+void kvm_mmu_invalidate_end(struct kvm *kvm);
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
@@ -1947,7 +1979,7 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
extern const struct kvm_stats_header kvm_vcpu_stats_header;
extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
{
if (unlikely(kvm->mmu_invalidate_in_progress))
@@ -1970,9 +2002,9 @@ static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
return 0;
}
-static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
+static inline int mmu_invalidate_retry_gfn(struct kvm *kvm,
unsigned long mmu_seq,
- unsigned long hva)
+ gfn_t gfn)
{
lockdep_assert_held(&kvm->mmu_lock);
/*
@@ -1981,10 +2013,20 @@ static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
* that might be being invalidated. Note that it may include some false
* positives, due to shortcuts when handing concurrent invalidations.
*/
- if (unlikely(kvm->mmu_invalidate_in_progress) &&
- hva >= kvm->mmu_invalidate_range_start &&
- hva < kvm->mmu_invalidate_range_end)
- return 1;
+ if (unlikely(kvm->mmu_invalidate_in_progress)) {
+ /*
+ * Dropping mmu_lock after bumping mmu_invalidate_in_progress
+ * but before updating the range is a KVM bug.
+ */
+ if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA ||
+ kvm->mmu_invalidate_range_end == INVALID_GPA))
+ return 1;
+
+ if (gfn >= kvm->mmu_invalidate_range_start &&
+ gfn < kvm->mmu_invalidate_range_end)
+ return 1;
+ }
+
if (kvm->mmu_invalidate_seq != mmu_seq)
return 1;
return 0;
@@ -2013,12 +2055,10 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
-#ifdef CONFIG_HAVE_KVM_EVENTFD
-
void kvm_eventfd_init(struct kvm *kvm);
int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
void kvm_irqfd_release(struct kvm *kvm);
bool kvm_notify_irqfd_resampler(struct kvm *kvm,
@@ -2039,31 +2079,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
{
return false;
}
-#endif
-
-#else
-
-static inline void kvm_eventfd_init(struct kvm *kvm) {}
-
-static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
-{
- return -EINVAL;
-}
-
-static inline void kvm_irqfd_release(struct kvm *kvm) {}
-
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
-static inline void kvm_irq_routing_update(struct kvm *kvm)
-{
-}
-#endif
-
-static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
-{
- return -ENOSYS;
-}
-
-#endif /* CONFIG_HAVE_KVM_EVENTFD */
+#endif /* CONFIG_HAVE_KVM_IRQCHIP */
void kvm_arch_irq_routing_update(struct kvm *kvm);
@@ -2318,4 +2334,57 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
/* Max number of entries allowed for each kvm dirty ring */
#define KVM_DIRTY_RING_MAX_ENTRIES 65536
+static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t size,
+ bool is_write, bool is_exec,
+ bool is_private)
+{
+ vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
+ vcpu->run->memory_fault.gpa = gpa;
+ vcpu->run->memory_fault.size = size;
+
+ /* RWX flags are not (yet) defined or communicated to userspace. */
+ vcpu->run->memory_fault.flags = 0;
+ if (is_private)
+ vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
+}
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+}
+
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long attrs);
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+ return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
+ kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+#else
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+ return false;
+}
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+
+#ifdef CONFIG_KVM_PRIVATE_MEM
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
+#else
+static inline int kvm_gmem_get_pfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t *pfn, int *max_order)
+{
+ KVM_BUG_ON(1, kvm);
+ return -EIO;
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
#endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 6f4737d5046a..9d1f7835d8c1 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -6,6 +6,7 @@
struct kvm;
struct kvm_async_pf;
struct kvm_device_ops;
+struct kvm_gfn_range;
struct kvm_interrupt;
struct kvm_irq_routing_table;
struct kvm_memory_slot;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 06142ff7f9ce..2df35e65557d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -206,6 +206,7 @@ enum mapping_flags {
AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
AS_STABLE_WRITES, /* must wait for writeback before modifying
folio contents */
+ AS_UNMOVABLE, /* The mapping cannot be moved, ever */
};
/**
@@ -306,6 +307,22 @@ static inline void mapping_clear_stable_writes(struct address_space *mapping)
clear_bit(AS_STABLE_WRITES, &mapping->flags);
}
+static inline void mapping_set_unmovable(struct address_space *mapping)
+{
+ /*
+ * It's expected unmovable mappings are also unevictable. Compaction
+ * migrate scanner (isolate_migratepages_block()) relies on this to
+ * reduce page locking.
+ */
+ set_bit(AS_UNEVICTABLE, &mapping->flags);
+ set_bit(AS_UNMOVABLE, &mapping->flags);
+}
+
+static inline bool mapping_unmovable(struct address_space *mapping)
+{
+ return test_bit(AS_UNMOVABLE, &mapping->flags);
+}
+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 3bd31ea23fee..011fba6b5552 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -62,7 +62,7 @@ TRACE_EVENT(kvm_vcpu_wakeup,
__entry->valid ? "valid" : "invalid")
);
-#if defined(CONFIG_HAVE_KVM_IRQFD)
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
TRACE_EVENT(kvm_set_irq,
TP_PROTO(unsigned int gsi, int level, int irq_source_id),
TP_ARGS(gsi, level, irq_source_id),
@@ -82,7 +82,7 @@ TRACE_EVENT(kvm_set_irq,
TP_printk("gsi %u level %d source %d",
__entry->gsi, __entry->level, __entry->irq_source_id)
);
-#endif /* defined(CONFIG_HAVE_KVM_IRQFD) */
+#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
#if defined(__KVM_HAVE_IOAPIC)
#define kvm_deliver_mode \
@@ -170,7 +170,7 @@ TRACE_EVENT(kvm_msi_set_irq,
#endif /* defined(__KVM_HAVE_IOAPIC) */
-#if defined(CONFIG_HAVE_KVM_IRQFD)
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
#ifdef kvm_irqchips
#define kvm_ack_irq_string "irqchip %s pin %u"
@@ -197,7 +197,7 @@ TRACE_EVENT(kvm_ack_irq,
TP_printk(kvm_ack_irq_string, kvm_ack_irq_parm)
);
-#endif /* defined(CONFIG_HAVE_KVM_IRQFD) */
+#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 211b86de35ac..b1f92a0edc35 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -16,76 +16,6 @@
#define KVM_API_VERSION 12
-/* *** Deprecated interfaces *** */
-
-#define KVM_TRC_SHIFT 16
-
-#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT)
-#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1))
-
-#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01)
-#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02)
-#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01)
-
-#define KVM_TRC_HEAD_SIZE 12
-#define KVM_TRC_CYCLE_SIZE 8
-#define KVM_TRC_EXTRA_MAX 7
-
-#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
-#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
-#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
-#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
-#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
-#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
-#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
-#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
-#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
-#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
-#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
-#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
-#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
-#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
-#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
-#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
-#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
-#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
-#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
-#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
-#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16)
-#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17)
-#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18)
-#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19)
-
-struct kvm_user_trace_setup {
- __u32 buf_size;
- __u32 buf_nr;
-};
-
-#define __KVM_DEPRECATED_MAIN_W_0x06 \
- _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
-#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
-#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
-
-#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
-
-struct kvm_breakpoint {
- __u32 enabled;
- __u32 padding;
- __u64 address;
-};
-
-struct kvm_debug_guest {
- __u32 enabled;
- __u32 pad;
- struct kvm_breakpoint breakpoints[4];
- __u32 singlestep;
-};
-
-#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
-
-/* *** End of deprecated interfaces *** */
-
-
/* for KVM_SET_USER_MEMORY_REGION */
struct kvm_userspace_memory_region {
__u32 slot;
@@ -95,6 +25,19 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
};
+/* for KVM_SET_USER_MEMORY_REGION2 */
+struct kvm_userspace_memory_region2 {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 userspace_addr;
+ __u64 guest_memfd_offset;
+ __u32 guest_memfd;
+ __u32 pad1;
+ __u64 pad2[14];
+};
+
/*
* The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
* userspace, other bits are reserved for kvm internal use which are defined
@@ -102,6 +45,7 @@ struct kvm_userspace_memory_region {
*/
#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
#define KVM_MEM_READONLY (1UL << 1)
+#define KVM_MEM_GUEST_MEMFD (1UL << 2)
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
@@ -265,6 +209,7 @@ struct kvm_xen_exit {
#define KVM_EXIT_RISCV_CSR 36
#define KVM_EXIT_NOTIFY 37
#define KVM_EXIT_LOONGARCH_IOCSR 38
+#define KVM_EXIT_MEMORY_FAULT 39
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -518,6 +463,13 @@ struct kvm_run {
#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0)
__u32 flags;
} notify;
+ /* KVM_EXIT_MEMORY_FAULT */
+ struct {
+#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3)
+ __u64 flags;
+ __u64 gpa;
+ __u64 size;
+ } memory_fault;
/* Fix the size of the union. */
char padding[256];
};
@@ -945,9 +897,6 @@ struct kvm_ppc_resize_hpt {
*/
#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */
#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
-#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06
-#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07
-#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08
#define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
#define KVM_GET_MSR_FEATURE_INDEX_LIST _IOWR(KVMIO, 0x0a, struct kvm_msr_list)
@@ -1201,6 +1150,11 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
#define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230
+#define KVM_CAP_USER_MEMORY2 231
+#define KVM_CAP_MEMORY_FAULT_INFO 232
+#define KVM_CAP_MEMORY_ATTRIBUTES 233
+#define KVM_CAP_GUEST_MEMFD 234
+#define KVM_CAP_VM_TYPES 235
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1483,6 +1437,8 @@ struct kvm_vfio_spapr_tce {
struct kvm_userspace_memory_region)
#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
+#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
+ struct kvm_userspace_memory_region2)
/* enable ucontrol for s390 */
struct kvm_s390_ucas_mapping {
@@ -1507,20 +1463,8 @@ struct kvm_s390_ucas_mapping {
_IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone)
#define KVM_UNREGISTER_COALESCED_MMIO \
_IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone)
-#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
- struct kvm_assigned_pci_dev)
#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
-/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
-#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70
-#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71)
-#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
- struct kvm_assigned_pci_dev)
-#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \
- struct kvm_assigned_msix_nr)
-#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \
- struct kvm_assigned_msix_entry)
-#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd)
#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config)
#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78)
@@ -1537,9 +1481,6 @@ struct kvm_s390_ucas_mapping {
* KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */
#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
-/* Available with KVM_CAP_PCI_2_3 */
-#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
- struct kvm_assigned_pci_dev)
/* Available with KVM_CAP_SIGNAL_MSI */
#define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi)
/* Available with KVM_CAP_PPC_GET_SMMU_INFO */
@@ -1592,8 +1533,6 @@ struct kvm_s390_ucas_mapping {
#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs)
#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation)
#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
-/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
-#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87
#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs)
#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs)
#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid)
@@ -2267,4 +2206,24 @@ struct kvm_s390_zpci_op {
/* flags for kvm_s390_zpci_op->u.reg_aen.flags */
#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0)
+/* Available with KVM_CAP_MEMORY_ATTRIBUTES */
+#define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes)
+
+struct kvm_memory_attributes {
+ __u64 address;
+ __u64 size;
+ __u64 attributes;
+ __u64 flags;
+};
+
+#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
+
+#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
+
+struct kvm_create_guest_memfd {
+ __u64 size;
+ __u64 flags;
+ __u64 reserved[6];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9626a363f121..0efd775cb3ad 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3881,7 +3881,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return ERR_PTR(ret);
#endif
- file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ file = anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL);
#if defined(CONFIG_UNIX)
if (IS_ERR(file)) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 01ba298739dd..e2d0c604ae42 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -882,6 +882,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
+ bool is_dirty, is_unevictable;
if (skip_on_failure && low_pfn >= next_skip_pfn) {
/*
@@ -1079,8 +1080,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!folio_test_lru(folio))
goto isolate_fail_put;
+ is_unevictable = folio_test_unevictable(folio);
+
/* Compaction might skip unevictable pages but CMA takes them */
- if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio))
+ if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
goto isolate_fail_put;
/*
@@ -1092,26 +1095,42 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
goto isolate_fail_put;
- if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) {
- bool migrate_dirty;
+ is_dirty = folio_test_dirty(folio);
+
+ if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) ||
+ (mapping && is_unevictable)) {
+ bool migrate_dirty = true;
+ bool is_unmovable;
/*
* Only folios without mappings or that have
- * a ->migrate_folio callback are possible to
- * migrate without blocking. However, we may
- * be racing with truncation, which can free
- * the mapping. Truncation holds the folio lock
- * until after the folio is removed from the page
- * cache so holding it ourselves is sufficient.
+ * a ->migrate_folio callback are possible to migrate
+ * without blocking.
+ *
+ * Folios from unmovable mappings are not migratable.
+ *
+ * However, we can be racing with truncation, which can
+ * free the mapping that we need to check. Truncation
+ * holds the folio lock until after the folio is removed
+ * from the page so holding it ourselves is sufficient.
+ *
+ * To avoid locking the folio just to check unmovable,
+ * assume every unmovable folio is also unevictable,
+ * which is a cheaper test. If our assumption goes
+ * wrong, it's not a correctness bug, just potentially
+ * wasted cycles.
*/
if (!folio_trylock(folio))
goto isolate_fail_put;
mapping = folio_mapping(folio);
- migrate_dirty = !mapping ||
- mapping->a_ops->migrate_folio;
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) {
+ migrate_dirty = !mapping ||
+ mapping->a_ops->migrate_folio;
+ }
+ is_unmovable = mapping && mapping_unmovable(mapping);
folio_unlock(folio);
- if (!migrate_dirty)
+ if (!migrate_dirty || is_unmovable)
goto isolate_fail_put;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 35a88334bb3c..ff0fbf95a384 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -955,6 +955,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
if (!mapping)
rc = migrate_folio(mapping, dst, src, mode);
+ else if (mapping_unmovable(mapping))
+ rc = -EOPNOTSUPP;
else if (mapping->a_ops->migrate_folio)
/*
* Most folios have a mapping and most filesystems
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 017efabcf502..492e937fab00 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -77,11 +77,12 @@ TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush
TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
-TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
+TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
+TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
@@ -124,6 +125,7 @@ TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += guest_memfd_test
TEST_GEN_PROGS_x86_64 += guest_print_test
TEST_GEN_PROGS_x86_64 += hardware_disable_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
index eb4217b7c768..08a5ca5bed56 100644
--- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c
+++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
@@ -705,7 +705,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
print_test_banner(mode, p);
- vm = ____vm_create(mode);
+ vm = ____vm_create(VM_SHAPE(mode));
setup_memslots(vm, p);
kvm_vm_elf_load(vm, program_invocation_name);
setup_ucall(vm);
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index 936f3a8d1b83..6cbecf499767 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -699,7 +699,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu,
pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
- vm = __vm_create(mode, 1, extra_mem_pages);
+ vm = __vm_create(VM_SHAPE(mode), 1, extra_mem_pages);
log_mode_create_vm_done(vm);
*vcpu = vm_vcpu_add(vm, 0, guest_code);
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
new file mode 100644
index 000000000000..c78a98c1a915
--- /dev/null
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright Intel Corporation, 2023
+ *
+ * Author: Chao Peng <chao.p.peng@linux.intel.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+#include <linux/bitmap.h>
+#include <linux/falloc.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "test_util.h"
+#include "kvm_util_base.h"
+
+static void test_file_read_write(int fd)
+{
+ char buf[64];
+
+ TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0,
+ "read on a guest_mem fd should fail");
+ TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0,
+ "write on a guest_mem fd should fail");
+ TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0,
+ "pread on a guest_mem fd should fail");
+ TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0,
+ "pwrite on a guest_mem fd should fail");
+}
+
+static void test_mmap(int fd, size_t page_size)
+{
+ char *mem;
+
+ mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ TEST_ASSERT_EQ(mem, MAP_FAILED);
+}
+
+static void test_file_size(int fd, size_t page_size, size_t total_size)
+{
+ struct stat sb;
+ int ret;
+
+ ret = fstat(fd, &sb);
+ TEST_ASSERT(!ret, "fstat should succeed");
+ TEST_ASSERT_EQ(sb.st_size, total_size);
+ TEST_ASSERT_EQ(sb.st_blksize, page_size);
+}
+
+static void test_fallocate(int fd, size_t page_size, size_t total_size)
+{
+ int ret;
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, total_size);
+ TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ page_size - 1, page_size);
+ TEST_ASSERT(ret, "fallocate with unaligned offset should fail");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size, page_size);
+ TEST_ASSERT(ret, "fallocate beginning at total_size should fail");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size + page_size, page_size);
+ TEST_ASSERT(ret, "fallocate beginning after total_size should fail");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ total_size, page_size);
+ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) at total_size should succeed");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ total_size + page_size, page_size);
+ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) after total_size should succeed");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ page_size, page_size - 1);
+ TEST_ASSERT(ret, "fallocate with unaligned size should fail");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ page_size, page_size);
+ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) with aligned offset and size should succeed");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, page_size, page_size);
+ TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed");
+}
+
+static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size)
+{
+ struct {
+ off_t offset;
+ off_t len;
+ } testcases[] = {
+ {0, 1},
+ {0, page_size - 1},
+ {0, page_size + 1},
+
+ {1, 1},
+ {1, page_size - 1},
+ {1, page_size},
+ {1, page_size + 1},
+
+ {page_size, 1},
+ {page_size, page_size - 1},
+ {page_size, page_size + 1},
+ };
+ int ret, i;
+
+ for (i = 0; i < ARRAY_SIZE(testcases); i++) {
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ testcases[i].offset, testcases[i].len);
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "PUNCH_HOLE with !PAGE_SIZE offset (%lx) and/or length (%lx) should fail",
+ testcases[i].offset, testcases[i].len);
+ }
+}
+
+static void test_create_guest_memfd_invalid(struct kvm_vm *vm)
+{
+ size_t page_size = getpagesize();
+ uint64_t flag;
+ size_t size;
+ int fd;
+
+ for (size = 1; size < page_size; size++) {
+ fd = __vm_create_guest_memfd(vm, size, 0);
+ TEST_ASSERT(fd == -1 && errno == EINVAL,
+ "guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL",
+ size);
+ }
+
+ for (flag = 0; flag; flag <<= 1) {
+ fd = __vm_create_guest_memfd(vm, page_size, flag);
+ TEST_ASSERT(fd == -1 && errno == EINVAL,
+ "guest_memfd() with flag '0x%lx' should fail with EINVAL",
+ flag);
+ }
+}
+
+static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
+{
+ int fd1, fd2, ret;
+ struct stat st1, st2;
+
+ fd1 = __vm_create_guest_memfd(vm, 4096, 0);
+ TEST_ASSERT(fd1 != -1, "memfd creation should succeed");
+
+ ret = fstat(fd1, &st1);
+ TEST_ASSERT(ret != -1, "memfd fstat should succeed");
+ TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size");
+
+ fd2 = __vm_create_guest_memfd(vm, 8192, 0);
+ TEST_ASSERT(fd2 != -1, "memfd creation should succeed");
+
+ ret = fstat(fd2, &st2);
+ TEST_ASSERT(ret != -1, "memfd fstat should succeed");
+ TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size");
+
+ ret = fstat(fd1, &st1);
+ TEST_ASSERT(ret != -1, "memfd fstat should succeed");
+ TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size");
+ TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers");
+}
+
+int main(int argc, char *argv[])
+{
+ size_t page_size;
+ size_t total_size;
+ int fd;
+ struct kvm_vm *vm;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
+
+ page_size = getpagesize();
+ total_size = page_size * 4;
+
+ vm = vm_create_barebones();
+
+ test_create_guest_memfd_invalid(vm);
+ test_create_guest_memfd_multiple(vm);
+
+ fd = vm_create_guest_memfd(vm, total_size, 0);
+
+ test_file_read_write(fd);
+ test_mmap(fd, page_size);
+ test_file_size(fd, page_size, total_size);
+ test_fallocate(fd, page_size, total_size);
+ test_invalid_punch_hole(fd, page_size, total_size);
+
+ close(fd);
+}
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index e112ee30867f..4c8a05cbc20c 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -44,7 +44,7 @@ typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
struct userspace_mem_region {
- struct kvm_userspace_memory_region region;
+ struct kvm_userspace_memory_region2 region;
struct sparsebit *unused_phy_pages;
int fd;
off_t offset;
@@ -189,6 +189,23 @@ enum vm_guest_mode {
NUM_VM_MODES,
};
+struct vm_shape {
+ enum vm_guest_mode mode;
+ unsigned int type;
+};
+
+#define VM_TYPE_DEFAULT 0
+
+#define VM_SHAPE(__mode) \
+({ \
+ struct vm_shape shape = { \
+ .mode = (__mode), \
+ .type = VM_TYPE_DEFAULT \
+ }; \
+ \
+ shape; \
+})
+
#if defined(__aarch64__)
extern enum vm_guest_mode vm_mode_default;
@@ -221,6 +238,8 @@ extern enum vm_guest_mode vm_mode_default;
#endif
+#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT)
+
#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT)
#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE)
@@ -249,6 +268,13 @@ static inline bool kvm_has_cap(long cap)
#define __KVM_SYSCALL_ERROR(_name, _ret) \
"%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno)
+/*
+ * Use the "inner", double-underscore macro when reporting errors from within
+ * other macros so that the name of ioctl() and not its literal numeric value
+ * is printed on error. The "outer" macro is strongly preferred when reporting
+ * errors "directly", i.e. without an additional layer of macros, as it reduces
+ * the probability of passing in the wrong string.
+ */
#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret)
#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret)
@@ -261,17 +287,13 @@ static inline bool kvm_has_cap(long cap)
#define __kvm_ioctl(kvm_fd, cmd, arg) \
kvm_do_ioctl(kvm_fd, cmd, arg)
-
-#define _kvm_ioctl(kvm_fd, cmd, name, arg) \
+#define kvm_ioctl(kvm_fd, cmd, arg) \
({ \
int ret = __kvm_ioctl(kvm_fd, cmd, arg); \
\
- TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \
+ TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \
})
-#define kvm_ioctl(kvm_fd, cmd, arg) \
- _kvm_ioctl(kvm_fd, cmd, #cmd, arg)
-
static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { }
#define __vm_ioctl(vm, cmd, arg) \
@@ -280,17 +302,42 @@ static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { }
kvm_do_ioctl((vm)->fd, cmd, arg); \
})
-#define _vm_ioctl(vm, cmd, name, arg) \
+/*
+ * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if
+ * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM,
+ * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before
+ * selftests existed and (b) should never outright fail, i.e. is supposed to
+ * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the
+ * VM and its vCPUs, including KVM_CHECK_EXTENSION.
+ */
+#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \
+do { \
+ int __errno = errno; \
+ \
+ static_assert_is_vm(vm); \
+ \
+ if (cond) \
+ break; \
+ \
+ if (errno == EIO && \
+ __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \
+ TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \
+ TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \
+ } \
+ errno = __errno; \
+ TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \
+} while (0)
+
+#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \
+ __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm)
+
+#define vm_ioctl(vm, cmd, arg) \
({ \
int ret = __vm_ioctl(vm, cmd, arg); \
\
- TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \
+ __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \
})
-#define vm_ioctl(vm, cmd, arg) \
- _vm_ioctl(vm, cmd, #cmd, arg)
-
-
static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { }
#define __vcpu_ioctl(vcpu, cmd, arg) \
@@ -299,16 +346,13 @@ static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { }
kvm_do_ioctl((vcpu)->fd, cmd, arg); \
})
-#define _vcpu_ioctl(vcpu, cmd, name, arg) \
+#define vcpu_ioctl(vcpu, cmd, arg) \
({ \
int ret = __vcpu_ioctl(vcpu, cmd, arg); \
\
- TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \
+ __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \
})
-#define vcpu_ioctl(vcpu, cmd, arg) \
- _vcpu_ioctl(vcpu, cmd, #cmd, arg)
-
/*
* Looks up and returns the value corresponding to the capability
* (KVM_CAP_*) given by cap.
@@ -317,7 +361,7 @@ static inline int vm_check_cap(struct kvm_vm *vm, long cap)
{
int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap);
- TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret));
+ TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm);
return ret;
}
@@ -334,6 +378,54 @@ static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0)
vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap);
}
+static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size, uint64_t attributes)
+{
+ struct kvm_memory_attributes attr = {
+ .attributes = attributes,
+ .address = gpa,
+ .size = size,
+ .flags = 0,
+ };
+
+ /*
+ * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows
+ * need significant enhancements to support multiple attributes.
+ */
+ TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ "Update me to support multiple attributes!");
+
+ vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr);
+}
+
+
+static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size)
+{
+ vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
+}
+
+static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size)
+{
+ vm_set_memory_attributes(vm, gpa, size, 0);
+}
+
+void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size,
+ bool punch_hole);
+
+static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size)
+{
+ vm_guest_mem_fallocate(vm, gpa, size, true);
+}
+
+static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa,
+ uint64_t size)
+{
+ vm_guest_mem_fallocate(vm, gpa, size, false);
+}
+
void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
const char *vm_guest_mode_string(uint32_t i);
@@ -376,7 +468,7 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm)
{
int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL);
- TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_GET_STATS_FD, fd));
+ TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm);
return fd;
}
@@ -432,14 +524,44 @@ static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name)
void vm_create_irqchip(struct kvm_vm *vm);
+static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size,
+ uint64_t flags)
+{
+ struct kvm_create_guest_memfd guest_memfd = {
+ .size = size,
+ .flags = flags,
+ };
+
+ return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
+}
+
+static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size,
+ uint64_t flags)
+{
+ int fd = __vm_create_guest_memfd(vm, size, flags);
+
+ TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd));
+ return fd;
+}
+
void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
uint64_t gpa, uint64_t size, void *hva);
int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
uint64_t gpa, uint64_t size, void *hva);
+void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
+ uint64_t gpa, uint64_t size, void *hva,
+ uint32_t guest_memfd, uint64_t guest_memfd_offset);
+int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
+ uint64_t gpa, uint64_t size, void *hva,
+ uint32_t guest_memfd, uint64_t guest_memfd_offset);
+
void vm_userspace_mem_region_add(struct kvm_vm *vm,
enum vm_mem_backing_src_type src_type,
uint64_t guest_paddr, uint32_t slot, uint64_t npages,
uint32_t flags);
+void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset);
void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
@@ -588,7 +710,7 @@ static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu)
{
int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL);
- TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_GET_STATS_FD, fd));
+ TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm);
return fd;
}
@@ -714,21 +836,33 @@ vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm);
* __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to
* calculate the amount of memory needed for per-vCPU data, e.g. stacks.
*/
-struct kvm_vm *____vm_create(enum vm_guest_mode mode);
-struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
+struct kvm_vm *____vm_create(struct vm_shape shape);
+struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
uint64_t nr_extra_pages);
static inline struct kvm_vm *vm_create_barebones(void)
{
- return ____vm_create(VM_MODE_DEFAULT);
+ return ____vm_create(VM_SHAPE_DEFAULT);
}
+#ifdef __x86_64__
+static inline struct kvm_vm *vm_create_barebones_protected_vm(void)
+{
+ const struct vm_shape shape = {
+ .mode = VM_MODE_DEFAULT,
+ .type = KVM_X86_SW_PROTECTED_VM,
+ };
+
+ return ____vm_create(shape);
+}
+#endif
+
static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus)
{
- return __vm_create(VM_MODE_DEFAULT, nr_runnable_vcpus, 0);
+ return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0);
}
-struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
+struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus,
uint64_t extra_mem_pages,
void *guest_code, struct kvm_vcpu *vcpus[]);
@@ -736,17 +870,27 @@ static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus,
void *guest_code,
struct kvm_vcpu *vcpus[])
{
- return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, 0,
+ return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0,
guest_code, vcpus);
}
+
+struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape,
+ struct kvm_vcpu **vcpu,
+ uint64_t extra_mem_pages,
+ void *guest_code);
+
/*
* Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages
* additional pages of guest memory. Returns the VM and vCPU (via out param).
*/
-struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
- uint64_t extra_mem_pages,
- void *guest_code);
+static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
+ uint64_t extra_mem_pages,
+ void *guest_code)
+{
+ return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu,
+ extra_mem_pages, guest_code);
+}
static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
void *guest_code)
@@ -754,6 +898,13 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
return __vm_create_with_one_vcpu(vcpu, 0, guest_code);
}
+static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape,
+ struct kvm_vcpu **vcpu,
+ void *guest_code)
+{
+ return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code);
+}
+
struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm);
void kvm_pin_this_task_to_pcpu(uint32_t pcpu);
@@ -777,10 +928,6 @@ vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
return n;
}
-struct kvm_userspace_memory_region *
-kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
- uint64_t end);
-
#define sync_global_to_guest(vm, g) ({ \
typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \
memcpy(_p, &(g), sizeof(g)); \
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 8e5f413a593d..71a41fa924b7 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -142,6 +142,11 @@ static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t)
return vm_mem_backing_src_alias(t)->flag & MAP_SHARED;
}
+static inline bool backing_src_can_be_huge(enum vm_mem_backing_src_type t)
+{
+ return t != VM_MEM_SRC_ANONYMOUS && t != VM_MEM_SRC_SHMEM;
+}
+
/* Aligns x up to the next multiple of size. Size must be a power of 2. */
static inline uint64_t align_up(uint64_t x, uint64_t size)
{
@@ -186,7 +191,7 @@ static inline uint32_t atoi_non_negative(const char *name, const char *num_str)
}
int guest_vsnprintf(char *buf, int n, const char *fmt, va_list args);
-int guest_snprintf(char *buf, int n, const char *fmt, ...);
+__printf(3, 4) int guest_snprintf(char *buf, int n, const char *fmt, ...);
char *strdup_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2), nonnull(1)));
diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h
index ce33d306c2cb..d9d6581b8d4f 100644
--- a/tools/testing/selftests/kvm/include/ucall_common.h
+++ b/tools/testing/selftests/kvm/include/ucall_common.h
@@ -34,9 +34,10 @@ void ucall_arch_do_ucall(vm_vaddr_t uc);
void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu);
void ucall(uint64_t cmd, int nargs, ...);
-void ucall_fmt(uint64_t cmd, const char *fmt, ...);
-void ucall_assert(uint64_t cmd, const char *exp, const char *file,
- unsigned int line, const char *fmt, ...);
+__printf(2, 3) void ucall_fmt(uint64_t cmd, const char *fmt, ...);
+__printf(5, 6) void ucall_assert(uint64_t cmd, const char *exp,
+ const char *file, unsigned int line,
+ const char *fmt, ...);
uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
int ucall_nr_pages_required(uint64_t page_size);
@@ -52,6 +53,17 @@ int ucall_nr_pages_required(uint64_t page_size);
#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
+#define GUEST_SYNC1(arg0) ucall(UCALL_SYNC, 1, arg0)
+#define GUEST_SYNC2(arg0, arg1) ucall(UCALL_SYNC, 2, arg0, arg1)
+#define GUEST_SYNC3(arg0, arg1, arg2) \
+ ucall(UCALL_SYNC, 3, arg0, arg1, arg2)
+#define GUEST_SYNC4(arg0, arg1, arg2, arg3) \
+ ucall(UCALL_SYNC, 4, arg0, arg1, arg2, arg3)
+#define GUEST_SYNC5(arg0, arg1, arg2, arg3, arg4) \
+ ucall(UCALL_SYNC, 5, arg0, arg1, arg2, arg3, arg4)
+#define GUEST_SYNC6(arg0, arg1, arg2, arg3, arg4, arg5) \
+ ucall(UCALL_SYNC, 6, arg0, arg1, arg2, arg3, arg4, arg5)
+
#define GUEST_PRINTF(_fmt, _args...) ucall_fmt(UCALL_PRINTF, _fmt, ##_args)
#define GUEST_DONE() ucall(UCALL_DONE, 0)
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 25bc61dac5fb..a84863503fcb 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -15,6 +15,7 @@
#include <asm/msr-index.h>
#include <asm/prctl.h>
+#include <linux/kvm_para.h>
#include <linux/stringify.h>
#include "../kvm_util.h"
@@ -1194,6 +1195,20 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1);
void xen_hypercall(uint64_t nr, uint64_t a0, void *a1);
+static inline uint64_t __kvm_hypercall_map_gpa_range(uint64_t gpa,
+ uint64_t size, uint64_t flags)
+{
+ return kvm_hypercall(KVM_HC_MAP_GPA_RANGE, gpa, size >> PAGE_SHIFT, flags, 0);
+}
+
+static inline void kvm_hypercall_map_gpa_range(uint64_t gpa, uint64_t size,
+ uint64_t flags)
+{
+ uint64_t ret = __kvm_hypercall_map_gpa_range(gpa, size, flags);
+
+ GUEST_ASSERT(!ret);
+}
+
void __vm_xsave_require_permission(uint64_t xfeature, const char *name);
#define vm_xsave_require_permission(xfeature) \
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index 69f26d80c821..e37dc9c21888 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -254,7 +254,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
/* Create a VM with enough guest pages */
guest_num_pages = test_mem_size / guest_page_size;
- vm = __vm_create_with_vcpus(mode, nr_vcpus, guest_num_pages,
+ vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus, guest_num_pages,
guest_code, test_args.vcpus);
/* Align down GPA of the testing memslot */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 7a8af1821f5d..17a978b8a2c4 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -209,7 +209,7 @@ __weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
}
-struct kvm_vm *____vm_create(enum vm_guest_mode mode)
+struct kvm_vm *____vm_create(struct vm_shape shape)
{
struct kvm_vm *vm;
@@ -221,13 +221,13 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode)
vm->regions.hva_tree = RB_ROOT;
hash_init(vm->regions.slot_hash);
- vm->mode = mode;
- vm->type = 0;
+ vm->mode = shape.mode;
+ vm->type = shape.type;
- vm->pa_bits = vm_guest_mode_params[mode].pa_bits;
- vm->va_bits = vm_guest_mode_params[mode].va_bits;
- vm->page_size = vm_guest_mode_params[mode].page_size;
- vm->page_shift = vm_guest_mode_params[mode].page_shift;
+ vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits;
+ vm->va_bits = vm_guest_mode_params[vm->mode].va_bits;
+ vm->page_size = vm_guest_mode_params[vm->mode].page_size;
+ vm->page_shift = vm_guest_mode_params[vm->mode].page_shift;
/* Setup mode specific traits. */
switch (vm->mode) {
@@ -265,7 +265,7 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode)
/*
* Ignore KVM support for 5-level paging (vm->va_bits == 57),
* it doesn't take effect unless a CR4.LA57 is set, which it
- * isn't for this VM_MODE.
+ * isn't for this mode (48-bit virtual address space).
*/
TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57,
"Linear address width (%d bits) not supported",
@@ -285,10 +285,11 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode)
vm->pgtable_levels = 5;
break;
default:
- TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
+ TEST_FAIL("Unknown guest mode: 0x%x", vm->mode);
}
#ifdef __aarch64__
+ TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types");
if (vm->pa_bits != 40)
vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
#endif
@@ -347,19 +348,19 @@ static uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
return vm_adjust_num_guest_pages(mode, nr_pages);
}
-struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
+struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
uint64_t nr_extra_pages)
{
- uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
+ uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus,
nr_extra_pages);
struct userspace_mem_region *slot0;
struct kvm_vm *vm;
int i;
- pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
- vm_guest_mode_string(mode), nr_pages);
+ pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__,
+ vm_guest_mode_string(shape.mode), shape.type, nr_pages);
- vm = ____vm_create(mode);
+ vm = ____vm_create(shape);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
for (i = 0; i < NR_MEM_REGIONS; i++)
@@ -400,7 +401,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
* extra_mem_pages is only used to calculate the maximum page table size,
* no real memory allocation for non-slot0 memory in this function.
*/
-struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
+struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus,
uint64_t extra_mem_pages,
void *guest_code, struct kvm_vcpu *vcpus[])
{
@@ -409,7 +410,7 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus
TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array");
- vm = __vm_create(mode, nr_vcpus, extra_mem_pages);
+ vm = __vm_create(shape, nr_vcpus, extra_mem_pages);
for (i = 0; i < nr_vcpus; ++i)
vcpus[i] = vm_vcpu_add(vm, i, guest_code);
@@ -417,15 +418,15 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus
return vm;
}
-struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
- uint64_t extra_mem_pages,
- void *guest_code)
+struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape,
+ struct kvm_vcpu **vcpu,
+ uint64_t extra_mem_pages,
+ void *guest_code)
{
struct kvm_vcpu *vcpus[1];
struct kvm_vm *vm;
- vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages,
- guest_code, vcpus);
+ vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus);
*vcpu = vcpus[0];
return vm;
@@ -453,8 +454,9 @@ void kvm_vm_restart(struct kvm_vm *vmp)
vm_create_irqchip(vmp);
hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
- int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
- TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, &region->region);
+
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
" rc: %i errno: %i\n"
" slot: %u flags: 0x%x\n"
" guest_phys_addr: 0x%llx size: 0x%llx",
@@ -590,35 +592,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
return NULL;
}
-/*
- * KVM Userspace Memory Region Find
- *
- * Input Args:
- * vm - Virtual Machine
- * start - Starting VM physical address
- * end - Ending VM physical address, inclusive.
- *
- * Output Args: None
- *
- * Return:
- * Pointer to overlapping region, NULL if no such region.
- *
- * Public interface to userspace_mem_region_find. Allows tests to look up
- * the memslot datastructure for a given range of guest physical memory.
- */
-struct kvm_userspace_memory_region *
-kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
- uint64_t end)
-{
- struct userspace_mem_region *region;
-
- region = userspace_mem_region_find(vm, start, end);
- if (!region)
- return NULL;
-
- return &region->region;
-}
-
__weak void vcpu_arch_free(struct kvm_vcpu *vcpu)
{
@@ -686,7 +659,7 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
}
region->region.memory_size = 0;
- vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
+ vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
sparsebit_free(&region->unused_phy_pages);
ret = munmap(region->mmap_start, region->mmap_size);
@@ -697,6 +670,8 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
close(region->fd);
}
+ if (region->region.guest_memfd >= 0)
+ close(region->region.guest_memfd);
free(region);
}
@@ -898,36 +873,44 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
errno, strerror(errno));
}
-/*
- * VM Userspace Memory Region Add
- *
- * Input Args:
- * vm - Virtual Machine
- * src_type - Storage source for this region.
- * NULL to use anonymous memory.
- * guest_paddr - Starting guest physical address
- * slot - KVM region slot
- * npages - Number of physical pages
- * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
- *
- * Output Args: None
- *
- * Return: None
- *
- * Allocates a memory area of the number of pages specified by npages
- * and maps it to the VM specified by vm, at a starting physical address
- * given by guest_paddr. The region is created with a KVM region slot
- * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The
- * region is created with the flags given by flags.
- */
-void vm_userspace_mem_region_add(struct kvm_vm *vm,
- enum vm_mem_backing_src_type src_type,
- uint64_t guest_paddr, uint32_t slot, uint64_t npages,
- uint32_t flags)
+int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
+ uint64_t gpa, uint64_t size, void *hva,
+ uint32_t guest_memfd, uint64_t guest_memfd_offset)
+{
+ struct kvm_userspace_memory_region2 region = {
+ .slot = slot,
+ .flags = flags,
+ .guest_phys_addr = gpa,
+ .memory_size = size,
+ .userspace_addr = (uintptr_t)hva,
+ .guest_memfd = guest_memfd,
+ .guest_memfd_offset = guest_memfd_offset,
+ };
+
+ return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, &region);
+}
+
+void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
+ uint64_t gpa, uint64_t size, void *hva,
+ uint32_t guest_memfd, uint64_t guest_memfd_offset)
+{
+ int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva,
+ guest_memfd, guest_memfd_offset);
+
+ TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)",
+ errno, strerror(errno));
+}
+
+
+/* FIXME: This thing needs to be ripped apart and rewritten. */
+void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset)
{
int ret;
struct userspace_mem_region *region;
size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
+ size_t mem_size = npages * vm->page_size;
size_t alignment;
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -980,7 +963,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
/* Allocate and initialize new mem region structure. */
region = calloc(1, sizeof(*region));
TEST_ASSERT(region != NULL, "Insufficient Memory");
- region->mmap_size = npages * vm->page_size;
+ region->mmap_size = mem_size;
#ifdef __s390x__
/* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
@@ -1027,14 +1010,38 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
/* As needed perform madvise */
if ((src_type == VM_MEM_SRC_ANONYMOUS ||
src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
- ret = madvise(region->host_mem, npages * vm->page_size,
+ ret = madvise(region->host_mem, mem_size,
src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
- region->host_mem, npages * vm->page_size,
+ region->host_mem, mem_size,
vm_mem_backing_src_alias(src_type)->name);
}
region->backing_src_type = src_type;
+
+ if (flags & KVM_MEM_GUEST_MEMFD) {
+ if (guest_memfd < 0) {
+ uint32_t guest_memfd_flags = 0;
+ TEST_ASSERT(!guest_memfd_offset,
+ "Offset must be zero when creating new guest_memfd");
+ guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
+ } else {
+ /*
+ * Install a unique fd for each memslot so that the fd
+ * can be closed when the region is deleted without
+ * needing to track if the fd is owned by the framework
+ * or by the caller.
+ */
+ guest_memfd = dup(guest_memfd);
+ TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd));
+ }
+
+ region->region.guest_memfd = guest_memfd;
+ region->region.guest_memfd_offset = guest_memfd_offset;
+ } else {
+ region->region.guest_memfd = -1;
+ }
+
region->unused_phy_pages = sparsebit_alloc();
sparsebit_set_num(region->unused_phy_pages,
guest_paddr >> vm->page_shift, npages);
@@ -1043,13 +1050,14 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->region.guest_phys_addr = guest_paddr;
region->region.memory_size = npages * vm->page_size;
region->region.userspace_addr = (uintptr_t) region->host_mem;
- ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
- TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
" rc: %i errno: %i\n"
" slot: %u flags: 0x%x\n"
- " guest_phys_addr: 0x%lx size: 0x%lx",
+ " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d\n",
ret, errno, slot, flags,
- guest_paddr, (uint64_t) region->region.memory_size);
+ guest_paddr, (uint64_t) region->region.memory_size,
+ region->region.guest_memfd);
/* Add to quick lookup data structures */
vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
@@ -1070,6 +1078,14 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
}
}
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+ enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot,
+ uint64_t npages, uint32_t flags)
+{
+ vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0);
+}
+
/*
* Memslot to region
*
@@ -1126,9 +1142,9 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
region->region.flags = flags;
- ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
+ ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
- TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
" rc: %i errno: %i slot: %u flags: 0x%x",
ret, errno, slot, flags);
}
@@ -1156,9 +1172,9 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
region->region.guest_phys_addr = new_gpa;
- ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
+ ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
- TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n"
+ TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n"
"ret: %i errno: %i slot: %u new_gpa: 0x%lx",
ret, errno, slot, new_gpa);
}
@@ -1181,6 +1197,34 @@ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
__vm_mem_region_delete(vm, memslot2region(vm, slot), true);
}
+void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,
+ bool punch_hole)
+{
+ const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
+ struct userspace_mem_region *region;
+ uint64_t end = base + size;
+ uint64_t gpa, len;
+ off_t fd_offset;
+ int ret;
+
+ for (gpa = base; gpa < end; gpa += len) {
+ uint64_t offset;
+
+ region = userspace_mem_region_find(vm, gpa, gpa);
+ TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
+ "Private memory region not found for GPA 0x%lx", gpa);
+
+ offset = gpa - region->region.guest_phys_addr;
+ fd_offset = region->region.guest_memfd_offset + offset;
+ len = min_t(uint64_t, end - gpa, region->region.memory_size - offset);
+
+ ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
+ TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx\n",
+ punch_hole ? "punch hole" : "allocate", gpa, len,
+ region->region.guest_memfd, mode, fd_offset);
+ }
+}
+
/* Returns the size of a vCPU's kvm_run structure. */
static int vcpu_mmap_sz(void)
{
@@ -1227,7 +1271,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
vcpu->vm = vm;
vcpu->id = vcpu_id;
vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id);
- TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd));
+ TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm);
TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size "
"smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c
index df457452d146..d05487e5a371 100644
--- a/tools/testing/selftests/kvm/lib/memstress.c
+++ b/tools/testing/selftests/kvm/lib/memstress.c
@@ -168,7 +168,8 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
* The memory is also added to memslot 0, but that's a benign side
* effect as KVM allows aliasing HVAs in meslots.
*/
- vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages,
+ vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus,
+ slot0_pages + guest_num_pages,
memstress_guest_code, vcpus);
args->vm = vm;
diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c
index c8e0a6495a63..626a2b8a2037 100644
--- a/tools/testing/selftests/kvm/s390x/cmma_test.c
+++ b/tools/testing/selftests/kvm/s390x/cmma_test.c
@@ -94,11 +94,6 @@ static void guest_dirty_test_data(void)
);
}
-static struct kvm_vm *create_vm(void)
-{
- return ____vm_create(VM_MODE_DEFAULT);
-}
-
static void create_main_memslot(struct kvm_vm *vm)
{
int i;
@@ -157,7 +152,7 @@ static struct kvm_vm *create_vm_two_memslots(void)
{
struct kvm_vm *vm;
- vm = create_vm();
+ vm = vm_create_barebones();
create_memslots(vm);
@@ -276,7 +271,7 @@ static void assert_exit_was_hypercall(struct kvm_vcpu *vcpu)
static void test_migration_mode(void)
{
- struct kvm_vm *vm = create_vm();
+ struct kvm_vm *vm = vm_create_barebones();
struct kvm_vcpu *vcpu;
u64 orig_psw;
int rc;
@@ -670,7 +665,7 @@ struct testdef {
*/
static int machine_has_cmma(void)
{
- struct kvm_vm *vm = create_vm();
+ struct kvm_vm *vm = vm_create_barebones();
int r;
r = !__kvm_has_device_attr(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA);
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index b32960189f5f..075b80dbe237 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -157,17 +157,17 @@ static void guest_code_move_memory_region(void)
*/
val = guest_spin_on_val(0);
__GUEST_ASSERT(val == 1 || val == MMIO_VAL,
- "Expected '1' or MMIO ('%llx'), got '%llx'", MMIO_VAL, val);
+ "Expected '1' or MMIO ('%lx'), got '%lx'", MMIO_VAL, val);
/* Spin until the misaligning memory region move completes. */
val = guest_spin_on_val(MMIO_VAL);
__GUEST_ASSERT(val == 1 || val == 0,
- "Expected '0' or '1' (no MMIO), got '%llx'", val);
+ "Expected '0' or '1' (no MMIO), got '%lx'", val);
/* Spin until the memory region starts to get re-aligned. */
val = guest_spin_on_val(0);
__GUEST_ASSERT(val == 1 || val == MMIO_VAL,
- "Expected '1' or MMIO ('%llx'), got '%llx'", MMIO_VAL, val);
+ "Expected '1' or MMIO ('%lx'), got '%lx'", MMIO_VAL, val);
/* Spin until the re-aligning memory region move completes. */
val = guest_spin_on_val(MMIO_VAL);
@@ -326,6 +326,55 @@ static void test_zero_memory_regions(void)
}
#endif /* __x86_64__ */
+static void test_invalid_memory_region_flags(void)
+{
+ uint32_t supported_flags = KVM_MEM_LOG_DIRTY_PAGES;
+ const uint32_t v2_only_flags = KVM_MEM_GUEST_MEMFD;
+ struct kvm_vm *vm;
+ int r, i;
+
+#if defined __aarch64__ || defined __x86_64__
+ supported_flags |= KVM_MEM_READONLY;
+#endif
+
+#ifdef __x86_64__
+ if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))
+ vm = vm_create_barebones_protected_vm();
+ else
+#endif
+ vm = vm_create_barebones();
+
+ if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ supported_flags |= KVM_MEM_GUEST_MEMFD;
+
+ for (i = 0; i < 32; i++) {
+ if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i)))
+ continue;
+
+ r = __vm_set_user_memory_region(vm, 0, BIT(i),
+ 0, MEM_REGION_SIZE, NULL);
+
+ TEST_ASSERT(r && errno == EINVAL,
+ "KVM_SET_USER_MEMORY_REGION should have failed on v2 only flag 0x%lx", BIT(i));
+
+ if (supported_flags & BIT(i))
+ continue;
+
+ r = __vm_set_user_memory_region2(vm, 0, BIT(i),
+ 0, MEM_REGION_SIZE, NULL, 0, 0);
+ TEST_ASSERT(r && errno == EINVAL,
+ "KVM_SET_USER_MEMORY_REGION2 should have failed on unsupported flag 0x%lx", BIT(i));
+ }
+
+ if (supported_flags & KVM_MEM_GUEST_MEMFD) {
+ r = __vm_set_user_memory_region2(vm, 0,
+ KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD,
+ 0, MEM_REGION_SIZE, NULL, 0, 0);
+ TEST_ASSERT(r && errno == EINVAL,
+ "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported");
+ }
+}
+
/*
* Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any
* tentative to add further slots should fail.
@@ -385,13 +434,105 @@ static void test_add_max_memory_regions(void)
kvm_vm_free(vm);
}
+
+#ifdef __x86_64__
+static void test_invalid_guest_memfd(struct kvm_vm *vm, int memfd,
+ size_t offset, const char *msg)
+{
+ int r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA, MEM_REGION_SIZE,
+ 0, memfd, offset);
+ TEST_ASSERT(r == -1 && errno == EINVAL, "%s", msg);
+}
+
+static void test_add_private_memory_region(void)
+{
+ struct kvm_vm *vm, *vm2;
+ int memfd, i;
+
+ pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n");
+
+ vm = vm_create_barebones_protected_vm();
+
+ test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail");
+ test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail");
+
+ memfd = kvm_memfd_alloc(MEM_REGION_SIZE, false);
+ test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail");
+ close(memfd);
+
+ vm2 = vm_create_barebones_protected_vm();
+ memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0);
+ test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail");
+
+ vm_set_user_memory_region2(vm2, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0);
+ close(memfd);
+ kvm_vm_free(vm2);
+
+ memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0);
+ for (i = 1; i < PAGE_SIZE; i++)
+ test_invalid_guest_memfd(vm, memfd, i, "Unaligned offset should fail");
+
+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0);
+ close(memfd);
+
+ kvm_vm_free(vm);
+}
+
+static void test_add_overlapping_private_memory_regions(void)
+{
+ struct kvm_vm *vm;
+ int memfd;
+ int r;
+
+ pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n");
+
+ vm = vm_create_barebones_protected_vm();
+
+ memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0);
+
+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA, MEM_REGION_SIZE * 2, 0, memfd, 0);
+
+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT + 1, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA * 2, MEM_REGION_SIZE * 2,
+ 0, memfd, MEM_REGION_SIZE * 2);
+
+ /*
+ * Delete the first memslot, and then attempt to recreate it except
+ * with a "bad" offset that results in overlap in the guest_memfd().
+ */
+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA, 0, NULL, -1, 0);
+
+ /* Overlap the front half of the other slot. */
+ r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA * 2 - MEM_REGION_SIZE,
+ MEM_REGION_SIZE * 2,
+ 0, memfd, 0);
+ TEST_ASSERT(r == -1 && errno == EEXIST, "%s",
+ "Overlapping guest_memfd() bindings should fail with EEXIST");
+
+ /* And now the back half of the other slot. */
+ r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD,
+ MEM_REGION_GPA * 2 + MEM_REGION_SIZE,
+ MEM_REGION_SIZE * 2,
+ 0, memfd, 0);
+ TEST_ASSERT(r == -1 && errno == EEXIST, "%s",
+ "Overlapping guest_memfd() bindings should fail with EEXIST");
+
+ close(memfd);
+ kvm_vm_free(vm);
+}
+#endif
+
int main(int argc, char *argv[])
{
#ifdef __x86_64__
int i, loops;
-#endif
-#ifdef __x86_64__
/*
* FIXME: the zero-memslot test fails on aarch64 and s390x because
* KVM_RUN fails with ENOEXEC or EFAULT.
@@ -399,9 +540,19 @@ int main(int argc, char *argv[])
test_zero_memory_regions();
#endif
+ test_invalid_memory_region_flags();
+
test_add_max_memory_regions();
#ifdef __x86_64__
+ if (kvm_has_cap(KVM_CAP_GUEST_MEMFD) &&
+ (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) {
+ test_add_private_memory_region();
+ test_add_overlapping_private_memory_regions();
+ } else {
+ pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n");
+ }
+
if (argc > 1)
loops = atoi_positive("Number of iterations", argv[1]);
else
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 9f28aa276c4e..29f6bdbce817 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -55,18 +55,18 @@ static void guest_msr(struct msr_data *msr)
if (msr->fault_expected)
__GUEST_ASSERT(vector == GP_VECTOR,
"Expected #GP on %sMSR(0x%x), got vector '0x%x'",
- msr->idx, msr->write ? "WR" : "RD", vector);
+ msr->write ? "WR" : "RD", msr->idx, vector);
else
__GUEST_ASSERT(!vector,
"Expected success on %sMSR(0x%x), got vector '0x%x'",
- msr->idx, msr->write ? "WR" : "RD", vector);
+ msr->write ? "WR" : "RD", msr->idx, vector);
if (vector || is_write_only_msr(msr->idx))
goto done;
if (msr->write)
__GUEST_ASSERT(!vector,
- "WRMSR(0x%x) to '0x%llx', RDMSR read '0x%llx'",
+ "WRMSR(0x%x) to '0x%lx', RDMSR read '0x%lx'",
msr->idx, msr->write_val, msr_val);
/* Invariant TSC bit appears when TSC invariant control MSR is written to */
@@ -102,11 +102,11 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall)
vector = __hyperv_hypercall(hcall->control, input, output, &res);
if (hcall->ud_expected) {
__GUEST_ASSERT(vector == UD_VECTOR,
- "Expected #UD for control '%u', got vector '0x%x'",
+ "Expected #UD for control '%lu', got vector '0x%x'",
hcall->control, vector);
} else {
__GUEST_ASSERT(!vector,
- "Expected no exception for control '%u', got vector '0x%x'",
+ "Expected no exception for control '%lu', got vector '0x%x'",
hcall->control, vector);
GUEST_ASSERT_EQ(res, hcall->expect);
}
diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
deleted file mode 100644
index ce1ccc4c1503..000000000000
--- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * mmio_warning_test
- *
- * Copyright (C) 2019, Google LLC.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- * Test that we don't get a kernel warning when we call KVM_RUN after a
- * triple fault occurs. To get the triple fault to occur we call KVM_RUN
- * on a VCPU that hasn't been properly setup.
- *
- */
-
-#define _GNU_SOURCE
-#include <fcntl.h>
-#include <kvm_util.h>
-#include <linux/kvm.h>
-#include <processor.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <test_util.h>
-#include <unistd.h>
-
-#define NTHREAD 4
-#define NPROCESS 5
-
-struct thread_context {
- int kvmcpu;
- struct kvm_run *run;
-};
-
-void *thr(void *arg)
-{
- struct thread_context *tc = (struct thread_context *)arg;
- int res;
- int kvmcpu = tc->kvmcpu;
- struct kvm_run *run = tc->run;
-
- res = ioctl(kvmcpu, KVM_RUN, 0);
- pr_info("ret1=%d exit_reason=%d suberror=%d\n",
- res, run->exit_reason, run->internal.suberror);
-
- return 0;
-}
-
-void test(void)
-{
- int i, kvm, kvmvm, kvmcpu;
- pthread_t th[NTHREAD];
- struct kvm_run *run;
- struct thread_context tc;
-
- kvm = open("/dev/kvm", O_RDWR);
- TEST_ASSERT(kvm != -1, "failed to open /dev/kvm");
- kvmvm = __kvm_ioctl(kvm, KVM_CREATE_VM, NULL);
- TEST_ASSERT(kvmvm > 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, kvmvm));
- kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0);
- TEST_ASSERT(kvmcpu != -1, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, kvmcpu));
- run = (struct kvm_run *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED,
- kvmcpu, 0);
- tc.kvmcpu = kvmcpu;
- tc.run = run;
- srand(getpid());
- for (i = 0; i < NTHREAD; i++) {
- pthread_create(&th[i], NULL, thr, (void *)(uintptr_t)&tc);
- usleep(rand() % 10000);
- }
- for (i = 0; i < NTHREAD; i++)
- pthread_join(th[i], NULL);
-}
-
-int get_warnings_count(void)
-{
- int warnings;
- FILE *f;
-
- f = popen("dmesg | grep \"WARNING:\" | wc -l", "r");
- if (fscanf(f, "%d", &warnings) < 1)
- warnings = 0;
- pclose(f);
-
- return warnings;
-}
-
-int main(void)
-{
- int warnings_before, warnings_after;
-
- TEST_REQUIRE(host_cpu_is_intel);
-
- TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
-
- warnings_before = get_warnings_count();
-
- for (int i = 0; i < NPROCESS; ++i) {
- int status;
- int pid = fork();
-
- if (pid < 0)
- exit(1);
- if (pid == 0) {
- test();
- exit(0);
- }
- while (waitpid(pid, &status, __WALL) != pid)
- ;
- }
-
- warnings_after = get_warnings_count();
- TEST_ASSERT(warnings_before == warnings_after,
- "Warnings found in kernel. Run 'dmesg' to inspect them.");
-
- return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
index 80aa3d8b18f8..853802641e1e 100644
--- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
+++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
@@ -27,10 +27,12 @@ do { \
\
if (fault_wanted) \
__GUEST_ASSERT((vector) == UD_VECTOR, \
- "Expected #UD on " insn " for testcase '0x%x', got '0x%x'", vector); \
+ "Expected #UD on " insn " for testcase '0x%x', got '0x%x'", \
+ testcase, vector); \
else \
__GUEST_ASSERT(!(vector), \
- "Expected success on " insn " for testcase '0x%x', got '0x%x'", vector); \
+ "Expected success on " insn " for testcase '0x%x', got '0x%x'", \
+ testcase, vector); \
} while (0)
static void guest_monitor_wait(int testcase)
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
new file mode 100644
index 000000000000..65ad38b6be1f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022, Google LLC.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/memfd.h>
+#include <linux/sizes.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define BASE_DATA_SLOT 10
+#define BASE_DATA_GPA ((uint64_t)(1ull << 32))
+#define PER_CPU_DATA_SIZE ((uint64_t)(SZ_2M + PAGE_SIZE))
+
+/* Horrific macro so that the line info is captured accurately :-( */
+#define memcmp_g(gpa, pattern, size) \
+do { \
+ uint8_t *mem = (uint8_t *)gpa; \
+ size_t i; \
+ \
+ for (i = 0; i < size; i++) \
+ __GUEST_ASSERT(mem[i] == pattern, \
+ "Guest expected 0x%x at offset %lu (gpa 0x%lx), got 0x%x", \
+ pattern, i, gpa + i, mem[i]); \
+} while (0)
+
+static void memcmp_h(uint8_t *mem, uint64_t gpa, uint8_t pattern, size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ TEST_ASSERT(mem[i] == pattern,
+ "Host expected 0x%x at gpa 0x%lx, got 0x%x",
+ pattern, gpa + i, mem[i]);
+}
+
+/*
+ * Run memory conversion tests with explicit conversion:
+ * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit
+ * to back/unback private memory. Subsequent accesses by guest to the gpa range
+ * will not cause exit to userspace.
+ *
+ * Test memory conversion scenarios with following steps:
+ * 1) Access private memory using private access and verify that memory contents
+ * are not visible to userspace.
+ * 2) Convert memory to shared using explicit conversions and ensure that
+ * userspace is able to access the shared regions.
+ * 3) Convert memory back to private using explicit conversions and ensure that
+ * userspace is again not able to access converted private regions.
+ */
+
+#define GUEST_STAGE(o, s) { .offset = o, .size = s }
+
+enum ucall_syncs {
+ SYNC_SHARED,
+ SYNC_PRIVATE,
+};
+
+static void guest_sync_shared(uint64_t gpa, uint64_t size,
+ uint8_t current_pattern, uint8_t new_pattern)
+{
+ GUEST_SYNC5(SYNC_SHARED, gpa, size, current_pattern, new_pattern);
+}
+
+static void guest_sync_private(uint64_t gpa, uint64_t size, uint8_t pattern)
+{
+ GUEST_SYNC4(SYNC_PRIVATE, gpa, size, pattern);
+}
+
+/* Arbitrary values, KVM doesn't care about the attribute flags. */
+#define MAP_GPA_SET_ATTRIBUTES BIT(0)
+#define MAP_GPA_SHARED BIT(1)
+#define MAP_GPA_DO_FALLOCATE BIT(2)
+
+static void guest_map_mem(uint64_t gpa, uint64_t size, bool map_shared,
+ bool do_fallocate)
+{
+ uint64_t flags = MAP_GPA_SET_ATTRIBUTES;
+
+ if (map_shared)
+ flags |= MAP_GPA_SHARED;
+ if (do_fallocate)
+ flags |= MAP_GPA_DO_FALLOCATE;
+ kvm_hypercall_map_gpa_range(gpa, size, flags);
+}
+
+static void guest_map_shared(uint64_t gpa, uint64_t size, bool do_fallocate)
+{
+ guest_map_mem(gpa, size, true, do_fallocate);
+}
+
+static void guest_map_private(uint64_t gpa, uint64_t size, bool do_fallocate)
+{
+ guest_map_mem(gpa, size, false, do_fallocate);
+}
+
+struct {
+ uint64_t offset;
+ uint64_t size;
+} static const test_ranges[] = {
+ GUEST_STAGE(0, PAGE_SIZE),
+ GUEST_STAGE(0, SZ_2M),
+ GUEST_STAGE(PAGE_SIZE, PAGE_SIZE),
+ GUEST_STAGE(PAGE_SIZE, SZ_2M),
+ GUEST_STAGE(SZ_2M, PAGE_SIZE),
+};
+
+static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate)
+{
+ const uint8_t def_p = 0xaa;
+ const uint8_t init_p = 0xcc;
+ uint64_t j;
+ int i;
+
+ /* Memory should be shared by default. */
+ memset((void *)base_gpa, def_p, PER_CPU_DATA_SIZE);
+ memcmp_g(base_gpa, def_p, PER_CPU_DATA_SIZE);
+ guest_sync_shared(base_gpa, PER_CPU_DATA_SIZE, def_p, init_p);
+
+ memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
+
+ for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
+ uint64_t gpa = base_gpa + test_ranges[i].offset;
+ uint64_t size = test_ranges[i].size;
+ uint8_t p1 = 0x11;
+ uint8_t p2 = 0x22;
+ uint8_t p3 = 0x33;
+ uint8_t p4 = 0x44;
+
+ /*
+ * Set the test region to pattern one to differentiate it from
+ * the data range as a whole (contains the initial pattern).
+ */
+ memset((void *)gpa, p1, size);
+
+ /*
+ * Convert to private, set and verify the private data, and
+ * then verify that the rest of the data (map shared) still
+ * holds the initial pattern, and that the host always sees the
+ * shared memory (initial pattern). Unlike shared memory,
+ * punching a hole in private memory is destructive, i.e.
+ * previous values aren't guaranteed to be preserved.
+ */
+ guest_map_private(gpa, size, do_fallocate);
+
+ if (size > PAGE_SIZE) {
+ memset((void *)gpa, p2, PAGE_SIZE);
+ goto skip;
+ }
+
+ memset((void *)gpa, p2, size);
+ guest_sync_private(gpa, size, p1);
+
+ /*
+ * Verify that the private memory was set to pattern two, and
+ * that shared memory still holds the initial pattern.
+ */
+ memcmp_g(gpa, p2, size);
+ if (gpa > base_gpa)
+ memcmp_g(base_gpa, init_p, gpa - base_gpa);
+ if (gpa + size < base_gpa + PER_CPU_DATA_SIZE)
+ memcmp_g(gpa + size, init_p,
+ (base_gpa + PER_CPU_DATA_SIZE) - (gpa + size));
+
+ /*
+ * Convert odd-number page frames back to shared to verify KVM
+ * also correctly handles holes in private ranges.
+ */
+ for (j = 0; j < size; j += PAGE_SIZE) {
+ if ((j >> PAGE_SHIFT) & 1) {
+ guest_map_shared(gpa + j, PAGE_SIZE, do_fallocate);
+ guest_sync_shared(gpa + j, PAGE_SIZE, p1, p3);
+
+ memcmp_g(gpa + j, p3, PAGE_SIZE);
+ } else {
+ guest_sync_private(gpa + j, PAGE_SIZE, p1);
+ }
+ }
+
+skip:
+ /*
+ * Convert the entire region back to shared, explicitly write
+ * pattern three to fill in the even-number frames before
+ * asking the host to verify (and write pattern four).
+ */
+ guest_map_shared(gpa, size, do_fallocate);
+ memset((void *)gpa, p3, size);
+ guest_sync_shared(gpa, size, p3, p4);
+ memcmp_g(gpa, p4, size);
+
+ /* Reset the shared memory back to the initial pattern. */
+ memset((void *)gpa, init_p, size);
+
+ /*
+ * Free (via PUNCH_HOLE) *all* private memory so that the next
+ * iteration starts from a clean slate, e.g. with respect to
+ * whether or not there are pages/folios in guest_mem.
+ */
+ guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true);
+ }
+}
+
+static void guest_punch_hole(uint64_t gpa, uint64_t size)
+{
+ /* "Mapping" memory shared via fallocate() is done via PUNCH_HOLE. */
+ uint64_t flags = MAP_GPA_SHARED | MAP_GPA_DO_FALLOCATE;
+
+ kvm_hypercall_map_gpa_range(gpa, size, flags);
+}
+
+/*
+ * Test that PUNCH_HOLE actually frees memory by punching holes without doing a
+ * proper conversion. Freeing (PUNCH_HOLE) should zap SPTEs, and reallocating
+ * (subsequent fault) should zero memory.
+ */
+static void guest_test_punch_hole(uint64_t base_gpa, bool precise)
+{
+ const uint8_t init_p = 0xcc;
+ int i;
+
+ /*
+ * Convert the entire range to private, this testcase is all about
+ * punching holes in guest_memfd, i.e. shared mappings aren't needed.
+ */
+ guest_map_private(base_gpa, PER_CPU_DATA_SIZE, false);
+
+ for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
+ uint64_t gpa = base_gpa + test_ranges[i].offset;
+ uint64_t size = test_ranges[i].size;
+
+ /*
+ * Free all memory before each iteration, even for the !precise
+ * case where the memory will be faulted back in. Freeing and
+ * reallocating should obviously work, and freeing all memory
+ * minimizes the probability of cross-testcase influence.
+ */
+ guest_punch_hole(base_gpa, PER_CPU_DATA_SIZE);
+
+ /* Fault-in and initialize memory, and verify the pattern. */
+ if (precise) {
+ memset((void *)gpa, init_p, size);
+ memcmp_g(gpa, init_p, size);
+ } else {
+ memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE);
+ memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
+ }
+
+ /*
+ * Punch a hole at the target range and verify that reads from
+ * the guest succeed and return zeroes.
+ */
+ guest_punch_hole(gpa, size);
+ memcmp_g(gpa, 0, size);
+ }
+}
+
+static void guest_code(uint64_t base_gpa)
+{
+ /*
+ * Run the conversion test twice, with and without doing fallocate() on
+ * the guest_memfd backing when converting between shared and private.
+ */
+ guest_test_explicit_conversion(base_gpa, false);
+ guest_test_explicit_conversion(base_gpa, true);
+
+ /*
+ * Run the PUNCH_HOLE test twice too, once with the entire guest_memfd
+ * faulted in, once with only the target range faulted in.
+ */
+ guest_test_punch_hole(base_gpa, false);
+ guest_test_punch_hole(base_gpa, true);
+ GUEST_DONE();
+}
+
+static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ uint64_t gpa = run->hypercall.args[0];
+ uint64_t size = run->hypercall.args[1] * PAGE_SIZE;
+ bool set_attributes = run->hypercall.args[2] & MAP_GPA_SET_ATTRIBUTES;
+ bool map_shared = run->hypercall.args[2] & MAP_GPA_SHARED;
+ bool do_fallocate = run->hypercall.args[2] & MAP_GPA_DO_FALLOCATE;
+ struct kvm_vm *vm = vcpu->vm;
+
+ TEST_ASSERT(run->hypercall.nr == KVM_HC_MAP_GPA_RANGE,
+ "Wanted MAP_GPA_RANGE (%u), got '%llu'",
+ KVM_HC_MAP_GPA_RANGE, run->hypercall.nr);
+
+ if (do_fallocate)
+ vm_guest_mem_fallocate(vm, gpa, size, map_shared);
+
+ if (set_attributes)
+ vm_set_memory_attributes(vm, gpa, size,
+ map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
+ run->hypercall.ret = 0;
+}
+
+static bool run_vcpus;
+
+static void *__test_mem_conversions(void *__vcpu)
+{
+ struct kvm_vcpu *vcpu = __vcpu;
+ struct kvm_run *run = vcpu->run;
+ struct kvm_vm *vm = vcpu->vm;
+ struct ucall uc;
+
+ while (!READ_ONCE(run_vcpus))
+ ;
+
+ for ( ;; ) {
+ vcpu_run(vcpu);
+
+ if (run->exit_reason == KVM_EXIT_HYPERCALL) {
+ handle_exit_hypercall(vcpu);
+ continue;
+ }
+
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
+ run->exit_reason, exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ case UCALL_SYNC: {
+ uint64_t gpa = uc.args[1];
+ size_t size = uc.args[2];
+ size_t i;
+
+ TEST_ASSERT(uc.args[0] == SYNC_SHARED ||
+ uc.args[0] == SYNC_PRIVATE,
+ "Unknown sync command '%ld'", uc.args[0]);
+
+ for (i = 0; i < size; i += vm->page_size) {
+ size_t nr_bytes = min_t(size_t, vm->page_size, size - i);
+ uint8_t *hva = addr_gpa2hva(vm, gpa + i);
+
+ /* In all cases, the host should observe the shared data. */
+ memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
+
+ /* For shared, write the new pattern to guest memory. */
+ if (uc.args[0] == SYNC_SHARED)
+ memset(hva, uc.args[4], nr_bytes);
+ }
+ break;
+ }
+ case UCALL_DONE:
+ return NULL;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ }
+ }
+}
+
+static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus,
+ uint32_t nr_memslots)
+{
+ /*
+ * Allocate enough memory so that each vCPU's chunk of memory can be
+ * naturally aligned with respect to the size of the backing store.
+ */
+ const size_t alignment = max_t(size_t, SZ_2M, get_backing_src_pagesz(src_type));
+ const size_t per_cpu_size = align_up(PER_CPU_DATA_SIZE, alignment);
+ const size_t memfd_size = per_cpu_size * nr_vcpus;
+ const size_t slot_size = memfd_size / nr_memslots;
+ struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+ pthread_t threads[KVM_MAX_VCPUS];
+ struct kvm_vm *vm;
+ int memfd, i, r;
+
+ const struct vm_shape shape = {
+ .mode = VM_MODE_DEFAULT,
+ .type = KVM_X86_SW_PROTECTED_VM,
+ };
+
+ TEST_ASSERT(slot_size * nr_memslots == memfd_size,
+ "The memfd size (0x%lx) needs to be cleanly divisible by the number of memslots (%u)",
+ memfd_size, nr_memslots);
+ vm = __vm_create_with_vcpus(shape, nr_vcpus, 0, guest_code, vcpus);
+
+ vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
+
+ memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+
+ for (i = 0; i < nr_memslots; i++)
+ vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
+ BASE_DATA_SLOT + i, slot_size / vm->page_size,
+ KVM_MEM_GUEST_MEMFD, memfd, slot_size * i);
+
+ for (i = 0; i < nr_vcpus; i++) {
+ uint64_t gpa = BASE_DATA_GPA + i * per_cpu_size;
+
+ vcpu_args_set(vcpus[i], 1, gpa);
+
+ /*
+ * Map only what is needed so that an out-of-bounds access
+ * results #PF => SHUTDOWN instead of data corruption.
+ */
+ virt_map(vm, gpa, gpa, PER_CPU_DATA_SIZE / vm->page_size);
+
+ pthread_create(&threads[i], NULL, __test_mem_conversions, vcpus[i]);
+ }
+
+ WRITE_ONCE(run_vcpus, true);
+
+ for (i = 0; i < nr_vcpus; i++)
+ pthread_join(threads[i], NULL);
+
+ kvm_vm_free(vm);
+
+ /*
+ * Allocate and free memory from the guest_memfd after closing the VM
+ * fd. The guest_memfd is gifted a reference to its owning VM, i.e.
+ * should prevent the VM from being fully destroyed until the last
+ * reference to the guest_memfd is also put.
+ */
+ r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size);
+ TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
+
+ r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size);
+ TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
+}
+
+static void usage(const char *cmd)
+{
+ puts("");
+ printf("usage: %s [-h] [-m nr_memslots] [-s mem_type] [-n nr_vcpus]\n", cmd);
+ puts("");
+ backing_src_help("-s");
+ puts("");
+ puts(" -n: specify the number of vcpus (default: 1)");
+ puts("");
+ puts(" -m: specify the number of memslots (default: 1)");
+ puts("");
+}
+
+int main(int argc, char *argv[])
+{
+ enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
+ uint32_t nr_memslots = 1;
+ uint32_t nr_vcpus = 1;
+ int opt;
+
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+
+ while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) {
+ switch (opt) {
+ case 's':
+ src_type = parse_backing_src_type(optarg);
+ break;
+ case 'n':
+ nr_vcpus = atoi_positive("nr_vcpus", optarg);
+ break;
+ case 'm':
+ nr_memslots = atoi_positive("nr_memslots", optarg);
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ exit(0);
+ }
+ }
+
+ test_mem_conversions(src_type, nr_vcpus, nr_memslots);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
new file mode 100644
index 000000000000..13e72fcec8dd
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023, Google LLC.
+ */
+#include <linux/kvm.h>
+#include <pthread.h>
+#include <stdint.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+/* Arbitrarily selected to avoid overlaps with anything else */
+#define EXITS_TEST_GVA 0xc0000000
+#define EXITS_TEST_GPA EXITS_TEST_GVA
+#define EXITS_TEST_NPAGES 1
+#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE)
+#define EXITS_TEST_SLOT 10
+
+static uint64_t guest_repeatedly_read(void)
+{
+ volatile uint64_t value;
+
+ while (true)
+ value = *((uint64_t *) EXITS_TEST_GVA);
+
+ return value;
+}
+
+static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = _vcpu_run(vcpu);
+ if (r) {
+ TEST_ASSERT(errno == EFAULT, KVM_IOCTL_ERROR(KVM_RUN, r));
+ TEST_ASSERT_EQ(vcpu->run->exit_reason, KVM_EXIT_MEMORY_FAULT);
+ }
+ return vcpu->run->exit_reason;
+}
+
+const struct vm_shape protected_vm_shape = {
+ .mode = VM_MODE_DEFAULT,
+ .type = KVM_X86_SW_PROTECTED_VM,
+};
+
+static void test_private_access_memslot_deleted(void)
+{
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu;
+ pthread_t vm_thread;
+ void *thread_return;
+ uint32_t exit_reason;
+
+ vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
+ guest_repeatedly_read);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ EXITS_TEST_GPA, EXITS_TEST_SLOT,
+ EXITS_TEST_NPAGES,
+ KVM_MEM_GUEST_MEMFD);
+
+ virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+ /* Request to access page privately */
+ vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+
+ pthread_create(&vm_thread, NULL,
+ (void *(*)(void *))run_vcpu_get_exit_reason,
+ (void *)vcpu);
+
+ vm_mem_region_delete(vm, EXITS_TEST_SLOT);
+
+ pthread_join(vm_thread, &thread_return);
+ exit_reason = (uint32_t)(uint64_t)thread_return;
+
+ TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+
+ kvm_vm_free(vm);
+}
+
+static void test_private_access_memslot_not_private(void)
+{
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu;
+ uint32_t exit_reason;
+
+ vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
+ guest_repeatedly_read);
+
+ /* Add a non-private memslot (flags = 0) */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ EXITS_TEST_GPA, EXITS_TEST_SLOT,
+ EXITS_TEST_NPAGES, 0);
+
+ virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+ /* Request to access page privately */
+ vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+
+ exit_reason = run_vcpu_get_exit_reason(vcpu);
+
+ TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+ TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+
+ test_private_access_memslot_deleted();
+ test_private_access_memslot_not_private();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
index 7ee44496cf97..0c7ce3d4e83a 100644
--- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
+++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
@@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i
run_guest(vmcb, svm->vmcb_gpa);
__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL,
- "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%llx, info2 = '0x%llx'",
+ "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
vmcb->control.exit_code,
vmcb->control.exit_info_1, vmcb->control.exit_info_2);
@@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i
run_guest(vmcb, svm->vmcb_gpa);
__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT,
- "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%llx, info2 = '0x%llx'",
+ "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
vmcb->control.exit_code,
vmcb->control.exit_info_1, vmcb->control.exit_info_2);
diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
index 85f34ca7e49e..0ed32ec903d0 100644
--- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
+++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
@@ -271,7 +271,7 @@ int main(int argc, char *argv[])
kvm_check_cap(KVM_CAP_MCE);
- vm = __vm_create(VM_MODE_DEFAULT, 3, 0);
+ vm = __vm_create(VM_SHAPE_DEFAULT, 3, 0);
kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED,
&supported_mcg_caps);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
index ebbcb0a3f743..2a8d4ac2f020 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
@@ -56,7 +56,7 @@ static void guest_test_perf_capabilities_gp(uint64_t val)
uint8_t vector = wrmsr_safe(MSR_IA32_PERF_CAPABILITIES, val);
__GUEST_ASSERT(vector == GP_VECTOR,
- "Expected #GP for value '0x%llx', got vector '0x%x'",
+ "Expected #GP for value '0x%lx', got vector '0x%x'",
val, vector);
}
diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
index 77d04a7bdadd..dc6217440db3 100644
--- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
@@ -25,7 +25,7 @@ do { \
\
__GUEST_ASSERT((__supported & (xfeatures)) != (xfeatures) || \
__supported == ((xfeatures) | (dependencies)), \
- "supported = 0x%llx, xfeatures = 0x%llx, dependencies = 0x%llx", \
+ "supported = 0x%lx, xfeatures = 0x%llx, dependencies = 0x%llx", \
__supported, (xfeatures), (dependencies)); \
} while (0)
@@ -42,7 +42,7 @@ do { \
uint64_t __supported = (supported_xcr0) & (xfeatures); \
\
__GUEST_ASSERT(!__supported || __supported == (xfeatures), \
- "supported = 0x%llx, xfeatures = 0x%llx", \
+ "supported = 0x%lx, xfeatures = 0x%llx", \
__supported, (xfeatures)); \
} while (0)
@@ -81,7 +81,7 @@ static void guest_code(void)
vector = xsetbv_safe(0, supported_xcr0);
__GUEST_ASSERT(!vector,
- "Expected success on XSETBV(0x%llx), got vector '0x%x'",
+ "Expected success on XSETBV(0x%lx), got vector '0x%x'",
supported_xcr0, vector);
for (i = 0; i < 64; i++) {
@@ -90,7 +90,7 @@ static void guest_code(void)
vector = xsetbv_safe(0, supported_xcr0 | BIT_ULL(i));
__GUEST_ASSERT(vector == GP_VECTOR,
- "Expected #GP on XSETBV(0x%llx), supported XCR0 = %llx, got vector '0x%x'",
+ "Expected #GP on XSETBV(0x%llx), supported XCR0 = %lx, got vector '0x%x'",
BIT_ULL(i), supported_xcr0, vector);
}
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 484d0873061c..6793211a0b64 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -3,6 +3,7 @@
config HAVE_KVM
bool
+ select EVENTFD
config HAVE_KVM_PFNCACHE
bool
@@ -10,9 +11,6 @@ config HAVE_KVM_PFNCACHE
config HAVE_KVM_IRQCHIP
bool
-config HAVE_KVM_IRQFD
- bool
-
config HAVE_KVM_IRQ_ROUTING
bool
@@ -39,10 +37,6 @@ config NEED_KVM_DIRTY_RING_WITH_BITMAP
bool
depends on HAVE_KVM_DIRTY_RING
-config HAVE_KVM_EVENTFD
- bool
- select EVENTFD
-
config KVM_MMIO
bool
@@ -92,3 +86,20 @@ config HAVE_KVM_PM_NOTIFIER
config KVM_GENERIC_HARDWARE_ENABLING
bool
+
+config KVM_GENERIC_MMU_NOTIFIER
+ select MMU_NOTIFIER
+ bool
+
+config KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_GENERIC_MMU_NOTIFIER
+ bool
+
+config KVM_PRIVATE_MEM
+ select XARRAY_MULTI
+ bool
+
+config KVM_GENERIC_PRIVATE_MEM
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_PRIVATE_MEM
+ bool
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index 2c27d5d0c367..724c89af78af 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
+kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index c1cd7dfe4a90..86d267db87bb 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -58,7 +58,7 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
as_id = slot >> 16;
id = (u16)slot;
- if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
return;
memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 89912a17f5d5..d516e6d33f56 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -28,7 +28,7 @@
#include <kvm/iodev.h>
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
static struct workqueue_struct *irqfd_cleanup_wq;
@@ -526,21 +526,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
synchronize_srcu(&kvm->irq_srcu);
kvm_arch_post_irq_ack_notifier_list_update(kvm);
}
-#endif
-
-void
-kvm_eventfd_init(struct kvm *kvm)
-{
-#ifdef CONFIG_HAVE_KVM_IRQFD
- spin_lock_init(&kvm->irqfds.lock);
- INIT_LIST_HEAD(&kvm->irqfds.items);
- INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
- mutex_init(&kvm->irqfds.resampler_lock);
-#endif
- INIT_LIST_HEAD(&kvm->ioeventfds);
-}
-#ifdef CONFIG_HAVE_KVM_IRQFD
/*
* shutdown any irqfd's that match fd+gsi
*/
@@ -1012,3 +998,15 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
return kvm_assign_ioeventfd(kvm, args);
}
+
+void
+kvm_eventfd_init(struct kvm *kvm)
+{
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+ spin_lock_init(&kvm->irqfds.lock);
+ INIT_LIST_HEAD(&kvm->irqfds.items);
+ INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+ mutex_init(&kvm->irqfds.resampler_lock);
+#endif
+ INIT_LIST_HEAD(&kvm->ioeventfds);
+}
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
new file mode 100644
index 000000000000..c2e2371720a9
--- /dev/null
+++ b/virt/kvm/guest_memfd.c
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/anon_inodes.h>
+
+#include "kvm_mm.h"
+
+struct kvm_gmem {
+ struct kvm *kvm;
+ struct xarray bindings;
+ struct list_head entry;
+};
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+{
+ struct folio *folio;
+
+ /* TODO: Support huge pages. */
+ folio = filemap_grab_folio(inode->i_mapping, index);
+ if (IS_ERR_OR_NULL(folio))
+ return NULL;
+
+ /*
+ * Use the up-to-date flag to track whether or not the memory has been
+ * zeroed before being handed off to the guest. There is no backing
+ * storage for the memory, so the folio will remain up-to-date until
+ * it's removed.
+ *
+ * TODO: Skip clearing pages when trusted firmware will do it when
+ * assigning memory to the guest.
+ */
+ if (!folio_test_uptodate(folio)) {
+ unsigned long nr_pages = folio_nr_pages(folio);
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
+
+ folio_mark_uptodate(folio);
+ }
+
+ /*
+ * Ignore accessed, referenced, and dirty flags. The memory is
+ * unevictable and there is no storage to write back to.
+ */
+ return folio;
+}
+
+static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+ pgoff_t end)
+{
+ bool flush = false, found_memslot = false;
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = gmem->kvm;
+ unsigned long index;
+
+ xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+ pgoff_t pgoff = slot->gmem.pgoff;
+
+ struct kvm_gfn_range gfn_range = {
+ .start = slot->base_gfn + max(pgoff, start) - pgoff,
+ .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
+ .slot = slot,
+ .may_block = true,
+ };
+
+ if (!found_memslot) {
+ found_memslot = true;
+
+ KVM_MMU_LOCK(kvm);
+ kvm_mmu_invalidate_begin(kvm);
+ }
+
+ flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
+ }
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ if (found_memslot)
+ KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
+ pgoff_t end)
+{
+ struct kvm *kvm = gmem->kvm;
+
+ if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+ KVM_MMU_LOCK(kvm);
+ kvm_mmu_invalidate_end(kvm);
+ KVM_MMU_UNLOCK(kvm);
+ }
+}
+
+static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct list_head *gmem_list = &inode->i_mapping->private_list;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + len) >> PAGE_SHIFT;
+ struct kvm_gmem *gmem;
+
+ /*
+ * Bindings must be stable across invalidation to ensure the start+end
+ * are balanced.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+
+ list_for_each_entry(gmem, gmem_list, entry)
+ kvm_gmem_invalidate_begin(gmem, start, end);
+
+ truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+
+ list_for_each_entry(gmem, gmem_list, entry)
+ kvm_gmem_invalidate_end(gmem, start, end);
+
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return 0;
+}
+
+static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start, index, end;
+ int r;
+
+ /* Dedicated guest is immutable by default. */
+ if (offset + len > i_size_read(inode))
+ return -EINVAL;
+
+ filemap_invalidate_lock_shared(mapping);
+
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len) >> PAGE_SHIFT;
+
+ r = 0;
+ for (index = start; index < end; ) {
+ struct folio *folio;
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ folio = kvm_gmem_get_folio(inode, index);
+ if (!folio) {
+ r = -ENOMEM;
+ break;
+ }
+
+ index = folio_next_index(folio);
+
+ folio_unlock(folio);
+ folio_put(folio);
+
+ /* 64-bit only, wrapping the index should be impossible. */
+ if (WARN_ON_ONCE(!index))
+ break;
+
+ cond_resched();
+ }
+
+ filemap_invalidate_unlock_shared(mapping);
+
+ return r;
+}
+
+static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ int ret;
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ return -EOPNOTSUPP;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+ return -EINVAL;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
+ else
+ ret = kvm_gmem_allocate(file_inode(file), offset, len);
+
+ if (!ret)
+ file_modified(file);
+ return ret;
+}
+
+static int kvm_gmem_release(struct inode *inode, struct file *file)
+{
+ struct kvm_gmem *gmem = file->private_data;
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = gmem->kvm;
+ unsigned long index;
+
+ /*
+ * Prevent concurrent attempts to *unbind* a memslot. This is the last
+ * reference to the file and thus no new bindings can be created, but
+ * dereferencing the slot for existing bindings needs to be protected
+ * against memslot updates, specifically so that unbind doesn't race
+ * and free the memslot (kvm_gmem_get_file() will return NULL).
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ filemap_invalidate_lock(inode->i_mapping);
+
+ xa_for_each(&gmem->bindings, index, slot)
+ rcu_assign_pointer(slot->gmem.file, NULL);
+
+ synchronize_rcu();
+
+ /*
+ * All in-flight operations are gone and new bindings can be created.
+ * Zap all SPTEs pointed at by this file. Do not free the backing
+ * memory, as its lifetime is associated with the inode, not the file.
+ */
+ kvm_gmem_invalidate_begin(gmem, 0, -1ul);
+ kvm_gmem_invalidate_end(gmem, 0, -1ul);
+
+ list_del(&gmem->entry);
+
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ mutex_unlock(&kvm->slots_lock);
+
+ xa_destroy(&gmem->bindings);
+ kfree(gmem);
+
+ kvm_put_kvm(kvm);
+
+ return 0;
+}
+
+static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
+{
+ /*
+ * Do not return slot->gmem.file if it has already been closed;
+ * there might be some time between the last fput() and when
+ * kvm_gmem_release() clears slot->gmem.file, and you do not
+ * want to spin in the meanwhile.
+ */
+ return get_file_active(&slot->gmem.file);
+}
+
+static struct file_operations kvm_gmem_fops = {
+ .open = generic_file_open,
+ .release = kvm_gmem_release,
+ .fallocate = kvm_gmem_fallocate,
+};
+
+void kvm_gmem_init(struct module *module)
+{
+ kvm_gmem_fops.owner = module;
+}
+
+static int kvm_gmem_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
+static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
+{
+ struct list_head *gmem_list = &mapping->private_list;
+ struct kvm_gmem *gmem;
+ pgoff_t start, end;
+
+ filemap_invalidate_lock_shared(mapping);
+
+ start = page->index;
+ end = start + thp_nr_pages(page);
+
+ list_for_each_entry(gmem, gmem_list, entry)
+ kvm_gmem_invalidate_begin(gmem, start, end);
+
+ /*
+ * Do not truncate the range, what action is taken in response to the
+ * error is userspace's decision (assuming the architecture supports
+ * gracefully handling memory errors). If/when the guest attempts to
+ * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
+ * at which point KVM can either terminate the VM or propagate the
+ * error to userspace.
+ */
+
+ list_for_each_entry(gmem, gmem_list, entry)
+ kvm_gmem_invalidate_end(gmem, start, end);
+
+ filemap_invalidate_unlock_shared(mapping);
+
+ return MF_DELAYED;
+}
+
+static const struct address_space_operations kvm_gmem_aops = {
+ .dirty_folio = noop_dirty_folio,
+ .migrate_folio = kvm_gmem_migrate_folio,
+ .error_remove_page = kvm_gmem_error_page,
+};
+
+static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = path->dentry->d_inode;
+
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EINVAL;
+}
+static const struct inode_operations kvm_gmem_iops = {
+ .getattr = kvm_gmem_getattr,
+ .setattr = kvm_gmem_setattr,
+};
+
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
+{
+ const char *anon_name = "[kvm-gmem]";
+ struct kvm_gmem *gmem;
+ struct inode *inode;
+ struct file *file;
+ int fd, err;
+
+ fd = get_unused_fd_flags(0);
+ if (fd < 0)
+ return fd;
+
+ gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
+ if (!gmem) {
+ err = -ENOMEM;
+ goto err_fd;
+ }
+
+ file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
+ O_RDWR, NULL);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_gmem;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+
+ inode = file->f_inode;
+ WARN_ON(file->f_mapping != inode->i_mapping);
+
+ inode->i_private = (void *)(unsigned long)flags;
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_unmovable(inode->i_mapping);
+ /* Unmovable mappings are supposed to be marked unevictable as well. */
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ kvm_get_kvm(kvm);
+ gmem->kvm = kvm;
+ xa_init(&gmem->bindings);
+ list_add(&gmem->entry, &inode->i_mapping->private_list);
+
+ fd_install(fd, file);
+ return fd;
+
+err_gmem:
+ kfree(gmem);
+err_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
+{
+ loff_t size = args->size;
+ u64 flags = args->flags;
+ u64 valid_flags = 0;
+
+ if (flags & ~valid_flags)
+ return -EINVAL;
+
+ if (size <= 0 || !PAGE_ALIGNED(size))
+ return -EINVAL;
+
+ return __kvm_gmem_create(kvm, size, flags);
+}
+
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned int fd, loff_t offset)
+{
+ loff_t size = slot->npages << PAGE_SHIFT;
+ unsigned long start, end;
+ struct kvm_gmem *gmem;
+ struct inode *inode;
+ struct file *file;
+ int r = -EINVAL;
+
+ BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ if (file->f_op != &kvm_gmem_fops)
+ goto err;
+
+ gmem = file->private_data;
+ if (gmem->kvm != kvm)
+ goto err;
+
+ inode = file_inode(file);
+
+ if (offset < 0 || !PAGE_ALIGNED(offset) ||
+ offset + size > i_size_read(inode))
+ goto err;
+
+ filemap_invalidate_lock(inode->i_mapping);
+
+ start = offset >> PAGE_SHIFT;
+ end = start + slot->npages;
+
+ if (!xa_empty(&gmem->bindings) &&
+ xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+ filemap_invalidate_unlock(inode->i_mapping);
+ goto err;
+ }
+
+ /*
+ * No synchronize_rcu() needed, any in-flight readers are guaranteed to
+ * be see either a NULL file or this new file, no need for them to go
+ * away.
+ */
+ rcu_assign_pointer(slot->gmem.file, file);
+ slot->gmem.pgoff = start;
+
+ xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ /*
+ * Drop the reference to the file, even on success. The file pins KVM,
+ * not the other way 'round. Active bindings are invalidated if the
+ * file is closed before memslots are destroyed.
+ */
+ r = 0;
+err:
+ fput(file);
+ return r;
+}
+
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+ unsigned long start = slot->gmem.pgoff;
+ unsigned long end = start + slot->npages;
+ struct kvm_gmem *gmem;
+ struct file *file;
+
+ /*
+ * Nothing to do if the underlying file was already closed (or is being
+ * closed right now), kvm_gmem_release() invalidates all bindings.
+ */
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return;
+
+ gmem = file->private_data;
+
+ filemap_invalidate_lock(file->f_mapping);
+ xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
+ rcu_assign_pointer(slot->gmem.file, NULL);
+ synchronize_rcu();
+ filemap_invalidate_unlock(file->f_mapping);
+
+ fput(file);
+}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+ pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+ struct kvm_gmem *gmem;
+ struct folio *folio;
+ struct page *page;
+ struct file *file;
+ int r;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return -EFAULT;
+
+ gmem = file->private_data;
+
+ if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
+ r = -EIO;
+ goto out_fput;
+ }
+
+ folio = kvm_gmem_get_folio(file_inode(file), index);
+ if (!folio) {
+ r = -ENOMEM;
+ goto out_fput;
+ }
+
+ if (folio_test_hwpoison(folio)) {
+ r = -EHWPOISON;
+ goto out_unlock;
+ }
+
+ page = folio_file_page(folio, index);
+
+ *pfn = page_to_pfn(page);
+ if (max_order)
+ *max_order = 0;
+
+ r = 0;
+
+out_unlock:
+ folio_unlock(folio);
+out_fput:
+ fput(file);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7db96875ac46..8d59f83458ca 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -533,31 +533,44 @@ void kvm_destroy_vcpus(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
{
return container_of(mn, struct kvm, mmu_notifier);
}
-typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
-typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
- unsigned long end);
+typedef void (*on_lock_fn_t)(struct kvm *kvm);
-typedef void (*on_unlock_fn_t)(struct kvm *kvm);
-
-struct kvm_hva_range {
- unsigned long start;
- unsigned long end;
+struct kvm_mmu_notifier_range {
+ /*
+ * 64-bit addresses, as KVM notifiers can operate on host virtual
+ * addresses (unsigned long) and guest physical addresses (64-bit).
+ */
+ u64 start;
+ u64 end;
union kvm_mmu_notifier_arg arg;
- hva_handler_t handler;
+ gfn_handler_t handler;
on_lock_fn_t on_lock;
- on_unlock_fn_t on_unlock;
bool flush_on_ret;
bool may_block;
};
/*
+ * The inner-most helper returns a tuple containing the return value from the
+ * arch- and action-specific handler, plus a flag indicating whether or not at
+ * least one memslot was found, i.e. if the handler found guest memory.
+ *
+ * Note, most notifiers are averse to booleans, so even though KVM tracks the
+ * return from arch code as a bool, outer helpers will cast it to an int. :-(
+ */
+typedef struct kvm_mmu_notifier_return {
+ bool ret;
+ bool found_memslot;
+} kvm_mn_ret_t;
+
+/*
* Use a dedicated stub instead of NULL to indicate that there is no callback
* function/handler. The compiler technically can't guarantee that a real
* function will have a non-zero address, and so it will generate code to
@@ -578,26 +591,29 @@ static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
node; \
node = interval_tree_iter_next(node, start, last)) \
-static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
- const struct kvm_hva_range *range)
+static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
+ const struct kvm_mmu_notifier_range *range)
{
- bool ret = false, locked = false;
+ struct kvm_mmu_notifier_return r = {
+ .ret = false,
+ .found_memslot = false,
+ };
struct kvm_gfn_range gfn_range;
struct kvm_memory_slot *slot;
struct kvm_memslots *slots;
int i, idx;
if (WARN_ON_ONCE(range->end <= range->start))
- return 0;
+ return r;
/* A null handler is allowed if and only if on_lock() is provided. */
if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
IS_KVM_NULL_FN(range->handler)))
- return 0;
+ return r;
idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
struct interval_tree_node *node;
slots = __kvm_memslots(kvm, i);
@@ -606,9 +622,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
unsigned long hva_start, hva_end;
slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
- hva_start = max(range->start, slot->userspace_addr);
- hva_end = min(range->end, slot->userspace_addr +
- (slot->npages << PAGE_SHIFT));
+ hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
+ hva_end = min_t(unsigned long, range->end,
+ slot->userspace_addr + (slot->npages << PAGE_SHIFT));
/*
* To optimize for the likely case where the address
@@ -627,71 +643,66 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
gfn_range.slot = slot;
- if (!locked) {
- locked = true;
+ if (!r.found_memslot) {
+ r.found_memslot = true;
KVM_MMU_LOCK(kvm);
if (!IS_KVM_NULL_FN(range->on_lock))
- range->on_lock(kvm, range->start, range->end);
+ range->on_lock(kvm);
+
if (IS_KVM_NULL_FN(range->handler))
break;
}
- ret |= range->handler(kvm, &gfn_range);
+ r.ret |= range->handler(kvm, &gfn_range);
}
}
- if (range->flush_on_ret && ret)
+ if (range->flush_on_ret && r.ret)
kvm_flush_remote_tlbs(kvm);
- if (locked) {
+ if (r.found_memslot)
KVM_MMU_UNLOCK(kvm);
- if (!IS_KVM_NULL_FN(range->on_unlock))
- range->on_unlock(kvm);
- }
srcu_read_unlock(&kvm->srcu, idx);
- /* The notifiers are averse to booleans. :-( */
- return (int)ret;
+ return r;
}
static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
unsigned long start,
unsigned long end,
union kvm_mmu_notifier_arg arg,
- hva_handler_t handler)
+ gfn_handler_t handler)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- const struct kvm_hva_range range = {
+ const struct kvm_mmu_notifier_range range = {
.start = start,
.end = end,
.arg = arg,
.handler = handler,
.on_lock = (void *)kvm_null_fn,
- .on_unlock = (void *)kvm_null_fn,
.flush_on_ret = true,
.may_block = false,
};
- return __kvm_handle_hva_range(kvm, &range);
+ return __kvm_handle_hva_range(kvm, &range).ret;
}
static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
unsigned long start,
unsigned long end,
- hva_handler_t handler)
+ gfn_handler_t handler)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- const struct kvm_hva_range range = {
+ const struct kvm_mmu_notifier_range range = {
.start = start,
.end = end,
.handler = handler,
.on_lock = (void *)kvm_null_fn,
- .on_unlock = (void *)kvm_null_fn,
.flush_on_ret = false,
.may_block = false,
};
- return __kvm_handle_hva_range(kvm, &range);
+ return __kvm_handle_hva_range(kvm, &range).ret;
}
static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -736,16 +747,29 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
}
-void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
- unsigned long end)
+void kvm_mmu_invalidate_begin(struct kvm *kvm)
{
+ lockdep_assert_held_write(&kvm->mmu_lock);
/*
* The count increase must become visible at unlock time as no
* spte can be established without taking the mmu_lock and
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_invalidate_in_progress++;
+
if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+ kvm->mmu_invalidate_range_start = INVALID_GPA;
+ kvm->mmu_invalidate_range_end = INVALID_GPA;
+ }
+}
+
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
+
+ if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
kvm->mmu_invalidate_range_start = start;
kvm->mmu_invalidate_range_end = end;
} else {
@@ -765,16 +789,21 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
}
}
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+ return kvm_unmap_gfn_range(kvm, range);
+}
+
static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *range)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- const struct kvm_hva_range hva_range = {
+ const struct kvm_mmu_notifier_range hva_range = {
.start = range->start,
.end = range->end,
- .handler = kvm_unmap_gfn_range,
+ .handler = kvm_mmu_unmap_gfn_range,
.on_lock = kvm_mmu_invalidate_begin,
- .on_unlock = kvm_arch_guest_memory_reclaimed,
.flush_on_ret = true,
.may_block = mmu_notifier_range_blockable(range),
};
@@ -806,14 +835,21 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
hva_range.may_block);
- __kvm_handle_hva_range(kvm, &hva_range);
+ /*
+ * If one or more memslots were found and thus zapped, notify arch code
+ * that guest memory has been reclaimed. This needs to be done *after*
+ * dropping mmu_lock, as x86's reclaim path is slooooow.
+ */
+ if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
+ kvm_arch_guest_memory_reclaimed(kvm);
return 0;
}
-void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
- unsigned long end)
+void kvm_mmu_invalidate_end(struct kvm *kvm)
{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
/*
* This sequence increase will notify the kvm page fault that
* the page that is going to be mapped in the spte could have
@@ -827,18 +863,24 @@ void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
* in conjunction with the smp_rmb in mmu_invalidate_retry().
*/
kvm->mmu_invalidate_in_progress--;
+ KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
+
+ /*
+ * Assert that at least one range was added between start() and end().
+ * Not adding a range isn't fatal, but it is a KVM bug.
+ */
+ WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
const struct mmu_notifier_range *range)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- const struct kvm_hva_range hva_range = {
+ const struct kvm_mmu_notifier_range hva_range = {
.start = range->start,
.end = range->end,
.handler = (void *)kvm_null_fn,
.on_lock = kvm_mmu_invalidate_end,
- .on_unlock = (void *)kvm_null_fn,
.flush_on_ret = false,
.may_block = mmu_notifier_range_blockable(range),
};
@@ -857,8 +899,6 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
*/
if (wake)
rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
-
- BUG_ON(kvm->mmu_invalidate_in_progress < 0);
}
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -932,14 +972,14 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}
-#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
static int kvm_init_mmu_notifier(struct kvm *kvm)
{
return 0;
}
-#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
static int kvm_pm_notifier_call(struct notifier_block *bl,
@@ -985,6 +1025,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
/* This does not remove the slot from struct kvm_memslots data structures */
static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
+ if (slot->flags & KVM_MEM_GUEST_MEMFD)
+ kvm_gmem_unbind(slot);
+
kvm_destroy_dirty_bitmap(slot);
kvm_arch_free_memslot(kvm, slot);
@@ -1166,6 +1209,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
spin_lock_init(&kvm->mn_invalidate_lock);
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
xa_init(&kvm->vcpu_array);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ xa_init(&kvm->mem_attr_array);
+#endif
INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);
@@ -1190,7 +1236,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
goto out_err_no_irq_srcu;
refcount_set(&kvm->users_count, 1);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
for (j = 0; j < 2; j++) {
slots = &kvm->__memslots[i][j];
@@ -1222,7 +1268,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
if (r)
goto out_err_no_disable;
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif
@@ -1256,7 +1302,7 @@ out_err:
out_err_no_debugfs:
kvm_coalesced_mmio_free(kvm);
out_no_coalesced_mmio:
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
if (kvm->mmu_notifier.ops)
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
#endif
@@ -1315,7 +1361,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm);
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
/*
* At this point, pending calls to invalidate_range_start()
@@ -1324,20 +1370,30 @@ static void kvm_destroy_vm(struct kvm *kvm)
* No threads can be waiting in kvm_swap_active_memslots() as the
* last reference on KVM has been dropped, but freeing
* memslots would deadlock without this manual intervention.
+ *
+ * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
+ * notifier between a start() and end(), then there shouldn't be any
+ * in-progress invalidations.
*/
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
- kvm->mn_active_invalidate_count = 0;
+ if (kvm->mn_active_invalidate_count)
+ kvm->mn_active_invalidate_count = 0;
+ else
+ WARN_ON(kvm->mmu_invalidate_in_progress);
#else
kvm_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_destroy_devices(kvm);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
}
cleanup_srcu_struct(&kvm->irq_srcu);
cleanup_srcu_struct(&kvm->srcu);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ xa_destroy(&kvm->mem_attr_array);
+#endif
kvm_arch_free_vm(kvm);
preempt_notifier_dec();
hardware_disable_all();
@@ -1538,10 +1594,26 @@ static void kvm_replace_memslot(struct kvm *kvm,
}
}
-static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+/*
+ * Flags that do not access any of the extra space of struct
+ * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
+ * only allows these.
+ */
+#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
+ (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
+
+static int check_memory_region_flags(struct kvm *kvm,
+ const struct kvm_userspace_memory_region2 *mem)
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+ if (kvm_arch_has_private_mem(kvm))
+ valid_flags |= KVM_MEM_GUEST_MEMFD;
+
+ /* Dirty logging private memory is not currently supported. */
+ if (mem->flags & KVM_MEM_GUEST_MEMFD)
+ valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
#ifdef __KVM_HAVE_READONLY_MEM
valid_flags |= KVM_MEM_READONLY;
#endif
@@ -1603,7 +1675,7 @@ static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
* space 0 will use generations 0, 2, 4, ... while address space 1 will
* use generations 1, 3, 5, ...
*/
- gen += KVM_ADDRESS_SPACE_NUM;
+ gen += kvm_arch_nr_memslot_as_ids(kvm);
kvm_arch_memslots_updated(kvm, gen);
@@ -1940,7 +2012,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
* Must be called holding kvm->slots_lock for write.
*/
int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem)
+ const struct kvm_userspace_memory_region2 *mem)
{
struct kvm_memory_slot *old, *new;
struct kvm_memslots *slots;
@@ -1950,7 +2022,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
int as_id, id;
int r;
- r = check_memory_region_flags(mem);
+ r = check_memory_region_flags(kvm, mem);
if (r)
return r;
@@ -1969,7 +2041,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
!access_ok((void __user *)(unsigned long)mem->userspace_addr,
mem->memory_size))
return -EINVAL;
- if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
+ if (mem->flags & KVM_MEM_GUEST_MEMFD &&
+ (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
+ mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
+ return -EINVAL;
+ if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
return -EINVAL;
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
return -EINVAL;
@@ -2007,6 +2083,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
return -EINVAL;
} else { /* Modify an existing slot. */
+ /* Private memslots are immutable, they can only be deleted. */
+ if (mem->flags & KVM_MEM_GUEST_MEMFD)
+ return -EINVAL;
if ((mem->userspace_addr != old->userspace_addr) ||
(npages != old->npages) ||
((mem->flags ^ old->flags) & KVM_MEM_READONLY))
@@ -2035,16 +2114,29 @@ int __kvm_set_memory_region(struct kvm *kvm,
new->npages = npages;
new->flags = mem->flags;
new->userspace_addr = mem->userspace_addr;
+ if (mem->flags & KVM_MEM_GUEST_MEMFD) {
+ r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
+ if (r)
+ goto out;
+ }
r = kvm_set_memslot(kvm, old, new, change);
if (r)
- kfree(new);
+ goto out_unbind;
+
+ return 0;
+
+out_unbind:
+ if (mem->flags & KVM_MEM_GUEST_MEMFD)
+ kvm_gmem_unbind(new);
+out:
+ kfree(new);
return r;
}
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
int kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem)
+ const struct kvm_userspace_memory_region2 *mem)
{
int r;
@@ -2056,7 +2148,7 @@ int kvm_set_memory_region(struct kvm *kvm,
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+ struct kvm_userspace_memory_region2 *mem)
{
if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
return -EINVAL;
@@ -2089,7 +2181,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
as_id = log->slot >> 16;
id = (u16)log->slot;
- if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
@@ -2151,7 +2243,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
as_id = log->slot >> 16;
id = (u16)log->slot;
- if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
@@ -2263,7 +2355,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
as_id = log->slot >> 16;
id = (u16)log->slot;
- if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
return -EINVAL;
if (log->first_page & 63)
@@ -2335,6 +2427,200 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+/*
+ * Returns true if _all_ gfns in the range [@start, @end) have attributes
+ * matching @attrs.
+ */
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long attrs)
+{
+ XA_STATE(xas, &kvm->mem_attr_array, start);
+ unsigned long index;
+ bool has_attrs;
+ void *entry;
+
+ rcu_read_lock();
+
+ if (!attrs) {
+ has_attrs = !xas_find(&xas, end - 1);
+ goto out;
+ }
+
+ has_attrs = true;
+ for (index = start; index < end; index++) {
+ do {
+ entry = xas_next(&xas);
+ } while (xas_retry(&xas, entry));
+
+ if (xas.xa_index != index || xa_to_value(entry) != attrs) {
+ has_attrs = false;
+ break;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return has_attrs;
+}
+
+static u64 kvm_supported_mem_attributes(struct kvm *kvm)
+{
+ if (!kvm || kvm_arch_has_private_mem(kvm))
+ return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ return 0;
+}
+
+static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
+ struct kvm_mmu_notifier_range *range)
+{
+ struct kvm_gfn_range gfn_range;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ struct kvm_memslot_iter iter;
+ bool found_memslot = false;
+ bool ret = false;
+ int i;
+
+ gfn_range.arg = range->arg;
+ gfn_range.may_block = range->may_block;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
+ slot = iter.slot;
+ gfn_range.slot = slot;
+
+ gfn_range.start = max(range->start, slot->base_gfn);
+ gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
+ if (gfn_range.start >= gfn_range.end)
+ continue;
+
+ if (!found_memslot) {
+ found_memslot = true;
+ KVM_MMU_LOCK(kvm);
+ if (!IS_KVM_NULL_FN(range->on_lock))
+ range->on_lock(kvm);
+ }
+
+ ret |= range->handler(kvm, &gfn_range);
+ }
+ }
+
+ if (range->flush_on_ret && ret)
+ kvm_flush_remote_tlbs(kvm);
+
+ if (found_memslot)
+ KVM_MMU_UNLOCK(kvm);
+}
+
+static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ /*
+ * Unconditionally add the range to the invalidation set, regardless of
+ * whether or not the arch callback actually needs to zap SPTEs. E.g.
+ * if KVM supports RWX attributes in the future and the attributes are
+ * going from R=>RW, zapping isn't strictly necessary. Unconditionally
+ * adding the range allows KVM to require that MMU invalidations add at
+ * least one range between begin() and end(), e.g. allows KVM to detect
+ * bugs where the add() is missed. Relaxing the rule *might* be safe,
+ * but it's not obvious that allowing new mappings while the attributes
+ * are in flux is desirable or worth the complexity.
+ */
+ kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+
+ return kvm_arch_pre_set_memory_attributes(kvm, range);
+}
+
+/* Set @attributes for the gfn range [@start, @end). */
+static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long attributes)
+{
+ struct kvm_mmu_notifier_range pre_set_range = {
+ .start = start,
+ .end = end,
+ .handler = kvm_pre_set_memory_attributes,
+ .on_lock = kvm_mmu_invalidate_begin,
+ .flush_on_ret = true,
+ .may_block = true,
+ };
+ struct kvm_mmu_notifier_range post_set_range = {
+ .start = start,
+ .end = end,
+ .arg.attributes = attributes,
+ .handler = kvm_arch_post_set_memory_attributes,
+ .on_lock = kvm_mmu_invalidate_end,
+ .may_block = true,
+ };
+ unsigned long i;
+ void *entry;
+ int r = 0;
+
+ entry = attributes ? xa_mk_value(attributes) : NULL;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /* Nothing to do if the entire range as the desired attributes. */
+ if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
+ goto out_unlock;
+
+ /*
+ * Reserve memory ahead of time to avoid having to deal with failures
+ * partway through setting the new attributes.
+ */
+ for (i = start; i < end; i++) {
+ r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
+ if (r)
+ goto out_unlock;
+ }
+
+ kvm_handle_gfn_range(kvm, &pre_set_range);
+
+ for (i = start; i < end; i++) {
+ r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
+ GFP_KERNEL_ACCOUNT));
+ KVM_BUG_ON(r, kvm);
+ }
+
+ kvm_handle_gfn_range(kvm, &post_set_range);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
+
+ return r;
+}
+static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
+ struct kvm_memory_attributes *attrs)
+{
+ gfn_t start, end;
+
+ /* flags is currently not used. */
+ if (attrs->flags)
+ return -EINVAL;
+ if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
+ return -EINVAL;
+ if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
+ return -EINVAL;
+ if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
+ return -EINVAL;
+
+ start = attrs->address >> PAGE_SHIFT;
+ end = (attrs->address + attrs->size) >> PAGE_SHIFT;
+
+ /*
+ * xarray tracks data using "unsigned long", and as a result so does
+ * KVM. For simplicity, supports generic attributes only on 64-bit
+ * architectures.
+ */
+ BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
+
+ return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
+}
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -4527,13 +4813,14 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
{
switch (arg) {
case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_USER_MEMORY2:
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
case KVM_CAP_INTERNAL_ERROR_DATA:
#ifdef CONFIG_HAVE_KVM_MSI
case KVM_CAP_SIGNAL_MSI:
#endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
case KVM_CAP_IRQFD:
#endif
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
@@ -4555,9 +4842,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_IRQ_ROUTING:
return KVM_MAX_IRQ_ROUTES;
#endif
-#if KVM_ADDRESS_SPACE_NUM > 1
+#if KVM_MAX_NR_ADDRESS_SPACES > 1
case KVM_CAP_MULTI_ADDRESS_SPACE:
- return KVM_ADDRESS_SPACE_NUM;
+ if (kvm)
+ return kvm_arch_nr_memslot_as_ids(kvm);
+ return KVM_MAX_NR_ADDRESS_SPACES;
#endif
case KVM_CAP_NR_MEMSLOTS:
return KVM_USER_MEM_SLOTS;
@@ -4579,6 +4868,14 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_BINARY_STATS_FD:
case KVM_CAP_SYSTEM_EVENT_DATA:
return 1;
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ case KVM_CAP_MEMORY_ATTRIBUTES:
+ return kvm_supported_mem_attributes(kvm);
+#endif
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ case KVM_CAP_GUEST_MEMFD:
+ return !kvm || kvm_arch_has_private_mem(kvm);
+#endif
default:
break;
}
@@ -4657,7 +4954,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm)
lockdep_assert_held(&kvm->slots_lock);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
return false;
}
@@ -4783,6 +5080,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
return fd;
}
+#define SANITY_CHECK_MEM_REGION_FIELD(field) \
+do { \
+ BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
+ offsetof(struct kvm_userspace_memory_region2, field)); \
+ BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
+ sizeof_field(struct kvm_userspace_memory_region2, field)); \
+} while (0)
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4805,15 +5110,39 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
break;
}
+ case KVM_SET_USER_MEMORY_REGION2:
case KVM_SET_USER_MEMORY_REGION: {
- struct kvm_userspace_memory_region kvm_userspace_mem;
+ struct kvm_userspace_memory_region2 mem;
+ unsigned long size;
+
+ if (ioctl == KVM_SET_USER_MEMORY_REGION) {
+ /*
+ * Fields beyond struct kvm_userspace_memory_region shouldn't be
+ * accessed, but avoid leaking kernel memory in case of a bug.
+ */
+ memset(&mem, 0, sizeof(mem));
+ size = sizeof(struct kvm_userspace_memory_region);
+ } else {
+ size = sizeof(struct kvm_userspace_memory_region2);
+ }
+
+ /* Ensure the common parts of the two structs are identical. */
+ SANITY_CHECK_MEM_REGION_FIELD(slot);
+ SANITY_CHECK_MEM_REGION_FIELD(flags);
+ SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
+ SANITY_CHECK_MEM_REGION_FIELD(memory_size);
+ SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
r = -EFAULT;
- if (copy_from_user(&kvm_userspace_mem, argp,
- sizeof(kvm_userspace_mem)))
+ if (copy_from_user(&mem, argp, size))
goto out;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
+ r = -EINVAL;
+ if (ioctl == KVM_SET_USER_MEMORY_REGION &&
+ (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
+ goto out;
+
+ r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
break;
}
case KVM_GET_DIRTY_LOG: {
@@ -4941,6 +5270,18 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ case KVM_SET_MEMORY_ATTRIBUTES: {
+ struct kvm_memory_attributes attrs;
+
+ r = -EFAULT;
+ if (copy_from_user(&attrs, argp, sizeof(attrs)))
+ goto out;
+
+ r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
+ break;
+ }
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
@@ -4968,6 +5309,18 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_GET_STATS_FD:
r = kvm_vm_ioctl_get_stats_fd(kvm);
break;
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ case KVM_CREATE_GUEST_MEMFD: {
+ struct kvm_create_guest_memfd guest_memfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
+ goto out;
+
+ r = kvm_gmem_create(kvm, &guest_memfd);
+ break;
+ }
+#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}
@@ -5139,11 +5492,6 @@ static long kvm_dev_ioctl(struct file *filp,
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
- case KVM_TRACE_ENABLE:
- case KVM_TRACE_PAUSE:
- case KVM_TRACE_DISABLE:
- r = -EOPNOTSUPP;
- break;
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
@@ -6104,6 +6452,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (WARN_ON_ONCE(r))
goto err_vfio;
+ kvm_gmem_init(module);
+
/*
* Registration _must_ be the very last thing done, as this exposes
* /dev/kvm to userspace, i.e. all infrastructure must be setup!
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 180f1a09e6ba..ecefc7ec51af 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -37,4 +37,30 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
}
#endif /* HAVE_KVM_PFNCACHE */
+#ifdef CONFIG_KVM_PRIVATE_MEM
+void kvm_gmem_init(struct module *module);
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned int fd, loff_t offset);
+void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+#else
+static inline void kvm_gmem_init(struct module *module)
+{
+
+}
+
+static inline int kvm_gmem_bind(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned int fd, loff_t offset)
+{
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+
+static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+ WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
#endif /* __KVM_MM_H__ */