summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/svm/sev.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/svm/sev.c')
-rw-r--r--arch/x86/kvm/svm/sev.c1940
1 files changed, 1840 insertions, 100 deletions
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 759581bb2128..0b851ef937f2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -19,11 +19,14 @@
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
+#include <asm/sev.h>
#include "mmu.h"
#include "x86.h"
@@ -32,22 +35,12 @@
#include "cpuid.h"
#include "trace.h"
-#ifndef CONFIG_KVM_AMD_SEV
-/*
- * When this config is not defined, SEV feature is not supported and APIs in
- * this file are not used but this file still gets compiled into the KVM AMD
- * module.
- *
- * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
- * misc_res_type {} defined in linux/misc_cgroup.h.
- *
- * Below macros allow compilation to succeed.
- */
-#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
-#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
-#endif
+#define GHCB_VERSION_MAX 2ULL
+#define GHCB_VERSION_DEFAULT 2ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
-#ifdef CONFIG_KVM_AMD_SEV
/* enable/disable SEV support */
static bool sev_enabled = true;
module_param_named(sev, sev_enabled, bool, 0444);
@@ -56,14 +49,35 @@ module_param_named(sev, sev_enabled, bool, 0444);
static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
+/* enable/disable SEV-SNP support */
+static bool sev_snp_enabled = true;
+module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
+
/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = false;
+static bool sev_es_debug_swap_enabled = true;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
-#else
-#define sev_enabled false
-#define sev_es_enabled false
-#define sev_es_debug_swap_enabled false
-#endif /* CONFIG_KVM_AMD_SEV */
+static u64 sev_supported_vmsa_features;
+
+#define AP_RESET_HOLD_NONE 0
+#define AP_RESET_HOLD_NAE_EVENT 1
+#define AP_RESET_HOLD_MSR_PROTO 2
+
+/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
+#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
+#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
+#define SNP_POLICY_MASK_SMT BIT_ULL(16)
+#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
+#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
+#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
+
+#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET)
+
+#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
static u8 sev_enc_bit;
static DECLARE_RWSEM(sev_deactivate_lock);
@@ -75,6 +89,8 @@ static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
+static int snp_decommission_context(struct kvm *kvm);
+
struct enc_region {
struct list_head list;
unsigned long npages;
@@ -101,19 +117,32 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
down_write(&sev_deactivate_lock);
wbinvd_on_all_cpus();
- ret = sev_guest_df_flush(&error);
+
+ if (sev_snp_enabled)
+ ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
+ else
+ ret = sev_guest_df_flush(&error);
up_write(&sev_deactivate_lock);
if (ret)
- pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+ pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
+ sev_snp_enabled ? "-SNP" : "", ret, error);
return ret;
}
static inline bool is_mirroring_enc_context(struct kvm *kvm)
{
- return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+ return !!to_kvm_sev_info(kvm)->enc_context_owner;
+}
+
+static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+ return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
}
/* Must be called with the sev_bitmap_lock held */
@@ -234,6 +263,53 @@ static void sev_decommission(unsigned int handle)
sev_guest_decommission(&decommission, NULL);
}
+/*
+ * Transition a page to hypervisor-owned/shared state in the RMP table. This
+ * should not fail under normal conditions, but leak the page should that
+ * happen since it will no longer be usable by the host due to RMP protections.
+ */
+static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
+{
+ if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
+ snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Certain page-states, such as Pre-Guest and Firmware pages (as documented
+ * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
+ * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
+ * unless they are reclaimed first.
+ *
+ * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
+ * might not be usable by the host due to being set as immutable or still
+ * being associated with a guest ASID.
+ *
+ * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
+ * converted back to shared, as the page is no longer usable due to RMP
+ * protections, and it's infeasible for the guest to continue on.
+ */
+static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
+{
+ struct sev_data_snp_page_reclaim data = {0};
+ int fw_err, rc;
+
+ data.paddr = __sme_set(pfn << PAGE_SHIFT);
+ rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
+ if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
+ snp_leak_pages(pfn, 1);
+ return -EIO;
+ }
+
+ if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
+ return -EIO;
+
+ return rc;
+}
+
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
struct sev_data_deactivate deactivate;
@@ -251,20 +327,119 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
sev_decommission(handle);
}
-static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+/*
+ * This sets up bounce buffers/firmware pages to handle SNP Guest Request
+ * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
+ * 2.0 specification for more details.
+ *
+ * Technically, when an SNP Guest Request is issued, the guest will provide its
+ * own request/response pages, which could in theory be passed along directly
+ * to firmware rather than using bounce pages. However, these pages would need
+ * special care:
+ *
+ * - Both pages are from shared guest memory, so they need to be protected
+ * from migration/etc. occurring while firmware reads/writes to them. At a
+ * minimum, this requires elevating the ref counts and potentially needing
+ * an explicit pinning of the memory. This places additional restrictions
+ * on what type of memory backends userspace can use for shared guest
+ * memory since there is some reliance on using refcounted pages.
+ *
+ * - The response page needs to be switched to Firmware-owned[1] state
+ * before the firmware can write to it, which can lead to potential
+ * host RMP #PFs if the guest is misbehaved and hands the host a
+ * guest page that KVM might write to for other reasons (e.g. virtio
+ * buffers/etc.).
+ *
+ * Both of these issues can be avoided completely by using separately-allocated
+ * bounce pages for both the request/response pages and passing those to
+ * firmware instead. So that's what is being set up here.
+ *
+ * Guest requests rely on message sequence numbers to ensure requests are
+ * issued to firmware in the order the guest issues them, so concurrent guest
+ * requests generally shouldn't happen. But a misbehaved guest could issue
+ * concurrent guest requests in theory, so a mutex is used to serialize
+ * access to the bounce buffers.
+ *
+ * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
+ * details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
+ * in the APM for details on the related RMP restrictions.
+ */
+static int snp_guest_req_init(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct page *req_page;
+
+ req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!req_page)
+ return -ENOMEM;
+
+ sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sev->guest_resp_buf) {
+ __free_page(req_page);
+ return -EIO;
+ }
+
+ sev->guest_req_buf = page_address(req_page);
+ mutex_init(&sev->guest_req_mutex);
+
+ return 0;
+}
+
+static void snp_guest_req_cleanup(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ if (sev->guest_resp_buf)
+ snp_free_firmware_page(sev->guest_resp_buf);
+
+ if (sev->guest_req_buf)
+ __free_page(virt_to_page(sev->guest_req_buf));
+
+ sev->guest_req_buf = NULL;
+ sev->guest_resp_buf = NULL;
+}
+
+static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_init *data,
+ unsigned long vm_type)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_platform_init_args init_args = {0};
+ bool es_active = vm_type != KVM_X86_SEV_VM;
+ u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
int ret;
if (kvm->created_vcpus)
return -EINVAL;
+ if (data->flags)
+ return -EINVAL;
+
+ if (data->vmsa_features & ~valid_vmsa_features)
+ return -EINVAL;
+
+ if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
+ return -EINVAL;
+
if (unlikely(sev->active))
return -EINVAL;
sev->active = true;
- sev->es_active = argp->id == KVM_SEV_ES_INIT;
+ sev->es_active = es_active;
+ sev->vmsa_features = data->vmsa_features;
+ sev->ghcb_version = data->ghcb_version;
+
+ /*
+ * Currently KVM supports the full range of mandatory features defined
+ * by version 2 of the GHCB protocol, so default to that for SEV-ES
+ * guests created via KVM_SEV_INIT2.
+ */
+ if (sev->es_active && !sev->ghcb_version)
+ sev->ghcb_version = GHCB_VERSION_DEFAULT;
+
+ if (vm_type == KVM_X86_SNP_VM)
+ sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+
ret = sev_asid_new(sev);
if (ret)
goto e_no_asid;
@@ -274,8 +449,13 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (ret)
goto e_free;
+ /* This needs to happen after SEV/SNP firmware initialization. */
+ if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm))
+ goto e_free;
+
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
+ sev->need_init = false;
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
@@ -286,11 +466,54 @@ e_free:
sev_asid_free(sev);
sev->asid = 0;
e_no_asid:
+ sev->vmsa_features = 0;
sev->es_active = false;
sev->active = false;
return ret;
}
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data = {
+ .vmsa_features = 0,
+ .ghcb_version = 0,
+ };
+ unsigned long vm_type;
+
+ if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
+ return -EINVAL;
+
+ vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
+
+ /*
+ * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
+ * continue to only ever support the minimal GHCB protocol version.
+ */
+ if (vm_type == KVM_X86_SEV_ES_VM)
+ data.ghcb_version = GHCB_VERSION_MIN;
+
+ return __sev_guest_init(kvm, argp, &data, vm_type);
+}
+
+static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_init data;
+
+ if (!sev->need_init)
+ return -EINVAL;
+
+ if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
+ kvm->arch.vm_type != KVM_X86_SNP_VM)
+ return -EINVAL;
+
+ if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
+ return -EFAULT;
+
+ return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
+}
+
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
unsigned int asid = sev_get_asid(kvm);
@@ -311,10 +534,10 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error)
int ret;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- ret = sev_issue_cmd_external_user(f.file, id, data, error);
+ ret = sev_issue_cmd_external_user(fd_file(f), id, data, error);
fdput(f);
return ret;
@@ -339,7 +562,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&start, 0, sizeof(start));
@@ -383,7 +606,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* return handle to userspace */
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
@@ -522,7 +745,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
vaddr = params.uaddr;
@@ -580,7 +803,13 @@ e_unpin:
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
struct sev_es_save_area *save = svm->sev_es.vmsa;
+ struct xregs_state *xsave;
+ const u8 *s;
+ u8 *d;
+ int i;
/* Check some debug related fields before encrypting the VMSA */
if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
@@ -621,10 +850,44 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- if (sev_es_debug_swap_enabled) {
- save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
- pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
- "This will not work starting with Linux 6.10\n");
+ save->sev_features = sev->vmsa_features;
+
+ /*
+ * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
+ * breaking older measurements.
+ */
+ if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
+ xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
+ save->x87_dp = xsave->i387.rdp;
+ save->mxcsr = xsave->i387.mxcsr;
+ save->x87_ftw = xsave->i387.twd;
+ save->x87_fsw = xsave->i387.swd;
+ save->x87_fcw = xsave->i387.cwd;
+ save->x87_fop = xsave->i387.fop;
+ save->x87_ds = 0;
+ save->x87_cs = 0;
+ save->x87_rip = xsave->i387.rip;
+
+ for (i = 0; i < 8; i++) {
+ /*
+ * The format of the x87 save area is undocumented and
+ * definitely not what you would expect. It consists of
+ * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
+ * area with bytes 8-9 of each register.
+ */
+ d = save->fpreg_x87 + i * 8;
+ s = ((u8 *)xsave->i387.st_space) + i * 16;
+ memcpy(d, s, 8);
+ save->fpreg_x87[64 + i * 2] = s[8];
+ save->fpreg_x87[64 + i * 2 + 1] = s[9];
+ }
+ memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
+
+ s = get_xsave_addr(xsave, XFEATURE_YMM);
+ if (s)
+ memcpy(save->fpreg_ymm, s, 256);
+ else
+ memset(save->fpreg_ymm, 0, 256);
}
pr_debug("Virtual Machine Save Area (VMSA):\n");
@@ -658,14 +921,29 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
vmsa.reserved = 0;
- vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.handle = to_kvm_sev_info(kvm)->handle;
vmsa.address = __sme_pa(svm->sev_es.vmsa);
vmsa.len = PAGE_SIZE;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
if (ret)
return ret;
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
vcpu->arch.guest_state_protected = true;
+
+ /*
+ * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
+ * only after setting guest_state_protected because KVM_SET_MSRS allows
+ * dynamic toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
return 0;
}
@@ -695,7 +973,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *measure = (void __user *)(uintptr_t)argp->data;
+ void __user *measure = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
@@ -715,7 +993,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -788,7 +1066,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.state = data.state;
params.handle = data.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
ret = -EFAULT;
return ret;
@@ -953,7 +1231,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
return -EFAULT;
if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
@@ -1037,7 +1315,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
@@ -1101,7 +1379,7 @@ e_unpin_memory:
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *report = (void __user *)(uintptr_t)argp->data;
+ void __user *report = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
@@ -1112,7 +1390,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&data, 0, sizeof(data));
@@ -1121,7 +1399,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -1174,7 +1452,7 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
params->session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1193,7 +1471,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_start)))
return -EFAULT;
@@ -1248,7 +1526,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
- if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
session_data, params.session_len)) {
ret = -EFAULT;
goto e_free_amd_cert;
@@ -1256,7 +1534,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.policy = data.policy;
params.session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1287,7 +1565,7 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
params->hdr_len = data.hdr_len;
params->trans_len = data.trans_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_update_data)))
ret = -EFAULT;
@@ -1307,7 +1585,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_update_data)))
return -EFAULT;
@@ -1358,14 +1636,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_trans_data;
/* copy transport buffer to user space */
- if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
trans_data, params.trans_len)) {
ret = -EFAULT;
goto e_free_trans_data;
}
/* Copy packet header to userspace. */
- if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
params.hdr_len))
ret = -EFAULT;
@@ -1417,7 +1695,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -ENOTTY;
/* Get parameter from the userspace */
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_start)))
return -EFAULT;
@@ -1459,7 +1737,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ if (copy_to_user(u64_to_user_ptr(argp->data),
&params, sizeof(struct kvm_sev_receive_start))) {
ret = -EFAULT;
sev_unbind_asid(kvm, start.handle);
@@ -1490,7 +1768,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -EINVAL;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_update_data)))
return -EFAULT;
@@ -1705,6 +1983,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
dst->pages_locked = src->pages_locked;
dst->enc_context_owner = src->enc_context_owner;
dst->es_active = src->es_active;
+ dst->vmsa_features = src->vmsa_features;
src->asid = 0;
src->active = false;
@@ -1799,20 +2078,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
bool charged = false;
int ret;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!file_is_kvm(f.file)) {
+ if (!file_is_kvm(fd_file(f))) {
ret = -EBADF;
goto out_fput;
}
- source_kvm = f.file->private_data;
+ source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
- if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
+ sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1861,6 +2141,427 @@ out_fput:
return ret;
}
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
+{
+ if (group != KVM_X86_GRP_SEV)
+ return -ENXIO;
+
+ switch (attr) {
+ case KVM_X86_SEV_VMSA_FEATURES:
+ *val = sev_supported_vmsa_features;
+ return 0;
+
+ default:
+ return -ENXIO;
+ }
+}
+
+/*
+ * The guest context contains all the information, keys and metadata
+ * associated with the guest that the firmware tracks to implement SEV
+ * and SNP features. The firmware stores the guest context in hypervisor
+ * provide page via the SNP_GCTX_CREATE command.
+ */
+static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_snp_addr data = {};
+ void *context;
+ int rc;
+
+ /* Allocate memory for context page */
+ context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+ if (!context)
+ return NULL;
+
+ data.address = __psp_pa(context);
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
+ if (rc) {
+ pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
+ rc, argp->error);
+ snp_free_firmware_page(context);
+ return NULL;
+ }
+
+ return context;
+}
+
+static int snp_bind_asid(struct kvm *kvm, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_activate data = {0};
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.asid = sev_get_asid(kvm);
+ return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
+}
+
+static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_start start = {0};
+ struct kvm_sev_snp_launch_start params;
+ int rc;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ /* Don't allow userspace to allocate memory for more than 1 SNP context. */
+ if (sev->snp_context)
+ return -EINVAL;
+
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
+ if (params.flags)
+ return -EINVAL;
+
+ if (params.policy & ~SNP_POLICY_MASK_VALID)
+ return -EINVAL;
+
+ /* Check for policy bits that must be set */
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
+ !(params.policy & SNP_POLICY_MASK_SMT))
+ return -EINVAL;
+
+ if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
+ return -EINVAL;
+
+ start.gctx_paddr = __psp_pa(sev->snp_context);
+ start.policy = params.policy;
+ memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
+ if (rc) {
+ pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ sev->fd = argp->sev_fd;
+ rc = snp_bind_asid(kvm, &argp->error);
+ if (rc) {
+ pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ return 0;
+
+e_free_context:
+ snp_decommission_context(kvm);
+
+ return rc;
+}
+
+struct sev_gmem_populate_args {
+ __u8 type;
+ int sev_fd;
+ int fw_error;
+};
+
+static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
+ void __user *src, int order, void *opaque)
+{
+ struct sev_gmem_populate_args *sev_populate_args = opaque;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int n_private = 0, ret, i;
+ int npages = (1 << order);
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
+ return -EINVAL;
+
+ for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
+ struct sev_data_snp_launch_update fw_args = {0};
+ bool assigned = false;
+ int level;
+
+ ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
+ if (ret || assigned) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+ __func__, gfn, ret, assigned);
+ ret = ret ? -EINVAL : -EEXIST;
+ goto err;
+ }
+
+ if (src) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ ret = -EFAULT;
+ goto err;
+ }
+ kunmap_local(vaddr);
+ }
+
+ ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
+ sev_get_asid(kvm), true);
+ if (ret)
+ goto err;
+
+ n_private++;
+
+ fw_args.gctx_paddr = __psp_pa(sev->snp_context);
+ fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
+ fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+ fw_args.page_type = sev_populate_args->type;
+
+ ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &fw_args, &sev_populate_args->fw_error);
+ if (ret)
+ goto fw_err;
+ }
+
+ return 0;
+
+fw_err:
+ /*
+ * If the firmware command failed handle the reclaim and cleanup of that
+ * PFN specially vs. prior pages which can be cleaned up below without
+ * needing to reclaim in advance.
+ *
+ * Additionally, when invalid CPUID function entries are detected,
+ * firmware writes the expected values into the page and leaves it
+ * unencrypted so it can be used for debugging and error-reporting.
+ *
+ * Copy this page back into the source buffer so userspace can use this
+ * information to provide information on which CPUID leaves/fields
+ * failed CPUID validation.
+ */
+ if (!snp_page_reclaim(kvm, pfn + i) &&
+ sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
+ sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
+ pr_debug("Failed to write CPUID page back to userspace\n");
+
+ kunmap_local(vaddr);
+ }
+
+ /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
+ n_private--;
+
+err:
+ pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
+ __func__, ret, sev_populate_args->fw_error, n_private);
+ for (i = 0; i < n_private; i++)
+ kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
+
+ return ret;
+}
+
+static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_gmem_populate_args sev_populate_args = {0};
+ struct kvm_sev_snp_launch_update params;
+ struct kvm_memory_slot *memslot;
+ long npages, count;
+ void __user *src;
+ int ret = 0;
+
+ if (!sev_snp_guest(kvm) || !sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
+ params.gfn_start, params.len, params.type, params.flags);
+
+ if (!PAGE_ALIGNED(params.len) || params.flags ||
+ (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
+ return -EINVAL;
+
+ npages = params.len / PAGE_SIZE;
+
+ /*
+ * For each GFN that's being prepared as part of the initial guest
+ * state, the following pre-conditions are verified:
+ *
+ * 1) The backing memslot is a valid private memslot.
+ * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
+ * beforehand.
+ * 3) The PFN of the guest_memfd has not already been set to private
+ * in the RMP table.
+ *
+ * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
+ * faults if there's a race between a fault and an attribute update via
+ * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
+ * here. However, kvm->slots_lock guards against both this as well as
+ * concurrent memslot updates occurring while these checks are being
+ * performed, so use that here to make it easier to reason about the
+ * initial expected state and better guard against unexpected
+ * situations.
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ memslot = gfn_to_memslot(kvm, params.gfn_start);
+ if (!kvm_slot_can_be_private(memslot)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sev_populate_args.sev_fd = argp->sev_fd;
+ sev_populate_args.type = params.type;
+ src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
+
+ count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ sev_gmem_post_populate, &sev_populate_args);
+ if (count < 0) {
+ argp->error = sev_populate_args.fw_error;
+ pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
+ __func__, count, argp->error);
+ ret = -EIO;
+ } else {
+ params.gfn_start += count;
+ params.len -= count * PAGE_SIZE;
+ if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
+ params.uaddr += count * PAGE_SIZE;
+
+ ret = 0;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+ }
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
+
+static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_update data = {};
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.page_type = SNP_PAGE_TYPE_VMSA;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /* Transition the VMSA page to a firmware state. */
+ ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
+ if (ret)
+ return ret;
+
+ /* Issue the SNP command to encrypt the VMSA */
+ data.address = __sme_pa(svm->sev_es.vmsa);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &data, &argp->error);
+ if (ret) {
+ snp_page_reclaim(kvm, pfn);
+
+ return ret;
+ }
+
+ svm->vcpu.arch.guest_state_protected = true;
+ /*
+ * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
+ * be _always_ ON. Enable it only after setting
+ * guest_state_protected because KVM_SET_MSRS allows dynamic
+ * toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ }
+
+ return 0;
+}
+
+static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_snp_launch_finish params;
+ struct sev_data_snp_launch_finish *data;
+ void *id_block = NULL, *id_auth = NULL;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (!sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ if (params.flags)
+ return -EINVAL;
+
+ /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
+ ret = snp_launch_update_vmsa(kvm, argp);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ if (params.id_block_en) {
+ id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
+ if (IS_ERR(id_block)) {
+ ret = PTR_ERR(id_block);
+ goto e_free;
+ }
+
+ data->id_block_en = 1;
+ data->id_block_paddr = __sme_pa(id_block);
+
+ id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
+ if (IS_ERR(id_auth)) {
+ ret = PTR_ERR(id_auth);
+ goto e_free_id_block;
+ }
+
+ data->id_auth_paddr = __sme_pa(id_auth);
+
+ if (params.auth_key_en)
+ data->auth_key_en = 1;
+ }
+
+ data->vcek_disabled = params.vcek_disabled;
+
+ memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
+ data->gctx_paddr = __psp_pa(sev->snp_context);
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
+ kfree(id_auth);
+
+e_free_id_block:
+ kfree(id_block);
+
+e_free:
+ kfree(data);
+
+ return ret;
+}
+
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1884,6 +2585,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
goto out;
}
+ /*
+ * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
+ * allow the use of SNP-specific commands.
+ */
+ if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
+ r = -EPERM;
+ goto out;
+ }
+
switch (sev_cmd.id) {
case KVM_SEV_ES_INIT:
if (!sev_es_enabled) {
@@ -1894,6 +2604,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
+ case KVM_SEV_INIT2:
+ r = sev_guest_init2(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
@@ -1945,6 +2658,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_RECEIVE_FINISH:
r = sev_receive_finish(kvm, &sev_cmd);
break;
+ case KVM_SEV_SNP_LAUNCH_START:
+ r = snp_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_UPDATE:
+ r = snp_launch_update(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_FINISH:
+ r = snp_launch_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -2081,15 +2803,15 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!file_is_kvm(f.file)) {
+ if (!file_is_kvm(fd_file(f))) {
ret = -EBADF;
goto e_source_fput;
}
- source_kvm = f.file->private_data;
+ source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto e_source_fput;
@@ -2121,6 +2843,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->asid = source_sev->asid;
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->need_init = false;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
@@ -2139,6 +2862,31 @@ e_source_fput:
return ret;
}
+static int snp_decommission_context(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_addr data = {};
+ int ret;
+
+ /* If context is not created then do nothing */
+ if (!sev->snp_context)
+ return 0;
+
+ /* Do the decommision, which will unbind the ASID from the SNP context */
+ data.address = __sme_pa(sev->snp_context);
+ down_write(&sev_deactivate_lock);
+ ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+ up_write(&sev_deactivate_lock);
+
+ if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
+ return ret;
+
+ snp_free_firmware_page(sev->snp_context);
+ sev->snp_context = NULL;
+
+ return 0;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -2180,22 +2928,42 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- sev_unbind_asid(kvm, sev->handle);
+ if (sev_snp_guest(kvm)) {
+ snp_guest_req_cleanup(kvm);
+
+ /*
+ * Decomission handles unbinding of the ASID. If it fails for
+ * some unexpected reason, just leak the ASID.
+ */
+ if (snp_decommission_context(kvm))
+ return;
+ } else {
+ sev_unbind_asid(kvm, sev->handle);
+ }
+
sev_asid_free(sev);
}
void __init sev_set_cpu_caps(void)
{
- if (!sev_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV);
- if (!sev_es_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+ if (sev_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
+ }
+ if (sev_es_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
+ }
+ if (sev_snp_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
+ }
}
void __init sev_hardware_setup(void)
{
-#ifdef CONFIG_KVM_AMD_SEV
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ bool sev_snp_supported = false;
bool sev_es_supported = false;
bool sev_supported = false;
@@ -2269,6 +3037,12 @@ void __init sev_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_SEV_ES))
goto out;
+ if (!lbrv) {
+ WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
+ "LBRV must be present for SEV-ES support");
+ goto out;
+ }
+
/* Has the system been allocated ASIDs for SEV-ES? */
if (min_sev_asid == 1)
goto out;
@@ -2276,6 +3050,7 @@ void __init sev_hardware_setup(void)
sev_es_asid_count = min_sev_asid - 1;
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
sev_es_supported = true;
+ sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
out:
if (boot_cpu_has(X86_FEATURE_SEV))
@@ -2288,13 +3063,22 @@ out:
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
sev_es_supported ? "enabled" : "disabled",
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
+ if (boot_cpu_has(X86_FEATURE_SEV_SNP))
+ pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
+ sev_snp_supported ? "enabled" : "disabled",
+ min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
sev_enabled = sev_supported;
sev_es_enabled = sev_es_supported;
+ sev_snp_enabled = sev_snp_supported;
+
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
sev_es_debug_swap_enabled = false;
-#endif
+
+ sev_supported_vmsa_features = 0;
+ if (sev_es_debug_swap_enabled)
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
}
void sev_hardware_unsetup(void)
@@ -2366,7 +3150,13 @@ do_wbinvd:
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
- if (!sev_guest(kvm))
+ /*
+ * With SNP+gmem, private/encrypted memory is unreachable via the
+ * hva-based mmu notifiers, so these events are only actually
+ * pertaining to shared pages where there is no need to perform
+ * the WBINVD to flush associated caches.
+ */
+ if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
wbinvd_on_all_cpus();
@@ -2381,11 +3171,24 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
+ /*
+ * If it's an SNP guest, then the VMSA was marked in the RMP table as
+ * a guest-owned page. Transition the page to hypervisor state before
+ * releasing it back to the system.
+ */
+ if (sev_snp_guest(vcpu->kvm)) {
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
+ goto skip_vmsa_free;
+ }
+
if (vcpu->arch.guest_state_protected)
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
__free_page(virt_to_page(svm->sev_es.vmsa));
+skip_vmsa_free:
if (svm->sev_es.ghcb_sa_free)
kvfree(svm->sev_es.ghcb_sa);
}
@@ -2581,10 +3384,31 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
+ case SVM_VMGEXIT_AP_CREATION:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto vmgexit_err;
+ if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
case SVM_VMGEXIT_NMI_COMPLETE:
case SVM_VMGEXIT_AP_HLT_LOOP:
case SVM_VMGEXIT_AP_JUMP_TABLE:
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ case SVM_VMGEXIT_HV_FEATURES:
+ case SVM_VMGEXIT_TERM_REQUEST:
+ break;
+ case SVM_VMGEXIT_PSC:
+ if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ if (!sev_snp_guest(vcpu->kvm) ||
+ !PAGE_ALIGNED(control->exit_info_1) ||
+ !PAGE_ALIGNED(control->exit_info_2) ||
+ control->exit_info_1 == control->exit_info_2)
+ goto vmgexit_err;
break;
default:
reason = GHCB_ERR_INVALID_EVENT;
@@ -2615,6 +3439,9 @@ vmgexit_err:
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
+ /* Clear any indication that the vCPU is in a type of AP Reset Hold */
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
+
if (!svm->sev_es.ghcb)
return;
@@ -2770,10 +3597,539 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
svm->vmcb->control.ghcb_gpa = value;
}
+static int snp_rmptable_psmash(kvm_pfn_t pfn)
+{
+ int ret;
+
+ pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+ /*
+ * PSMASH_FAIL_INUSE indicates another processor is modifying the
+ * entry, so retry until that's no longer the case.
+ */
+ do {
+ ret = psmash(pfn);
+ } while (ret == PSMASH_FAIL_INUSE);
+
+ return ret;
+}
+
+static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->run->hypercall.ret)
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ else
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
+
+ return 1; /* resume guest */
+}
+
+static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
+{
+ u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
+ u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = 1;
+ vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+
+ vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
+
+ return 0; /* forward request to userspace */
+}
+
+struct psc_buffer {
+ struct psc_hdr hdr;
+ struct psc_entry entries[];
+} __packed;
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
+
+static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
+{
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_2m = false;
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+}
+
+static void __snp_complete_one_psc(struct vcpu_svm *svm)
+{
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+ struct psc_entry *entries = psc->entries;
+ struct psc_hdr *hdr = &psc->hdr;
+ __u16 idx;
+
+ /*
+ * Everything in-flight has been processed successfully. Update the
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
+ */
+ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
+ svm->sev_es.psc_inflight--, idx++) {
+ struct psc_entry *entry = &entries[idx];
+
+ entry->cur_page = entry->pagesize ? 512 : 1;
+ }
+
+ hdr->cur_entry = idx;
+}
+
+static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+
+ if (vcpu->run->hypercall.ret) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
+
+ __snp_complete_one_psc(svm);
+
+ /* Handle the next range (if any). */
+ return snp_begin_psc(svm, psc);
+}
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
+{
+ struct psc_entry *entries = psc->entries;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct psc_hdr *hdr = &psc->hdr;
+ struct psc_entry entry_start;
+ u16 idx, idx_start, idx_end;
+ int npages;
+ bool huge;
+ u64 gfn;
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+next_range:
+ /* There should be no other PSCs in-flight at this point. */
+ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+ /*
+ * The PSC descriptor buffer can be modified by a misbehaved guest after
+ * validation, so take care to only use validated copies of values used
+ * for things like array indexing.
+ */
+ idx_start = hdr->cur_entry;
+ idx_end = hdr->end_entry;
+
+ if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
+ return 1;
+ }
+
+ /* Find the start of the next range which needs processing. */
+ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
+ entry_start = entries[idx];
+
+ gfn = entry_start.gfn;
+ huge = entry_start.pagesize;
+ npages = huge ? 512 : 1;
+
+ if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
+ return 1;
+ }
+
+ if (entry_start.cur_page) {
+ /*
+ * If this is a partially-completed 2M range, force 4K handling
+ * for the remaining pages since they're effectively split at
+ * this point. Subsequent code should ensure this doesn't get
+ * combined with adjacent PSC entries where 2M handling is still
+ * possible.
+ */
+ npages -= entry_start.cur_page;
+ gfn += entry_start.cur_page;
+ huge = false;
+ }
+
+ if (npages)
+ break;
+ }
+
+ if (idx > idx_end) {
+ /* Nothing more to process. */
+ snp_complete_psc(svm, 0);
+ return 1;
+ }
+
+ svm->sev_es.psc_2m = huge;
+ svm->sev_es.psc_idx = idx;
+ svm->sev_es.psc_inflight = 1;
+
+ /*
+ * Find all subsequent PSC entries that contain adjacent GPA
+ * ranges/operations and can be combined into a single
+ * KVM_HC_MAP_GPA_RANGE exit.
+ */
+ while (++idx <= idx_end) {
+ struct psc_entry entry = entries[idx];
+
+ if (entry.operation != entry_start.operation ||
+ entry.gfn != entry_start.gfn + npages ||
+ entry.cur_page || !!entry.pagesize != huge)
+ break;
+
+ svm->sev_es.psc_inflight++;
+ npages += huge ? 512 : 1;
+ }
+
+ switch (entry_start.operation) {
+ case VMGEXIT_PSC_OP_PRIVATE:
+ case VMGEXIT_PSC_OP_SHARED:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= entry_start.pagesize
+ ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
+ : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+ vcpu->arch.complete_userspace_io = snp_complete_one_psc;
+ return 0; /* forward request to userspace */
+ default:
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can always be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion.
+ */
+ __snp_complete_one_psc(svm);
+ goto next_range;
+ }
+
+ unreachable();
+}
+
+static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
+
+ /* Mark the vCPU as offline and not runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ /* Clear use of the VMSA */
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+
+ if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
+ gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+ struct kvm_memory_slot *slot;
+ kvm_pfn_t pfn;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
+ return -EINVAL;
+
+ /*
+ * The new VMSA will be private memory guest memory, so
+ * retrieve the PFN from the gmem backend.
+ */
+ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
+ return -EINVAL;
+
+ /*
+ * From this point forward, the VMSA will always be a
+ * guest-mapped page rather than the initial one allocated
+ * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa
+ * could be free'd and cleaned up here, but that involves
+ * cleanups like wbinvd_on_all_cpus() which would ideally
+ * be handled during teardown rather than guest boot.
+ * Deferring that also allows the existing logic for SEV-ES
+ * VMSAs to be re-used with minimal SNP-specific changes.
+ */
+ svm->sev_es.snp_has_guest_vmsa = true;
+
+ /* Use the new VMSA */
+ svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
+
+ /* Mark the vCPU as runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+
+ /*
+ * gmem pages aren't currently migratable, but if this ever
+ * changes then care should be taken to ensure
+ * svm->sev_es.vmsa is pinned through some other means.
+ */
+ kvm_release_pfn_clean(pfn);
+ }
+
+ /*
+ * When replacing the VMSA during SEV-SNP AP creation,
+ * mark the VMCB dirty so that full state is always reloaded.
+ */
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ return 0;
+}
+
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ return;
+
+ mutex_lock(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!svm->sev_es.snp_ap_waiting_for_reset)
+ goto unlock;
+
+ svm->sev_es.snp_ap_waiting_for_reset = false;
+
+ ret = __sev_snp_update_protected_guest_state(vcpu);
+ if (ret)
+ vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
+
+unlock:
+ mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
+}
+
+static int sev_snp_ap_creation(struct vcpu_svm *svm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_vcpu *target_vcpu;
+ struct vcpu_svm *target_svm;
+ unsigned int request;
+ unsigned int apic_id;
+ bool kick;
+ int ret;
+
+ request = lower_32_bits(svm->vmcb->control.exit_info_1);
+ apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
+
+ /* Validate the APIC ID */
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
+ if (!target_vcpu) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
+ apic_id);
+ return -EINVAL;
+ }
+
+ ret = 0;
+
+ target_svm = to_svm(target_vcpu);
+
+ /*
+ * The target vCPU is valid, so the vCPU will be kicked unless the
+ * request is for CREATE_ON_INIT. For any errors at this stage, the
+ * kick will place the vCPU in an non-runnable state.
+ */
+ kick = true;
+
+ mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
+ /* Interrupt injection mode shouldn't change for AP creation */
+ if (request < SVM_VMGEXIT_AP_DESTROY) {
+ u64 sev_features;
+
+ sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
+ sev_features ^= sev->vmsa_features;
+
+ if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ switch (request) {
+ case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+ kick = false;
+ fallthrough;
+ case SVM_VMGEXIT_AP_CREATE:
+ if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Malicious guest can RMPADJUST a large page into VMSA which
+ * will hit the SNP erratum where the CPU will incorrectly signal
+ * an RMP violation #PF if a hugepage collides with the RMP entry
+ * of VMSA page, reject the AP CREATE request if VMSA address from
+ * guest is 2M aligned.
+ */
+ if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
+ vcpu_unimpl(vcpu,
+ "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+ break;
+ case SVM_VMGEXIT_AP_DESTROY:
+ break;
+ default:
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
+ request);
+ ret = -EINVAL;
+ break;
+ }
+
+out:
+ if (kick) {
+ kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
+ kvm_vcpu_kick(target_vcpu);
+ }
+
+ mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ return ret;
+}
+
+static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct sev_data_snp_guest_request data = {0};
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ sev_ret_code fw_err = 0;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ mutex_lock(&sev->guest_req_mutex);
+
+ if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.req_paddr = __psp_pa(sev->guest_req_buf);
+ data.res_paddr = __psp_pa(sev->guest_resp_buf);
+
+ /*
+ * Firmware failures are propagated on to guest, but any other failure
+ * condition along the way should be reported to userspace. E.g. if
+ * the PSP is dead and commands are timing out.
+ */
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
+ if (ret && !fw_err)
+ goto out_unlock;
+
+ if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err));
+
+ ret = 1; /* resume guest */
+
+out_unlock:
+ mutex_unlock(&sev->guest_req_mutex);
+ return ret;
+}
+
+static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+ u8 msg_type;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
+ &msg_type, 1))
+ return -EIO;
+
+ /*
+ * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
+ * additional certificate data to be provided alongside the attestation
+ * report via the guest-provided data pages indicated by RAX/RBX. The
+ * certificate data is optional and requires additional KVM enablement
+ * to provide an interface for userspace to provide it, but KVM still
+ * needs to be able to handle extended guest requests either way. So
+ * provide a stub implementation that will always return an empty
+ * certificate table in the guest-provided data pages.
+ */
+ if (msg_type == SNP_MSG_REPORT_REQ) {
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 data_npages;
+ gpa_t data_gpa;
+
+ if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
+ goto request_invalid;
+
+ data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
+ data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
+
+ if (!PAGE_ALIGNED(data_gpa))
+ goto request_invalid;
+
+ /*
+ * As per GHCB spec (see "SNP Extended Guest Request"), the
+ * certificate table is terminated by 24-bytes of zeroes.
+ */
+ if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
+ return -EIO;
+ }
+
+ return snp_handle_guest_req(svm, req_gpa, resp_gpa);
+
+request_invalid:
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ return 1; /* resume guest */
+}
+
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
u64 ghcb_info;
int ret = 1;
@@ -2784,7 +4140,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
switch (ghcb_info) {
case GHCB_MSR_SEV_INFO_REQ:
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
break;
@@ -2826,6 +4182,60 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_INFO_POS);
break;
}
+ case GHCB_MSR_AP_RESET_HOLD_REQ:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
+ ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+
+ /*
+ * Preset the result to a non-SIPI return and then only set
+ * the result to non-zero when delivering a SIPI.
+ */
+ set_ghcb_msr_bits(svm, 0,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_HV_FT_REQ:
+ set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
+ GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
+ GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_PREF_GPA_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_REG_GPA_REQ: {
+ u64 gfn;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+
+ svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+
+ set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_PSC_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
+ break;
case GHCB_MSR_TERM_REQ: {
u64 reason_set, reason_code;
@@ -2838,12 +4248,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
- vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
- vcpu->run->system_event.ndata = 1;
- vcpu->run->system_event.data[0] = control->ghcb_gpa;
-
- return 0;
+ goto out_terminate;
}
default:
/* Error, keep GHCB MSR value as-is */
@@ -2854,6 +4259,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
control->ghcb_gpa, ret);
return ret;
+
+out_terminate:
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
}
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
@@ -2889,6 +4302,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
sev_es_sync_from_ghcb(svm);
+
+ /* SEV-SNP guest requires that the GHCB GPA must be registered */
+ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
+ return -EINVAL;
+ }
+
ret = sev_es_validate_vmgexit(svm);
if (ret)
return ret;
@@ -2925,6 +4345,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
@@ -2949,6 +4370,41 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
}
+ case SVM_VMGEXIT_HV_FEATURES:
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED);
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_TERM_REQUEST:
+ pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+ break;
+ case SVM_VMGEXIT_PSC:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ ret = sev_snp_ap_creation(svm);
+ if (ret) {
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
@@ -3034,7 +4490,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
/*
* An SEV-ES guest requires a VMSA area that is a separate from the
@@ -3043,7 +4498,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
* the VMSA will be NULL if this vCPU is the destination for intrahost
* migration, and will be copied later.
*/
- if (svm->sev_es.vmsa)
+ if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
/* Can't intercept CR register access, HV can't modify CR registers */
@@ -3063,7 +4518,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, TRAP_CR8_WRITE);
vmcb->control.intercepts[INTERCEPT_DR] = 0;
- if (!sev_es_debug_swap_enabled) {
+ if (!sev_vcpu_has_debug_swap(svm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
@@ -3086,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
/* Clear intercepts on selected MSRs */
set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}
void sev_init_vmcb(struct vcpu_svm *svm)
@@ -3109,16 +4560,21 @@ void sev_init_vmcb(struct vcpu_svm *svm)
void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
/*
* Set the GHCB MSR value as per the GHCB specification when emulating
* vCPU RESET for an SEV-ES guest.
*/
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
+
+ mutex_init(&svm->sev_es.snp_vmsa_mutex);
}
-void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
/*
* All host state for SEV-ES guests is categorized into three swap types
@@ -3137,16 +4593,16 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
* by common SVM code).
*/
- hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ hostsa->xcr0 = kvm_host.xcr0;
hostsa->pkru = read_pkru();
- hostsa->xss = host_xss;
+ hostsa->xss = kvm_host.xss;
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
* saves and loads debug registers (Type-A).
*/
- if (sev_es_debug_swap_enabled) {
+ if (sev_vcpu_has_debug_swap(svm)) {
hostsa->dr0 = native_get_debugreg(0);
hostsa->dr1 = native_get_debugreg(1);
hostsa->dr2 = native_get_debugreg(2);
@@ -3168,24 +4624,40 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
return;
}
- /*
- * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
- * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
- * non-zero value.
- */
- if (!svm->sev_es.ghcb)
- return;
+ /* Subsequent SIPI */
+ switch (svm->sev_es.ap_reset_hold_type) {
+ case AP_RESET_HOLD_NAE_EVENT:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+ */
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ break;
+ case AP_RESET_HOLD_MSR_PROTO:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set GHCB data field to a non-zero value.
+ */
+ set_ghcb_msr_bits(svm, 1,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ default:
+ break;
+ }
}
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
unsigned long pfn;
struct page *p;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
- return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
/*
* Allocate an SNP-safe page to workaround the SNP erratum where
@@ -3196,7 +4668,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
*/
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
if (!p)
return NULL;
@@ -3210,3 +4682,271 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
return p;
}
+
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = vcpu->kvm;
+ int order, rmp_level, ret;
+ bool assigned;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ gfn = gpa >> PAGE_SHIFT;
+
+ /*
+ * The only time RMP faults occur for shared pages is when the guest is
+ * triggering an RMP fault for an implicit page-state change from
+ * shared->private. Implicit page-state changes are forwarded to
+ * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
+ * for shared pages should not end up here.
+ */
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!kvm_slot_can_be_private(slot)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret || !assigned) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
+ gpa, pfn, ret);
+ goto out_no_trace;
+ }
+
+ /*
+ * There are 2 cases where a PSMASH may be needed to resolve an #NPF
+ * with PFERR_GUEST_RMP_BIT set:
+ *
+ * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
+ * bit set if the guest issues them with a smaller granularity than
+ * what is indicated by the page-size bit in the 2MB RMP entry for
+ * the PFN that backs the GPA.
+ *
+ * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
+ * smaller than what is indicated by the 2MB RMP entry for the PFN
+ * that backs the GPA.
+ *
+ * In both these cases, the corresponding 2M RMP entry needs to
+ * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
+ * split into 4K RMP entries, then this is likely a spurious case which
+ * can occur when there are concurrent accesses by the guest to a 2MB
+ * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
+ * the process of being PMASH'd into 4K entries. These cases should
+ * resolve automatically on subsequent accesses, so just ignore them
+ * here.
+ */
+ if (rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ ret = snp_rmptable_psmash(pfn);
+ if (ret) {
+ /*
+ * Look it up again. If it's 4K now then the PSMASH may have
+ * raced with another process and the issue has already resolved
+ * itself.
+ */
+ if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
+ assigned && rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
+ gpa, pfn, ret);
+ }
+
+ kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
+out:
+ trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
+out_no_trace:
+ put_page(pfn_to_page(pfn));
+}
+
+static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn = start;
+
+ while (pfn < end) {
+ int ret, rmp_level;
+ bool assigned;
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
+ pfn, start, end, rmp_level, ret);
+ return false;
+ }
+
+ if (assigned) {
+ pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
+ __func__, pfn, start, end, rmp_level);
+ return false;
+ }
+
+ pfn++;
+ }
+
+ return true;
+}
+
+static u8 max_level_for_order(int order)
+{
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
+{
+ kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+
+ /*
+ * If this is a large folio, and the entire 2M range containing the
+ * PFN is currently shared, then the entire 2M-aligned range can be
+ * set to private via a single 2M RMP entry.
+ */
+ if (max_level_for_order(order) > PG_LEVEL_4K &&
+ is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
+ return true;
+
+ return false;
+}
+
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ kvm_pfn_t pfn_aligned;
+ gfn_t gfn_aligned;
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
+ gfn, pfn, rc);
+ return -ENOENT;
+ }
+
+ if (assigned) {
+ pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
+ __func__, gfn, pfn, max_order, level);
+ return 0;
+ }
+
+ if (is_large_rmp_possible(kvm, pfn, max_order)) {
+ level = PG_LEVEL_2M;
+ pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
+ } else {
+ level = PG_LEVEL_4K;
+ pfn_aligned = pfn;
+ gfn_aligned = gfn;
+ }
+
+ rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
+ gfn, pfn, level, rc);
+ return -EINVAL;
+ }
+
+ pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
+ __func__, gfn, pfn, pfn_aligned, max_order, level);
+
+ return 0;
+}
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+ for (pfn = start; pfn < end;) {
+ bool use_2m_update = false;
+ int rc, rmp_level;
+ bool assigned;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (rc || !assigned)
+ goto next_pfn;
+
+ use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+ end >= (pfn + PTRS_PER_PMD) &&
+ rmp_level > PG_LEVEL_4K;
+
+ /*
+ * If an unaligned PFN corresponds to a 2M region assigned as a
+ * large page in the RMP table, PSMASH the region into individual
+ * 4K RMP entries before attempting to convert a 4K sub-page.
+ */
+ if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+ /*
+ * This shouldn't fail, but if it does, report it, but
+ * still try to update RMP entry to shared and pray this
+ * was a spurious error that can be addressed later.
+ */
+ rc = snp_rmptable_psmash(pfn);
+ WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc);
+ }
+
+ rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+ if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc))
+ goto next_pfn;
+
+ /*
+ * SEV-ES avoids host/guest cache coherency issues through
+ * WBINVD hooks issued via MMU notifiers during run-time, and
+ * KVM's VM destroy path at shutdown. Those MMU notifier events
+ * don't cover gmem since there is no requirement to map pages
+ * to a HVA in order to use them for a running guest. While the
+ * shutdown path would still likely cover things for SNP guests,
+ * userspace may also free gmem pages during run-time via
+ * hole-punching operations on the guest_memfd, so flush the
+ * cache entries for these pages before free'ing them back to
+ * the host.
+ */
+ clflush_cache_range(__va(pfn_to_hpa(pfn)),
+ use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+ pfn += use_2m_update ? PTRS_PER_PMD : 1;
+ cond_resched();
+ }
+}
+
+int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc || !assigned)
+ return PG_LEVEL_4K;
+
+ return level;
+}