From 1ed576a20cd5c93295f57d6b7400357bd8d01b21 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 20 Oct 2020 06:12:07 -0400
Subject: KVM: s390: pv: Mark mm as protected after the set secure parameters
 and improve cleanup

We can only have protected guest pages after a successful set secure
parameters call as only then the UV allows imports and unpacks.

By moving the test we can now also check for it in s390_reset_acc()
and do an early return if it is 0.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Fixes: 29b40f105ec8 ("KVM: s390: protvirt: Add initial vm and cpu lifecycle handling")
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 +-
 arch/s390/kvm/pv.c       | 3 ++-
 arch/s390/mm/gmap.c      | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6b74b92c1a58..08ea6c4735cd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2312,7 +2312,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 		struct kvm_s390_pv_unp unp = {};
 
 		r = -EINVAL;
-		if (!kvm_s390_pv_is_protected(kvm))
+		if (!kvm_s390_pv_is_protected(kvm) || !mm_is_protected(kvm->mm))
 			break;
 
 		r = -EFAULT;
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index eb99e2f95ebe..f5847f9dec7c 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -208,7 +208,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return -EIO;
 	}
 	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
-	atomic_set(&kvm->mm->context.is_protected, 1);
 	return 0;
 }
 
@@ -228,6 +227,8 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
 	*rrc = uvcb.header.rrc;
 	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
 		     *rc, *rrc);
+	if (!cc)
+		atomic_set(&kvm->mm->context.is_protected, 1);
 	return cc ? -EINVAL : 0;
 }
 
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index cfb0017f33a7..64795d034926 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2690,6 +2690,8 @@ static const struct mm_walk_ops reset_acc_walk_ops = {
 #include <linux/sched/mm.h>
 void s390_reset_acc(struct mm_struct *mm)
 {
+	if (!mm_is_protected(mm))
+		return;
 	/*
 	 * we might be called during
 	 * reset:                             we walk the pages and clear
-- 
cgit v1.2.3


From 6cbf1e960fa52e4c63a6dfa4cda8736375b34ccc Mon Sep 17 00:00:00 2001
From: Collin Walling <walling@linux.ibm.com>
Date: Wed, 4 Nov 2020 13:10:32 -0500
Subject: KVM: s390: remove diag318 reset code

The diag318 data must be set to 0 by VM-wide reset events
triggered by diag308. As such, KVM should not handle
resetting this data via the VCPU ioctls.

Fixes: 23a60f834406 ("s390/kvm: diagnose 0x318 sync and reset")
Signed-off-by: Collin Walling <walling@linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Link: https://lore.kernel.org/r/20201104181032.109800-1-walling@linux.ibm.com
---
 arch/s390/kvm/kvm-s390.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 08ea6c4735cd..425d3d75320b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3564,7 +3564,6 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 		vcpu->arch.sie_block->pp = 0;
 		vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
 		vcpu->arch.sie_block->todpr = 0;
-		vcpu->arch.sie_block->cpnc = 0;
 	}
 }
 
@@ -3582,7 +3581,6 @@ static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu)
 
 	regs->etoken = 0;
 	regs->etoken_extension = 0;
-	regs->diag318 = 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-- 
cgit v1.2.3


From 7bab16a6075b7b94999666355ab532c3dabb94f9 Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie@nuviainc.com>
Date: Fri, 13 Nov 2020 15:04:06 +0000
Subject: KVM: arm64: Correctly align nVHE percpu data

The nVHE percpu data is partially linked but the nVHE linker script did
not align the percpu section.  The PERCPU_INPUT macro would then align
the data to a page boundary:

  #define PERCPU_INPUT(cacheline)					\
  	__per_cpu_start = .;						\
  	*(.data..percpu..first)						\
  	. = ALIGN(PAGE_SIZE);						\
  	*(.data..percpu..page_aligned)					\
  	. = ALIGN(cacheline);						\
  	*(.data..percpu..read_mostly)					\
  	. = ALIGN(cacheline);						\
  	*(.data..percpu)						\
  	*(.data..percpu..shared_aligned)				\
  	PERCPU_DECRYPTED_SECTION					\
  	__per_cpu_end = .;

but then when the final vmlinux linking happens the hypervisor percpu
data is included after page alignment and so the offsets potentially
don't match.  On my build I saw that the .hyp.data..percpu section was
at address 0x20 and then the percpu data would begin at 0x1000 (because
of the page alignment in PERCPU_INPUT), but when linked into vmlinux,
everything would be shifted down by 0x20 bytes.

This manifests as one of the CPUs getting lost when running
kvm-unit-tests or starting any VM and subsequent soft lockup on a Cortex
A72 device.

Fixes: 30c953911c43 ("kvm: arm64: Set up hyp percpu data for nVHE")
Signed-off-by: Jamie Iles <jamie@nuviainc.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: David Brazdil <dbrazdil@google.com>
Cc: David Brazdil <dbrazdil@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201113150406.14314-1-jamie@nuviainc.com
---
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index bb2d986ff696..a797abace13f 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -13,6 +13,11 @@
 
 SECTIONS {
 	HYP_SECTION(.text)
+	/*
+	 * .hyp..data..percpu needs to be page aligned to maintain the same
+	 * alignment for when linking into vmlinux.
+	 */
+	. = ALIGN(PAGE_SIZE);
 	HYP_SECTION_NAME(.data..percpu) : {
 		PERCPU_INPUT(L1_CACHE_BYTES)
 	}
-- 
cgit v1.2.3


From 854c57f02bc718b0653bc467073b4541b8155a36 Mon Sep 17 00:00:00 2001
From: Ashish Kalra <ashish.kalra@amd.com>
Date: Tue, 10 Nov 2020 22:42:05 +0000
Subject: KVM: SVM: Fix offset computation bug in __sev_dbg_decrypt().

Fix offset computation in __sev_dbg_decrypt() to include the
source paddr before it is rounded down to be aligned to 16 bytes
as required by SEV API. This fixes incorrect guest memory dumps
observed when using qemu monitor.

Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Message-Id: <20201110224205.29444-1-Ashish.Kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/sev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c0b14106258a..566f4d18185b 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -642,8 +642,8 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
 	 * Its safe to read more than we are asked, caller should ensure that
 	 * destination has enough space.
 	 */
-	src_paddr = round_down(src_paddr, 16);
 	offset = src_paddr & 15;
+	src_paddr = round_down(src_paddr, 16);
 	sz = round_up(sz + offset, 16);
 
 	return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
-- 
cgit v1.2.3


From 054409ab253d9f31bec5760105144166b4b71e22 Mon Sep 17 00:00:00 2001
From: Chen Zhou <chenzhou10@huawei.com>
Date: Tue, 17 Nov 2020 10:54:26 +0800
Subject: KVM: SVM: fix error return code in svm_create_vcpu()

Fix to return a negative error code from the error handling case
instead of 0 in function svm_create_vcpu(), as done elsewhere in this
function.

Fixes: f4c847a95654 ("KVM: SVM: refactor msr permission bitmap allocation")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
Message-Id: <20201117025426.167824-1-chenzhou10@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 1e81cfebd491..79b3a564f1c9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1309,8 +1309,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 		svm->avic_is_running = true;
 
 	svm->msrpm = svm_vcpu_alloc_msrpm();
-	if (!svm->msrpm)
+	if (!svm->msrpm) {
+		err = -ENOMEM;
 		goto error_free_vmcb_page;
+	}
 
 	svm_vcpu_init_msrpm(vcpu, svm->msrpm);
 
-- 
cgit v1.2.3


From 23bde34771f1ea92fb5e6682c0d8c04304d34b3b Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Tue, 17 Nov 2020 23:16:29 +0800
Subject: KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for
 userspace

It was recently reported that if GICR_TYPER is accessed before the RD base
address is set, we'll suffer from the unset @rdreg dereferencing. Oops...

	gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
			(rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;

It's "expected" that users will access registers in the redistributor if
the RD has been properly configured (e.g., the RD base address is set). But
it hasn't yet been covered by the existing documentation.

Per discussion on the list [1], the reporting of the GICR_TYPER.Last bit
for userspace never actually worked. And it's difficult for us to emulate
it correctly given that userspace has the flexibility to access it any
time. Let's just drop the reporting of the Last bit for userspace for now
(userspace should have full knowledge about it anyway) and it at least
prevents kernel from panic ;-)

[1] https://lore.kernel.org/kvmarm/c20865a267e44d1e2c0d52ce4e012263@kernel.org/

Fixes: ba7b3f1275fd ("KVM: arm/arm64: Revisit Redistributor TYPER last bit computation")
Reported-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Link: https://lore.kernel.org/r/20201117151629.1738-1-yuzenghui@huawei.com
Cc: stable@vger.kernel.org
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 52d6f24f65dc..15a6c98ee92f 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -273,6 +273,23 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
 	return extract_bytes(value, addr & 7, len);
 }
 
+static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
+						 gpa_t addr, unsigned int len)
+{
+	unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+	int target_vcpu_id = vcpu->vcpu_id;
+	u64 value;
+
+	value = (u64)(mpidr & GENMASK(23, 0)) << 32;
+	value |= ((target_vcpu_id & 0xffff) << 8);
+
+	if (vgic_has_its(vcpu->kvm))
+		value |= GICR_TYPER_PLPIS;
+
+	/* reporting of the Last bit is not supported for userspace */
+	return extract_bytes(value, addr & 7, len);
+}
+
 static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
 					     gpa_t addr, unsigned int len)
 {
@@ -593,8 +610,9 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
 	REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
 		vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
 		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
-		vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
+	REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER,
+		vgic_mmio_read_v3r_typer, vgic_mmio_write_wi,
+		vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
 		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-- 
cgit v1.2.3


From 4c80d05714d347405865802b7098f1c97362cbef Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 17 Nov 2020 20:00:33 +0100
Subject: s390/uv: handle destroy page legacy interface

Older firmware can return rc=0x107 rrc=0xd for destroy page if the
page is already non-secure. This should be handled like a success
as already done by newer firmware.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Fixes: 1a80b54d1ce1 ("s390/uv: add destroy page call")
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
---
 arch/s390/kernel/uv.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 14bd9d58edc9..883bfed9f5c2 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -129,8 +129,15 @@ int uv_destroy_page(unsigned long paddr)
 		.paddr = paddr
 	};
 
-	if (uv_call(0, (u64)&uvcb))
+	if (uv_call(0, (u64)&uvcb)) {
+		/*
+		 * Older firmware uses 107/d as an indication of a non secure
+		 * page. Let us emulate the newer variant (no-op).
+		 */
+		if (uvcb.header.rc == 0x107 && uvcb.header.rrc == 0xd)
+			return 0;
 		return -EINVAL;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 72c3bcdcda494cbd600712a32e67702cdee60c07 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 27 Nov 2020 08:53:52 +0100
Subject: KVM: x86: handle !lapic_in_kernel case in kvm_cpu_*_extint

Centralize handling of interrupts from the userspace APIC
in kvm_cpu_has_extint and kvm_cpu_get_extint, since
userspace APIC interrupts are handled more or less the
same as ExtINTs are with split irqchip.  This removes
duplicated code from kvm_cpu_has_injectable_intr and
kvm_cpu_has_interrupt, and makes the code more similar
between kvm_cpu_has_{extint,interrupt} on one side
and kvm_cpu_get_{extint,interrupt} on the other.

Cc: stable@vger.kernel.org
Reviewed-by: Filippo Sironi <sironi@amazon.de>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq.c   | 83 +++++++++++++++++++++-------------------------------
 arch/x86/kvm/lapic.c |  2 +-
 2 files changed, 34 insertions(+), 51 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 99d118ffc67d..ee94671272e3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -41,28 +41,9 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
  * non-APIC source without intack.
  */
 static int kvm_cpu_has_extint(struct kvm_vcpu *v)
-{
-	u8 accept = kvm_apic_accept_pic_intr(v);
-
-	if (accept) {
-		if (irqchip_split(v->kvm))
-			return pending_userspace_extint(v);
-		else
-			return v->kvm->arch.vpic->output;
-	} else
-		return 0;
-}
-
-/*
- * check if there is injectable interrupt:
- * when virtual interrupt delivery enabled,
- * interrupt from apic will handled by hardware,
- * we don't need to check it here.
- */
-int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 {
 	/*
-	 * FIXME: interrupt.injected represents an interrupt that it's
+	 * FIXME: interrupt.injected represents an interrupt whose
 	 * side-effects have already been applied (e.g. bit from IRR
 	 * already moved to ISR). Therefore, it is incorrect to rely
 	 * on interrupt.injected to know if there is a pending
@@ -75,6 +56,23 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.injected;
 
+	if (!kvm_apic_accept_pic_intr(v))
+		return 0;
+
+	if (irqchip_split(v->kvm))
+		return pending_userspace_extint(v);
+	else
+		return v->kvm->arch.vpic->output;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
@@ -91,20 +89,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr);
  */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	/*
-	 * FIXME: interrupt.injected represents an interrupt that it's
-	 * side-effects have already been applied (e.g. bit from IRR
-	 * already moved to ISR). Therefore, it is incorrect to rely
-	 * on interrupt.injected to know if there is a pending
-	 * interrupt in the user-mode LAPIC.
-	 * This leads to nVMX/nSVM not be able to distinguish
-	 * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
-	 * pending interrupt or should re-inject an injected
-	 * interrupt.
-	 */
-	if (!lapic_in_kernel(v))
-		return v->arch.interrupt.injected;
-
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
@@ -118,16 +102,21 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
  */
 static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 {
-	if (kvm_cpu_has_extint(v)) {
-		if (irqchip_split(v->kvm)) {
-			int vector = v->arch.pending_external_vector;
-
-			v->arch.pending_external_vector = -1;
-			return vector;
-		} else
-			return kvm_pic_read_irq(v->kvm); /* PIC */
-	} else
+	if (!kvm_cpu_has_extint(v)) {
+		WARN_ON(!lapic_in_kernel(v));
 		return -1;
+	}
+
+	if (!lapic_in_kernel(v))
+		return v->arch.interrupt.nr;
+
+	if (irqchip_split(v->kvm)) {
+		int vector = v->arch.pending_external_vector;
+
+		v->arch.pending_external_vector = -1;
+		return vector;
+	} else
+		return kvm_pic_read_irq(v->kvm); /* PIC */
 }
 
 /*
@@ -135,13 +124,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
  */
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
-	int vector;
-
-	if (!lapic_in_kernel(v))
-		return v->arch.interrupt.nr;
-
-	vector = kvm_cpu_get_extint(v);
-
+	int vector = kvm_cpu_get_extint(v);
 	if (vector != -1)
 		return vector;			/* PIC */
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 105e7859d1f2..86c33d53c90a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2465,7 +2465,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 ppr;
 
-	if (!kvm_apic_hw_enabled(apic))
+	if (!kvm_apic_present(vcpu))
 		return -1;
 
 	__apic_update_ppr(apic, &ppr);
-- 
cgit v1.2.3


From 71cc849b7093bb83af966c0e60cb11b7f35cd746 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 27 Nov 2020 09:18:20 +0100
Subject: KVM: x86: Fix split-irqchip vs interrupt injection window request

kvm_cpu_accept_dm_intr and kvm_vcpu_ready_for_interrupt_injection are
a hodge-podge of conditions, hacked together to get something that
more or less works.  But what is actually needed is much simpler;
in both cases the fundamental question is, do we have a place to stash
an interrupt if userspace does KVM_INTERRUPT?

In userspace irqchip mode, that is !vcpu->arch.interrupt.injected.
Currently kvm_event_needs_reinjection(vcpu) covers it, but it is
unnecessarily restrictive.

In split irqchip mode it's a bit more complicated, we need to check
kvm_apic_accept_pic_intr(vcpu) (the IRQ window exit is basically an INTACK
cycle and thus requires ExtINTs not to be masked) as well as
!pending_userspace_extint(vcpu).  However, there is no need to
check kvm_event_needs_reinjection(vcpu), since split irqchip keeps
pending ExtINT state separate from event injection state, and checking
kvm_cpu_has_interrupt(vcpu) is wrong too since ExtINT has higher
priority than APIC interrupts.  In fact the latter fixes a bug:
when userspace requests an IRQ window vmexit, an interrupt in the
local APIC can cause kvm_cpu_has_interrupt() to be true and thus
kvm_vcpu_ready_for_interrupt_injection() to return false.  When this
happens, vcpu_run does not exit to userspace but the interrupt window
vmexits keep occurring.  The VM loops without any hope of making progress.

Once we try to fix these with something like

     return kvm_arch_interrupt_allowed(vcpu) &&
-        !kvm_cpu_has_interrupt(vcpu) &&
-        !kvm_event_needs_reinjection(vcpu) &&
-        kvm_cpu_accept_dm_intr(vcpu);
+        (!lapic_in_kernel(vcpu)
+         ? !vcpu->arch.interrupt.injected
+         : (kvm_apic_accept_pic_intr(vcpu)
+            && !pending_userspace_extint(v)));

we realize two things.  First, thanks to the previous patch the complex
conditional can reuse !kvm_cpu_has_extint(vcpu).  Second, the interrupt
window request in vcpu_enter_guest()

        bool req_int_win =
                dm_request_for_irq_injection(vcpu) &&
                kvm_cpu_accept_dm_intr(vcpu);

should be kept in sync with kvm_vcpu_ready_for_interrupt_injection():
it is unnecessary to ask the processor for an interrupt window
if we would not be able to return to userspace.  Therefore,
kvm_cpu_accept_dm_intr(vcpu) is basically !kvm_cpu_has_extint(vcpu)
ANDed with the existing check for masked ExtINT.  It all makes sense:

- we can accept an interrupt from userspace if there is a place
  to stash it (and, for irqchip split, ExtINTs are not masked).
  Interrupts from userspace _can_ be accepted even if right now
  EFLAGS.IF=0.

- in order to tell userspace we will inject its interrupt ("IRQ
  window open" i.e. kvm_vcpu_ready_for_interrupt_injection), both
  KVM and the vCPU need to be ready to accept the interrupt.

... and this is what the patch implements.

Reported-by: David Woodhouse <dwmw@amazon.co.uk>
Analyzed-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/irq.c              |  2 +-
 arch/x86/kvm/x86.c              | 18 ++++++++++--------
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 324ddd7fd0aa..7e5f33a0d0e2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1656,6 +1656,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_extint(struct kvm_vcpu *v);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ee94671272e3..814698e5b152 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -40,7 +40,7 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
  * check if there is pending interrupt from
  * non-APIC source without intack.
  */
-static int kvm_cpu_has_extint(struct kvm_vcpu *v)
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
 {
 	/*
 	 * FIXME: interrupt.injected represents an interrupt whose
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 078a39d489fe..e545a8a613b1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4051,21 +4051,23 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 
 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
 {
+	/*
+	 * We can accept userspace's request for interrupt injection
+	 * as long as we have a place to store the interrupt number.
+	 * The actual injection will happen when the CPU is able to
+	 * deliver the interrupt.
+	 */
+	if (kvm_cpu_has_extint(vcpu))
+		return false;
+
+	/* Acknowledging ExtINT does not happen if LINT0 is masked.  */
 	return (!lapic_in_kernel(vcpu) ||
 		kvm_apic_accept_pic_intr(vcpu));
 }
 
-/*
- * if userspace requested an interrupt window, check that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
 {
 	return kvm_arch_interrupt_allowed(vcpu) &&
-		!kvm_cpu_has_interrupt(vcpu) &&
-		!kvm_event_needs_reinjection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
 }
 
-- 
cgit v1.2.3


From 9a2a0d3ca163fc645991804b8b032f7d59326bb5 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 26 Nov 2020 12:02:06 +0100
Subject: kvm: x86/mmu: Fix get_mmio_spte() on CPUs supporting 5-level PT

Commit 95fb5b0258b7 ("kvm: x86/mmu: Support MMIO in the TDP MMU") caused
the following WARNING on an Intel Ice Lake CPU:

 get_mmio_spte: detect reserved bits on spte, addr 0xb80a0, dump hierarchy:
 ------ spte 0xb80a0 level 5.
 ------ spte 0xfcd210107 level 4.
 ------ spte 0x1004c40107 level 3.
 ------ spte 0x1004c41107 level 2.
 ------ spte 0x1db00000000b83b6 level 1.
 WARNING: CPU: 109 PID: 10254 at arch/x86/kvm/mmu/mmu.c:3569 kvm_mmu_page_fault.cold.150+0x54/0x22f [kvm]
...
 Call Trace:
  ? kvm_io_bus_get_first_dev+0x55/0x110 [kvm]
  vcpu_enter_guest+0xaa1/0x16a0 [kvm]
  ? vmx_get_cs_db_l_bits+0x17/0x30 [kvm_intel]
  ? skip_emulated_instruction+0xaa/0x150 [kvm_intel]
  kvm_arch_vcpu_ioctl_run+0xca/0x520 [kvm]

The guest triggering this crashes. Note, this happens with the traditional
MMU and EPT enabled, not with the newly introduced TDP MMU. Turns out,
there was a subtle change in the above mentioned commit. Previously,
walk_shadow_page_get_mmio_spte() was setting 'root' to 'iterator.level'
which is returned by shadow_walk_init() and this equals to
'vcpu->arch.mmu->shadow_root_level'. Now, get_mmio_spte() sets it to
'int root = vcpu->arch.mmu->root_level'.

The difference between 'root_level' and 'shadow_root_level' on CPUs
supporting 5-level page tables is that in some case we don't want to
use 5-level, in particular when 'cpuid_maxphyaddr(vcpu) <= 48'
kvm_mmu_get_tdp_level() returns '4'. In case upper layer is not used,
the corresponding SPTE will fail '__is_rsvd_bits_set()' check.

Revert to using 'shadow_root_level'.

Fixes: 95fb5b0258b7 ("kvm: x86/mmu: Support MMIO in the TDP MMU")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20201126110206.2118959-1-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5bb1939b65d8..7a6ae9e90bd7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3517,7 +3517,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 {
 	u64 sptes[PT64_ROOT_MAX_LEVEL];
 	struct rsvd_bits_validate *rsvd_check;
-	int root = vcpu->arch.mmu->root_level;
+	int root = vcpu->arch.mmu->shadow_root_level;
 	int leaf;
 	int level;
 	bool reserved = false;
-- 
cgit v1.2.3