From 8e5647a723c49d73b9f108a8bb38e8c29d3948ea Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 22 Nov 2023 10:37:00 -0600 Subject: x86/mm: Ensure input to pfn_to_kaddr() is treated as a 64-bit type On 64-bit platforms, the pfn_to_kaddr() macro requires that the input value is 64 bits in order to ensure that valid address bits don't get lost when shifting that input by PAGE_SHIFT to calculate the physical address to provide a virtual address for. One such example is in pvalidate_pages() (used by SEV-SNP guests), where the GFN in the struct used for page-state change requests is a 40-bit bit-field, so attempts to pass this GFN field directly into pfn_to_kaddr() ends up causing guest crashes when dealing with addresses above the 1TB range due to the above. Fix this issue with SEV-SNP guests, as well as any similar cases that might cause issues in current/future code, by using an inline function, instead of a macro, so that the input is implicitly cast to the expected 64-bit input type prior to performing the shift operation. While it might be argued that the issue is on the caller side, other archs/macros have taken similar approaches to deal with instances like this, such as ARM explicitly casting the input to phys_addr_t: e48866647b48 ("ARM: 8396/1: use phys_addr_t in pfn_to_kaddr()") A C inline function is even better though. [ mingo: Refined the changelog some more & added __always_inline. ] Fixes: 6c3211796326 ("x86/sev: Add SNP-specific unaccepted memory support") Suggested-by: Dave Hansen Suggested-by: H. Peter Anvin Signed-off-by: Michael Roth Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231122163700.400507-1-michael.roth@amd.com Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Rik van Riel Cc: Linus Torvalds --- arch/x86/include/asm/page.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index d18e5c332cb9..1b93ff80b43b 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) +static __always_inline void *pfn_to_kaddr(unsigned long pfn) +{ + return __va(pfn << PAGE_SHIFT); +} + static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); -- cgit v1.2.3 From 8f588afe6256c50b3d1f8a671828fc4aab421c05 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Jan 2024 09:34:57 -0800 Subject: x86/mm: Get rid of conditional IF flag handling in page fault path We had this nonsensical code that would happily handle kernel page faults with interrupts disabled, which makes no sense at all. It turns out that this is legacy code that _used_ to make sense, back when we enabled IRQs as early as possible, and we used to have this code sequence essentially immediately after reading the faulting address from the %cr2 register. Back then, we could have kernel page faults to populate the vmalloc area with interrupts disabled, and they would need to stay disabled for that case. However, the code in question has been moved down in the page fault handling, and is now in the "handle faults in user addresses" section, and apparently nobody ever noticed that it no longer makes sense to handle these page faults with interrupts conditionally disabled. So replace the conditional IRQ enable: if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); with an unconditional one, and add a temporary WARN_ON_ONCE() if some codepath actually does do page faults with interrupts disabled (without also doing a pagefault_disable(), of course). NOTE! We used to allow user space to disable interrupts with iopl(3). That is no longer true since commits: a24ca9976843 ("x86/iopl: Remove legacy IOPL option") b968e84b509d ("x86/iopl: Fake iopl(3) CLI/STI usage") so the WARN_ON_ONCE() is valid for both the kernel and user situation. For some of the history relevant to this code, see particularly commit 8c914cb704a1 ("x86_64: actively synchronize vmalloc area when registering certain callbacks"), which moved this below the vmalloc fault handling. Now that the user_mode() check is irrelevant, we can also move the FAULT_FLAG_USER flag setting down to where the other flag settings are done. Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Uros Bizjak Cc: Sean Christopherson Link: https://lore.kernel.org/r/20240125173457.1281880-1-torvalds@linux-foundation.org --- arch/x86/mm/fault.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..150e002e0884 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1302,21 +1302,14 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode(regs)) { - local_irq_enable(); - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + /* Legacy check - remove this after verifying that it doesn't trigger */ + if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { + bad_area_nosemaphore(regs, error_code, address); + return; } + local_irq_enable(); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -1332,6 +1325,14 @@ void do_user_addr_fault(struct pt_regs *regs, if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + /* + * We set FAULT_FLAG_USER based on the register state, not + * based on X86_PF_USER. User space accesses that cause + * system page faults are still user accesses. + */ + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The -- cgit v1.2.3 From 82ace185017fbbe48342bf7d8a9fd795f9c711cd Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 22 Jan 2024 10:40:03 -0800 Subject: x86/mm/cpa: Warn for set_memory_XXcrypted() VMM fails On TDX it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. In terms of security, the problematic case is guest PTEs mapping the shared alias GFNs, since the VMM has control of the shared mapping in the EPT/NPT. Such conversion errors may herald future system instability, but are temporarily survivable with proper handling in the caller. The kernel traditionally makes every effort to keep running, but it is expected that some coco guests may prefer to play it safe security-wise, and panic in this case. To accommodate both cases, warn when the arch breakouts for converting memory at the VMM layer return an error to CPA. Security focused users can rely on panic_on_warn to defend against bugs in the callers. Some VMMs are not known to behave in the troublesome way, so users that would like to terminate on any unusual behavior by the VMM around this will be covered as well. Since the arch breakouts host the logic for handling coco implementation specific errors, an error returned from them means that the set_memory() call is out of options for handling the error internally. Make this the condition to warn about. It is possible that very rarely these functions could fail due to guest memory pressure (in the case of failing to allocate a huge page when splitting a page table). Don't warn in this case because it is a lot less likely to indicate an attack by the host and it is not clear which set_memory() calls should get the same treatment. That corner should be addressed by future work that considers the more general problem and not just papers over a single set_memory() variant. Suggested-by: Michael Kelley (LINUX) Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Tom Lendacky Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Kirill A. Shutemov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/all/20240122184003.129104-1-rick.p.edgecombe%40intel.com --- arch/x86/mm/pat/set_memory.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index e9b448d1b1b7..47af3815b663 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2153,7 +2153,7 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) /* Notify hypervisor that we are about to set/clr encryption attribute. */ if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) - return -EIO; + goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); @@ -2166,13 +2166,20 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) */ cpa_flush(&cpa, 0); + if (ret) + return ret; + /* Notify hypervisor that we have successfully set/clr encryption attribute. */ - if (!ret) { - if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) - ret = -EIO; - } + if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) + goto vmm_fail; - return ret; + return 0; + +vmm_fail: + WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n", + (void *)addr, numpages, enc ? "private" : "shared"); + + return -EIO; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) -- cgit v1.2.3