summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/extable.c9
-rw-r--r--arch/x86/mm/fault.c64
-rw-r--r--arch/x86/mm/hugetlbpage.c35
-rw-r--r--arch/x86/mm/ident_map.c23
-rw-r--r--arch/x86/mm/init.c92
-rw-r--r--arch/x86/mm/mem_encrypt.c7
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c18
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--arch/x86/mm/numa_32.c2
-rw-r--r--arch/x86/mm/pat/memtype.c73
-rw-r--r--arch/x86/mm/pat/set_memory.c68
-rw-r--r--arch/x86/mm/pgtable.c6
14 files changed, 212 insertions, 195 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 428048e73bd2..8d3a00e5c528 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,7 +34,7 @@ obj-y += pat/
CFLAGS_physaddr.o := -fno-stack-protector
CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
-CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
+CFLAGS_fault.o := -I $(src)/../include/asm/trace
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b522933bfa56..51986e8a9d35 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -164,13 +164,6 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
return ex_handler_default(fixup, regs);
}
-static bool ex_handler_copy(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
-{
- WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- return ex_handler_fault(fixup, regs, trapnr);
-}
-
static bool ex_handler_msr(const struct exception_table_entry *fixup,
struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
@@ -341,8 +334,6 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_fault(e, regs, trapnr);
case EX_TYPE_UACCESS:
return ex_handler_uaccess(e, regs, trapnr, fault_addr);
- case EX_TYPE_COPY:
- return ex_handler_copy(e, regs, trapnr);
case EX_TYPE_CLEAR_FS:
return ex_handler_clear_fs(e, regs);
case EX_TYPE_FPU_RESTORE:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 622d12ec7f08..e6c469b323cc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h> /* find_and_lock_vma() */
+#include <linux/vmalloc.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -514,18 +515,19 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
if (error_code & X86_PF_INSTR) {
unsigned int level;
+ bool nx, rw;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
- pte = lookup_address_in_pgd(pgd, address, &level);
+ pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);
- if (pte && pte_present(*pte) && !pte_exec(*pte))
+ if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
from_kuid(&init_user_ns, current_uid()));
- if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
(pgd_flags(*pgd) & _PAGE_USER) &&
(__read_cr4() & X86_CR4_SMEP))
pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
@@ -723,39 +725,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }
/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
@@ -865,14 +836,17 @@ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 pkey, int si_code)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma, u32 pkey, int si_code)
{
- struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
@@ -896,7 +870,8 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma)
{
/*
* This OSPKE check is not strictly necessary at runtime.
@@ -926,9 +901,9 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
*/
u32 pkey = vma_pkey(vma);
- __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
+ __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
} else {
- __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
+ __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
}
}
@@ -1356,8 +1331,9 @@ void do_user_addr_fault(struct pt_regs *regs,
goto lock_mmap;
if (unlikely(access_error(error_code, vma))) {
- vma_end_read(vma);
- goto lock_mmap;
+ bad_area_access_error(regs, error_code, address, NULL, vma);
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
@@ -1393,7 +1369,7 @@ retry:
* we can handle it..
*/
if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address, vma);
+ bad_area_access_error(regs, error_code, address, mm, vma);
return;
}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 5804bbae4f01..807a5859a3c4 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,41 +19,14 @@
#include <asm/tlbflush.h>
#include <asm/elf.h>
-/*
- * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pmd_huge(pmd_t pmd)
-{
- return !pmd_none(pmd) &&
- (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-}
-
-/*
- * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pud_huge(pud_t pud)
-{
-#if CONFIG_PGTABLE_LEVELS > 2
- return !pud_none(pud) &&
- (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-#else
- return 0;
-#endif
-}
-
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
- info.flags = 0;
info.length = len;
info.low_limit = get_mmap_base(1);
@@ -65,7 +38,6 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
@@ -74,7 +46,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
@@ -89,7 +61,6 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -141,7 +112,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
get_unmapped_area:
- if (mm->get_unmapped_area == arch_get_unmapped_area)
+ if (!test_bit(MMF_TOPDOWN, &mm->flags))
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index a204a332c71f..968d7005f4a7 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -26,31 +26,18 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
- bool use_gbpage;
next = (addr & PUD_MASK) + PUD_SIZE;
if (next > end)
next = end;
- /* if this is already a gbpage, this portion is already mapped */
- if (pud_leaf(*pud))
- continue;
-
- /* Is using a gbpage allowed? */
- use_gbpage = info->direct_gbpages;
-
- /* Don't use gbpage if it maps more than the requested region. */
- /* at the begining: */
- use_gbpage &= ((addr & ~PUD_MASK) == 0);
- /* ... or at the end: */
- use_gbpage &= ((next & ~PUD_MASK) == 0);
-
- /* Never overwrite existing mappings */
- use_gbpage &= !pud_present(*pud);
-
- if (use_gbpage) {
+ if (info->direct_gbpages) {
pud_t pudval;
+ if (pud_present(*pud))
+ continue;
+
+ addr &= PUD_MASK;
pudval = __pud((addr - info->offset) | info->page_flag);
set_pud(pud, pudval);
continue;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 679893ea5e68..eb503f53c319 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -7,6 +7,7 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>
#include <linux/sched/task.h>
+#include <linux/execmem.h>
#include <asm/set_memory.h>
#include <asm/cpu_device_id.h>
@@ -261,21 +262,17 @@ static void __init probe_page_size_mask(void)
}
}
-#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \
- .family = 6, \
- .model = _model, \
- }
/*
* INVLPG may not properly flush Global entries
* on these CPUs when PCIDs are enabled.
*/
static const struct x86_cpu_id invlpg_miss_ids[] = {
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
- INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, 0),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0),
{}
};
@@ -990,53 +987,6 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-/*
- * Calculate the precise size of the DMA zone (first 16 MB of RAM),
- * and pass it to the MM layer - to help it set zone watermarks more
- * accurately.
- *
- * Done on 64-bit systems only for the time being, although 32-bit systems
- * might benefit from this as well.
- */
-void __init memblock_find_dma_reserve(void)
-{
-#ifdef CONFIG_X86_64
- u64 nr_pages = 0, nr_free_pages = 0;
- unsigned long start_pfn, end_pfn;
- phys_addr_t start_addr, end_addr;
- int i;
- u64 u;
-
- /*
- * Iterate over all memory ranges (free and reserved ones alike),
- * to calculate the total number of pages in the first 16 MB of RAM:
- */
- nr_pages = 0;
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
- start_pfn = min(start_pfn, MAX_DMA_PFN);
- end_pfn = min(end_pfn, MAX_DMA_PFN);
-
- nr_pages += end_pfn - start_pfn;
- }
-
- /*
- * Iterate over free memory ranges to calculate the number of free
- * pages in the DMA zone, while not counting potential partial
- * pages at the beginning or the end of the range:
- */
- nr_free_pages = 0;
- for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
- start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
- end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
-
- if (start_pfn < end_pfn)
- nr_free_pages += end_pfn - start_pfn;
- }
-
- set_dma_reserve(nr_pages - nr_free_pages);
-#endif
-}
-
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -1099,3 +1049,31 @@ unsigned long arch_max_swapfile_size(void)
return pages;
}
#endif
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ unsigned long start, offset = 0;
+
+ if (kaslr_enabled())
+ offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+ start = MODULES_VADDR + offset;
+
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_DEFAULT] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 6f3b3e028718..0a120d85d7bb 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -102,6 +102,13 @@ void __init mem_encrypt_setup_arch(void)
phys_addr_t total_mem = memblock_phys_mem_size();
unsigned long size;
+ /*
+ * Do RMP table fixups after the e820 tables have been setup by
+ * e820__memory_setup().
+ */
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ snp_fixup_e820_tables();
+
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 70b91de2e053..422602f6039b 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -492,6 +492,24 @@ void __init sme_early_init(void)
*/
if (sev_status & MSR_AMD64_SEV_ENABLED)
ia32_disable();
+
+ /*
+ * Override init functions that scan the ROM region in SEV-SNP guests,
+ * as this memory is not pre-validated and would thus cause a crash.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.pci.init_irq = x86_init_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
+
+ /*
+ * DMI setup behavior for SEV-SNP guests depends on
+ * efi_enabled(EFI_CONFIG_TABLES), which hasn't been
+ * parsed yet. snp_dmi_setup() will run after that
+ * parsing has happened.
+ */
+ x86_init.resources.dmi_setup = snp_dmi_setup;
+ }
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a60..a2cabb1c81e1 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -129,9 +129,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
else
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 65e9a6e391c0..ce84ba86e69e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -929,6 +929,8 @@ int memory_add_physaddr_to_nid(u64 start)
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
static int __init cmp_memblk(const void *a, const void *b)
{
const struct numa_memblk *ma = *(const struct numa_memblk **)a;
@@ -1001,5 +1003,3 @@ int __init numa_fill_memblks(u64 start, u64 end)
}
return 0;
}
-
-#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 104544359d69..65fda406e6f2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,8 @@
#include <linux/memblock.h>
#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable_areas.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 0d72183b5dd0..bdc2a240c2aa 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -39,6 +39,7 @@
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
@@ -947,6 +948,61 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
+ resource_size_t *phys)
+{
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+
+ if (follow_pte(vma, vma->vm_start, &ptep, &ptl))
+ return -EINVAL;
+
+ pte = ptep_get(ptep);
+
+ /* Never return PFNs of anon folios in COW mappings. */
+ if (vm_normal_folio(vma, vma->vm_start, pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ return -EINVAL;
+ }
+
+ *prot = pgprot_val(pte_pgprot(pte));
+ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+
+static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
+ pgprot_t *pgprot)
+{
+ unsigned long prot;
+
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
+
+ /*
+ * We need the starting PFN and cachemode used for track_pfn_remap()
+ * that covered the whole VMA. For most mappings, we can obtain that
+ * information from the page tables. For COW mappings, we might now
+ * suddenly have anon folios mapped and follow_phys() will fail.
+ *
+ * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
+ * detect the PFN. If we need the cachemode as well, we're out of luck
+ * for now and have to fail fork().
+ */
+ if (!follow_phys(vma, &prot, paddr)) {
+ if (pgprot)
+ *pgprot = __pgprot(prot);
+ return 0;
+ }
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (pgprot)
+ return -EINVAL;
+ *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
/*
* track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
@@ -957,20 +1013,13 @@ static void free_pfn_range(u64 paddr, unsigned long size)
int track_pfn_copy(struct vm_area_struct *vma)
{
resource_size_t paddr;
- unsigned long prot;
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (vma->vm_flags & VM_PAT) {
- /*
- * reserve the whole chunk covered by vma. We need the
- * starting address and protection from pte.
- */
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, &pgprot))
return -EINVAL;
- }
- pgprot = __pgprot(prot);
+ /* reserve the whole chunk covered by vma. */
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
}
@@ -1045,7 +1094,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size, bool mm_wr_locked)
{
resource_size_t paddr;
- unsigned long prot;
if (vma && !(vma->vm_flags & VM_PAT))
return;
@@ -1053,11 +1101,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
/* free the chunk starting from pfn or the whole chunk */
paddr = (resource_size_t)pfn << PAGE_SHIFT;
if (!paddr && !size) {
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, NULL))
return;
- }
-
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 80c9037ffadf..19fdfbb171ed 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -619,7 +619,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
* Validate strict W^X semantics.
*/
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
- unsigned long pfn, unsigned long npg)
+ unsigned long pfn, unsigned long npg,
+ bool nx, bool rw)
{
unsigned long end;
@@ -641,6 +642,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
return new;
+ /* Non-leaf translation entries can disable writing or execution. */
+ if (!rw || nx)
+ return new;
+
end = start + npg * PAGE_SIZE - 1;
WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
(unsigned long long)pgprot_val(old),
@@ -657,20 +662,26 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/*
* Lookup the page table entry for a virtual address in a specific pgd.
- * Return a pointer to the entry and the level of the mapping.
+ * Return a pointer to the entry, the level of the mapping, and the effective
+ * NX and RW bits of all page table levels.
*/
-pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
*level = PG_LEVEL_NONE;
+ *nx = false;
+ *rw = true;
if (pgd_none(*pgd))
return NULL;
+ *nx |= pgd_flags(*pgd) & _PAGE_NX;
+ *rw &= pgd_flags(*pgd) & _PAGE_RW;
+
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
@@ -679,6 +690,9 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
+ *nx |= p4d_flags(*p4d) & _PAGE_NX;
+ *rw &= p4d_flags(*p4d) & _PAGE_RW;
+
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
@@ -687,6 +701,9 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+ *nx |= pud_flags(*pud) & _PAGE_NX;
+ *rw &= pud_flags(*pud) & _PAGE_RW;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
@@ -695,12 +712,27 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
+ *nx |= pmd_flags(*pmd) & _PAGE_NX;
+ *rw &= pmd_flags(*pmd) & _PAGE_RW;
+
*level = PG_LEVEL_4K;
return pte_offset_kernel(pmd, address);
}
/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ bool nx, rw;
+
+ return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
+}
+
+/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
@@ -715,13 +747,16 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
- unsigned int *level)
+ unsigned int *level, bool *nx, bool *rw)
{
- if (cpa->pgd)
- return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
- address, level);
+ pgd_t *pgd;
+
+ if (!cpa->pgd)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = cpa->pgd + pgd_index(address);
- return lookup_address(address, level);
+ return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}
/*
@@ -849,12 +884,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, *tmp;
enum pg_level level;
+ bool nx, rw;
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte)
return 1;
@@ -965,7 +1001,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
- new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
+ nx, rw);
/*
* If there is a conflict, split the large page.
@@ -1046,6 +1083,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
pte_t *pbase = (pte_t *)page_address(base);
unsigned int i, level;
pgprot_t ref_prot;
+ bool nx, rw;
pte_t *tmp;
spin_lock(&pgd_lock);
@@ -1053,7 +1091,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -1594,10 +1632,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ bool nx, rw;
address = __cpa_addr(cpa, cpa->curpage);
repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -1619,7 +1658,8 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
- new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
+ nx, rw);
new_prot = pgprot_clear_protnone_bits(new_prot);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d007591b8059..93e54ba91fbf 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -631,6 +631,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
/*
* No flush is necessary. Once an invalid PTE is established, the PTE's
* access and dirty bits cannot be updated.
@@ -731,7 +733,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
return 0;
/* Bail out if we are we on a populated non-leaf entry: */
- if (pud_present(*pud) && !pud_huge(*pud))
+ if (pud_present(*pud) && !pud_leaf(*pud))
return 0;
set_pte((pte_t *)pud, pfn_pte(
@@ -760,7 +762,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
}
/* Bail out if we are we on a populated non-leaf entry: */
- if (pmd_present(*pmd) && !pmd_huge(*pmd))
+ if (pmd_present(*pmd) && !pmd_leaf(*pmd))
return 0;
set_pte((pte_t *)pmd, pfn_pte(