summaryrefslogtreecommitdiff
path: root/arch/x86/include/asm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/include/asm')
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/disabled-features.h16
-rw-r--r--arch/x86/include/asm/fpu/api.h9
-rw-r--r--arch/x86/include/asm/fpu/regset.h7
-rw-r--r--arch/x86/include/asm/fpu/sched.h3
-rw-r--r--arch/x86/include/asm/fpu/types.h16
-rw-r--r--arch/x86/include/asm/fpu/xstate.h6
-rw-r--r--arch/x86/include/asm/idtentry.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/pgtable.h302
-rw-r--r--arch/x86/include/asm/pgtable_types.h44
-rw-r--r--arch/x86/include/asm/processor.h8
-rw-r--r--arch/x86/include/asm/shstk.h38
-rw-r--r--arch/x86/include/asm/special_insns.h13
-rw-r--r--arch/x86/include/asm/tlbflush.h3
-rw-r--r--arch/x86/include/asm/trap_pf.h2
-rw-r--r--arch/x86/include/asm/traps.h15
17 files changed, 431 insertions, 57 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 7b4ecbf78d8b..2061ed1c398f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -307,6 +307,7 @@
#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
@@ -383,6 +384,7 @@
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index fafe9be7a6f4..702d93fdd10e 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -105,6 +105,18 @@
# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31))
#endif
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+#define DISABLE_USER_SHSTK 0
+#else
+#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31))
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+#define DISABLE_IBT 0
+#else
+#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -120,7 +132,7 @@
#define DISABLED_MASK9 (DISABLE_SGX)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
- DISABLE_CALL_DEPTH_TRACKING)
+ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
#define DISABLED_MASK12 (DISABLE_LAM)
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
@@ -128,7 +140,7 @@
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
-#define DISABLED_MASK18 0
+#define DISABLED_MASK18 (DISABLE_IBT)
#define DISABLED_MASK19 0
#define DISABLED_MASK20 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index b475d9a582b8..31089b851c4f 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -82,6 +82,15 @@ static inline void fpregs_unlock(void)
preempt_enable();
}
+/*
+ * FPU state gets lazily restored before returning to userspace. So when in the
+ * kernel, the valid FPU state may be kept in the buffer. This function will force
+ * restore all the fpu state to the registers early if needed, and lock them from
+ * being automatically saved/restored. Then FPU state can be modified safely in the
+ * registers, before unlocking with fpregs_unlock().
+ */
+void fpregs_lock_and_load(void);
+
#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
#else
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
index 4f928d6a367b..697b77e96025 100644
--- a/arch/x86/include/asm/fpu/regset.h
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -7,11 +7,12 @@
#include <linux/regset.h>
-extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active,
+ ssp_active;
extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get,
- xstateregs_get;
+ xstateregs_get, ssp_get;
extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
- xstateregs_set;
+ xstateregs_set, ssp_set;
/*
* xstateregs_active == regset_fpregs_active. Please refer to the comment
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index 78fcde7b1f07..ca6e5e5f16b2 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -11,7 +11,8 @@
extern void save_fpregs_to_fpstate(struct fpu *fpu);
extern void fpu__drop(struct fpu *fpu);
-extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal);
+extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
+ unsigned long shstk_addr);
extern void fpu_flush_thread(void);
/*
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 7f6d858ff47a..eb810074f1e7 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -115,8 +115,8 @@ enum xfeature {
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
XFEATURE_PASID,
- XFEATURE_RSRVD_COMP_11,
- XFEATURE_RSRVD_COMP_12,
+ XFEATURE_CET_USER,
+ XFEATURE_CET_KERNEL_UNUSED,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -138,6 +138,8 @@ enum xfeature {
#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
+#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
+#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
@@ -253,6 +255,16 @@ struct pkru_state {
} __packed;
/*
+ * State component 11 is Control-flow Enforcement user states
+ */
+struct cet_user_state {
+ /* user control-flow settings */
+ u64 user_cet;
+ /* user shadow stack pointer */
+ u64 user_ssp;
+};
+
+/*
* State component 15: Architectural LBR configuration state.
* The size of Arch LBR state depends on the number of LBRs (lbr_depth).
*/
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index cd3dd170e23a..d4427b88ee12 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -50,7 +50,8 @@
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
/* All currently supported supervisor features */
-#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER)
/*
* A supervisor state component may not always contain valuable information,
@@ -77,7 +78,8 @@
* Unsupported supervisor features. When a supervisor feature in this mask is
* supported in the future, move it to the supported supervisor feature mask.
*/
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
+ XFEATURE_MASK_CET_KERNEL)
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index cd5c10a74071..05fd175cec7d 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -614,7 +614,7 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
#endif
/* #CP */
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
#endif
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 1d29dc791f5a..416901d406f8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -186,6 +186,8 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
+ if (!tsk->vfork_done) \
+ shstk_free(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index dbf8af70b7c2..d6ad98ca1288 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -125,9 +125,15 @@ extern pmdval_t early_pmd_flags;
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pte_shstk(pte_t pte)
+{
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}
static inline int pte_young(pte_t pte)
@@ -135,9 +141,16 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pmd_shstk(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_DIRTY;
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
+ (_PAGE_DIRTY | _PAGE_PSE);
}
#define pmd_young pmd_young
@@ -146,9 +159,9 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
{
- return pud_flags(pud) & _PAGE_DIRTY;
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
}
static inline int pud_young(pud_t pud)
@@ -158,7 +171,27 @@ static inline int pud_young(pud_t pud)
static inline int pte_write(pte_t pte)
{
- return pte_flags(pte) & _PAGE_RW;
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
+}
+
+#define pmd_write pmd_write
+static inline int pmd_write(pmd_t pmd)
+{
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
+}
+
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_RW;
}
static inline int pte_huge(pte_t pte)
@@ -292,9 +325,63 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+/*
+ * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
+ * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So
+ * when creating dirty, write-protected memory, a software bit is used:
+ * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the
+ * Dirty bit to SavedDirty, and vice-vesra.
+ *
+ * This shifting is only done if needed. In the case of shifting
+ * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of
+ * shifting SavedDirty->Dirty, the condition is Write=1.
+ */
+static inline pgprotval_t mksaveddirty_shift(pgprotval_t v)
+{
+ pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1;
+
+ v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY;
+ v &= ~(cond << _PAGE_BIT_DIRTY);
+
+ return v;
+}
+
+static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v)
+{
+ pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1;
+
+ v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY;
+ v &= ~(cond << _PAGE_BIT_SAVED_DIRTY);
+
+ return v;
+}
+
+static inline pte_t pte_mksaveddirty(pte_t pte)
+{
+ pteval_t v = native_pte_val(pte);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pte(v);
+}
+
+static inline pte_t pte_clear_saveddirty(pte_t pte)
+{
+ pteval_t v = native_pte_val(pte);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pte(v);
+}
+
static inline pte_t pte_wrprotect(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_RW);
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
+ * dirty value to the software bit, if present.
+ */
+ return pte_mksaveddirty(pte);
}
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
@@ -332,7 +419,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte)
static inline pte_t pte_mkclean(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}
static inline pte_t pte_mkold(pte_t pte)
@@ -347,7 +434,16 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pte_mksaveddirty(pte);
+}
+
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
+ return pte_set_flags(pte, _PAGE_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -355,11 +451,15 @@ static inline pte_t pte_mkyoung(pte_t pte)
return pte_set_flags(pte, _PAGE_ACCESSED);
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_novma(pte_t pte)
{
return pte_set_flags(pte, _PAGE_RW);
}
+struct vm_area_struct;
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
+#define pte_mkwrite pte_mkwrite
+
static inline pte_t pte_mkhuge(pte_t pte)
{
return pte_set_flags(pte, _PAGE_PSE);
@@ -404,9 +504,34 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+/* See comments above mksaveddirty_shift() */
+static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pmd(v);
+}
+
+/* See comments above mksaveddirty_shift() */
+static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pmd(v);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_RW);
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ return pmd_mksaveddirty(pmd);
}
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
@@ -433,12 +558,21 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pmd_mksaveddirty(pmd);
+}
+
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+
+ return pmd_set_flags(pmd, _PAGE_DIRTY);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -456,11 +590,14 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_ACCESSED);
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_RW);
}
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+#define pmd_mkwrite pmd_mkwrite
+
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
pudval_t v = native_pud_val(pud);
@@ -475,6 +612,24 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
return native_make_pud(v & ~clear);
}
+/* See comments above mksaveddirty_shift() */
+static inline pud_t pud_mksaveddirty(pud_t pud)
+{
+ pudval_t v = native_pud_val(pud);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pud(v);
+}
+
+/* See comments above mksaveddirty_shift() */
+static inline pud_t pud_clear_saveddirty(pud_t pud)
+{
+ pudval_t v = native_pud_val(pud);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pud(v);
+}
+
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
@@ -482,17 +637,26 @@ static inline pud_t pud_mkold(pud_t pud)
static inline pud_t pud_mkclean(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_RW);
+ pud = pud_clear_flags(pud, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ return pud_mksaveddirty(pud);
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pud_mksaveddirty(pud);
}
static inline pud_t pud_mkdevmap(pud_t pud)
@@ -512,7 +676,9 @@ static inline pud_t pud_mkyoung(pud_t pud)
static inline pud_t pud_mkwrite(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_RW);
+ pud = pud_set_flags(pud, _PAGE_RW);
+
+ return pud_clear_saveddirty(pud);
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -629,6 +795,7 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pteval_t val = pte_val(pte), oldval = val;
+ pte_t pte_result;
/*
* Chop off the NX bit (if present), and add the NX portion of
@@ -637,17 +804,54 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
- return __pte(val);
+
+ pte_result = __pte(val);
+
+ /*
+ * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid:
+ * 1. Marking Write=0 PTEs Dirty=1
+ * 2. Marking Dirty=1 PTEs Write=0
+ *
+ * The first case cannot happen because the _PAGE_CHG_MASK will filter
+ * out any Dirty bit passed in newprot. Handle the second case by
+ * going through the mksaveddirty exercise. Only do this if the old
+ * value was Write=1 to avoid doing this on Shadow Stack PTEs.
+ */
+ if (oldval & _PAGE_RW)
+ pte_result = pte_mksaveddirty(pte_result);
+ else
+ pte_result = pte_clear_saveddirty(pte_result);
+
+ return pte_result;
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
pmdval_t val = pmd_val(pmd), oldval = val;
+ pmd_t pmd_result;
- val &= _HPAGE_CHG_MASK;
+ val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
- return __pmd(val);
+
+ pmd_result = __pmd(val);
+
+ /*
+ * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid:
+ * 1. Marking Write=0 PMDs Dirty=1
+ * 2. Marking Dirty=1 PMDs Write=0
+ *
+ * The first case cannot happen because the _PAGE_CHG_MASK will filter
+ * out any Dirty bit passed in newprot. Handle the second case by
+ * going through the mksaveddirty exercise. Only do this if the old
+ * value was Write=1 to avoid doing this on Shadow Stack PTEs.
+ */
+ if (oldval & _PAGE_RW)
+ pmd_result = pmd_mksaveddirty(pmd_result);
+ else
+ pmd_result = pmd_clear_saveddirty(pmd_result);
+
+ return pmd_result;
}
/*
@@ -831,7 +1035,14 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
* (Currently stuck as a macro because of indirect forward reference
* to linux/mm.h:page_to_nid())
*/
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+#define mk_pte(page, pgprot) \
+({ \
+ pgprot_t __pgprot = pgprot; \
+ \
+ WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
+ _PAGE_DIRTY); \
+ pfn_pte(page_to_pfn(page), __pgprot); \
+})
static inline int pmd_bad(pmd_t pmd)
{
@@ -1090,7 +1301,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ pte_t old_pte, new_pte;
+
+ old_pte = READ_ONCE(*ptep);
+ do {
+ new_pte = pte_wrprotect(old_pte);
+ } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
@@ -1116,12 +1337,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define pmd_write pmd_write
-static inline int pmd_write(pmd_t pmd)
-{
- return pmd_flags(pmd) & _PAGE_RW;
-}
-
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
@@ -1148,13 +1363,17 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
-}
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ pmd_t old_pmd, new_pmd;
-#define pud_write pud_write
-static inline int pud_write(pud_t pud)
-{
- return pud_flags(pud) & _PAGE_RW;
+ old_pmd = READ_ONCE(*pmdp);
+ do {
+ new_pmd = pmd_wrprotect(old_pmd);
+ } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd));
}
#ifndef pmdp_establish
@@ -1412,6 +1631,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+ /*
+ * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
+ * shouldn't generally allow access to, but since they
+ * are already Write=0, the below logic covers both cases.
+ */
if (write)
need_pte_bits |= _PAGE_RW;
@@ -1453,6 +1677,12 @@ static inline bool arch_has_hw_pte_young(void)
return true;
}
+#define arch_check_zapped_pte arch_check_zapped_pte
+void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
+
+#define arch_check_zapped_pmd arch_check_zapped_pmd
+void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);
+
#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a6deb67cfbb2..0b748ee16b3d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -21,7 +21,8 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_SOFTW4 57 /* available for programmer */
+#define _PAGE_BIT_SOFTW5 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
@@ -34,6 +35,13 @@
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+#ifdef CONFIG_X86_64
+#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */
+#else
+/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */
+#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit */
+#endif
+
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -117,6 +125,18 @@
#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
#endif
+/*
+ * The hardware requires shadow stack to be Write=0,Dirty=1. However,
+ * there are valid cases where the kernel might create read-only PTEs that
+ * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In
+ * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit,
+ * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have
+ * (Write=0,SavedDirty=1,Dirty=0) set.
+ */
+#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY)
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
/*
@@ -125,10 +145,10 @@
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
-#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |\
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \
- _PAGE_UFFD_WP)
+#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | \
+ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \
+ _PAGE_DEVMAP | _PAGE_ENC | _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)
@@ -189,14 +209,22 @@ enum page_cache_mode {
#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
+
+/*
+ * Page tables needs to have Write=1 in order for any lower PTEs to be
+ * writable. This includes shadow stack memory (Write=0, Dirty=1)
+ */
#define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0)
#define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC)
#define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0)
#define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC)
-#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G)
+
+#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G)
+#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G)
+#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
#define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC)
-#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G)
#define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cbb943010c1e..0086920cda06 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -28,6 +28,7 @@ struct vm86;
#include <asm/unwind_hints.h>
#include <asm/vmxfeatures.h>
#include <asm/vdso/processor.h>
+#include <asm/shstk.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -474,6 +475,13 @@ struct thread_struct {
*/
u32 pkru;
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ unsigned long features;
+ unsigned long features_locked;
+
+ struct thread_shstk shstk;
+#endif
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
new file mode 100644
index 000000000000..42fee8959df7
--- /dev/null
+++ b/arch/x86/include/asm/shstk.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHSTK_H
+#define _ASM_X86_SHSTK_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+struct task_struct;
+struct ksignal;
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+struct thread_shstk {
+ u64 base;
+ u64 size;
+};
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
+void reset_thread_features(void);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
+ unsigned long stack_size);
+void shstk_free(struct task_struct *p);
+int setup_signal_shadow_stack(struct ksignal *ksig);
+int restore_signal_shadow_stack(void);
+#else
+static inline long shstk_prctl(struct task_struct *task, int option,
+ unsigned long arg2) { return -EINVAL; }
+static inline void reset_thread_features(void) {}
+static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+ unsigned long clone_flags,
+ unsigned long stack_size) { return 0; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
+static inline int restore_signal_shadow_stack(void) { return 0; }
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_SHSTK_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index de48d1389936..d6cd9344f6c7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -202,6 +202,19 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static inline int write_user_shstk_64(u64 __user *addr, u64 val)
+{
+ asm_volatile_goto("1: wrussq %[val], (%[addr])\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: [addr] "r" (addr), [val] "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -EFAULT;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
#define nop() asm volatile ("nop")
static inline void serialize(void)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ab42caaa67a..25726893c6f4 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -306,7 +306,8 @@ static inline bool pte_flags_need_flush(unsigned long oldflags,
const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
_PAGE_ACCESSED;
const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
- _PAGE_SOFTW3 | _PAGE_SOFTW4;
+ _PAGE_SOFTW3 | _PAGE_SOFTW4 |
+ _PAGE_SAVED_DIRTY;
const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
_PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
_PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 10b1de500ab1..afa524325e55 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
+ * bit 6 == 1: shadow stack access fault
* bit 15 == 1: SGX MMU page-fault
*/
enum x86_pf_error_code {
@@ -20,6 +21,7 @@ enum x86_pf_error_code {
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
+ X86_PF_SHSTK = 1 << 6,
X86_PF_SGX = 1 << 15,
};
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 47ecfff2c83d..b1c9cea6ba88 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -18,7 +18,8 @@ void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
-extern bool ibt_selftest(void);
+extern int ibt_selftest(void);
+extern int ibt_selftest_noendbr(void);
#ifdef CONFIG_X86_F00F_BUG
/* For handling the FOOF bug */
@@ -47,4 +48,16 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
struct stack_info *info);
#endif
+static inline void cond_local_irq_enable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void cond_local_irq_disable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+}
+
#endif /* _ASM_X86_TRAPS_H */