summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig16
-rw-r--r--arch/x86/include/asm/cpufeature.h55
-rw-r--r--arch/x86/include/asm/cpufeatures.h6
-rw-r--r--arch/x86/include/asm/disabled-features.h15
-rw-r--r--arch/x86/include/asm/fpu/internal.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h12
-rw-r--r--arch/x86/include/asm/fpu/xstate.h3
-rw-r--r--arch/x86/include/asm/mmu_context.h85
-rw-r--r--arch/x86/include/asm/pgtable.h38
-rw-r--r--arch/x86/include/asm/pgtable_types.h39
-rw-r--r--arch/x86/include/asm/pkeys.h34
-rw-r--r--arch/x86/include/asm/required-features.h7
-rw-r--r--arch/x86/include/asm/special_insns.h22
-rw-r--r--arch/x86/include/uapi/asm/mman.h22
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/cpu/common.c44
-rw-r--r--arch/x86/kernel/fpu/core.c63
-rw-r--r--arch/x86/kernel/fpu/xstate.c185
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c150
-rw-r--r--arch/x86/mm/gup.c45
-rw-r--r--arch/x86/mm/mpx.c4
-rw-r--r--arch/x86/mm/pkeys.c101
26 files changed, 888 insertions, 79 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d07cca6ad37b..8b680a5cb25b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -156,6 +156,8 @@ config X86
select X86_DEV_DMA_OPS if X86_64
select X86_FEATURE_NAMES if PROC_FS
select HAVE_STACK_VALIDATION if X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS
+ select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS
config INSTRUCTION_DECODER
def_bool y
@@ -1719,6 +1721,20 @@ config X86_INTEL_MPX
If unsure, say N.
+config X86_INTEL_MEMORY_PROTECTION_KEYS
+ prompt "Intel Memory Protection Keys"
+ def_bool y
+ # Note: only available in 64-bit mode
+ depends on CPU_SUP_INTEL && X86_64
+ ---help---
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/x86/protection-keys.txt
+
+ If unsure, say y.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 68e4e8258b84..3636ec06c887 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -26,6 +26,7 @@ enum cpuid_leafs
CPUID_8000_0008_EBX,
CPUID_6_EAX,
CPUID_8000_000A_EDX,
+ CPUID_7_ECX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
@@ -48,28 +49,42 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
test_bit(bit, (unsigned long *)((c)->x86_capability))
#define REQUIRED_MASK_BIT_SET(bit) \
- ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
- (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
- (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
- (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
- (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
- (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
- (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
- (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \
+ (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \
+ (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \
+ (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \
+ (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
#define DISABLED_MASK_BIT_SET(bit) \
- ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) || \
- (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) || \
- (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) || \
- (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) || \
- (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) || \
- (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) || \
- (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) || \
- (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) )
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \
+ (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \
+ (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \
+ (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \
+ (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 074b7604bd51..3d1a84383162 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -12,7 +12,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 16 /* N 32-bit words worth of info */
+#define NCAPINTS 17 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -274,6 +274,10 @@
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index f226df064660..39343be7d4f4 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -28,6 +28,14 @@
# define DISABLE_CENTAUR_MCR 0
#endif /* CONFIG_X86_64 */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+# define DISABLE_PKU (1<<(X86_FEATURE_PKU))
+# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE))
+#else
+# define DISABLE_PKU 0
+# define DISABLE_OSPKE 0
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
+
/*
* Make sure to add features to the correct mask
*/
@@ -41,5 +49,12 @@
#define DISABLED_MASK7 0
#define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX)
+#define DISABLED_MASK10 0
+#define DISABLED_MASK11 0
+#define DISABLED_MASK12 0
+#define DISABLED_MASK13 0
+#define DISABLED_MASK14 0
+#define DISABLED_MASK15 0
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a2124343edf5..31ac8e6d9f36 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,6 +25,8 @@
extern void fpu__activate_curr(struct fpu *fpu);
extern void fpu__activate_fpstate_read(struct fpu *fpu);
extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__current_fpstate_write_begin(void);
+extern void fpu__current_fpstate_write_end(void);
extern void fpu__save(struct fpu *fpu);
extern void fpu__restore(struct fpu *fpu);
extern int fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac52ad0..36b90bbfc69f 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -108,6 +108,8 @@ enum xfeature {
XFEATURE_OPMASK,
XFEATURE_ZMM_Hi256,
XFEATURE_Hi16_ZMM,
+ XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
+ XFEATURE_PKRU,
XFEATURE_MAX,
};
@@ -120,6 +122,7 @@ enum xfeature {
#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK)
#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256)
#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
+#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
@@ -212,6 +215,15 @@ struct avx_512_hi16_state {
struct reg_512_bit hi16_zmm[16];
} __packed;
+/*
+ * State component 9: 32-bit PKRU register. The state is
+ * 8 bytes long but only 4 bytes is used currently.
+ */
+struct pkru_state {
+ u32 pkru;
+ u32 pad;
+} __packed;
+
struct xstate_header {
u64 xfeatures;
u64 xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index f23cd8c80b1c..38951b0fcc5a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -24,7 +24,8 @@
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM)
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU)
/* Supported features which require eager state saving */
#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bfd9b2a35a0b..84280029cafd 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,15 +52,15 @@ struct ldt_struct {
/*
* Used for LDT copy/destruction.
*/
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
+int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context_ldt(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context(struct task_struct *tsk,
- struct mm_struct *mm)
+static inline int init_new_context_ldt(struct task_struct *tsk,
+ struct mm_struct *mm)
{
return 0;
}
-static inline void destroy_context(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) {}
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
@@ -104,6 +104,17 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
#endif
}
+static inline int init_new_context(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ init_new_context_ldt(tsk, mm);
+ return 0;
+}
+static inline void destroy_context(struct mm_struct *mm)
+{
+ destroy_context_ldt(mm);
+}
+
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -275,4 +286,68 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
mpx_notify_unmap(mm, vma, start, end);
}
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+ u16 pkey = 0;
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+ VM_PKEY_BIT2 | VM_PKEY_BIT3;
+ pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+#endif
+ return pkey;
+}
+
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only want to enforce protection keys on the current process
+ * because we effectively have no access to PKRU for other
+ * processes or any way to tell *which * PKRU in a threaded
+ * process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current
+ * mm, or if we are in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+ if (!current->mm)
+ return true;
+ /*
+ * Should PKRU be enforced on the access to this VMA? If
+ * the VMA is from another process, then PKRU has no
+ * relevance and should not be enforced.
+ */
+ if (current->mm != vma->vm_mm)
+ return true;
+
+ return false;
+}
+
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+ bool write, bool execute, bool foreign)
+{
+ /* pkeys never affect instruction fetches */
+ if (execute)
+ return true;
+ /* allow access if the VMA is not one from this process */
+ if (foreign || vma_is_foreign(vma))
+ return true;
+ return __pkru_allows_pkey(vma_pkey(vma), write);
+}
+
+static inline bool arch_pte_access_permitted(pte_t pte, bool write)
+{
+ return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
+}
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0687c4748b8f..1ff49ec29ece 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -99,6 +99,14 @@ static inline int pte_dirty(pte_t pte)
return pte_flags(pte) & _PAGE_DIRTY;
}
+
+static inline u32 read_pkru(void)
+{
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ return __read_pkru();
+ return 0;
+}
+
static inline int pte_young(pte_t pte)
{
return pte_flags(pte) & _PAGE_ACCESSED;
@@ -911,6 +919,36 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
}
#endif
+#define PKRU_AD_BIT 0x1
+#define PKRU_WD_BIT 0x2
+#define PKRU_BITS_PER_PKEY 2
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ /*
+ * Access-disable disables writes too so we need to check
+ * both bits here.
+ */
+ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+static inline u16 pte_flags_pkey(unsigned long pte_flags)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /* ifdef to avoid doing 59-bit shift on 32-bit values */
+ return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
+#else
+ return 0;
+#endif
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4432ab7f407c..7b5efe264eff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -20,13 +20,18 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
+#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
+#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
+#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
-#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
-#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -47,8 +52,24 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
+#else
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
+#endif
#define __HAVE_ARCH_PTE_SPECIAL
+#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
+ _PAGE_PKEY_BIT1 | \
+ _PAGE_PKEY_BIT2 | \
+ _PAGE_PKEY_BIT3)
+
#ifdef CONFIG_KMEMCHECK
#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
#else
@@ -99,7 +120,12 @@
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
_PAGE_DIRTY)
-/* Set of bits not changed in pte_modify */
+/*
+ * Set of bits not changed in pte_modify. The pte's
+ * protection key is treated like _PAGE_RW, for
+ * instance, and is *not* included in this mask since
+ * pte_modify() does modify it.
+ */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
_PAGE_SOFT_DIRTY)
@@ -215,7 +241,10 @@ enum page_cache_mode {
/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
-/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
+/*
+ * Extracts the flags from a (pte|pmd|pud|pgd)val_t
+ * This includes the protection key value.
+ */
#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
new file mode 100644
index 000000000000..7b84565c916c
--- /dev/null
+++ b/arch/x86/include/asm/pkeys.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_PKEYS_H
+#define _ASM_X86_PKEYS_H
+
+#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+
+/*
+ * Try to dedicate one of the protection keys to be used as an
+ * execute-only protection key.
+ */
+#define PKEY_DEDICATED_EXECUTE_ONLY 15
+extern int __execute_only_pkey(struct mm_struct *mm);
+static inline int execute_only_pkey(struct mm_struct *mm)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return 0;
+
+ return __execute_only_pkey(mm);
+}
+
+extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey);
+static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return 0;
+
+ return __arch_override_mprotect_pkey(vma, prot, pkey);
+}
+
+#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 5c6e4fb370f5..4916144e3c42 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -92,5 +92,12 @@
#define REQUIRED_MASK7 0
#define REQUIRED_MASK8 0
#define REQUIRED_MASK9 0
+#define REQUIRED_MASK10 0
+#define REQUIRED_MASK11 0
+#define REQUIRED_MASK12 0
+#define REQUIRED_MASK13 0
+#define REQUIRED_MASK14 0
+#define REQUIRED_MASK15 0
+#define REQUIRED_MASK16 0
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2270e41b32fd..aee6e76e561e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -98,6 +98,28 @@ static inline void native_write_cr8(unsigned long val)
}
#endif
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static inline u32 __read_pkru(void)
+{
+ u32 ecx = 0;
+ u32 edx, pkru;
+
+ /*
+ * "rdpkru" instruction. Places PKRU contents in to EAX,
+ * clears EDX and requires that ecx=0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (pkru), "=d" (edx)
+ : "c" (ecx));
+ return pkru;
+}
+#else
+static inline u32 __read_pkru(void)
+{
+ return 0;
+}
+#endif
+
static inline void native_wbinvd(void)
{
asm volatile("wbinvd": : :"memory");
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 513b05f15bb4..39bca7fac087 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -6,6 +6,28 @@
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * Take the 4 protection key bits out of the vma->vm_flags
+ * value and turn them in to the bits that we can put in
+ * to a pte.
+ *
+ * Only override these if Protection Keys are available
+ * (which is only on 64-bit).
+ */
+#define arch_vm_get_page_prot(vm_flags) __pgprot( \
+ ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) ( \
+ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
+ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
+ ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
+ ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
+
#include <asm-generic/mman.h>
#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 79887abcb5e1..567de50a4c2a 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -118,6 +118,8 @@
#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT)
#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */
#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
+#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
+#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 249461f95851..06ad72383b4e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -304,6 +304,48 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ return;
+ if (pku_disabled)
+ return;
+
+ cr4_set_bits(X86_CR4_PKE);
+ /*
+ * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+ * cpuid bit to be set. We need to ensure that we
+ * update that bit in this CPU's "cpu_info".
+ */
+ get_cpu_cap(c);
+}
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
+{
+ /*
+ * Do not clear the X86_FEATURE_PKU bit. All of the
+ * runtime checks are against OSPKE so clearing the
+ * bit does nothing.
+ *
+ * This way, we will see "pku" in cpuinfo, but not
+ * "ospke", which is exactly what we want. It shows
+ * that the CPU has PKU, but the OS has not enabled it.
+ * This happens to be exactly how a system would look
+ * if we disabled the config option.
+ */
+ pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ pku_disabled = true;
+ return 1;
+}
+__setup("nopku", setup_disable_pku);
+#endif /* CONFIG_X86_64 */
+
+/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
* software. Add those features to this table to auto-disable them.
@@ -625,6 +667,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_7_0_EBX] = ebx;
c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+ c->x86_capability[CPUID_7_ECX] = ecx;
}
/* Extended state features: level 0x0000000d */
@@ -982,6 +1025,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor(c);
x86_init_rdrand(c);
x86_init_cache_qos(c);
+ setup_pku(c);
/*
* Clear/Set all flags overriden by options, need do it
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 0b1b9abd4d5f..8e37cc8a539a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -354,6 +354,69 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
}
/*
+ * This function must be called before we write the current
+ * task's fpstate.
+ *
+ * This call gets the current FPU register state and moves
+ * it in to the 'fpstate'. Preemption is disabled so that
+ * no writes to the 'fpstate' can occur from context
+ * swiches.
+ *
+ * Must be followed by a fpu__current_fpstate_write_end().
+ */
+void fpu__current_fpstate_write_begin(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ /*
+ * Ensure that the context-switching code does not write
+ * over the fpstate while we are doing our update.
+ */
+ preempt_disable();
+
+ /*
+ * Move the fpregs in to the fpu's 'fpstate'.
+ */
+ fpu__activate_fpstate_read(fpu);
+
+ /*
+ * The caller is about to write to 'fpu'. Ensure that no
+ * CPU thinks that its fpregs match the fpstate. This
+ * ensures we will not be lazy and skip a XRSTOR in the
+ * future.
+ */
+ fpu->last_cpu = -1;
+}
+
+/*
+ * This function must be paired with fpu__current_fpstate_write_begin()
+ *
+ * This will ensure that the modified fpstate gets placed back in
+ * the fpregs if necessary.
+ *
+ * Note: This function may be called whether or not an _actual_
+ * write to the fpstate occurred.
+ */
+void fpu__current_fpstate_write_end(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ /*
+ * 'fpu' now has an updated copy of the state, but the
+ * registers may still be out of date. Update them with
+ * an XRSTOR if they are active.
+ */
+ if (fpregs_active())
+ copy_kernel_to_fpregs(&fpu->state);
+
+ /*
+ * Our update is done and the fpregs/fpstate are in sync
+ * if necessary. Context switches can happen again.
+ */
+ preempt_enable();
+}
+
+/*
* 'fpu__restore()' is called to copy FPU registers from
* the FPU fpstate to the live hw registers and to activate
* access to the hardware registers, so that FPU instructions
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6e8354f5a593..b48ef35b28d4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -5,6 +5,7 @@
*/
#include <linux/compat.h>
#include <linux/cpu.h>
+#include <linux/pkeys.h>
#include <asm/fpu/api.h>
#include <asm/fpu/internal.h>
@@ -13,6 +14,11 @@
#include <asm/tlbflush.h>
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused. We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
static const char *xfeature_names[] =
{
"x87 floating point registers" ,
@@ -23,6 +29,8 @@ static const char *xfeature_names[] =
"AVX-512 opmask" ,
"AVX-512 Hi256" ,
"AVX-512 ZMM_Hi256" ,
+ "Processor Trace (unused)" ,
+ "Protection Keys User registers",
"unknown xstate feature" ,
};
@@ -56,6 +64,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+ setup_clear_cpu_cap(X86_FEATURE_PKU);
}
/*
@@ -234,7 +243,7 @@ static void __init print_xstate_feature(u64 xstate_mask)
const char *feature_name;
if (cpu_has_xfeatures(xstate_mask, &feature_name))
- pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
}
/*
@@ -250,6 +259,7 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_OPMASK);
print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
+ print_xstate_feature(XFEATURE_MASK_PKRU);
}
/*
@@ -466,6 +476,7 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
+ XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
/*
* Make *SURE* to add any feature numbers in below if
@@ -473,7 +484,8 @@ static void check_xstate_against_struct(int nr)
* numbers.
*/
if ((nr < XFEATURE_YMM) ||
- (nr >= XFEATURE_MAX)) {
+ (nr >= XFEATURE_MAX) ||
+ (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
}
@@ -671,6 +683,19 @@ void fpu__resume_cpu(void)
}
/*
+ * Given an xstate feature mask, calculate where in the xsave
+ * buffer the state is. Callers should ensure that the buffer
+ * is valid.
+ *
+ * Note: does not work for compacted buffers.
+ */
+void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+{
+ int feature_nr = fls64(xstate_feature_mask) - 1;
+
+ return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
+/*
* Given the xsave area and a state inside, this function returns the
* address of the state.
*
@@ -690,7 +715,6 @@ void fpu__resume_cpu(void)
*/
void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
{
- int feature_nr = fls64(xstate_feature) - 1;
/*
* Do we even *have* xsave state?
*/
@@ -718,7 +742,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
if (!(xsave->header.xfeatures & xstate_feature))
return NULL;
- return (void *)xsave + xstate_comp_offsets[feature_nr];
+ return __raw_xsave_addr(xsave, xstate_feature);
}
EXPORT_SYMBOL_GPL(get_xsave_addr);
@@ -753,3 +777,156 @@ const void *get_xsave_field_ptr(int xsave_state)
return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
+
+
+/*
+ * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
+ * to take out of its "init state". This will ensure that an
+ * XRSTOR actually restores the state.
+ */
+static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
+ int xstate_feature_mask)
+{
+ xsave->header.xfeatures |= xstate_feature_mask;
+}
+
+/*
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ * XFEATURE_MASK_SSE, etc...)
+ * @xsave_state_ptr: a pointer to a copy of the state that you would
+ * like written in to the current task's FPU xsave state. This pointer
+ * must not be located in the current tasks's xsave area.
+ * Output:
+ * address of the state in the xsave area or NULL if the state
+ * is not present or is in its 'init state'.
+ */
+static void fpu__xfeature_set_state(int xstate_feature_mask,
+ void *xstate_feature_src, size_t len)
+{
+ struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+ struct fpu *fpu = &current->thread.fpu;
+ void *dst;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
+ return;
+ }
+
+ /*
+ * Tell the FPU code that we need the FPU state to be in
+ * 'fpu' (not in the registers), and that we need it to
+ * be stable while we write to it.
+ */
+ fpu__current_fpstate_write_begin();
+
+ /*
+ * This method *WILL* *NOT* work for compact-format
+ * buffers. If the 'xstate_feature_mask' is unset in
+ * xcomp_bv then we may need to move other feature state
+ * "up" in the buffer.
+ */
+ if (xsave->header.xcomp_bv & xstate_feature_mask) {
+ WARN_ON_ONCE(1);
+ goto out;
+ }
+
+ /* find the location in the xsave buffer of the desired state */
+ dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
+
+ /*
+ * Make sure that the pointer being passed in did not
+ * come from the xsave buffer itself.
+ */
+ WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
+
+ /* put the caller-provided data in the location */
+ memcpy(dst, xstate_feature_src, len);
+
+ /*
+ * Mark the xfeature so that the CPU knows there is state
+ * in the buffer now.
+ */
+ fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
+out:
+ /*
+ * We are done writing to the 'fpu'. Reenable preeption
+ * and (possibly) move the fpstate back in to the fpregs.
+ */
+ fpu__current_fpstate_write_end();
+}
+
+#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
+#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+
+/*
+ * This will go out and modify the XSAVE buffer so that PKRU is
+ * set to a particular state for access to 'pkey'.
+ *
+ * PKRU state does affect kernel access to user memory. We do
+ * not modfiy PKRU *itself* here, only the XSAVE state that will
+ * be restored in to PKRU when we return back to userspace.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct pkru_state *old_pkru_state;
+ struct pkru_state new_pkru_state;
+ int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+ u32 new_pkru_bits = 0;
+
+ /*
+ * This check implies XSAVE support. OSPKE only gets
+ * set if we enable XSAVE and we enable PKU in XCR0.
+ */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return -EINVAL;
+
+ /* Set the bits we need in PKRU */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_pkru_bits |= PKRU_AD_BIT;
+ if (init_val & PKEY_DISABLE_WRITE)
+ new_pkru_bits |= PKRU_WD_BIT;
+
+ /* Shift the bits in to the correct place in PKRU for pkey. */
+ new_pkru_bits <<= pkey_shift;
+
+ /* Locate old copy of the state in the xsave buffer */
+ old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
+
+ /*
+ * When state is not in the buffer, it is in the init
+ * state, set it manually. Otherwise, copy out the old
+ * state.
+ */
+ if (!old_pkru_state)
+ new_pkru_state.pkru = 0;
+ else
+ new_pkru_state.pkru = old_pkru_state->pkru;
+
+ /* mask off any old bits in place */
+ new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+ /* Set the newly-requested bits */
+ new_pkru_state.pkru |= new_pkru_bits;
+
+ /*
+ * We could theoretically live without zeroing pkru.pad.
+ * The current XSAVE feature state definition says that
+ * only bytes 0->3 are used. But we do not want to
+ * chance leaking kernel stack out to userspace in case a
+ * memcpy() of the whole xsave buffer was done.
+ *
+ * They're in the same cacheline anyway.
+ */
+ new_pkru_state.pad = 0;
+
+ fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
+ sizeof(new_pkru_state));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6acc9dd91f36..6707039b9032 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -103,7 +103,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
* we do not have to muck with descriptors here, that is
* done in switch_mm() as needed.
*/
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
{
struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
@@ -144,7 +144,7 @@ out_unlock:
*
* 64bit: Don't touch the LDT register - we're already in the next thread.
*/
-void destroy_context(struct mm_struct *mm)
+void destroy_context_ldt(struct mm_struct *mm)
{
free_ldt_struct(mm->context.ldt);
mm->context.ldt = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0f82c4..776229e98202 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -116,6 +116,8 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
}
void release_thread(struct task_struct *dead_task)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index aa52c1009475..2367ae07eb76 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
#include <asm/alternative.h>
#include <asm/prom.h>
#include <asm/microcode.h>
+#include <asm/mmu_context.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1282,3 +1283,11 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
+
+void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return;
+
+ seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a48e3c8..67cf2e1e557b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 03898aea6e0f..5ce1ed02f7e8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -15,12 +15,14 @@
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
+#include <asm/mmu_context.h> /* vma_pkey() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -33,6 +35,7 @@
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
@@ -41,6 +44,7 @@ enum x86_pf_error_code {
PF_USER = 1 << 2,
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
+ PF_PK = 1 << 5,
};
/*
@@ -167,9 +171,60 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
return prefetch;
}
+/*
+ * A protection key fault means that the PKRU value did not allow
+ * access to some PTE. Userspace can figure out what PKRU was
+ * from the XSAVE state, and this function fills out a field in
+ * siginfo so userspace can discover which protection key was set
+ * on the PTE.
+ *
+ * If we get here, we know that the hardware signaled a PF_PK
+ * fault and that there was a VMA once we got in the fault
+ * handler. It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set PKRU to deny access to pkey=4, touches page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+static void fill_sig_info_pkey(int si_code, siginfo_t *info,
+ struct vm_area_struct *vma)
+{
+ /* This is effectively an #ifdef */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return;
+
+ /* Fault not from Protection Keys: nothing to do */
+ if (si_code != SEGV_PKUERR)
+ return;
+ /*
+ * force_sig_info_fault() is called from a number of
+ * contexts, some of which have a VMA and some of which
+ * do not. The PF_PK handing happens after we have a
+ * valid VMA, so we should never reach this without a
+ * valid VMA.
+ */
+ if (!vma) {
+ WARN_ONCE(1, "PKU fault with no VMA passed in");
+ info->si_pkey = 0;
+ return;
+ }
+ /*
+ * si_pkey should be thought of as a strong hint, but not
+ * absolutely guranteed to be 100% accurate because of
+ * the race explained above.
+ */
+ info->si_pkey = vma_pkey(vma);
+}
+
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
- struct task_struct *tsk, int fault)
+ struct task_struct *tsk, struct vm_area_struct *vma,
+ int fault)
{
unsigned lsb = 0;
siginfo_t info;
@@ -184,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
+ fill_sig_info_pkey(si_code, &info, vma);
+
force_sig_info(si_signo, &info, tsk);
}
@@ -661,6 +718,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
+ /* No context means no VMA to pass down */
+ struct vm_area_struct *vma = NULL;
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -684,7 +743,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
- force_sig_info_fault(signal, si_code, address, tsk, 0);
+ force_sig_info_fault(signal, si_code, address,
+ tsk, vma, 0);
}
/*
@@ -761,7 +821,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int si_code)
+ unsigned long address, struct vm_area_struct *vma,
+ int si_code)
{
struct task_struct *tsk = current;
@@ -804,7 +865,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
return;
}
@@ -817,14 +878,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, struct vm_area_struct *vma)
{
- __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+ __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int si_code)
+ unsigned long address, struct vm_area_struct *vma, int si_code)
{
struct mm_struct *mm = current->mm;
@@ -834,25 +895,50 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
*/
up_read(&mm->mmap_sem);
- __bad_area_nosemaphore(regs, error_code, address, si_code);
+ __bad_area_nosemaphore(regs, error_code, address, vma, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- __bad_area(regs, error_code, address, SEGV_MAPERR);
+ __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
+}
+
+static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ struct vm_area_struct *vma)
+{
+ /* This code is always called on the current mm */
+ bool foreign = false;
+
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return false;
+ if (error_code & PF_PK)
+ return true;
+ /* this checks permission keys on the VMA: */
+ if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+ (error_code & PF_INSTR), foreign))
+ return true;
+ return false;
}
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, struct vm_area_struct *vma)
{
- __bad_area(regs, error_code, address, SEGV_ACCERR);
+ /*
+ * This OSPKE check is not strictly necessary at runtime.
+ * But, doing it this way allows compiler optimizations
+ * if pkeys are compiled out.
+ */
+ if (bad_area_access_from_pkeys(error_code, vma))
+ __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
+ else
+ __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- unsigned int fault)
+ struct vm_area_struct *vma, unsigned int fault)
{
struct task_struct *tsk = current;
int code = BUS_ADRERR;
@@ -879,12 +965,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
code = BUS_MCEERR_AR;
}
#endif
- force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+ force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, unsigned int fault)
+ unsigned long address, struct vm_area_struct *vma,
+ unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -908,9 +995,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
+ do_sigbus(regs, error_code, address, vma, fault);
else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, vma);
else
BUG();
}
@@ -923,6 +1010,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
+ /*
+ * Note: We do not do lazy flushing on protection key
+ * changes, so no spurious fault will ever set PF_PK.
+ */
+ if ((error_code & PF_PK))
+ return 1;
return 1;
}
@@ -1012,6 +1105,17 @@ int show_unhandled_signals = 1;
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
+ /* This is only called for the current mm, so: */
+ bool foreign = false;
+ /*
+ * Make sure to check the VMA so that we do not perform
+ * faults just to hit a PF_PK as soon as we fill in a
+ * page.
+ */
+ if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+ (error_code & PF_INSTR), foreign))
+ return 1;
+
if (error_code & PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1118,7 +1222,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1131,7 +1235,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1140,7 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1164,6 +1268,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_WRITE)
flags |= FAULT_FLAG_WRITE;
+ if (error_code & PF_INSTR)
+ flags |= FAULT_FLAG_INSTRUCTION;
/*
* When running in the kernel we expect faults to occur only to
@@ -1184,7 +1290,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
retry:
@@ -1232,7 +1338,7 @@ retry:
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address);
+ bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1270,7 +1376,7 @@ good_area:
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, fault);
+ mm_fault_error(regs, error_code, address, vma, fault);
return;
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index f8d0b5e8bdfd..b8b6a60b32cf 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/memremap.h>
+#include <asm/mmu_context.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
@@ -75,6 +76,28 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
/*
+ * 'pteval' can come from a pte, pmd or pud. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline int pte_allows_gup(unsigned long pteval, int write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ /* Check memory protection keys permissions. */
+ if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
+ return 0;
+
+ return 1;
+}
+
+/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
@@ -83,14 +106,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
- unsigned long mask;
int nr_start = *nr;
pte_t *ptep;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
-
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
@@ -109,7 +127,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_unmap(ptep);
return 0;
}
- } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ } else if (!pte_allows_gup(pte_val(pte), write) ||
+ pte_special(pte)) {
pte_unmap(ptep);
return 0;
}
@@ -164,14 +183,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- unsigned long mask;
struct page *head, *page;
int refs;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
- if ((pmd_flags(pmd) & mask) != mask)
+ if (!pte_allows_gup(pmd_val(pmd), write))
return 0;
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
@@ -231,14 +246,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- unsigned long mask;
struct page *head, *page;
int refs;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
- if ((pud_flags(pud) & mask) != mask)
+ if (!pte_allows_gup(pud_val(pud), write))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
@@ -422,7 +433,7 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(current, mm, start,
+ ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT,
write, 0, pages);
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index ef05755a1900..a0a0b9861902 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write)
int nr_pages = 1;
int force = 0;
- gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
- nr_pages, write, force, NULL, NULL);
+ gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
+ force, NULL, NULL);
/*
* get_user_pages() returns number of pages gotten.
* 0 means we failed to fault in and get anything,
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
new file mode 100644
index 000000000000..e8c474451928
--- /dev/null
+++ b/arch/x86/mm/pkeys.c
@@ -0,0 +1,101 @@
+/*
+ * Intel Memory Protection Keys management
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/mm_types.h> /* mm_struct, vma, etc... */
+#include <linux/pkeys.h> /* PKEY_* */
+#include <uapi/asm-generic/mman-common.h>
+
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
+#include <asm/mmu_context.h> /* vma_pkey() */
+#include <asm/fpu/internal.h> /* fpregs_active() */
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+ int ret;
+
+ /*
+ * We do not want to go through the relatively costly
+ * dance to set PKRU if we do not need to. Check it
+ * first and assume that if the execute-only pkey is
+ * write-disabled that we do not have to set it
+ * ourselves. We need preempt off so that nobody
+ * can make fpregs inactive.
+ */
+ preempt_disable();
+ if (fpregs_active() &&
+ !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
+ preempt_enable();
+ return PKEY_DEDICATED_EXECUTE_ONLY;
+ }
+ preempt_enable();
+ ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
+ PKEY_DISABLE_ACCESS);
+ /*
+ * If the PKRU-set operation failed somehow, just return
+ * 0 and effectively disable execute-only support.
+ */
+ if (ret)
+ return 0;
+
+ return PKEY_DEDICATED_EXECUTE_ONLY;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+ /* Do this check first since the vm_flags should be hot */
+ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ return false;
+ if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is only called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
+{
+ /*
+ * Is this an mprotect_pkey() call? If so, never
+ * override the value that came from the user.
+ */
+ if (pkey != -1)
+ return pkey;
+ /*
+ * Look for a protection-key-drive execute-only mapping
+ * which is now being given permissions that are not
+ * execute-only. Move it back to the default pkey.
+ */
+ if (vma_is_pkey_exec_only(vma) &&
+ (prot & (PROT_READ|PROT_WRITE))) {
+ return 0;
+ }
+ /*
+ * The mapping is execute-only. Go try to get the
+ * execute-only protection key. If we fail to do that,
+ * fall through as if we do not have execute-only
+ * support.
+ */
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(vma->vm_mm);
+ if (pkey > 0)
+ return pkey;
+ }
+ /*
+ * This is a vanilla, non-pkey mprotect (or we failed to
+ * setup execute-only), inherit the pkey from the VMA we
+ * are working on.
+ */
+ return vma_pkey(vma);
+}