summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2024-11-26 18:15:06 +0100
committerChristian Brauner <brauner@kernel.org>2024-11-26 18:15:06 +0100
commitcf87766dd6f9ddcceaa8ee26e3cbd7538e42dd19 (patch)
tree8531685628a090333db2f874688ac07624b51072 /arch/x86
parentc66f759832a83cb273ba5a55c66dcc99384efa74 (diff)
parent2957fa4931a3b658d8e54eda9439d4c57967e8ad (diff)
Merge branch 'ovl.fixes'
Bring in an overlayfs fix for v6.13-rc1 that fixes a bug introduced by the overlayfs changes merged for v6.13. Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig17
-rw-r--r--arch/x86/coco/tdx/tdx.c138
-rw-r--r--arch/x86/include/asm/cpuid.h8
-rw-r--r--arch/x86/include/asm/ftrace.h30
-rw-r--r--arch/x86/include/asm/io.h5
-rw-r--r--arch/x86/include/asm/shared/tdx.h13
-rw-r--r--arch/x86/include/uapi/asm/amd_hsmp.h3
-rw-r--r--arch/x86/kernel/cpu/common.c39
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c2
-rw-r--r--arch/x86/kernel/devicetree.c2
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c19
-rw-r--r--arch/x86/net/bpf_jit_comp.c149
-rw-r--r--arch/x86/platform/efi/efi.c20
-rw-r--r--arch/x86/platform/efi/efi_64.c42
16 files changed, 328 insertions, 163 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a3c31b784edc..948707a3615e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2564,15 +2564,14 @@ config MITIGATION_CALL_DEPTH_TRACKING
default y
help
Compile the kernel with call depth tracking to mitigate the Intel
- SKL Return-Speculation-Buffer (RSB) underflow issue. The
- mitigation is off by default and needs to be enabled on the
- kernel command line via the retbleed=stuff option. For
- non-affected systems the overhead of this option is marginal as
- the call depth tracking is using run-time generated call thunks
- in a compiler generated padding area and call patching. This
- increases text size by ~5%. For non affected systems this space
- is unused. On affected SKL systems this results in a significant
- performance gain over the IBRS mitigation.
+ SKL Return-Stack-Buffer (RSB) underflow issue. The mitigation is off
+ by default and needs to be enabled on the kernel command line via the
+ retbleed=stuff option. For non-affected systems the overhead of this
+ option is marginal as the call depth tracking is using run-time
+ generated call thunks in a compiler generated padding area and call
+ patching. This increases text size by ~5%. For non affected systems
+ this space is unused. On affected SKL systems this results in a
+ significant performance gain over the IBRS mitigation.
config CALL_THUNKS_DEBUG
bool "Enable call thunks and call depth tracking debugging"
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 327c45c5013f..0d9b090b4880 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -78,6 +78,32 @@ static inline void tdcall(u64 fn, struct tdx_module_args *args)
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
+/* Read TD-scoped metadata */
+static inline u64 tdg_vm_rd(u64 field, u64 *value)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ };
+ u64 ret;
+
+ ret = __tdcall_ret(TDG_VM_RD, &args);
+ *value = args.r8;
+
+ return ret;
+}
+
+/* Write TD-scoped metadata */
+static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ .r8 = value,
+ .r9 = mask,
+ };
+
+ return __tdcall(TDG_VM_WR, &args);
+}
+
/**
* tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
* subtype 0) using TDG.MR.REPORT TDCALL.
@@ -168,7 +194,87 @@ static void __noreturn tdx_panic(const char *msg)
__tdx_hypercall(&args);
}
-static void tdx_parse_tdinfo(u64 *cc_mask)
+/*
+ * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure
+ * that no #VE will be delivered for accesses to TD-private memory.
+ *
+ * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
+ * controls if the guest will receive such #VE with TD attribute
+ * ATTR_SEPT_VE_DISABLE.
+ *
+ * Newer TDX modules allow the guest to control if it wants to receive SEPT
+ * violation #VEs.
+ *
+ * Check if the feature is available and disable SEPT #VE if possible.
+ *
+ * If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
+ * attribute is no longer reliable. It reflects the initial state of the
+ * control for the TD, but it will not be updated if someone (e.g. bootloader)
+ * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
+ * determine if SEPT #VEs are enabled or disabled.
+ */
+static void disable_sept_ve(u64 td_attr)
+{
+ const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
+ bool debug = td_attr & ATTR_DEBUG;
+ u64 config, controls;
+
+ /* Is this TD allowed to disable SEPT #VE */
+ tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
+ if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
+ /* No SEPT #VE controls for the guest: check the attribute */
+ if (td_attr & ATTR_SEPT_VE_DISABLE)
+ return;
+
+ /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
+ if (debug)
+ pr_warn("%s\n", msg);
+ else
+ tdx_panic(msg);
+ return;
+ }
+
+ /* Check if SEPT #VE has been disabled before us */
+ tdg_vm_rd(TDCS_TD_CTLS, &controls);
+ if (controls & TD_CTLS_PENDING_VE_DISABLE)
+ return;
+
+ /* Keep #VEs enabled for splats in debugging environments */
+ if (debug)
+ return;
+
+ /* Disable SEPT #VEs */
+ tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE,
+ TD_CTLS_PENDING_VE_DISABLE);
+}
+
+/*
+ * TDX 1.0 generates a #VE when accessing topology-related CPUID leafs (0xB and
+ * 0x1F) and the X2APIC_APICID MSR. The kernel returns all zeros on CPUID #VEs.
+ * In practice, this means that the kernel can only boot with a plain topology.
+ * Any complications will cause problems.
+ *
+ * The ENUM_TOPOLOGY feature allows the VMM to provide topology information.
+ * Enabling the feature eliminates topology-related #VEs: the TDX module
+ * virtualizes accesses to the CPUID leafs and the MSR.
+ *
+ * Enable ENUM_TOPOLOGY if it is available.
+ */
+static void enable_cpu_topology_enumeration(void)
+{
+ u64 configured;
+
+ /* Has the VMM provided a valid topology configuration? */
+ tdg_vm_rd(TDCS_TOPOLOGY_ENUM_CONFIGURED, &configured);
+ if (!configured) {
+ pr_err("VMM did not configure X2APIC_IDs properly\n");
+ return;
+ }
+
+ tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);
+}
+
+static void tdx_setup(u64 *cc_mask)
{
struct tdx_module_args args = {};
unsigned int gpa_width;
@@ -193,21 +299,13 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
gpa_width = args.rcx & GENMASK(5, 0);
*cc_mask = BIT_ULL(gpa_width - 1);
- /*
- * The kernel can not handle #VE's when accessing normal kernel
- * memory. Ensure that no #VE will be delivered for accesses to
- * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
- */
td_attr = args.rdx;
- if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
- const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
- /* Relax SEPT_VE_DISABLE check for debug TD. */
- if (td_attr & ATTR_DEBUG)
- pr_warn("%s\n", msg);
- else
- tdx_panic(msg);
- }
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
+
+ disable_sept_ve(td_attr);
+ enable_cpu_topology_enumeration();
}
/*
@@ -929,10 +1027,6 @@ static void tdx_kexec_finish(void)
void __init tdx_early_init(void)
{
- struct tdx_module_args args = {
- .rdx = TDCS_NOTIFY_ENABLES,
- .r9 = -1ULL,
- };
u64 cc_mask;
u32 eax, sig[3];
@@ -947,11 +1041,11 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
cc_vendor = CC_VENDOR_INTEL;
- tdx_parse_tdinfo(&cc_mask);
- cc_set_mask(cc_mask);
- /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
- tdcall(TDG_VM_WR, &args);
+ /* Configure the TD */
+ tdx_setup(&cc_mask);
+
+ cc_set_mask(cc_mask);
/*
* All bits above GPA width are reserved and kernel treats shared bit
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
index ca4243318aad..239b9ba5c398 100644
--- a/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@ -6,6 +6,8 @@
#ifndef _ASM_X86_CPUID_H
#define _ASM_X86_CPUID_H
+#include <linux/types.h>
+
#include <asm/string.h>
struct cpuid_regs {
@@ -20,11 +22,11 @@ enum cpuid_regs_idx {
};
#ifdef CONFIG_X86_32
-extern int have_cpuid_p(void);
+bool have_cpuid_p(void);
#else
-static inline int have_cpuid_p(void)
+static inline bool have_cpuid_p(void)
{
- return 1;
+ return true;
}
#endif
static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index b4d719de2c84..6e8cf0fa48fc 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -35,37 +35,21 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
}
#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
-struct ftrace_regs {
- struct pt_regs regs;
-};
+
+#include <linux/ftrace_regs.h>
static __always_inline struct pt_regs *
arch_ftrace_get_regs(struct ftrace_regs *fregs)
{
/* Only when FL_SAVE_REGS is set, cs will be non zero */
- if (!fregs->regs.cs)
+ if (!arch_ftrace_regs(fregs)->regs.cs)
return NULL;
- return &fregs->regs;
+ return &arch_ftrace_regs(fregs)->regs;
}
#define ftrace_regs_set_instruction_pointer(fregs, _ip) \
- do { (fregs)->regs.ip = (_ip); } while (0)
-
-#define ftrace_regs_get_instruction_pointer(fregs) \
- ((fregs)->regs.ip)
-
-#define ftrace_regs_get_argument(fregs, n) \
- regs_get_kernel_argument(&(fregs)->regs, n)
-#define ftrace_regs_get_stack_pointer(fregs) \
- kernel_stack_pointer(&(fregs)->regs)
-#define ftrace_regs_return_value(fregs) \
- regs_return_value(&(fregs)->regs)
-#define ftrace_regs_set_return_value(fregs, ret) \
- regs_set_return_value(&(fregs)->regs, ret)
-#define ftrace_override_function_with_return(fregs) \
- override_function_with_return(&(fregs)->regs)
-#define ftrace_regs_query_register_offset(name) \
- regs_query_register_offset(name)
+ do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0)
+
struct ftrace_ops;
#define ftrace_graph_func ftrace_graph_func
@@ -90,7 +74,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
regs->orig_ax = addr;
}
#define arch_ftrace_set_direct_caller(fregs, addr) \
- __arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
+ __arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr)
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 1d60427379c9..ed580c7f9d0a 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -152,11 +152,6 @@ static inline void *phys_to_virt(phys_addr_t address)
#define phys_to_virt phys_to_virt
/*
- * Change "struct page" to physical address.
- */
-#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-
-/*
* ISA I/O bus memory addresses are 1:1 with the physical address.
* However, we truncate the address to unsigned int to avoid undesirable
* promotions in legacy drivers.
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index fdfd41511b02..89f7fcade8ae 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -16,10 +16,21 @@
#define TDG_VP_VEINFO_GET 3
#define TDG_MR_REPORT 4
#define TDG_MEM_PAGE_ACCEPT 6
+#define TDG_VM_RD 7
#define TDG_VM_WR 8
-/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
+/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */
+#define TDCS_CONFIG_FLAGS 0x1110000300000016
+#define TDCS_TD_CTLS 0x1110000300000017
#define TDCS_NOTIFY_ENABLES 0x9100000000000010
+#define TDCS_TOPOLOGY_ENUM_CONFIGURED 0x9100000000000019
+
+/* TDCS_CONFIG_FLAGS bits */
+#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
+
+/* TDCS_TD_CTLS bits */
+#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
+#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1)
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h
index e5d182c7373c..4a7cace06204 100644
--- a/arch/x86/include/uapi/asm/amd_hsmp.h
+++ b/arch/x86/include/uapi/asm/amd_hsmp.h
@@ -88,7 +88,8 @@ struct hsmp_msg_desc {
*
* Not supported messages would return -ENOMSG.
*/
-static const struct hsmp_msg_desc hsmp_msg_desc_table[] = {
+static const struct hsmp_msg_desc hsmp_msg_desc_table[]
+ __attribute__((unused)) = {
/* RESERVED */
{0, 0, HSMP_RSVD},
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 02637365d1a9..06a516f6795b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -276,21 +276,13 @@ static int __init x86_noinvpcid_setup(char *s)
}
early_param("noinvpcid", x86_noinvpcid_setup);
-#ifdef CONFIG_X86_32
-static int cachesize_override = -1;
-static int disable_x86_serial_nr = 1;
-
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
-}
-__setup("cachesize=", cachesize_setup);
-
/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
+static inline bool flag_is_changeable_p(unsigned long flag)
{
- u32 f1, f2;
+ unsigned long f1, f2;
+
+ if (!IS_ENABLED(CONFIG_X86_32))
+ return true;
/*
* Cyrix and IDT cpus allow disabling of CPUID
@@ -313,11 +305,22 @@ static inline int flag_is_changeable_p(u32 flag)
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
- return ((f1^f2) & flag) != 0;
+ return (f1 ^ f2) & flag;
}
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
/* Probe for the CPUID instruction */
-int have_cpuid_p(void)
+bool have_cpuid_p(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
@@ -349,10 +352,6 @@ static int __init x86_serial_nr_setup(char *s)
}
__setup("serialnumber", x86_serial_nr_setup);
#else
-static inline int flag_is_changeable_p(u32 flag)
-{
- return 1;
-}
static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
}
@@ -1088,7 +1087,6 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
int i;
/*
@@ -1109,7 +1107,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
break;
}
}
-#endif
}
#define NO_SPECULATION BIT(0)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index eb5848d1851a..8ce352fc72ac 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -630,7 +630,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
if (!section->virt_addr)
return false;
- section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
+ section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
if (!section->pages) {
memunmap(section->virt_addr);
return false;
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 64280879c68c..59d23cdf4ed0 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -305,7 +305,7 @@ void __init x86_flattree_get_config(void)
map_len = size;
}
- early_init_dt_verify(dt);
+ early_init_dt_verify(dt, __pa(dt));
}
unflatten_and_copy_device_tree();
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 29d1f9104e94..6b6f32f40cbe 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,7 +18,7 @@
#include <linux/bcma/bcma_regs.h>
#include <linux/platform_data/x86/apple.h>
#include <drm/intel/i915_drm.h>
-#include <drm/intel/i915_pciids.h>
+#include <drm/intel/pciids.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8da0e66ca22d..adb09f78edb2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -647,7 +647,7 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
- struct pt_regs *regs = &fregs->regs;
+ struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
prepare_ftrace_return(ip, (unsigned long *)stack, 0);
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 15af7e98e161..2be55ec3f392 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -9,6 +9,7 @@
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/ftrace.h>
+#include <asm/text-patching.h>
#include "common.h"
@@ -36,23 +37,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
if (kprobe_running()) {
kprobes_inc_nmissed_count(p);
} else {
- unsigned long orig_ip = regs->ip;
+ unsigned long orig_ip = instruction_pointer(regs);
+
/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
- regs->ip = ip + sizeof(kprobe_opcode_t);
+ instruction_pointer_set(regs, ip + INT3_INSN_SIZE);
__this_cpu_write(current_kprobe, p);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (!p->pre_handler || !p->pre_handler(p, regs)) {
- /*
- * Emulate singlestep (and also recover regs->ip)
- * as if there is a 5byte nop
- */
- regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
if (unlikely(p->post_handler)) {
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE);
kcb->kprobe_status = KPROBE_HIT_SSDONE;
p->post_handler(p, regs, 0);
}
- regs->ip = orig_ip;
+ /* Recover IP address */
+ instruction_pointer_set(regs, orig_ip);
}
/*
* If pre_handler returns !0, it changes regs->ip. We have to
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 06b080b61aa5..a43fc5af973d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -325,6 +325,22 @@ struct jit_context {
/* Number of bytes that will be skipped on tailcall */
#define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE)
+static void push_r9(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x51); /* push r9 */
+ *pprog = prog;
+}
+
+static void pop_r9(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x59); /* pop r9 */
+ *pprog = prog;
+}
+
static void push_r12(u8 **pprog)
{
u8 *prog = *pprog;
@@ -1404,6 +1420,24 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
*pprog = prog;
}
+static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
+{
+ u8 *prog = *pprog;
+
+ /* movabs r9, priv_frame_ptr */
+ emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32,
+ (u32) (__force long) priv_frame_ptr);
+
+#ifdef CONFIG_SMP
+ /* add <r9>, gs:[<off>] */
+ EMIT2(0x65, 0x4c);
+ EMIT3(0x03, 0x0c, 0x25);
+ EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+
+ *pprog = prog;
+}
+
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
#define __LOAD_TCC_PTR(off) \
@@ -1412,6 +1446,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
#define LOAD_TAIL_CALL_CNT_PTR(stack) \
__LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ 8
+#define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
@@ -1421,18 +1459,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
int insn_cnt = bpf_prog->len;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ void __percpu *priv_frame_ptr = NULL;
u64 arena_vm_start, user_vm_start;
+ void __percpu *priv_stack_ptr;
int i, excnt = 0;
int ilen, proglen = 0;
u8 *prog = temp;
+ u32 stack_depth;
int err;
+ stack_depth = bpf_prog->aux->stack_depth;
+ priv_stack_ptr = bpf_prog->aux->priv_stack_ptr;
+ if (priv_stack_ptr) {
+ priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8);
+ stack_depth = 0;
+ }
+
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
detect_reg_usage(insn, insn_cnt, callee_regs_used);
- emit_prologue(&prog, bpf_prog->aux->stack_depth,
+ emit_prologue(&prog, stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
/* Exception callback will clobber callee regs for its own use, and
@@ -1454,6 +1502,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
emit_mov_imm64(&prog, X86_REG_R12,
arena_vm_start >> 32, (u32) arena_vm_start);
+ if (priv_frame_ptr)
+ emit_priv_frame_ptr(&prog, priv_frame_ptr);
+
ilen = prog - temp;
if (rw_image)
memcpy(rw_image + proglen, temp, ilen);
@@ -1473,6 +1524,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
u8 *func;
int nops;
+ if (priv_frame_ptr) {
+ if (src_reg == BPF_REG_FP)
+ src_reg = X86_REG_R9;
+
+ if (dst_reg == BPF_REG_FP)
+ dst_reg = X86_REG_R9;
+ }
+
switch (insn->code) {
/* ALU */
case BPF_ALU | BPF_ADD | BPF_X:
@@ -2127,15 +2186,21 @@ populate_extable:
u8 *ip = image + addrs[i - 1];
func = (u8 *) __bpf_call_base + imm32;
- if (tail_call_reachable) {
- LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
+ if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) {
+ LOAD_TAIL_CALL_CNT_PTR(stack_depth);
ip += 7;
}
if (!imm32)
return -EINVAL;
+ if (priv_frame_ptr) {
+ push_r9(&prog);
+ ip += 2;
+ }
ip += x86_call_depth_emit_accounting(&prog, func, ip);
if (emit_call(&prog, func, ip))
return -EINVAL;
+ if (priv_frame_ptr)
+ pop_r9(&prog);
break;
}
@@ -2145,13 +2210,13 @@ populate_extable:
&bpf_prog->aux->poke_tab[imm32 - 1],
&prog, image + addrs[i - 1],
callee_regs_used,
- bpf_prog->aux->stack_depth,
+ stack_depth,
ctx);
else
emit_bpf_tail_call_indirect(bpf_prog,
&prog,
callee_regs_used,
- bpf_prog->aux->stack_depth,
+ stack_depth,
image + addrs[i - 1],
ctx);
break;
@@ -3303,6 +3368,42 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
}
+static const char *bpf_get_prog_name(struct bpf_prog *prog)
+{
+ if (prog->aux->ksym.prog)
+ return prog->aux->ksym.name;
+ return prog->aux->name;
+}
+
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+ }
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size,
+ struct bpf_prog *prog)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) {
+ pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
+ bpf_get_prog_name(prog));
+ break;
+ }
+ }
+}
+
struct x64_jit_data {
struct bpf_binary_header *rw_header;
struct bpf_binary_header *header;
@@ -3320,7 +3421,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
struct bpf_binary_header *rw_header = NULL;
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
+ void __percpu *priv_stack_ptr = NULL;
struct x64_jit_data *jit_data;
+ int priv_stack_alloc_sz;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
bool tmp_blinded = false;
@@ -3356,6 +3459,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
prog->aux->jit_data = jit_data;
}
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+ /* Allocate actual private stack size with verifier-calculated
+ * stack size plus two memory guards to protect overflow and
+ * underflow.
+ */
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL);
+ if (!priv_stack_ptr) {
+ prog = orig_prog;
+ goto out_priv_stack;
+ }
+
+ priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+ prog->aux->priv_stack_ptr = priv_stack_ptr;
+ }
addrs = jit_data->addrs;
if (addrs) {
ctx = jit_data->ctx;
@@ -3491,6 +3611,11 @@ out_image:
bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
kvfree(addrs);
+ if (!image && priv_stack_ptr) {
+ free_percpu(priv_stack_ptr);
+ prog->aux->priv_stack_ptr = NULL;
+ }
+out_priv_stack:
kfree(jit_data);
prog->aux->jit_data = NULL;
}
@@ -3529,6 +3654,8 @@ void bpf_jit_free(struct bpf_prog *prog)
if (prog->jited) {
struct x64_jit_data *jit_data = prog->aux->jit_data;
struct bpf_binary_header *hdr;
+ void __percpu *priv_stack_ptr;
+ int priv_stack_alloc_sz;
/*
* If we fail the final pass of JIT (from jit_subprogs),
@@ -3544,6 +3671,13 @@ void bpf_jit_free(struct bpf_prog *prog)
prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (priv_stack_ptr) {
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
+ free_percpu(prog->aux->priv_stack_ptr);
+ }
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
@@ -3559,6 +3693,11 @@ bool bpf_jit_supports_exceptions(void)
return IS_ENABLED(CONFIG_UNWINDER_ORC);
}
+bool bpf_jit_supports_private_stack(void)
+{
+ return true;
+}
+
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
#if defined(CONFIG_UNWINDER_ORC)
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 88a96816de9a..a7ff189421c3 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -54,14 +54,12 @@
#include <asm/uv/uv.h>
static unsigned long efi_systab_phys __initdata;
-static unsigned long prop_phys = EFI_INVALID_TABLE_ADDR;
static unsigned long uga_phys = EFI_INVALID_TABLE_ADDR;
static unsigned long efi_runtime, efi_nr_tables;
unsigned long efi_fw_vendor, efi_config_table;
static const efi_config_table_type_t arch_tables[] __initconst = {
- {EFI_PROPERTIES_TABLE_GUID, &prop_phys, "PROP" },
{UGA_IO_PROTOCOL_GUID, &uga_phys, "UGA" },
#ifdef CONFIG_X86_UV
{UV_SYSTEM_TABLE_GUID, &uv_systab_phys, "UVsystab" },
@@ -82,7 +80,6 @@ static const unsigned long * const efi_tables[] = {
&efi_runtime,
&efi_config_table,
&efi.esrt,
- &prop_phys,
&efi_mem_attr_table,
#ifdef CONFIG_EFI_RCI2_TABLE
&rci2_table_phys,
@@ -502,22 +499,6 @@ void __init efi_init(void)
return;
}
- /* Parse the EFI Properties table if it exists */
- if (prop_phys != EFI_INVALID_TABLE_ADDR) {
- efi_properties_table_t *tbl;
-
- tbl = early_memremap_ro(prop_phys, sizeof(*tbl));
- if (tbl == NULL) {
- pr_err("Could not map Properties table!\n");
- } else {
- if (tbl->memory_protection_attribute &
- EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA)
- set_bit(EFI_NX_PE_DATA, &efi.flags);
-
- early_memunmap(tbl, sizeof(*tbl));
- }
- }
-
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
efi_clean_memmap();
@@ -784,6 +765,7 @@ static void __init kexec_enter_virtual_mode(void)
efi_sync_low_kernel_mappings();
efi_native_runtime_setup();
+ efi_runtime_update_mappings();
#endif
}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 91d31ac422d6..ac57259a432b 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -412,51 +412,9 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m
void __init efi_runtime_update_mappings(void)
{
- efi_memory_desc_t *md;
-
- /*
- * Use the EFI Memory Attribute Table for mapping permissions if it
- * exists, since it is intended to supersede EFI_PROPERTIES_TABLE.
- */
if (efi_enabled(EFI_MEM_ATTR)) {
efi_disable_ibt_for_runtime = false;
efi_memattr_apply_permissions(NULL, efi_update_mem_attr);
- return;
- }
-
- /*
- * EFI_MEMORY_ATTRIBUTES_TABLE is intended to replace
- * EFI_PROPERTIES_TABLE. So, use EFI_PROPERTIES_TABLE to update
- * permissions only if EFI_MEMORY_ATTRIBUTES_TABLE is not
- * published by the firmware. Even if we find a buggy implementation of
- * EFI_MEMORY_ATTRIBUTES_TABLE, don't fall back to
- * EFI_PROPERTIES_TABLE, because of the same reason.
- */
-
- if (!efi_enabled(EFI_NX_PE_DATA))
- return;
-
- for_each_efi_memory_desc(md) {
- unsigned long pf = 0;
-
- if (!(md->attribute & EFI_MEMORY_RUNTIME))
- continue;
-
- if (!(md->attribute & EFI_MEMORY_WB))
- pf |= _PAGE_PCD;
-
- if ((md->attribute & EFI_MEMORY_XP) ||
- (md->type == EFI_RUNTIME_SERVICES_DATA))
- pf |= _PAGE_NX;
-
- if (!(md->attribute & EFI_MEMORY_RO) &&
- (md->type != EFI_RUNTIME_SERVICES_CODE))
- pf |= _PAGE_RW;
-
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- pf |= _PAGE_ENC;
-
- efi_update_mappings(md, pf);
}
}