diff options
author | Christian Brauner <brauner@kernel.org> | 2024-11-26 18:15:06 +0100 |
---|---|---|
committer | Christian Brauner <brauner@kernel.org> | 2024-11-26 18:15:06 +0100 |
commit | cf87766dd6f9ddcceaa8ee26e3cbd7538e42dd19 (patch) | |
tree | 8531685628a090333db2f874688ac07624b51072 /arch/x86 | |
parent | c66f759832a83cb273ba5a55c66dcc99384efa74 (diff) | |
parent | 2957fa4931a3b658d8e54eda9439d4c57967e8ad (diff) |
Merge branch 'ovl.fixes'
Bring in an overlayfs fix for v6.13-rc1 that fixes a bug introduced by
the overlayfs changes merged for v6.13.
Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 17 | ||||
-rw-r--r-- | arch/x86/coco/tdx/tdx.c | 138 | ||||
-rw-r--r-- | arch/x86/include/asm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/ftrace.h | 30 | ||||
-rw-r--r-- | arch/x86/include/asm/io.h | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/shared/tdx.h | 13 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/amd_hsmp.h | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 39 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/main.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/devicetree.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/early-quirks.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/ftrace.c | 19 | ||||
-rw-r--r-- | arch/x86/net/bpf_jit_comp.c | 149 | ||||
-rw-r--r-- | arch/x86/platform/efi/efi.c | 20 | ||||
-rw-r--r-- | arch/x86/platform/efi/efi_64.c | 42 |
16 files changed, 328 insertions, 163 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a3c31b784edc..948707a3615e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2564,15 +2564,14 @@ config MITIGATION_CALL_DEPTH_TRACKING default y help Compile the kernel with call depth tracking to mitigate the Intel - SKL Return-Speculation-Buffer (RSB) underflow issue. The - mitigation is off by default and needs to be enabled on the - kernel command line via the retbleed=stuff option. For - non-affected systems the overhead of this option is marginal as - the call depth tracking is using run-time generated call thunks - in a compiler generated padding area and call patching. This - increases text size by ~5%. For non affected systems this space - is unused. On affected SKL systems this results in a significant - performance gain over the IBRS mitigation. + SKL Return-Stack-Buffer (RSB) underflow issue. The mitigation is off + by default and needs to be enabled on the kernel command line via the + retbleed=stuff option. For non-affected systems the overhead of this + option is marginal as the call depth tracking is using run-time + generated call thunks in a compiler generated padding area and call + patching. This increases text size by ~5%. For non affected systems + this space is unused. On affected SKL systems this results in a + significant performance gain over the IBRS mitigation. config CALL_THUNKS_DEBUG bool "Enable call thunks and call depth tracking debugging" diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 327c45c5013f..0d9b090b4880 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -78,6 +78,32 @@ static inline void tdcall(u64 fn, struct tdx_module_args *args) panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); } +/* Read TD-scoped metadata */ +static inline u64 tdg_vm_rd(u64 field, u64 *value) +{ + struct tdx_module_args args = { + .rdx = field, + }; + u64 ret; + + ret = __tdcall_ret(TDG_VM_RD, &args); + *value = args.r8; + + return ret; +} + +/* Write TD-scoped metadata */ +static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask) +{ + struct tdx_module_args args = { + .rdx = field, + .r8 = value, + .r9 = mask, + }; + + return __tdcall(TDG_VM_WR, &args); +} + /** * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT * subtype 0) using TDG.MR.REPORT TDCALL. @@ -168,7 +194,87 @@ static void __noreturn tdx_panic(const char *msg) __tdx_hypercall(&args); } -static void tdx_parse_tdinfo(u64 *cc_mask) +/* + * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure + * that no #VE will be delivered for accesses to TD-private memory. + * + * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM + * controls if the guest will receive such #VE with TD attribute + * ATTR_SEPT_VE_DISABLE. + * + * Newer TDX modules allow the guest to control if it wants to receive SEPT + * violation #VEs. + * + * Check if the feature is available and disable SEPT #VE if possible. + * + * If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE + * attribute is no longer reliable. It reflects the initial state of the + * control for the TD, but it will not be updated if someone (e.g. bootloader) + * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to + * determine if SEPT #VEs are enabled or disabled. + */ +static void disable_sept_ve(u64 td_attr) +{ + const char *msg = "TD misconfiguration: SEPT #VE has to be disabled"; + bool debug = td_attr & ATTR_DEBUG; + u64 config, controls; + + /* Is this TD allowed to disable SEPT #VE */ + tdg_vm_rd(TDCS_CONFIG_FLAGS, &config); + if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) { + /* No SEPT #VE controls for the guest: check the attribute */ + if (td_attr & ATTR_SEPT_VE_DISABLE) + return; + + /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */ + if (debug) + pr_warn("%s\n", msg); + else + tdx_panic(msg); + return; + } + + /* Check if SEPT #VE has been disabled before us */ + tdg_vm_rd(TDCS_TD_CTLS, &controls); + if (controls & TD_CTLS_PENDING_VE_DISABLE) + return; + + /* Keep #VEs enabled for splats in debugging environments */ + if (debug) + return; + + /* Disable SEPT #VEs */ + tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE, + TD_CTLS_PENDING_VE_DISABLE); +} + +/* + * TDX 1.0 generates a #VE when accessing topology-related CPUID leafs (0xB and + * 0x1F) and the X2APIC_APICID MSR. The kernel returns all zeros on CPUID #VEs. + * In practice, this means that the kernel can only boot with a plain topology. + * Any complications will cause problems. + * + * The ENUM_TOPOLOGY feature allows the VMM to provide topology information. + * Enabling the feature eliminates topology-related #VEs: the TDX module + * virtualizes accesses to the CPUID leafs and the MSR. + * + * Enable ENUM_TOPOLOGY if it is available. + */ +static void enable_cpu_topology_enumeration(void) +{ + u64 configured; + + /* Has the VMM provided a valid topology configuration? */ + tdg_vm_rd(TDCS_TOPOLOGY_ENUM_CONFIGURED, &configured); + if (!configured) { + pr_err("VMM did not configure X2APIC_IDs properly\n"); + return; + } + + tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY); +} + +static void tdx_setup(u64 *cc_mask) { struct tdx_module_args args = {}; unsigned int gpa_width; @@ -193,21 +299,13 @@ static void tdx_parse_tdinfo(u64 *cc_mask) gpa_width = args.rcx & GENMASK(5, 0); *cc_mask = BIT_ULL(gpa_width - 1); - /* - * The kernel can not handle #VE's when accessing normal kernel - * memory. Ensure that no #VE will be delivered for accesses to - * TD-private memory. Only VMM-shared memory (MMIO) will #VE. - */ td_attr = args.rdx; - if (!(td_attr & ATTR_SEPT_VE_DISABLE)) { - const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set."; - /* Relax SEPT_VE_DISABLE check for debug TD. */ - if (td_attr & ATTR_DEBUG) - pr_warn("%s\n", msg); - else - tdx_panic(msg); - } + /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ + tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL); + + disable_sept_ve(td_attr); + enable_cpu_topology_enumeration(); } /* @@ -929,10 +1027,6 @@ static void tdx_kexec_finish(void) void __init tdx_early_init(void) { - struct tdx_module_args args = { - .rdx = TDCS_NOTIFY_ENABLES, - .r9 = -1ULL, - }; u64 cc_mask; u32 eax, sig[3]; @@ -947,11 +1041,11 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); cc_vendor = CC_VENDOR_INTEL; - tdx_parse_tdinfo(&cc_mask); - cc_set_mask(cc_mask); - /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ - tdcall(TDG_VM_WR, &args); + /* Configure the TD */ + tdx_setup(&cc_mask); + + cc_set_mask(cc_mask); /* * All bits above GPA width are reserved and kernel treats shared bit diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index ca4243318aad..239b9ba5c398 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -6,6 +6,8 @@ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H +#include <linux/types.h> + #include <asm/string.h> struct cpuid_regs { @@ -20,11 +22,11 @@ enum cpuid_regs_idx { }; #ifdef CONFIG_X86_32 -extern int have_cpuid_p(void); +bool have_cpuid_p(void); #else -static inline int have_cpuid_p(void) +static inline bool have_cpuid_p(void) { - return 1; + return true; } #endif static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b4d719de2c84..6e8cf0fa48fc 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -35,37 +35,21 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) } #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS -struct ftrace_regs { - struct pt_regs regs; -}; + +#include <linux/ftrace_regs.h> static __always_inline struct pt_regs * arch_ftrace_get_regs(struct ftrace_regs *fregs) { /* Only when FL_SAVE_REGS is set, cs will be non zero */ - if (!fregs->regs.cs) + if (!arch_ftrace_regs(fregs)->regs.cs) return NULL; - return &fregs->regs; + return &arch_ftrace_regs(fregs)->regs; } #define ftrace_regs_set_instruction_pointer(fregs, _ip) \ - do { (fregs)->regs.ip = (_ip); } while (0) - -#define ftrace_regs_get_instruction_pointer(fregs) \ - ((fregs)->regs.ip) - -#define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(&(fregs)->regs, n) -#define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(&(fregs)->regs) -#define ftrace_regs_return_value(fregs) \ - regs_return_value(&(fregs)->regs) -#define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(&(fregs)->regs, ret) -#define ftrace_override_function_with_return(fregs) \ - override_function_with_return(&(fregs)->regs) -#define ftrace_regs_query_register_offset(name) \ - regs_query_register_offset(name) + do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0) + struct ftrace_ops; #define ftrace_graph_func ftrace_graph_func @@ -90,7 +74,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) regs->orig_ax = addr; } #define arch_ftrace_set_direct_caller(fregs, addr) \ - __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) + __arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 1d60427379c9..ed580c7f9d0a 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -152,11 +152,6 @@ static inline void *phys_to_virt(phys_addr_t address) #define phys_to_virt phys_to_virt /* - * Change "struct page" to physical address. - */ -#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) - -/* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable * promotions in legacy drivers. diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index fdfd41511b02..89f7fcade8ae 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -16,10 +16,21 @@ #define TDG_VP_VEINFO_GET 3 #define TDG_MR_REPORT 4 #define TDG_MEM_PAGE_ACCEPT 6 +#define TDG_VM_RD 7 #define TDG_VM_WR 8 -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016 +#define TDCS_TD_CTLS 0x1110000300000017 #define TDCS_NOTIFY_ENABLES 0x9100000000000010 +#define TDCS_TOPOLOGY_ENUM_CONFIGURED 0x9100000000000019 + +/* TDCS_CONFIG_FLAGS bits */ +#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1) + +/* TDCS_TD_CTLS bits */ +#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0) +#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1) /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index e5d182c7373c..4a7cace06204 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -88,7 +88,8 @@ struct hsmp_msg_desc { * * Not supported messages would return -ENOMSG. */ -static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { +static const struct hsmp_msg_desc hsmp_msg_desc_table[] + __attribute__((unused)) = { /* RESERVED */ {0, 0, HSMP_RSVD}, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 02637365d1a9..06a516f6795b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -276,21 +276,13 @@ static int __init x86_noinvpcid_setup(char *s) } early_param("noinvpcid", x86_noinvpcid_setup); -#ifdef CONFIG_X86_32 -static int cachesize_override = -1; -static int disable_x86_serial_nr = 1; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); - /* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) +static inline bool flag_is_changeable_p(unsigned long flag) { - u32 f1, f2; + unsigned long f1, f2; + + if (!IS_ENABLED(CONFIG_X86_32)) + return true; /* * Cyrix and IDT cpus allow disabling of CPUID @@ -313,11 +305,22 @@ static inline int flag_is_changeable_p(u32 flag) : "=&r" (f1), "=&r" (f2) : "ir" (flag)); - return ((f1^f2) & flag) != 0; + return (f1 ^ f2) & flag; } +#ifdef CONFIG_X86_32 +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + /* Probe for the CPUID instruction */ -int have_cpuid_p(void) +bool have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -349,10 +352,6 @@ static int __init x86_serial_nr_setup(char *s) } __setup("serialnumber", x86_serial_nr_setup); #else -static inline int flag_is_changeable_p(u32 flag) -{ - return 1; -} static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { } @@ -1088,7 +1087,6 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_32 int i; /* @@ -1109,7 +1107,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) break; } } -#endif } #define NO_SPECULATION BIT(0) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index eb5848d1851a..8ce352fc72ac 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -630,7 +630,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, if (!section->virt_addr) return false; - section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page)); if (!section->pages) { memunmap(section->virt_addr); return false; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 64280879c68c..59d23cdf4ed0 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -305,7 +305,7 @@ void __init x86_flattree_get_config(void) map_len = size; } - early_init_dt_verify(dt); + early_init_dt_verify(dt, __pa(dt)); } unflatten_and_copy_device_tree(); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 29d1f9104e94..6b6f32f40cbe 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,7 +18,7 @@ #include <linux/bcma/bcma_regs.h> #include <linux/platform_data/x86/apple.h> #include <drm/intel/i915_drm.h> -#include <drm/intel/i915_pciids.h> +#include <drm/intel/pciids.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8da0e66ca22d..adb09f78edb2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -647,7 +647,7 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); prepare_ftrace_return(ip, (unsigned long *)stack, 0); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 15af7e98e161..2be55ec3f392 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/ftrace.h> +#include <asm/text-patching.h> #include "common.h" @@ -36,23 +37,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { - unsigned long orig_ip = regs->ip; + unsigned long orig_ip = instruction_pointer(regs); + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ - regs->ip = ip + sizeof(kprobe_opcode_t); + instruction_pointer_set(regs, ip + INT3_INSN_SIZE); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; if (unlikely(p->post_handler)) { + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE); kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - regs->ip = orig_ip; + /* Recover IP address */ + instruction_pointer_set(regs, orig_ip); } /* * If pre_handler returns !0, it changes regs->ip. We have to diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 06b080b61aa5..a43fc5af973d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -325,6 +325,22 @@ struct jit_context { /* Number of bytes that will be skipped on tailcall */ #define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE) +static void push_r9(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x51); /* push r9 */ + *pprog = prog; +} + +static void pop_r9(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x59); /* pop r9 */ + *pprog = prog; +} + static void push_r12(u8 **pprog) { u8 *prog = *pprog; @@ -1404,6 +1420,24 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) *pprog = prog; } +static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr) +{ + u8 *prog = *pprog; + + /* movabs r9, priv_frame_ptr */ + emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32, + (u32) (__force long) priv_frame_ptr); + +#ifdef CONFIG_SMP + /* add <r9>, gs:[<off>] */ + EMIT2(0x65, 0x4c); + EMIT3(0x03, 0x0c, 0x25); + EMIT((u32)(unsigned long)&this_cpu_off, 4); +#endif + + *pprog = prog; +} + #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) #define __LOAD_TCC_PTR(off) \ @@ -1412,6 +1446,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) #define LOAD_TAIL_CALL_CNT_PTR(stack) \ __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack)) +/* Memory size/value to protect private stack overflow/underflow */ +#define PRIV_STACK_GUARD_SZ 8 +#define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -1421,18 +1459,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image int insn_cnt = bpf_prog->len; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + void __percpu *priv_frame_ptr = NULL; u64 arena_vm_start, user_vm_start; + void __percpu *priv_stack_ptr; int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; + u32 stack_depth; int err; + stack_depth = bpf_prog->aux->stack_depth; + priv_stack_ptr = bpf_prog->aux->priv_stack_ptr; + if (priv_stack_ptr) { + priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8); + stack_depth = 0; + } + arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena); detect_reg_usage(insn, insn_cnt, callee_regs_used); - emit_prologue(&prog, bpf_prog->aux->stack_depth, + emit_prologue(&prog, stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); /* Exception callback will clobber callee regs for its own use, and @@ -1454,6 +1502,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image emit_mov_imm64(&prog, X86_REG_R12, arena_vm_start >> 32, (u32) arena_vm_start); + if (priv_frame_ptr) + emit_priv_frame_ptr(&prog, priv_frame_ptr); + ilen = prog - temp; if (rw_image) memcpy(rw_image + proglen, temp, ilen); @@ -1473,6 +1524,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image u8 *func; int nops; + if (priv_frame_ptr) { + if (src_reg == BPF_REG_FP) + src_reg = X86_REG_R9; + + if (dst_reg == BPF_REG_FP) + dst_reg = X86_REG_R9; + } + switch (insn->code) { /* ALU */ case BPF_ALU | BPF_ADD | BPF_X: @@ -2127,15 +2186,21 @@ populate_extable: u8 *ip = image + addrs[i - 1]; func = (u8 *) __bpf_call_base + imm32; - if (tail_call_reachable) { - LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth); + if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) { + LOAD_TAIL_CALL_CNT_PTR(stack_depth); ip += 7; } if (!imm32) return -EINVAL; + if (priv_frame_ptr) { + push_r9(&prog); + ip += 2; + } ip += x86_call_depth_emit_accounting(&prog, func, ip); if (emit_call(&prog, func, ip)) return -EINVAL; + if (priv_frame_ptr) + pop_r9(&prog); break; } @@ -2145,13 +2210,13 @@ populate_extable: &bpf_prog->aux->poke_tab[imm32 - 1], &prog, image + addrs[i - 1], callee_regs_used, - bpf_prog->aux->stack_depth, + stack_depth, ctx); else emit_bpf_tail_call_indirect(bpf_prog, &prog, callee_regs_used, - bpf_prog->aux->stack_depth, + stack_depth, image + addrs[i - 1], ctx); break; @@ -3303,6 +3368,42 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } +static const char *bpf_get_prog_name(struct bpf_prog *prog) +{ + if (prog->aux->ksym.prog) + return prog->aux->ksym.name; + return prog->aux->name; +} + +static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size) +{ + int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; + u64 *stack_ptr; + + for_each_possible_cpu(cpu) { + stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu); + stack_ptr[0] = PRIV_STACK_GUARD_VAL; + stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL; + } +} + +static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size, + struct bpf_prog *prog) +{ + int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; + u64 *stack_ptr; + + for_each_possible_cpu(cpu) { + stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu); + if (stack_ptr[0] != PRIV_STACK_GUARD_VAL || + stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) { + pr_err("BPF private stack overflow/underflow detected for prog %sx\n", + bpf_get_prog_name(prog)); + break; + } + } +} + struct x64_jit_data { struct bpf_binary_header *rw_header; struct bpf_binary_header *header; @@ -3320,7 +3421,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct bpf_binary_header *rw_header = NULL; struct bpf_binary_header *header = NULL; struct bpf_prog *tmp, *orig_prog = prog; + void __percpu *priv_stack_ptr = NULL; struct x64_jit_data *jit_data; + int priv_stack_alloc_sz; int proglen, oldproglen = 0; struct jit_context ctx = {}; bool tmp_blinded = false; @@ -3356,6 +3459,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } prog->aux->jit_data = jit_data; } + priv_stack_ptr = prog->aux->priv_stack_ptr; + if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) { + /* Allocate actual private stack size with verifier-calculated + * stack size plus two memory guards to protect overflow and + * underflow. + */ + priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) + + 2 * PRIV_STACK_GUARD_SZ; + priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL); + if (!priv_stack_ptr) { + prog = orig_prog; + goto out_priv_stack; + } + + priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz); + prog->aux->priv_stack_ptr = priv_stack_ptr; + } addrs = jit_data->addrs; if (addrs) { ctx = jit_data->ctx; @@ -3491,6 +3611,11 @@ out_image: bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: kvfree(addrs); + if (!image && priv_stack_ptr) { + free_percpu(priv_stack_ptr); + prog->aux->priv_stack_ptr = NULL; + } +out_priv_stack: kfree(jit_data); prog->aux->jit_data = NULL; } @@ -3529,6 +3654,8 @@ void bpf_jit_free(struct bpf_prog *prog) if (prog->jited) { struct x64_jit_data *jit_data = prog->aux->jit_data; struct bpf_binary_header *hdr; + void __percpu *priv_stack_ptr; + int priv_stack_alloc_sz; /* * If we fail the final pass of JIT (from jit_subprogs), @@ -3544,6 +3671,13 @@ void bpf_jit_free(struct bpf_prog *prog) prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset(); hdr = bpf_jit_binary_pack_hdr(prog); bpf_jit_binary_pack_free(hdr, NULL); + priv_stack_ptr = prog->aux->priv_stack_ptr; + if (priv_stack_ptr) { + priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) + + 2 * PRIV_STACK_GUARD_SZ; + priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog); + free_percpu(prog->aux->priv_stack_ptr); + } WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog)); } @@ -3559,6 +3693,11 @@ bool bpf_jit_supports_exceptions(void) return IS_ENABLED(CONFIG_UNWINDER_ORC); } +bool bpf_jit_supports_private_stack(void) +{ + return true; +} + void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { #if defined(CONFIG_UNWINDER_ORC) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 88a96816de9a..a7ff189421c3 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -54,14 +54,12 @@ #include <asm/uv/uv.h> static unsigned long efi_systab_phys __initdata; -static unsigned long prop_phys = EFI_INVALID_TABLE_ADDR; static unsigned long uga_phys = EFI_INVALID_TABLE_ADDR; static unsigned long efi_runtime, efi_nr_tables; unsigned long efi_fw_vendor, efi_config_table; static const efi_config_table_type_t arch_tables[] __initconst = { - {EFI_PROPERTIES_TABLE_GUID, &prop_phys, "PROP" }, {UGA_IO_PROTOCOL_GUID, &uga_phys, "UGA" }, #ifdef CONFIG_X86_UV {UV_SYSTEM_TABLE_GUID, &uv_systab_phys, "UVsystab" }, @@ -82,7 +80,6 @@ static const unsigned long * const efi_tables[] = { &efi_runtime, &efi_config_table, &efi.esrt, - &prop_phys, &efi_mem_attr_table, #ifdef CONFIG_EFI_RCI2_TABLE &rci2_table_phys, @@ -502,22 +499,6 @@ void __init efi_init(void) return; } - /* Parse the EFI Properties table if it exists */ - if (prop_phys != EFI_INVALID_TABLE_ADDR) { - efi_properties_table_t *tbl; - - tbl = early_memremap_ro(prop_phys, sizeof(*tbl)); - if (tbl == NULL) { - pr_err("Could not map Properties table!\n"); - } else { - if (tbl->memory_protection_attribute & - EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA) - set_bit(EFI_NX_PE_DATA, &efi.flags); - - early_memunmap(tbl, sizeof(*tbl)); - } - } - set_bit(EFI_RUNTIME_SERVICES, &efi.flags); efi_clean_memmap(); @@ -784,6 +765,7 @@ static void __init kexec_enter_virtual_mode(void) efi_sync_low_kernel_mappings(); efi_native_runtime_setup(); + efi_runtime_update_mappings(); #endif } diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 91d31ac422d6..ac57259a432b 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -412,51 +412,9 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m void __init efi_runtime_update_mappings(void) { - efi_memory_desc_t *md; - - /* - * Use the EFI Memory Attribute Table for mapping permissions if it - * exists, since it is intended to supersede EFI_PROPERTIES_TABLE. - */ if (efi_enabled(EFI_MEM_ATTR)) { efi_disable_ibt_for_runtime = false; efi_memattr_apply_permissions(NULL, efi_update_mem_attr); - return; - } - - /* - * EFI_MEMORY_ATTRIBUTES_TABLE is intended to replace - * EFI_PROPERTIES_TABLE. So, use EFI_PROPERTIES_TABLE to update - * permissions only if EFI_MEMORY_ATTRIBUTES_TABLE is not - * published by the firmware. Even if we find a buggy implementation of - * EFI_MEMORY_ATTRIBUTES_TABLE, don't fall back to - * EFI_PROPERTIES_TABLE, because of the same reason. - */ - - if (!efi_enabled(EFI_NX_PE_DATA)) - return; - - for_each_efi_memory_desc(md) { - unsigned long pf = 0; - - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - - if (!(md->attribute & EFI_MEMORY_WB)) - pf |= _PAGE_PCD; - - if ((md->attribute & EFI_MEMORY_XP) || - (md->type == EFI_RUNTIME_SERVICES_DATA)) - pf |= _PAGE_NX; - - if (!(md->attribute & EFI_MEMORY_RO) && - (md->type != EFI_RUNTIME_SERVICES_CODE)) - pf |= _PAGE_RW; - - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - pf |= _PAGE_ENC; - - efi_update_mappings(md, pf); } } |