diff options
Diffstat (limited to 'arch/x86')
212 files changed, 4510 insertions, 3195 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53bab123a8ee..7422db409770 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -71,6 +71,7 @@ config X86 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION + select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE @@ -216,6 +217,7 @@ config X86 select HAVE_FAST_GUP select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE) select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS @@ -274,8 +276,11 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO + select HOTPLUG_PARALLEL if SMP && X86_64 select HOTPLUG_SMT if SMP + select HOTPLUG_SPLIT_STARTUP if SMP && X86_32 select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH @@ -291,7 +296,6 @@ config X86 select TRACE_IRQFLAGS_NMI_SUPPORT select USER_STACKTRACE_SUPPORT select HAVE_ARCH_KCSAN if X86_64 - select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX select FUNCTION_ALIGNMENT_16B if X86_64 || X86_ALIGNMENT_16 @@ -441,17 +445,6 @@ config SMP If you don't know what to do here, say N. -config X86_FEATURE_NAMES - bool "Processor feature human-readable names" if EMBEDDED - default y - help - This option compiles in a table of x86 feature bits and corresponding - names. This is required to support /proc/cpuinfo and a few kernel - messages. You can disable this to save space, at the expense of - making those few kernel messages show numeric feature bits instead. - - If in doubt, say Y. - config X86_X2APIC bool "Support x2apic" depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST) @@ -884,9 +877,11 @@ config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC + depends on EFI_STUB select ARCH_HAS_CC_PLATFORM select X86_MEM_ENCRYPT select X86_MCE + select UNACCEPTED_MEMORY help Support running as a guest under Intel TDX. Without this support, the guest kernel can not boot or run under TDX. @@ -1541,11 +1536,13 @@ config X86_MEM_ENCRYPT config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD + depends on EFI_STUB select DMA_COHERENT_POOL select ARCH_USE_MEMREMAP_PROT select INSTRUCTION_DECODER select ARCH_HAS_CC_PLATFORM select X86_MEM_ENCRYPT + select UNACCEPTED_MEMORY help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -2305,49 +2302,6 @@ config HOTPLUG_CPU def_bool y depends on SMP -config BOOTPARAM_HOTPLUG_CPU0 - bool "Set default setting of cpu0_hotpluggable" - depends on HOTPLUG_CPU - help - Set whether default state of cpu0_hotpluggable is on or off. - - Say Y here to enable CPU0 hotplug by default. If this switch - is turned on, there is no need to give cpu0_hotplug kernel - parameter and the CPU0 hotplug feature is enabled by default. - - Please note: there are two known CPU0 dependencies if you want - to enable the CPU0 hotplug feature either by this switch or by - cpu0_hotplug kernel parameter. - - First, resume from hibernate or suspend always starts from CPU0. - So hibernate and suspend are prevented if CPU0 is offline. - - Second dependency is PIC interrupts always go to CPU0. CPU0 can not - offline if any interrupt can not migrate out of CPU0. There may - be other CPU0 dependencies. - - Please make sure the dependencies are under your control before - you enable this feature. - - Say N if you don't want to enable CPU0 hotplug feature by default. - You still can enable the CPU0 hotplug feature at boot by kernel - parameter cpu0_hotplug. - -config DEBUG_HOTPLUG_CPU0 - def_bool n - prompt "Debug CPU0 hotplug" - depends on HOTPLUG_CPU - help - Enabling this option offlines CPU0 (if CPU0 can be offlined) as - soon as possible and boots up userspace with CPU0 offlined. User - can online CPU0 back after boot time. - - To debug CPU0 hotplug, you need to enable CPU0 offline/online - feature by either turning on CONFIG_BOOTPARAM_HOTPLUG_CPU0 during - compilation or giving cpu0_hotplug kernel parameter at boot. - - If unsure, say N. - config COMPAT_VDSO def_bool n prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 542377cd419d..00468adf180f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -389,7 +389,7 @@ config IA32_FEAT_CTL config X86_VMX_FEATURE_NAMES def_bool y - depends on IA32_FEAT_CTL && X86_FEATURE_NAMES + depends on IA32_FEAT_CTL menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EXPERT diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 89a02e69be5f..95315d3474a2 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -305,6 +305,18 @@ ifeq ($(RETPOLINE_CFLAGS),) endif endif +ifdef CONFIG_UNWINDER_ORC +orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h +orc_hash_sh := $(srctree)/scripts/orc_hash.sh +targets += $(orc_hash_h) +quiet_cmd_orc_hash = GEN $@ + cmd_orc_hash = mkdir -p $(dir $@); \ + $(CONFIG_SHELL) $(orc_hash_sh) < $< > $@ +$(orc_hash_h): $(srctree)/arch/x86/include/asm/orc_types.h $(orc_hash_sh) FORCE + $(call if_changed,orc_hash) +archprepare: $(orc_hash_h) +endif + archclean: $(Q)rm -rf $(objtree)/arch/i386 $(Q)rm -rf $(objtree)/arch/x86_64 diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink new file mode 100644 index 000000000000..936093d29160 --- /dev/null +++ b/arch/x86/Makefile.postlink @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link x86 pass +# =========================================================================== +# +# 1. Separate relocations from vmlinux into vmlinux.relocs. +# 2. Strip relocations from vmlinux. + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include $(srctree)/scripts/Kbuild.include + +CMD_RELOCS = arch/x86/tools/relocs +OUT_RELOCS = arch/x86/boot/compressed +quiet_cmd_relocs = RELOCS $(OUT_RELOCS)/$@.relocs + cmd_relocs = \ + mkdir -p $(OUT_RELOCS); \ + $(CMD_RELOCS) $@ > $(OUT_RELOCS)/$@.relocs; \ + $(CMD_RELOCS) --abs-relocs $@ + +quiet_cmd_strip_relocs = RSTRIP $@ + cmd_strip_relocs = \ + $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' \ + --remove-section='.rela.*' --remove-section='.rela__*' $@ + +# `@true` prevents complaint when there is nothing to be done + +vmlinux: FORCE + @true +ifeq ($(CONFIG_X86_NEED_RELOCS),y) + $(call cmd,relocs) + $(call cmd,strip_relocs) +endif + +%.ko: FORCE + @true + +clean: + @rm -f $(OUT_RELOCS)/vmlinux.relocs + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9e38ffaadb5d..f33e45ed1437 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -55,14 +55,12 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include \ -include include/generated/autoconf.h \ -D__EXPORTED_HEADERS__ -ifdef CONFIG_X86_FEATURE_NAMES $(obj)/cpu.o: $(obj)/cpustr.h quiet_cmd_cpustr = CPUSTR $@ cmd_cpustr = $(obj)/mkcpustr > $@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE $(call if_changed,cpustr) -endif targets += cpustr.h # --------------------------------------------------------------------------- diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6b6cfe607bdb..40d2ff503079 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -106,7 +106,8 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o +vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o @@ -121,11 +122,9 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs -CMD_RELOCS = arch/x86/tools/relocs -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux FORCE - $(call if_changed,relocs) +# vmlinux.relocs is created by the vmlinux postlink step. +$(obj)/vmlinux.relocs: vmlinux + @true vmlinux.bin.all-y := $(obj)/vmlinux.bin vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h index 7db2f41b54cd..866c0af8b5b9 100644 --- a/arch/x86/boot/compressed/efi.h +++ b/arch/x86/boot/compressed/efi.h @@ -16,6 +16,7 @@ typedef guid_t efi_guid_t __aligned(__alignof__(u32)); #define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) #define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) #define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) +#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31) #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" @@ -32,6 +33,7 @@ typedef struct { } efi_table_hdr_t; #define EFI_CONVENTIONAL_MEMORY 7 +#define EFI_UNACCEPTED_MEMORY 15 #define EFI_MEMORY_MORE_RELIABLE \ ((u64)0x0000000000010000ULL) /* higher reliability */ @@ -104,6 +106,14 @@ struct efi_setup_data { u64 reserved[8]; }; +struct efi_unaccepted_memory { + u32 version; + u32 unit_size; + u64 phys_base; + u64 size; + unsigned long bitmap[]; +}; + static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) { return memcmp(&left, &right, sizeof (efi_guid_t)); diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c index c881878e56d3..5313c5cb2b80 100644 --- a/arch/x86/boot/compressed/error.c +++ b/arch/x86/boot/compressed/error.c @@ -22,3 +22,22 @@ void error(char *m) while (1) asm("hlt"); } + +/* EFI libstub provides vsnprintf() */ +#ifdef CONFIG_EFI_STUB +void panic(const char *fmt, ...) +{ + static char buf[1024]; + va_list args; + int len; + + va_start(args, fmt); + len = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + + error(buf); +} +#endif diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index 1de5821184f1..86fe33b93715 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -6,5 +6,6 @@ void warn(char *m); void error(char *m) __noreturn; +void panic(const char *fmt, ...) __noreturn __cold; #endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 454757fbdfe5..9193acf0e9cd 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -672,6 +672,33 @@ static bool process_mem_region(struct mem_vector *region, } #ifdef CONFIG_EFI + +/* + * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are + * guaranteed to be free. + * + * Pick free memory more conservatively than the EFI spec allows: according to + * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus + * available to place the kernel image into, but in practice there's firmware + * where using that memory leads to crashes. Buggy vendor EFI code registers + * for an event that triggers on SetVirtualAddressMap(). The handler assumes + * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which + * is probably true for Windows. + * + * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). + */ +static inline bool memory_type_is_free(efi_memory_desc_t *md) +{ + if (md->type == EFI_CONVENTIONAL_MEMORY) + return true; + + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && + md->type == EFI_UNACCEPTED_MEMORY) + return true; + + return false; +} + /* * Returns true if we processed the EFI memmap, which we prefer over the E820 * table if it is available. @@ -716,18 +743,7 @@ process_efi_entries(unsigned long minimum, unsigned long image_size) for (i = 0; i < nr_desc; i++) { md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); - /* - * Here we are more conservative in picking free memory than - * the EFI spec allows: - * - * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also - * free memory and thus available to place the kernel image into, - * but in practice there's firmware where using that memory leads - * to crashes. - * - * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. - */ - if (md->type != EFI_CONVENTIONAL_MEMORY) + if (!memory_type_is_free(md)) continue; if (efi_soft_reserve_enabled() && diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c new file mode 100644 index 000000000000..3c1609245f2a --- /dev/null +++ b/arch/x86/boot/compressed/mem.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "error.h" +#include "misc.h" +#include "tdx.h" +#include "sev.h" +#include <asm/shared/tdx.h> + +/* + * accept_memory() and process_unaccepted_memory() called from EFI stub which + * runs before decompresser and its early_tdx_detect(). + * + * Enumerate TDX directly from the early users. + */ +static bool early_is_tdx_guest(void) +{ + static bool once; + static bool is_tdx; + + if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST)) + return false; + + if (!once) { + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, + &sig[0], &sig[2], &sig[1]); + is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig)); + once = true; + } + + return is_tdx; +} + +void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* Platform-specific memory-acceptance call goes here */ + if (early_is_tdx_guest()) { + if (!tdx_accept_memory(start, end)) + panic("TDX: Failed to accept memory\n"); + } else if (sev_snp_enabled()) { + snp_accept_memory(start, end); + } else { + error("Cannot accept memory: unknown platform\n"); + } +} + +bool init_unaccepted_memory(void) +{ + guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; + struct efi_unaccepted_memory *table; + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + enum efi_type et; + int ret; + + et = efi_get_type(boot_params); + if (et == EFI_TYPE_NONE) + return false; + + ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len); + if (ret) { + warn("EFI config table not found."); + return false; + } + + table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa, + cfg_table_len, guid); + if (!table) + return false; + + if (table->version != 1) + error("Unknown version of unaccepted memory table\n"); + + /* + * In many cases unaccepted_table is already set by EFI stub, but it + * has to be initialized again to cover cases when the table is not + * allocated by EFI stub or EFI stub copied the kernel image with + * efi_relocate_kernel() before the variable is set. + * + * It must be initialized before the first usage of accept_memory(). + */ + unaccepted_table = table; + + return true; +} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 014ff222bf4b..94b7abcf624b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -455,6 +455,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #endif debug_putstr("\nDecompressing Linux... "); + + if (init_unaccepted_memory()) { + debug_putstr("Accepting memory... "); + accept_memory(__pa(output), __pa(output) + needed_size); + } + __decompress(input_data, input_len, NULL, NULL, output, output_len, NULL, error); entry_offset = parse_elf(output); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 2f155a0e3041..964fe903a1cd 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -247,4 +247,14 @@ static inline unsigned long efi_find_vendor_table(struct boot_params *bp, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_UNACCEPTED_MEMORY +bool init_unaccepted_memory(void); +#else +static inline bool init_unaccepted_memory(void) { return false; } +#endif + +/* Defined in EFI stub */ +extern struct efi_unaccepted_memory *unaccepted_table; +void accept_memory(phys_addr_t start, phys_addr_t end); + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 014b89c89088..09dc8c187b3c 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -115,7 +115,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, /* Include code for early handlers */ #include "../../kernel/sev-shared.c" -static inline bool sev_snp_enabled(void) +bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } @@ -181,6 +181,58 @@ static bool early_setup_ghcb(void) return true; } +static phys_addr_t __snp_accept_memory(struct snp_psc_desc *desc, + phys_addr_t pa, phys_addr_t pa_end) +{ + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned int i; + + hdr = &desc->hdr; + memset(hdr, 0, sizeof(*hdr)); + + e = desc->entries; + + i = 0; + while (pa < pa_end && i < VMGEXIT_PSC_MAX_ENTRY) { + hdr->end_entry = i; + + e->gfn = pa >> PAGE_SHIFT; + e->operation = SNP_PAGE_STATE_PRIVATE; + if (IS_ALIGNED(pa, PMD_SIZE) && (pa_end - pa) >= PMD_SIZE) { + e->pagesize = RMP_PG_SIZE_2M; + pa += PMD_SIZE; + } else { + e->pagesize = RMP_PG_SIZE_4K; + pa += PAGE_SIZE; + } + + e++; + i++; + } + + if (vmgexit_psc(boot_ghcb, desc)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + pvalidate_pages(desc); + + return pa; +} + +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ + struct snp_psc_desc desc = {}; + unsigned int i; + phys_addr_t pa; + + if (!boot_ghcb && !early_setup_ghcb()) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + pa = start; + while (pa < end) + pa = __snp_accept_memory(&desc, pa, end); +} + void sev_es_shutdown_ghcb(void) { if (!boot_ghcb) diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h new file mode 100644 index 000000000000..fc725a981b09 --- /dev/null +++ b/arch/x86/boot/compressed/sev.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD SEV header for early boot related functions. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#ifndef BOOT_COMPRESSED_SEV_H +#define BOOT_COMPRESSED_SEV_H + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +bool sev_snp_enabled(void); +void snp_accept_memory(phys_addr_t start, phys_addr_t end); + +#else + +static inline bool sev_snp_enabled(void) { return false; } +static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } + +#endif + +#endif diff --git a/arch/x86/boot/compressed/tdx-shared.c b/arch/x86/boot/compressed/tdx-shared.c new file mode 100644 index 000000000000..5ac43762fe13 --- /dev/null +++ b/arch/x86/boot/compressed/tdx-shared.c @@ -0,0 +1,2 @@ +#include "error.h" +#include "../../coco/tdx/tdx-shared.c" diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 2d81d3cc72a1..8841b945a1e2 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -20,7 +20,7 @@ static inline unsigned int tdx_io_in(int size, u16 port) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, - .r11 = EXIT_REASON_IO_INSTRUCTION, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, .r13 = 0, .r14 = port, @@ -36,7 +36,7 @@ static inline void tdx_io_out(int size, u16 port, u32 value) { struct tdx_hypercall_args args = { .r10 = TDX_HYPERCALL_STANDARD, - .r11 = EXIT_REASON_IO_INSTRUCTION, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, .r13 = 1, .r14 = port, diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 0bbf4f3707d2..feb6dbd7ca86 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -14,9 +14,7 @@ */ #include "boot.h" -#ifdef CONFIG_X86_FEATURE_NAMES #include "cpustr.h" -#endif static char *cpu_name(int level) { @@ -35,7 +33,6 @@ static char *cpu_name(int level) static void show_cap_strs(u32 *err_flags) { int i, j; -#ifdef CONFIG_X86_FEATURE_NAMES const unsigned char *msg_strs = (const unsigned char *)x86_cap_strs; for (i = 0; i < NCAPINTS; i++) { u32 e = err_flags[i]; @@ -58,16 +55,6 @@ static void show_cap_strs(u32 *err_flags) e >>= 1; } } -#else - for (i = 0; i < NCAPINTS; i++) { - u32 e = err_flags[i]; - for (j = 0; j < 32; j++) { - if (e & 1) - printf("%d:%d ", i, j); - e >>= 1; - } - } -#endif } int validate_cpu(void) diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 73f83233d25d..eeec9986570e 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -13,10 +13,10 @@ #include <asm/coco.h> #include <asm/processor.h> -enum cc_vendor cc_vendor __ro_after_init; +enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; static u64 cc_mask __ro_after_init; -static bool intel_cc_platform_has(enum cc_attr attr) +static bool noinstr intel_cc_platform_has(enum cc_attr attr) { switch (attr) { case CC_ATTR_GUEST_UNROLL_STRING_IO: @@ -34,7 +34,7 @@ static bool intel_cc_platform_has(enum cc_attr attr) * the other levels of SME/SEV functionality, including C-bit * based SEV-SNP, are not enabled. */ -static __maybe_unused bool amd_cc_platform_vtom(enum cc_attr attr) +static __maybe_unused __always_inline bool amd_cc_platform_vtom(enum cc_attr attr) { switch (attr) { case CC_ATTR_GUEST_MEM_ENCRYPT: @@ -58,7 +58,7 @@ static __maybe_unused bool amd_cc_platform_vtom(enum cc_attr attr) * the trampoline area must be encrypted. */ -static bool amd_cc_platform_has(enum cc_attr attr) +static bool noinstr amd_cc_platform_has(enum cc_attr attr) { #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -97,7 +97,7 @@ static bool amd_cc_platform_has(enum cc_attr attr) #endif } -bool cc_platform_has(enum cc_attr attr) +bool noinstr cc_platform_has(enum cc_attr attr) { switch (cc_vendor) { case CC_VENDOR_AMD: diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index 46c55998557d..2c7dcbf1458b 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o tdcall.o +obj-y += tdx.o tdx-shared.o tdcall.o diff --git a/arch/x86/coco/tdx/tdx-shared.c b/arch/x86/coco/tdx/tdx-shared.c new file mode 100644 index 000000000000..ef20ddc37b58 --- /dev/null +++ b/arch/x86/coco/tdx/tdx-shared.c @@ -0,0 +1,71 @@ +#include <asm/tdx.h> +#include <asm/pgtable.h> + +static unsigned long try_accept_one(phys_addr_t start, unsigned long len, + enum pg_level pg_level) +{ + unsigned long accept_size = page_level_size(pg_level); + u64 tdcall_rcx; + u8 page_size; + + if (!IS_ALIGNED(start, accept_size)) + return 0; + + if (len < accept_size) + return 0; + + /* + * Pass the page physical address to the TDX module to accept the + * pending, private page. + * + * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. + */ + switch (pg_level) { + case PG_LEVEL_4K: + page_size = 0; + break; + case PG_LEVEL_2M: + page_size = 1; + break; + case PG_LEVEL_1G: + page_size = 2; + break; + default: + return 0; + } + + tdcall_rcx = start | page_size; + if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) + return 0; + + return accept_size; +} + +bool tdx_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* + * For shared->private conversion, accept the page using + * TDX_ACCEPT_PAGE TDX module call. + */ + while (start < end) { + unsigned long len = end - start; + unsigned long accept_size; + + /* + * Try larger accepts first. It gives chance to VMM to keep + * 1G/2M Secure EPT entries where possible and speeds up + * process by cutting number of hypercalls (if successful). + */ + + accept_size = try_accept_one(start, len, PG_LEVEL_1G); + if (!accept_size) + accept_size = try_accept_one(start, len, PG_LEVEL_2M); + if (!accept_size) + accept_size = try_accept_one(start, len, PG_LEVEL_4K); + if (!accept_size) + return false; + start += accept_size; + } + + return true; +} diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index e146b599260f..1d6b863c42b0 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -14,20 +14,6 @@ #include <asm/insn-eval.h> #include <asm/pgtable.h> -/* TDX module Call Leaf IDs */ -#define TDX_GET_INFO 1 -#define TDX_GET_VEINFO 3 -#define TDX_GET_REPORT 4 -#define TDX_ACCEPT_PAGE 6 -#define TDX_WR 8 - -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ -#define TDCS_NOTIFY_ENABLES 0x9100000000000010 - -/* TDX hypercall Leaf IDs */ -#define TDVMCALL_MAP_GPA 0x10001 -#define TDVMCALL_REPORT_FATAL_ERROR 0x10003 - /* MMIO direction */ #define EPT_READ 0 #define EPT_WRITE 1 @@ -51,24 +37,6 @@ #define TDREPORT_SUBTYPE_0 0 -/* - * Wrapper for standard use of __tdx_hypercall with no output aside from - * return code. - */ -static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) -{ - struct tdx_hypercall_args args = { - .r10 = TDX_HYPERCALL_STANDARD, - .r11 = fn, - .r12 = r12, - .r13 = r13, - .r14 = r14, - .r15 = r15, - }; - - return __tdx_hypercall(&args); -} - /* Called from __tdx_hypercall() for unrecoverable failure */ noinstr void __tdx_hypercall_failed(void) { @@ -76,17 +44,6 @@ noinstr void __tdx_hypercall_failed(void) panic("TDVMCALL failed. TDX module bug?"); } -/* - * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined - * independently from but are currently matched 1:1 with VMX EXIT_REASONs. - * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and - * guest sides of these calls. - */ -static __always_inline u64 hcall_func(u64 exit_reason) -{ - return exit_reason; -} - #ifdef CONFIG_KVM_GUEST long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) @@ -745,47 +702,6 @@ static bool tdx_cache_flush_required(void) return true; } -static bool try_accept_one(phys_addr_t *start, unsigned long len, - enum pg_level pg_level) -{ - unsigned long accept_size = page_level_size(pg_level); - u64 tdcall_rcx; - u8 page_size; - - if (!IS_ALIGNED(*start, accept_size)) - return false; - - if (len < accept_size) - return false; - - /* - * Pass the page physical address to the TDX module to accept the - * pending, private page. - * - * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. - */ - switch (pg_level) { - case PG_LEVEL_4K: - page_size = 0; - break; - case PG_LEVEL_2M: - page_size = 1; - break; - case PG_LEVEL_1G: - page_size = 2; - break; - default: - return false; - } - - tdcall_rcx = *start | page_size; - if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) - return false; - - *start += accept_size; - return true; -} - /* * Inform the VMM of the guest's intent for this physical page: shared with * the VMM or private to the guest. The VMM is expected to change its mapping @@ -810,33 +726,34 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0)) return false; - /* private->shared conversion requires only MapGPA call */ - if (!enc) - return true; + /* shared->private conversion requires memory to be accepted before use */ + if (enc) + return tdx_accept_memory(start, end); + + return true; +} +static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages, + bool enc) +{ /* - * For shared->private conversion, accept the page using - * TDX_ACCEPT_PAGE TDX module call. + * Only handle shared->private conversion here. + * See the comment in tdx_early_init(). */ - while (start < end) { - unsigned long len = end - start; - - /* - * Try larger accepts first. It gives chance to VMM to keep - * 1G/2M SEPT entries where possible and speeds up process by - * cutting number of hypercalls (if successful). - */ - - if (try_accept_one(&start, len, PG_LEVEL_1G)) - continue; - - if (try_accept_one(&start, len, PG_LEVEL_2M)) - continue; - - if (!try_accept_one(&start, len, PG_LEVEL_4K)) - return false; - } + if (enc) + return tdx_enc_status_changed(vaddr, numpages, enc); + return true; +} +static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, + bool enc) +{ + /* + * Only handle private->shared conversion here. + * See the comment in tdx_early_init(). + */ + if (!enc) + return tdx_enc_status_changed(vaddr, numpages, enc); return true; } @@ -852,7 +769,7 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); - cc_set_vendor(CC_VENDOR_INTEL); + cc_vendor = CC_VENDOR_INTEL; tdx_parse_tdinfo(&cc_mask); cc_set_mask(cc_mask); @@ -867,9 +784,41 @@ void __init tdx_early_init(void) */ physical_mask &= cc_mask - 1; - x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; - x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; - x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed; + /* + * The kernel mapping should match the TDX metadata for the page. + * load_unaligned_zeropad() can touch memory *adjacent* to that which is + * owned by the caller and can catch even _momentary_ mismatches. Bad + * things happen on mismatch: + * + * - Private mapping => Shared Page == Guest shutdown + * - Shared mapping => Private Page == Recoverable #VE + * + * guest.enc_status_change_prepare() converts the page from + * shared=>private before the mapping becomes private. + * + * guest.enc_status_change_finish() converts the page from + * private=>shared after the mapping becomes private. + * + * In both cases there is a temporary shared mapping to a private page, + * which can result in a #VE. But, there is never a private mapping to + * a shared page. + */ + x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare; + x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish; + + x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; + + /* + * TDX intercepts the RDMSR to read the X2APIC ID in the parallel + * bringup low level code. That raises #VE which cannot be handled + * there. + * + * Intel-TDX has a secure RDMSR hypercall, but that needs to be + * implemented seperately in the low level startup ASM code. + * Until that is in place, disable parallel bringup for TDX. + */ + x86_cpuinit.parallel_bringup = false; pr_info("Guest detected\n"); } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 91397f58ac30..6e6af42e044a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -720,26 +720,6 @@ SYM_CODE_END(__switch_to_asm) .popsection /* - * The unwinder expects the last frame on the stack to always be at the same - * offset from the end of the page, which allows it to validate the stack. - * Calling schedule_tail() directly would break that convention because its an - * asmlinkage function so its argument has to be pushed on the stack. This - * wrapper creates a proper "end of stack" frame header before the call. - */ -.pushsection .text, "ax" -SYM_FUNC_START(schedule_tail_wrapper) - FRAME_BEGIN - - pushl %eax - call schedule_tail - popl %eax - - FRAME_END - RET -SYM_FUNC_END(schedule_tail_wrapper) -.popsection - -/* * A newly forked process directly context switches into this address. * * eax: prev task we switched from @@ -747,29 +727,22 @@ SYM_FUNC_END(schedule_tail_wrapper) * edi: kernel thread arg */ .pushsection .text, "ax" -SYM_CODE_START(ret_from_fork) - call schedule_tail_wrapper +SYM_CODE_START(ret_from_fork_asm) + movl %esp, %edx /* regs */ - testl %ebx, %ebx - jnz 1f /* kernel threads are uncommon */ + /* return address for the stack unwinder */ + pushl $.Lsyscall_32_done -2: - /* When we fork, we trace the syscall return in the child, too. */ - movl %esp, %eax - call syscall_exit_to_user_mode - jmp .Lsyscall_32_done + FRAME_BEGIN + /* prev already in EAX */ + movl %ebx, %ecx /* fn */ + pushl %edi /* fn_arg */ + call ret_from_fork + addl $4, %esp + FRAME_END - /* kernel thread */ -1: movl %edi, %eax - CALL_NOSPEC ebx - /* - * A kernel thread is allowed to return here after successfully - * calling kernel_execve(). Exit to userspace to complete the execve() - * syscall. - */ - movl $0, PT_EAX(%esp) - jmp 2b -SYM_CODE_END(ret_from_fork) + RET +SYM_CODE_END(ret_from_fork_asm) .popsection SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f31e286c2977..91f6818884fa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm) * r12: kernel thread arg */ .pushsection .text, "ax" - __FUNC_ALIGN -SYM_CODE_START_NOALIGN(ret_from_fork) - UNWIND_HINT_END_OF_STACK +SYM_CODE_START(ret_from_fork_asm) + UNWIND_HINT_REGS ANNOTATE_NOENDBR // copy_thread CALL_DEPTH_ACCOUNT - movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + movq %rax, %rdi /* prev */ + movq %rsp, %rsi /* regs */ + movq %rbx, %rdx /* fn */ + movq %r12, %rcx /* fn_arg */ + call ret_from_fork -2: - UNWIND_HINT_REGS - movq %rsp, %rdi - call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode - -1: - /* kernel thread */ - UNWIND_HINT_END_OF_STACK - movq %r12, %rdi - CALL_NOSPEC rbx - /* - * A kernel thread is allowed to return here after successfully - * calling kernel_execve(). Exit to userspace to complete the execve() - * syscall. - */ - movq $0, RAX(%rsp) - jmp 2b -SYM_CODE_END(ret_from_fork) +SYM_CODE_END(ret_from_fork_asm) .popsection .macro DEBUG_ENTRY_ASSERT_IRQS_OFF diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 320480a8db4f..bc0a3c941b35 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -455,3 +455,4 @@ 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 cachestat sys_cachestat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..227538b0ce80 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 5e37f41e5f14..27b5da2111ac 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -26,17 +26,7 @@ SYM_FUNC_START(\name) pushq %r11 call \func - jmp __thunk_restore -SYM_FUNC_END(\name) - _ASM_NOKPROBE(\name) - .endm - - THUNK preempt_schedule_thunk, preempt_schedule - THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace - EXPORT_SYMBOL(preempt_schedule_thunk) - EXPORT_SYMBOL(preempt_schedule_notrace_thunk) -SYM_CODE_START_LOCAL(__thunk_restore) popq %r11 popq %r10 popq %r9 @@ -48,5 +38,11 @@ SYM_CODE_START_LOCAL(__thunk_restore) popq %rdi popq %rbp RET - _ASM_NOKPROBE(__thunk_restore) -SYM_CODE_END(__thunk_restore) +SYM_FUNC_END(\name) + _ASM_NOKPROBE(\name) + .endm + +THUNK preempt_schedule_thunk, preempt_schedule +THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace +EXPORT_SYMBOL(preempt_schedule_thunk) +EXPORT_SYMBOL(preempt_schedule_notrace_thunk) diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index 0a9007c24056..e4640306b2e3 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/getcpu.h> #include <asm/segment.h> +#include <vdso/processor.h> notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index bccea57dee81..abadd5f23425 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -374,7 +374,7 @@ static int amd_pmu_hw_config(struct perf_event *event) /* pass precise event sampling to ibs: */ if (event->attr.precise_ip && get_ibs_caps()) - return -ENOENT; + return forward_event_to_ibs(event); if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 64582954b5f6..371014802191 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -190,7 +190,7 @@ static struct perf_ibs *get_ibs_pmu(int type) } /* - * Use IBS for precise event sampling: + * core pmu config -> IBS config * * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count * perf record -a -e r076:p ... # same as -e cpu-cycles:p @@ -199,25 +199,9 @@ static struct perf_ibs *get_ibs_pmu(int type) * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, * MSRC001_1033) is used to select either cycle or micro-ops counting * mode. - * - * The rip of IBS samples has skid 0. Thus, IBS supports precise - * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the - * rip is invalid when IBS was not able to record the rip correctly. - * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. - * */ -static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +static int core_pmu_ibs_config(struct perf_event *event, u64 *config) { - switch (event->attr.precise_ip) { - case 0: - return -ENOENT; - case 1: - case 2: - break; - default: - return -EOPNOTSUPP; - } - switch (event->attr.type) { case PERF_TYPE_HARDWARE: switch (event->attr.config) { @@ -243,22 +227,37 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config) return -EOPNOTSUPP; } +/* + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + */ +int forward_event_to_ibs(struct perf_event *event) +{ + u64 config = 0; + + if (!event->attr.precise_ip || event->attr.precise_ip > 2) + return -EOPNOTSUPP; + + if (!core_pmu_ibs_config(event, &config)) { + event->attr.type = perf_ibs_op.pmu.type; + event->attr.config = config; + } + return -ENOENT; +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; u64 max_cnt, config; - int ret; perf_ibs = get_ibs_pmu(event->attr.type); - if (perf_ibs) { - config = event->attr.config; - } else { - perf_ibs = &perf_ibs_op; - ret = perf_ibs_precise_event(event, &config); - if (ret) - return ret; - } + if (!perf_ibs) + return -ENOENT; + + config = event->attr.config; if (event->pmu != &perf_ibs->pmu) return -ENOENT; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 89b9c1cebb61..2a284ba951b7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -349,6 +349,16 @@ static struct event_constraint intel_spr_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_gnr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -2451,7 +2461,7 @@ static void intel_pmu_disable_fixed(struct perf_event *event) intel_clear_masks(event, idx); - mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); + mask = intel_fixed_bits_by_idx(idx - INTEL_PMC_IDX_FIXED, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val &= ~mask; } @@ -2750,25 +2760,25 @@ static void intel_pmu_enable_fixed(struct perf_event *event) * if requested: */ if (!event->attr.precise_ip) - bits |= 0x8; + bits |= INTEL_FIXED_0_ENABLE_PMI; if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; + bits |= INTEL_FIXED_0_USER; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; + bits |= INTEL_FIXED_0_KERNEL; /* * ANY bit is supported in v3 and up */ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) - bits |= 0x4; + bits |= INTEL_FIXED_0_ANYTHREAD; idx -= INTEL_PMC_IDX_FIXED; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); + bits = intel_fixed_bits_by_idx(idx, bits); + mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { - bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4); - mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); + bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); + mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); } cpuc->fixed_ctrl_val &= ~mask; @@ -3983,6 +3993,13 @@ static int intel_pmu_hw_config(struct perf_event *event) struct perf_event *leader = event->group_leader; struct perf_event *sibling = NULL; + /* + * When this memload event is also the first event (no group + * exists yet), then there is no aux event before it. + */ + if (leader == event) + return -ENODATA; + if (!is_mem_loads_aux_event(leader)) { for_each_sibling_event(sibling, leader) { if (is_mem_loads_aux_event(sibling)) @@ -6496,6 +6513,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_EMERALDRAPIDS_X: x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + x86_pmu.extra_regs = intel_spr_extra_regs; fallthrough; case INTEL_FAM6_GRANITERAPIDS_X: case INTEL_FAM6_GRANITERAPIDS_D: @@ -6506,7 +6524,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_spr_event_constraints; x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; - x86_pmu.extra_regs = intel_spr_extra_regs; + if (!x86_pmu.extra_regs) + x86_pmu.extra_regs = intel_gnr_extra_regs; x86_pmu.limit_period = spr_limit_period; x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; @@ -6650,6 +6669,7 @@ __init int intel_pmu_init(void) pmu->pebs_constraints = intel_grt_pebs_event_constraints; pmu->extra_regs = intel_grt_extra_regs; if (is_mtl(boot_cpu_data.x86_model)) { + x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs; x86_pmu.pebs_latency_data = mtl_latency_data_small; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a5f9474f08e1..6c04b52f139b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -416,7 +416,7 @@ void __init hyperv_init(void) goto free_vp_assist_page; } - cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online", hv_cpu_init, hv_cpu_die); if (cpuhp < 0) goto free_ghcb_page; diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 1ba5d3b99b16..85d38b9f3586 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -20,6 +20,8 @@ void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index cc92388b7a99..14f46ad2ca64 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -17,6 +17,7 @@ #include <asm/mem_encrypt.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> +#include <asm/mtrr.h> #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -364,7 +365,7 @@ void __init hv_vtom_init(void) * Set it here to indicate a vTOM VM. */ sev_status = MSR_AMD64_SNP_VTOM; - cc_set_vendor(CC_VENDOR_AMD); + cc_vendor = CC_VENDOR_AMD; cc_set_mask(ms_hyperv.shared_gpa_boundary); physical_mask &= ms_hyperv.shared_gpa_boundary - 1; @@ -372,6 +373,9 @@ void __init hv_vtom_init(void) x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; + + /* Set WB as the default cache mode. */ + mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 1e51650b79d7..4f1ce5fc4e19 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +generated-y += orc_hash.h generated-y += syscalls_32.h generated-y += syscalls_64.h generated-y += syscalls_x32.h diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index d7da28fada87..9c4da699e11a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -96,7 +96,7 @@ extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); -extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, s32 *start_cfi, s32 *end_cfi); @@ -113,7 +113,6 @@ extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); -extern bool is_callthunk(void *addr); extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); #else static __always_inline void callthunks_patch_builtin_calls(void) {} @@ -124,10 +123,6 @@ static __always_inline void *callthunks_translate_call_dest(void *dest) { return dest; } -static __always_inline bool is_callthunk(void *addr) -{ - return false; -} static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, void *func) { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3216da7074ba..98c32aa5963a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -55,6 +55,8 @@ extern int local_apic_timer_c2_ok; extern int disable_apic; extern unsigned int lapic_timer_period; +extern int cpuid_to_apicid[]; + extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, @@ -377,7 +379,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[]; * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP -extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); extern int lapic_can_unplug_cpu(void); #endif @@ -507,10 +508,8 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP -bool apic_id_is_primary_thread(unsigned int id); void apic_smt_update(void); #else -static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } static inline void apic_smt_update(void) { } #endif diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 68d213e83fcc..4b125e5b3187 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_APICDEF_H #define _ASM_X86_APICDEF_H +#include <linux/bits.h> + /* * Constants for various Intel APICs. (local APIC, IOAPIC, etc.) * @@ -138,9 +140,10 @@ #define APIC_EILVT_MASKED (1 << 16) #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) -#define APIC_BASE_MSR 0x800 -#define XAPIC_ENABLE (1UL << 11) -#define X2APIC_ENABLE (1UL << 10) +#define APIC_BASE_MSR 0x800 +#define APIC_X2APIC_ID_MSR 0x802 +#define XAPIC_ENABLE BIT(11) +#define X2APIC_ENABLE BIT(10) #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 @@ -162,6 +165,7 @@ #define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) #define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) +#ifndef __ASSEMBLY__ /* * the local APIC register structure, memory mapped. Not terribly well * tested, but we might eventually use this one in the future - the @@ -435,4 +439,5 @@ enum apic_delivery_modes { APIC_DELIVERY_MODE_EXTINT = 7, }; +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 5e754e895767..55a55ec04350 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -14,12 +14,6 @@ * resource counting etc.. */ -/** - * arch_atomic_read - read atomic variable - * @v: pointer of type atomic_t - * - * Atomically reads the value of @v. - */ static __always_inline int arch_atomic_read(const atomic_t *v) { /* @@ -29,25 +23,11 @@ static __always_inline int arch_atomic_read(const atomic_t *v) return __READ_ONCE((v)->counter); } -/** - * arch_atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ static __always_inline void arch_atomic_set(atomic_t *v, int i) { __WRITE_ONCE(v->counter, i); } -/** - * arch_atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v. - */ static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" @@ -55,13 +35,6 @@ static __always_inline void arch_atomic_add(int i, atomic_t *v) : "ir" (i) : "memory"); } -/** - * arch_atomic_sub - subtract integer from atomic variable - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v. - */ static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" @@ -69,27 +42,12 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) : "ir" (i) : "memory"); } -/** - * arch_atomic_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test -/** - * arch_atomic_inc - increment atomic variable - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1. - */ static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" @@ -97,12 +55,6 @@ static __always_inline void arch_atomic_inc(atomic_t *v) } #define arch_atomic_inc arch_atomic_inc -/** - * arch_atomic_dec - decrement atomic variable - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1. - */ static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" @@ -110,69 +62,30 @@ static __always_inline void arch_atomic_dec(atomic_t *v) } #define arch_atomic_dec arch_atomic_dec -/** - * arch_atomic_dec_and_test - decrement and test - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test -/** - * arch_atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test -/** - * arch_atomic_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative -/** - * arch_atomic_add_return - add integer and return - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns @i + @v - */ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic_add_return arch_atomic_add_return -/** - * arch_atomic_sub_return - subtract integer and return - * @v: pointer of type atomic_t - * @i: integer value to subtract - * - * Atomically subtracts @i from @v and returns @v - @i - */ static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) { return arch_atomic_add_return(-i, v); diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 808b4eece251..3486d91b8595 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -61,30 +61,12 @@ ATOMIC64_DECL(add_unless); #undef __ATOMIC64_DECL #undef ATOMIC64_EXPORT -/** - * arch_atomic64_cmpxchg - cmpxchg atomic64 variable - * @v: pointer to type atomic64_t - * @o: expected value - * @n: new value - * - * Atomically sets @v to @n if it was equal to @o and returns - * the old value. - */ - static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { return arch_cmpxchg64(&v->counter, o, n); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg -/** - * arch_atomic64_xchg - xchg atomic64 variable - * @v: pointer to type atomic64_t - * @n: value to assign - * - * Atomically xchgs the value of @v to @n and returns - * the old value. - */ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; @@ -97,13 +79,6 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) } #define arch_atomic64_xchg arch_atomic64_xchg -/** - * arch_atomic64_set - set atomic64 variable - * @v: pointer to type atomic64_t - * @i: value to assign - * - * Atomically sets the value of @v to @n. - */ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); @@ -113,12 +88,6 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) : "eax", "edx", "memory"); } -/** - * arch_atomic64_read - read atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically reads the value of @v and returns it. - */ static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { s64 r; @@ -126,13 +95,6 @@ static __always_inline s64 arch_atomic64_read(const atomic64_t *v) return r; } -/** - * arch_atomic64_add_return - add and return - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns @i + *@v - */ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, @@ -142,9 +104,6 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) } #define arch_atomic64_add_return arch_atomic64_add_return -/* - * Other variants with different arithmetic operators: - */ static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, @@ -172,13 +131,6 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) } #define arch_atomic64_dec_return arch_atomic64_dec_return -/** - * arch_atomic64_add - add integer to atomic64 variable - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v. - */ static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, @@ -187,13 +139,6 @@ static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v) return i; } -/** - * arch_atomic64_sub - subtract the atomic64 variable - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v. - */ static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, @@ -202,12 +147,6 @@ static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) return i; } -/** - * arch_atomic64_inc - increment atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1. - */ static __always_inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, @@ -215,12 +154,6 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) } #define arch_atomic64_inc arch_atomic64_inc -/** - * arch_atomic64_dec - decrement atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1. - */ static __always_inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, @@ -228,15 +161,6 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) } #define arch_atomic64_dec arch_atomic64_dec -/** - * arch_atomic64_add_unless - add unless the number is a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as it was not @u. - * Returns non-zero if the add was done, zero otherwise. - */ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned low = (unsigned)u; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index c496595bf601..3165c0feedf7 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -10,37 +10,16 @@ #define ATOMIC64_INIT(i) { (i) } -/** - * arch_atomic64_read - read atomic64 variable - * @v: pointer of type atomic64_t - * - * Atomically reads the value of @v. - * Doesn't imply a read memory barrier. - */ static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __READ_ONCE((v)->counter); } -/** - * arch_atomic64_set - set atomic64 variable - * @v: pointer to type atomic64_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __WRITE_ONCE(v->counter, i); } -/** - * arch_atomic64_add - add integer to atomic64 variable - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v. - */ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" @@ -48,13 +27,6 @@ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) : "er" (i), "m" (v->counter) : "memory"); } -/** - * arch_atomic64_sub - subtract the atomic64 variable - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v. - */ static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" @@ -62,27 +34,12 @@ static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) : "er" (i), "m" (v->counter) : "memory"); } -/** - * arch_atomic64_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test -/** - * arch_atomic64_inc - increment atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1. - */ static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" @@ -91,12 +48,6 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) } #define arch_atomic64_inc arch_atomic64_inc -/** - * arch_atomic64_dec - decrement atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1. - */ static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" @@ -105,56 +56,24 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) } #define arch_atomic64_dec arch_atomic64_dec -/** - * arch_atomic64_dec_and_test - decrement and test - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test -/** - * arch_atomic64_inc_and_test - increment and test - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test -/** - * arch_atomic64_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } #define arch_atomic64_add_negative arch_atomic64_add_negative -/** - * arch_atomic64_add_return - add and return - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns @i + @v - */ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h index 92ae28389940..f25ca2d709d4 100644 --- a/arch/x86/include/asm/bugs.h +++ b/arch/x86/include/asm/bugs.h @@ -4,8 +4,6 @@ #include <asm/processor.h> -extern void check_bugs(void); - #if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32) int ppro_with_ram_bug(void); #else diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 540573f515b7..d53636506134 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -239,29 +239,4 @@ extern void __add_wrong_size(void) #define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) #define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) -#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \ -({ \ - bool __ret; \ - __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \ - __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \ - BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \ - BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ - VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ - VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ - asm volatile(pfx "cmpxchg%c5b %1" \ - CC_SET(e) \ - : CC_OUT(e) (__ret), \ - "+m" (*(p1)), "+m" (*(p2)), \ - "+a" (__old1), "+d" (__old2) \ - : "i" (2 * sizeof(long)), \ - "b" (__new1), "c" (__new2)); \ - __ret; \ -}) - -#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \ - __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2) - -#define arch_cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ - __cmpxchg_double(, p1, p2, o1, o2, n1, n2) - #endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 6ba80ce9438d..b5731c51f0f4 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -103,6 +103,6 @@ static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) #endif -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) +#define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 0d3beb27b7fe..44b08b53ab32 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,6 +20,71 @@ arch_try_cmpxchg((ptr), (po), (n)); \ }) -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) +union __u128_halves { + u128 full; + struct { + u64 low, high; + }; +}; + +#define __arch_cmpxchg128(_ptr, _old, _new, _lock) \ +({ \ + union __u128_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(_lock "cmpxchg16b %[ptr]" \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) +{ + return __arch_cmpxchg128(ptr, old, new, LOCK_PREFIX); +} +#define arch_cmpxchg128 arch_cmpxchg128 + +static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new) +{ + return __arch_cmpxchg128(ptr, old, new,); +} +#define arch_cmpxchg128_local arch_cmpxchg128_local + +#define __arch_try_cmpxchg128(_ptr, _oldp, _new, _lock) \ +({ \ + union __u128_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(_lock "cmpxchg16b %[ptr]" \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*ptr), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new) +{ + return __arch_try_cmpxchg128(ptr, oldp, new, LOCK_PREFIX); +} +#define arch_try_cmpxchg128 arch_try_cmpxchg128 + +static __always_inline bool arch_try_cmpxchg128_local(volatile u128 *ptr, u128 *oldp, u128 new) +{ + return __arch_try_cmpxchg128(ptr, oldp, new,); +} +#define arch_try_cmpxchg128_local arch_try_cmpxchg128_local + +#define system_has_cmpxchg128() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index eb08796002f3..6ae2d16a7613 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -10,30 +10,13 @@ enum cc_vendor { CC_VENDOR_INTEL, }; -#ifdef CONFIG_ARCH_HAS_CC_PLATFORM extern enum cc_vendor cc_vendor; -static inline enum cc_vendor cc_get_vendor(void) -{ - return cc_vendor; -} - -static inline void cc_set_vendor(enum cc_vendor vendor) -{ - cc_vendor = vendor; -} - +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM void cc_set_mask(u64 mask); u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); #else -static inline enum cc_vendor cc_get_vendor(void) -{ - return CC_VENDOR_NONE; -} - -static inline void cc_set_vendor(enum cc_vendor vendor) { } - static inline u64 cc_mkenc(u64 val) { return val; diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 78796b98a544..3a233ebff712 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -30,10 +30,7 @@ struct x86_cpu { #ifdef CONFIG_HOTPLUG_CPU extern int arch_register_cpu(int num); extern void arch_unregister_cpu(int); -extern void start_cpu0(void); -#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 -extern int _debug_hotplug_cpu(int cpu, int action); -#endif +extern void soft_restart_cpu(void); #endif extern void ap_init_aperfmperf(void); @@ -98,4 +95,6 @@ extern u64 x86_read_arch_cap_msr(void); int intel_find_matching_signature(void *mc, unsigned int csig, int cpf); int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type); +extern struct cpumask cpus_stop_mask; + #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ce0c8f7d3218..a26bebbdff87 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -38,15 +38,10 @@ enum cpuid_leafs #define X86_CAP_FMT_NUM "%d:%d" #define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) -#ifdef CONFIG_X86_FEATURE_NAMES extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] -#else -#define X86_CAP_FMT X86_CAP_FMT_NUM -#define x86_cap_flag x86_cap_flag_num -#endif /* * In order to save room, we index into this array by doing diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h index c5aed9e9226c..4acfd57de8f1 100644 --- a/arch/x86/include/asm/cpumask.h +++ b/arch/x86/include/asm/cpumask.h @@ -4,11 +4,6 @@ #ifndef __ASSEMBLY__ #include <linux/cpumask.h> -extern cpumask_var_t cpu_callin_mask; -extern cpumask_var_t cpu_callout_mask; -extern cpumask_var_t cpu_initialized_mask; -extern cpumask_var_t cpu_sibling_setup_mask; - extern void setup_cpu_local_masks(void); /* diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h index 54a6e4a2e132..de0e88b32207 100644 --- a/arch/x86/include/asm/doublefault.h +++ b/arch/x86/include/asm/doublefault.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_DOUBLEFAULT_H #define _ASM_X86_DOUBLEFAULT_H +#include <linux/linkage.h> + #ifdef CONFIG_X86_32 extern void doublefault_init_cpu_tss(void); #else @@ -10,4 +12,6 @@ static inline void doublefault_init_cpu_tss(void) } #endif +asmlinkage void __noreturn doublefault_shim(void); + #endif /* _ASM_X86_DOUBLEFAULT_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 419280d263d2..8b4be7cecdb8 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -31,6 +31,8 @@ extern unsigned long efi_mixed_mode_stack_pa; #define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF +#define EFI_UNACCEPTED_UNIT_SIZE PMD_SIZE + /* * The EFI services are called through variadic functions in many cases. These * functions are implemented in assembler and support only a fixed number of diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 503a577814b2..b475d9a582b8 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -109,7 +109,7 @@ extern void fpu_reset_from_exception_fixup(void); /* Boot, hotplug and resume */ extern void fpu__init_cpu(void); -extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_system(void); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 5061ac98ffa1..897cf02c20b1 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -106,6 +106,9 @@ struct dyn_arch_ftrace { #ifndef __ASSEMBLY__ +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, + unsigned long frame_pointer); + #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) extern void set_ftrace_ops_ro(void); #else @@ -147,4 +150,24 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) #endif /* !COMPILE_OFFSETS */ #endif /* !__ASSEMBLY__ */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + unsigned long ax; + unsigned long dx; + unsigned long bp; +}; + +static inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->ax; +} + +static inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->bp; +} +#endif /* ifdef CONFIG_FUNCTION_GRAPH_TRACER */ +#endif + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index baae6b4fea23..1e59581d500c 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -34,7 +34,7 @@ /* * Create a dummy function pointer reference to prevent objtool from marking * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by - * apply_ibt_endbr()). + * apply_seal_endbr()). */ #define IBT_NOSEAL(fname) \ ".pushsection .discard.ibt_endbr_noseal\n\t" \ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 768aa234cbb4..29e083b92813 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -40,8 +40,6 @@ extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern void init_ISA_irqs(void); -extern void __init init_IRQ(void); - #ifdef CONFIG_X86_LOCAL_APIC void arch_trigger_cpumask_backtrace(const struct cpumask *mask, bool exclude_self); diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index c17e3e96fc1d..6c98f4bb4228 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -13,7 +13,6 @@ BUILD_BUG_ON(1) * at the call sites. */ KVM_X86_PMU_OP(hw_event_available) -KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fb9d1f2d6136..28bd38303d70 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -523,7 +523,7 @@ struct kvm_pmu { u64 global_status; u64 counter_bitmask[2]; u64 global_ctrl_mask; - u64 global_ovf_ctrl_mask; + u64 global_status_mask; u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9646ed6e8c0b..180b1cbfcc4e 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -350,4 +350,7 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } + +unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len); + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index b7126701574c..7f97a8a97e24 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -17,6 +17,12 @@ #include <asm/bootparam.h> +#ifdef CONFIG_X86_MEM_ENCRYPT +void __init mem_encrypt_init(void); +#else +static inline void mem_encrypt_init(void) { } +#endif + #ifdef CONFIG_AMD_MEM_ENCRYPT extern u64 sme_me_mask; @@ -87,9 +93,6 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ -/* Architecture __weak replacement functions */ -void __init mem_encrypt_init(void); - void add_encrypt_protection_map(void); /* diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 49bb4f2bd300..88d9ef98e087 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -257,6 +257,11 @@ void hv_set_register(unsigned int reg, u64 value); u64 hv_get_non_nested_register(unsigned int reg); void hv_set_non_nested_register(unsigned int reg, u64 value); +static __always_inline u64 hv_raw_get_register(unsigned int reg) +{ + return __rdmsr(reg); +} + #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index f0eeaf6e5f5f..090d658a85a6 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -23,14 +23,43 @@ #ifndef _ASM_X86_MTRR_H #define _ASM_X86_MTRR_H +#include <linux/bits.h> #include <uapi/asm/mtrr.h> +/* Defines for hardware MTRR registers. */ +#define MTRR_CAP_VCNT GENMASK(7, 0) +#define MTRR_CAP_FIX BIT_MASK(8) +#define MTRR_CAP_WC BIT_MASK(10) + +#define MTRR_DEF_TYPE_TYPE GENMASK(7, 0) +#define MTRR_DEF_TYPE_FE BIT_MASK(10) +#define MTRR_DEF_TYPE_E BIT_MASK(11) + +#define MTRR_DEF_TYPE_ENABLE (MTRR_DEF_TYPE_FE | MTRR_DEF_TYPE_E) +#define MTRR_DEF_TYPE_DISABLE ~(MTRR_DEF_TYPE_TYPE | MTRR_DEF_TYPE_ENABLE) + +#define MTRR_PHYSBASE_TYPE GENMASK(7, 0) +#define MTRR_PHYSBASE_RSVD GENMASK(11, 8) + +#define MTRR_PHYSMASK_RSVD GENMASK(10, 0) +#define MTRR_PHYSMASK_V BIT_MASK(11) + +struct mtrr_state_type { + struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; + unsigned char enabled; + bool have_fixed; + mtrr_type def_type; +}; + /* * The following functions are for use by other drivers that cannot use * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR void mtrr_bp_init(void); +void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type); extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); @@ -40,7 +69,6 @@ extern int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment); extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); -extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); @@ -48,12 +76,21 @@ void mtrr_disable(void); void mtrr_enable(void); void mtrr_generic_set_state(void); # else +static inline void mtrr_overwrite_state(struct mtrr_var_range *var, + unsigned int num_var, + mtrr_type def_type) +{ +} + static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* - * Return no-MTRRs: + * Return the default MTRR type, without any known other types in + * that range. */ - return MTRR_TYPE_INVALID; + *uniform = 1; + + return MTRR_TYPE_UNCACHABLE; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) @@ -79,9 +116,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) { return 0; } -static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) -{ -} #define mtrr_bp_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) #define mtrr_disable() do {} while (0) @@ -121,7 +155,8 @@ struct mtrr_gentry32 { #endif /* CONFIG_COMPAT */ /* Bit fields for enabled in struct mtrr_state_type */ -#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 -#define MTRR_STATE_MTRR_ENABLED 0x02 +#define MTRR_STATE_SHIFT 10 +#define MTRR_STATE_MTRR_FIXED_ENABLED (MTRR_DEF_TYPE_FE >> MTRR_STATE_SHIFT) +#define MTRR_STATE_MTRR_ENABLED (MTRR_DEF_TYPE_E >> MTRR_STATE_SHIFT) #endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index c5573eaa5bb9..1c1b7550fa55 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -34,6 +34,8 @@ #define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x3e,BYTES_NOP7 +#define ASM_NOP_MAX 8 + #else /* @@ -47,6 +49,9 @@ * 6: osp nopl 0x00(%eax,%eax,1) * 7: nopl 0x00000000(%eax) * 8: nopl 0x00000000(%eax,%eax,1) + * 9: cs nopl 0x00000000(%eax,%eax,1) + * 10: osp cs nopl 0x00000000(%eax,%eax,1) + * 11: osp osp cs nopl 0x00000000(%eax,%eax,1) */ #define BYTES_NOP1 0x90 #define BYTES_NOP2 0x66,BYTES_NOP1 @@ -56,6 +61,15 @@ #define BYTES_NOP6 0x66,BYTES_NOP5 #define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +#define BYTES_NOP9 0x2e,BYTES_NOP8 +#define BYTES_NOP10 0x66,BYTES_NOP9 +#define BYTES_NOP11 0x66,BYTES_NOP10 + +#define ASM_NOP9 _ASM_BYTES(BYTES_NOP9) +#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10) +#define ASM_NOP11 _ASM_BYTES(BYTES_NOP11) + +#define ASM_NOP_MAX 11 #endif /* CONFIG_64BIT */ @@ -68,8 +82,6 @@ #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) -#define ASM_NOP_MAX 8 - #ifndef __ASSEMBLY__ extern const unsigned char * const x86_nops[]; #endif diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index edb2b0cb8efe..1a65cf4acb2b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -84,12 +84,12 @@ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); #define RESET_CALL_DEPTH \ - mov $0x80, %rax; \ - shl $56, %rax; \ + xor %eax, %eax; \ + bts $63, %rax; \ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); #define RESET_CALL_DEPTH_FROM_CALL \ - mov $0xfc, %rax; \ + movb $0xfc, %al; \ shl $56, %rax; \ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS @@ -234,6 +234,10 @@ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. + * + * NOTE: these do not take kCFI into account and are thus not comparable to C + * indirect calls, take care when using. The target of these should be an ENDBR + * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/include/asm/orc_header.h b/arch/x86/include/asm/orc_header.h new file mode 100644 index 000000000000..07bacf3e160e --- /dev/null +++ b/arch/x86/include/asm/orc_header.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#ifndef _ORC_HEADER_H +#define _ORC_HEADER_H + +#include <linux/types.h> +#include <linux/compiler.h> +#include <asm/orc_hash.h> + +/* + * The header is currently a 20-byte hash of the ORC entry definition; see + * scripts/orc_hash.sh. + */ +#define ORC_HEADER \ + __used __section(".orc_header") __aligned(4) \ + static const u8 orc_header[] = { ORC_HASH } + +#endif /* _ORC_HEADER_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 13c0d63ed55e..34734d730463 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -210,6 +210,67 @@ do { \ (typeof(_var))(unsigned long) pco_old__; \ }) +#if defined(CONFIG_X86_32) && !defined(CONFIG_UML) +#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \ +({ \ + union { \ + u64 var; \ + struct { \ + u32 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = _oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + : [var] "+m" (_var), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high) \ + : "memory", "esi"); \ + \ + old__.var; \ +}) + +#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval) +#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval) +#endif + +#ifdef CONFIG_X86_64 +#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval); +#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval); + +#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \ +({ \ + union { \ + u128 var; \ + struct { \ + u64 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = _oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + : [var] "+m" (_var), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high) \ + : "memory", "rsi"); \ + \ + old__.var; \ +}) + +#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval) +#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval) +#endif + /* * this_cpu_read() makes gcc load the percpu variable every time it is * accessed while this_cpu_read_stable() allows the value to be cached. @@ -290,23 +351,6 @@ do { \ #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) -#ifdef CONFIG_X86_CMPXCHG64 -#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ -({ \ - bool __ret; \ - typeof(pcp1) __o1 = (o1), __n1 = (n1); \ - typeof(pcp2) __o2 = (o2), __n2 = (n2); \ - asm volatile("cmpxchg8b "__percpu_arg(1) \ - CC_SET(z) \ - : CC_OUT(z) (__ret), "+m" (pcp1), "+m" (pcp2), "+a" (__o1), "+d" (__o2) \ - : "b" (__n1), "c" (__n2)); \ - __ret; \ -}) - -#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double -#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double -#endif /* CONFIG_X86_CMPXCHG64 */ - /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. @@ -329,30 +373,6 @@ do { \ #define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) - -/* - * Pretty complex macro to generate cmpxchg16 instruction. The instruction - * is not supported on early AMD64 processors so we must be able to emulate - * it in software. The address used in the cmpxchg16 instruction must be - * aligned to a 16 byte boundary. - */ -#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \ -({ \ - bool __ret; \ - typeof(pcp1) __o1 = (o1), __n1 = (n1); \ - typeof(pcp2) __o2 = (o2), __n2 = (n2); \ - alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \ - "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \ - X86_FEATURE_CX16, \ - ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \ - "+m" (pcp2), "+d" (__o2)), \ - "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \ - __ret; \ -}) - -#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double -#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double - #endif static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index abf09882f58b..85a9fd5a3ec3 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -32,11 +32,21 @@ #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define INTEL_FIXED_BITS_MASK 0xFULL +#define INTEL_FIXED_BITS_STRIDE 4 +#define INTEL_FIXED_0_KERNEL (1ULL << 0) +#define INTEL_FIXED_0_USER (1ULL << 1) +#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) +#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) + #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define intel_fixed_bits_by_idx(_idx, _bits) \ + ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) + #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) #define AMD64_EVENTSEL_HOSTONLY (1ULL << 41) @@ -478,8 +488,10 @@ struct pebs_xmm { #ifdef CONFIG_X86_LOCAL_APIC extern u32 get_ibs_caps(void); +extern int forward_event_to_ibs(struct perf_event *event); #else static inline u32 get_ibs_caps(void) { return 0; } +static inline int forward_event_to_ibs(struct perf_event *event) { return -ENOENT; } #endif #ifdef CONFIG_PERF_EVENTS diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 15ae4d6ba476..5700bb337987 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -27,6 +27,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 7929327abe00..a629b1b9f65a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -237,8 +237,8 @@ static inline void native_pgd_clear(pgd_t *pgd) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) -#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) -#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) +#define __swp_entry_to_pte(x) (__pte((x).val)) +#define __swp_entry_to_pmd(x) (__pmd((x).val)) extern void cleanup_highmap(void); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 447d4bee25c4..ba3e2554799a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -513,9 +513,6 @@ extern void native_pagetable_init(void); #define native_pagetable_init paging_init #endif -struct seq_file; -extern void arch_report_meminfo(struct seq_file *m); - enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a1e4fa58b357..d46300e94f85 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -551,7 +551,6 @@ extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); -extern void cpu_init_secondary(void); extern void cpu_init_exception_handling(void); extern void cr4_init(void); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index f6a1737c77be..87e5482acd0d 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -52,6 +52,7 @@ struct trampoline_header { u64 efer; u32 cr4; u32 flags; + u32 lock; #endif }; @@ -64,6 +65,8 @@ extern unsigned long initial_stack; extern unsigned long initial_vc_handler; #endif +extern u32 *trampoline_lock; + extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0759af9b1acf..b463fcbd4b90 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -106,8 +106,13 @@ enum psc_op { #define GHCB_HV_FT_SNP BIT_ULL(0) #define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1) -/* SNP Page State Change NAE event */ -#define VMGEXIT_PSC_MAX_ENTRY 253 +/* + * SNP Page State Change NAE event + * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which + * is a local stack variable in set_pages_state(). Do not increase this value + * without evaluating the impact to stack usage. + */ +#define VMGEXIT_PSC_MAX_ENTRY 64 struct psc_hdr { u16 cur_entry; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 13dc2a9d23c1..66c806784c52 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -14,6 +14,7 @@ #include <asm/insn.h> #include <asm/sev-common.h> #include <asm/bootparam.h> +#include <asm/coco.h> #define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 2ULL @@ -80,11 +81,15 @@ extern void vc_no_ghcb(void); extern void vc_boot_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* PVALIDATE return codes */ +#define PVALIDATE_FAIL_SIZEMISMATCH 6 + /* Software defined (when rFlags.CF = 1) */ #define PVALIDATE_FAIL_NOUPDATE 255 /* RMP page size */ #define RMP_PG_SIZE_4K 0 +#define RMP_PG_SIZE_2M 1 #define RMPADJUST_VMSA_PAGE_BIT BIT(16) @@ -136,24 +141,26 @@ struct snp_secrets_page_layout { } __packed; #ifdef CONFIG_AMD_MEM_ENCRYPT -extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); extern void __sev_es_ist_exit(void); static __always_inline void sev_es_ist_enter(struct pt_regs *regs) { - if (static_branch_unlikely(&sev_es_enable_key)) + if (cc_vendor == CC_VENDOR_AMD && + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) __sev_es_ist_enter(regs); } static __always_inline void sev_es_ist_exit(void) { - if (static_branch_unlikely(&sev_es_enable_key)) + if (cc_vendor == CC_VENDOR_AMD && + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) __sev_es_ist_exit(); } extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); extern void __sev_es_nmi_complete(void); static __always_inline void sev_es_nmi_complete(void) { - if (static_branch_unlikely(&sev_es_enable_key)) + if (cc_vendor == CC_VENDOR_AMD && + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); @@ -192,16 +199,17 @@ struct snp_guest_request_ioctl; void setup_ghcb(void); void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned int npages); + unsigned long npages); void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned int npages); + unsigned long npages); void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); -void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); -void snp_set_memory_private(unsigned long vaddr, unsigned int npages); +void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); +void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void __init __noreturn snp_abort(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); +void snp_accept_memory(phys_addr_t start, phys_addr_t end); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -212,12 +220,12 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline void setup_ghcb(void) { } static inline void __init -early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } static inline void __init -early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } -static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } -static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { } +static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } @@ -225,6 +233,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in { return -ENOTTY; } + +static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } #endif #endif diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 2631e01f6e0f..7513b3bb69b7 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -10,6 +10,20 @@ #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +/* TDX module Call Leaf IDs */ +#define TDX_GET_INFO 1 +#define TDX_GET_VEINFO 3 +#define TDX_GET_REPORT 4 +#define TDX_ACCEPT_PAGE 6 +#define TDX_WR 8 + +/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +#define TDCS_NOTIFY_ENABLES 0x9100000000000010 + +/* TDX hypercall Leaf IDs */ +#define TDVMCALL_MAP_GPA 0x10001 +#define TDVMCALL_REPORT_FATAL_ERROR 0x10003 + #ifndef __ASSEMBLY__ /* @@ -37,8 +51,58 @@ struct tdx_hypercall_args { u64 __tdx_hypercall(struct tdx_hypercall_args *args); u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = fn, + .r12 = r12, + .r13 = r13, + .r14 = r14, + .r15 = r15, + }; + + return __tdx_hypercall(&args); +} + + /* Called from __tdx_hypercall() for unrecoverable failure */ void __tdx_hypercall_failed(void); +/* + * Used in __tdx_module_call() to gather the output registers' values of the + * TDCALL instruction when requesting services from the TDX module. This is a + * software only structure and not part of the TDX module/VMM ABI + */ +struct tdx_module_output { + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 r10; + u64 r11; +}; + +/* Used to communicate with the TDX module */ +u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out); + +bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); + +/* + * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined + * independently from but are currently matched 1:1 with VMX EXIT_REASONs. + * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and + * guest sides of these calls. + */ +static __always_inline u64 hcall_func(u64 exit_reason) +{ + return exit_reason; +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 5b1ed650b124..84eab2724875 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -85,6 +85,4 @@ struct rt_sigframe_x32 { #endif /* CONFIG_X86_64 */ -void __init init_sigframe_size(void); - #endif /* _ASM_X86_SIGFRAME_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4e91054c84be..600cf25dbfc6 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -38,7 +38,9 @@ struct smp_ops { void (*crash_stop_other_cpus)(void); void (*smp_send_reschedule)(int cpu); - int (*cpu_up)(unsigned cpu, struct task_struct *tidle); + void (*cleanup_dead_cpu)(unsigned cpu); + void (*poll_sync_state)(void); + int (*kick_ap_alive)(unsigned cpu, struct task_struct *tidle); int (*cpu_disable)(void); void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); @@ -78,11 +80,6 @@ static inline void smp_cpus_done(unsigned int max_cpus) smp_ops.smp_cpus_done(max_cpus); } -static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle) -{ - return smp_ops.cpu_up(cpu, tidle); -} - static inline int __cpu_disable(void) { return smp_ops.cpu_disable(); @@ -90,7 +87,8 @@ static inline int __cpu_disable(void) static inline void __cpu_die(unsigned int cpu) { - smp_ops.cpu_die(cpu); + if (smp_ops.cpu_die) + smp_ops.cpu_die(cpu); } static inline void __noreturn play_dead(void) @@ -121,22 +119,23 @@ void native_smp_prepare_cpus(unsigned int max_cpus); void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); -int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); +int native_kick_ap(unsigned int cpu, struct task_struct *tidle); int native_cpu_disable(void); -int common_cpu_die(unsigned int cpu); -void native_cpu_die(unsigned int cpu); void __noreturn hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); -void cond_wakeup_cpu0(void); + +void smp_kick_mwait_play_dead(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); +bool smp_park_other_cpus_in_init(void); + void smp_store_boot_cpu_info(void); void smp_store_cpu_info(int id); @@ -201,7 +200,14 @@ extern void nmi_selftest(void); #endif extern unsigned int smpboot_control; +extern unsigned long apic_mmio_base; #endif /* !__ASSEMBLY__ */ +/* Control bits for startup_64 */ +#define STARTUP_READ_APICID 0x80000000 + +/* Top 8 bits are reserved for control */ +#define STARTUP_PARALLEL_MASK 0xFF000000 + #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 5c91305d09d2..f42dbf17f52b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -asmlinkage void ret_from_fork(void); +asmlinkage void ret_from_fork_asm(void); +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg); /* * This is the structure pointed to by thread.sp for an inactive task. The diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 5b85987a5e97..4fb36fba4b5a 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -127,9 +127,11 @@ static inline int syscall_get_arch(struct task_struct *task) } void do_syscall_64(struct pt_regs *regs, int nr); -void do_int80_syscall_32(struct pt_regs *regs); -long do_fast_syscall_32(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ +void do_int80_syscall_32(struct pt_regs *regs); +long do_fast_syscall_32(struct pt_regs *regs); +long do_SYSENTER_32(struct pt_regs *regs); + #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 28d889c9aa16..603e6d1e9d4a 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -5,6 +5,8 @@ #include <linux/init.h> #include <linux/bits.h> + +#include <asm/errno.h> #include <asm/ptrace.h> #include <asm/shared/tdx.h> @@ -21,21 +23,6 @@ #ifndef __ASSEMBLY__ /* - * Used to gather the output registers values of the TDCALL and SEAMCALL - * instructions when requesting services from the TDX module. - * - * This is a software only structure and not part of the TDX module/VMM ABI. - */ -struct tdx_module_output { - u64 rcx; - u64 rdx; - u64 r8; - u64 r9; - u64 r10; - u64 r11; -}; - -/* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure * and not part of the TDX module/VMM ABI. @@ -55,10 +42,6 @@ struct ve_info { void __init tdx_early_init(void); -/* Used to communicate with the TDX module */ -u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, - struct tdx_module_output *out); - void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f1cccba52eb9..d63b02940747 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -232,9 +232,6 @@ static inline int arch_within_stack_frames(const void * const stack, current_thread_info()->status & TS_COMPAT) #endif -extern void arch_task_cache_init(void); -extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -extern void arch_release_task_struct(struct task_struct *tsk); extern void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index a53961c64a56..f360104ed172 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -6,7 +6,6 @@ #include <asm/mc146818rtc.h> extern void hpet_time_init(void); -extern void time_init(void); extern bool pit_timer_init(void); extern bool tsc_clocksource_watchdog_disabled(void); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 75bfaa421030..80450e1d5385 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -14,6 +14,8 @@ #include <asm/processor-flags.h> #include <asm/pgtable.h> +DECLARE_PER_CPU(u64, tlbstate_untag_mask); + void __flush_tlb_all(void); #define TLB_FLUSH_ALL -1UL @@ -54,15 +56,6 @@ static inline void cr4_clear_bits(unsigned long mask) local_irq_restore(flags); } -#ifdef CONFIG_ADDRESS_MASKING -DECLARE_PER_CPU(u64, tlbstate_untag_mask); - -static inline u64 current_untag_mask(void) -{ - return this_cpu_read(tlbstate_untag_mask); -} -#endif - #ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 458c891a8273..caf41c4869a0 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -31,9 +31,9 @@ * CONFIG_NUMA. */ #include <linux/numa.h> +#include <linux/cpumask.h> #ifdef CONFIG_NUMA -#include <linux/cpumask.h> #include <asm/mpspec.h> #include <asm/percpu.h> @@ -139,23 +139,31 @@ static inline int topology_max_smt_threads(void) int topology_update_package_map(unsigned int apicid, unsigned int cpu); int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); -int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); -bool topology_is_primary_thread(unsigned int cpu); bool topology_smt_supported(void); -#else + +extern struct cpumask __cpu_primary_thread_mask; +#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) + +/** + * topology_is_primary_thread - Check whether CPU is the primary SMT thread + * @cpu: CPU to check + */ +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_primary_thread_mask); +} +#else /* CONFIG_SMP */ #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } static inline int topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_phys_to_logical_die(unsigned int die, - unsigned int cpu) { return 0; } static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline bool topology_smt_supported(void) { return false; } -#endif +#endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) { diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index fbdc3d951494..594fce0ca744 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -32,7 +32,6 @@ extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); extern void tsc_early_init(void); extern void tsc_init(void); -extern unsigned long calibrate_delay_is_known(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); @@ -55,12 +54,10 @@ extern bool tsc_async_resets; #ifdef CONFIG_X86_TSC extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); extern void tsc_verify_tsc_adjust(bool resume); -extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); #else static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } static inline void tsc_verify_tsc_adjust(bool resume) { } -static inline void check_tsc_sync_source(int cpu) { } static inline void check_tsc_sync_target(void) { } #endif diff --git a/arch/x86/include/asm/unaccepted_memory.h b/arch/x86/include/asm/unaccepted_memory.h new file mode 100644 index 000000000000..f5937e9866ac --- /dev/null +++ b/arch/x86/include/asm/unaccepted_memory.h @@ -0,0 +1,27 @@ +#ifndef _ASM_X86_UNACCEPTED_MEMORY_H +#define _ASM_X86_UNACCEPTED_MEMORY_H + +#include <linux/efi.h> +#include <asm/tdx.h> +#include <asm/sev.h> + +static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* Platform-specific memory-acceptance call goes here */ + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + if (!tdx_accept_memory(start, end)) + panic("TDX: Failed to accept memory\n"); + } else if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + snp_accept_memory(start, end); + } else { + panic("Cannot accept memory: unknown platform\n"); + } +} + +static inline struct efi_unaccepted_memory *efi_get_unaccepted_table(void) +{ + if (efi.unaccepted == EFI_INVALID_TABLE_ADDR) + return NULL; + return __va(efi.unaccepted); +} +#endif diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 01cb9692b160..85cc57cb6539 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -76,9 +76,18 @@ #else +#define UNWIND_HINT_UNDEFINED \ + UNWIND_HINT(UNWIND_HINT_TYPE_UNDEFINED, 0, 0, 0) + #define UNWIND_HINT_FUNC \ UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0) +#define UNWIND_HINT_SAVE \ + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) + +#define UNWIND_HINT_RESTORE \ + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d3e3197917be..5fa76c2ced51 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -177,6 +177,7 @@ struct uv_hub_info_s { unsigned short nr_possible_cpus; unsigned short nr_online_cpus; short memory_nid; + unsigned short *node_to_socket; }; /* CPU specific info with a pointer to the hub common info struct */ @@ -519,25 +520,30 @@ static inline int uv_socket_to_node(int socket) return _uv_socket_to_node(socket, uv_hub_info->socket_to_node); } +static inline int uv_pnode_to_socket(int pnode) +{ + unsigned short *p2s = uv_hub_info->pnode_to_socket; + + return p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { unsigned int m_val = uv_hub_info->m_val; unsigned long base; - unsigned short sockid, node, *p2s; + unsigned short sockid; if (m_val) return __va(((unsigned long)pnode << m_val) | offset); - p2s = uv_hub_info->pnode_to_socket; - sockid = p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode; - node = uv_socket_to_node(sockid); + sockid = uv_pnode_to_socket(pnode); /* limit address of previous socket is our base, except node 0 is 0 */ - if (!node) + if (sockid == 0) return __va((unsigned long)offset); - base = (unsigned long)(uv_hub_info->gr_table[node - 1].limit); + base = (unsigned long)(uv_hub_info->gr_table[sockid - 1].limit); return __va(base << UV_GAM_RANGE_SHFT | offset); } @@ -644,7 +650,7 @@ static inline int uv_cpu_blade_processor_id(int cpu) /* Blade number to Node number (UV2..UV4 is 1:1) */ static inline int uv_blade_to_node(int blade) { - return blade; + return uv_socket_to_node(blade); } /* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ @@ -656,23 +662,27 @@ static inline int uv_numa_blade_id(void) /* * Convert linux node number to the UV blade number. * .. Currently for UV2 thru UV4 the node and the blade are identical. - * .. If this changes then you MUST check references to this function! + * .. UV5 needs conversion when sub-numa clustering is enabled. */ static inline int uv_node_to_blade_id(int nid) { - return nid; + unsigned short *n2s = uv_hub_info->node_to_socket; + + return n2s ? n2s[nid] : nid; } /* Convert a CPU number to the UV blade number */ static inline int uv_cpu_to_blade_id(int cpu) { - return uv_node_to_blade_id(cpu_to_node(cpu)); + return uv_cpu_hub_info(cpu)->numa_blade_id; } /* Convert a blade id to the PNODE of the blade */ static inline int uv_blade_to_pnode(int bid) { - return uv_hub_info_list(uv_blade_to_node(bid))->pnode; + unsigned short *s2p = uv_hub_info->socket_to_pnode; + + return s2p ? s2p[bid] : bid; } /* Nid of memory node on blade. -1 if no blade-local memory */ diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 57fa67373262..bb45812889dd 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -4199,6 +4199,13 @@ union uvh_rh_gam_mmioh_overlay_config1_u { #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000007fffUL +/* UVH common defines */ +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK ( \ + is_uv(UV4A) ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \ + is_uv(UV4) ? UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \ + is_uv(UV3) ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \ + 0) + union uvh_rh_gam_mmioh_redirect_config0_u { unsigned long v; @@ -4247,8 +4254,8 @@ union uvh_rh_gam_mmioh_redirect_config0_u { 0) /* UV4A unique defines */ -#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 -#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000000fffUL +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000000fffUL /* UV4 unique defines */ #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 @@ -4258,6 +4265,13 @@ union uvh_rh_gam_mmioh_redirect_config0_u { #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000007fffUL +/* UVH common defines */ +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK ( \ + is_uv(UV4A) ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \ + is_uv(UV4) ? UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \ + is_uv(UV3) ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \ + 0) + union uvh_rh_gam_mmioh_redirect_config1_u { unsigned long v; diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 4cf6794f9d68..c81858d903dc 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -231,14 +231,19 @@ static u64 vread_pvclock(void) ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); - return ret; + return ret & S64_MAX; } #endif #ifdef CONFIG_HYPERV_TIMER static u64 vread_hvclock(void) { - return hv_read_tsc_page(&hvclock_page); + u64 tsc, time; + + if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time)) + return time & S64_MAX; + + return U64_MAX; } #endif @@ -246,7 +251,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) { if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) - return (u64)rdtsc_ordered(); + return (u64)rdtsc_ordered() & S64_MAX; /* * For any memory-mapped vclock type, we need to make sure that gcc * doesn't cleverly hoist a load before the mode check. Otherwise we @@ -284,6 +289,9 @@ static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) * which can be invalidated asynchronously and indicate invalidation by * returning U64_MAX, which can be effectively tested by checking for a * negative value after casting it to s64. + * + * This effectively forces a S64_MAX mask on the calculations, unlike the + * U64_MAX mask normally used by x86 clocksources. */ static inline bool arch_vdso_cycles_ok(u64 cycles) { @@ -303,18 +311,29 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) * @last. If not then use @last, which is the base time of the current * conversion period. * - * This variant also removes the masking of the subtraction because the - * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX - * which would result in a pointless operation. The compiler cannot - * optimize it away as the mask comes from the vdso data and is not compile - * time constant. + * This variant also uses a custom mask because while the clocksource mask of + * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses + * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above + * declares everything with the MSB/Sign-bit set as invalid. Therefore the + * effective mask is S64_MAX. */ static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) { - if (cycles > last) - return (cycles - last) * mult; - return 0; + /* + * Due to the MSB/Sign-bit being used as invald marker (see + * arch_vdso_cycles_valid() above), the effective mask is S64_MAX. + */ + u64 delta = (cycles - last) & S64_MAX; + + /* + * Due to the above mentioned TSC wobbles, filter out negative motion. + * Per the above masking, the effective sign bit is now bit 62. + */ + if (unlikely(delta & (1ULL << 62))) + return 0; + + return delta * mult; } #define vdso_calc_delta vdso_calc_delta diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 88085f369ff6..5240d88db52a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -150,7 +150,7 @@ struct x86_init_acpi { * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status */ struct x86_guest { - void (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); + bool (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc); bool (*enc_tlb_flush_required)(bool enc); bool (*enc_cache_flush_required)(void); @@ -177,11 +177,14 @@ struct x86_init_ops { * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device * @early_percpu_clock_init: early init of the per cpu clock event device + * @fixup_cpu_id: fixup function for cpuinfo_x86::phys_proc_id + * @parallel_bringup: Parallel bringup control */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); void (*early_percpu_clock_init)(void); void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); + bool parallel_bringup; }; struct timespec64; diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index 376563f2bac1..3a8a8eb8ac3a 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -81,14 +81,6 @@ typedef __u8 mtrr_type; #define MTRR_NUM_FIXED_RANGES 88 #define MTRR_MAX_VAR_RANGES 256 -struct mtrr_state_type { - struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; - mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) @@ -115,9 +107,9 @@ struct mtrr_state_type { #define MTRR_NUM_TYPES 7 /* - * Invalid MTRR memory type. mtrr_type_lookup() returns this value when - * MTRRs are disabled. Note, this value is allocated from the reserved - * values (0x7-0xff) of the MTRR memory types. + * Invalid MTRR memory type. No longer used outside of MTRR code. + * Note, this value is allocated from the reserved values (0x7-0xff) of + * the MTRR memory types. */ #define MTRR_TYPE_INVALID 0xff diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 1328c221af30..6dfecb27b846 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,6 +16,7 @@ #include <asm/cacheflush.h> #include <asm/realmode.h> #include <asm/hypervisor.h> +#include <asm/smp.h> #include <linux/ftrace.h> #include "../../realmode/rm/wakeup.h" @@ -127,7 +128,13 @@ int x86_acpi_suspend_lowlevel(void) * value is in the actual %rsp register. */ current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack); - smpboot_control = smp_processor_id(); + /* + * Ensure the CPU knows which one it is when it comes back, if + * it isn't in parallel mode and expected to work that out for + * itself. + */ + if (!(smpboot_control & STARTUP_PARALLEL_MASK)) + smpboot_control = smp_processor_id(); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0L; diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 171a40c74db6..054c15a2f860 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -12,7 +12,6 @@ extern int wakeup_pmode_return; extern u8 wake_sleep_flags; -extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); extern void do_suspend_lowlevel(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f615e0cb6d93..2dcf3a06af09 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -37,11 +37,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) -static int __initdata_or_module debug_alternative; +#define DA_ALL (~0) +#define DA_ALT 0x01 +#define DA_RET 0x02 +#define DA_RETPOLINE 0x04 +#define DA_ENDBR 0x08 +#define DA_SMP 0x10 + +static unsigned int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { - debug_alternative = 1; + if (str && *str == '=') + str++; + + if (!str || kstrtouint(str, 0, &debug_alternative)) + debug_alternative = DA_ALL; + return 1; } __setup("debug-alternative", debug_alt); @@ -55,15 +67,15 @@ static int __init setup_noreplace_smp(char *str) } __setup("noreplace-smp", setup_noreplace_smp); -#define DPRINTK(fmt, args...) \ +#define DPRINTK(type, fmt, args...) \ do { \ - if (debug_alternative) \ + if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) -#define DUMP_BYTES(buf, len, fmt, args...) \ +#define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ - if (unlikely(debug_alternative)) { \ + if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ @@ -86,6 +98,11 @@ static const unsigned char x86nops[] = BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, +#ifdef CONFIG_64BIT + BYTES_NOP9, + BYTES_NOP10, + BYTES_NOP11, +#endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = @@ -99,19 +116,44 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +#ifdef CONFIG_64BIT + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, +#endif }; -/* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void __init_or_module add_nops(void *insns, unsigned int len) +/* + * Fill the buffer with a single effective instruction of size @len. + * + * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) + * for every single-byte NOP, try to generate the maximally available NOP of + * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for + * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and + * *jump* over instead of executing long and daft NOPs. + */ +static void __init_or_module add_nop(u8 *instr, unsigned int len) { - while (len > 0) { - unsigned int noplen = len; - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); - insns += noplen; - len -= noplen; + u8 *target = instr + len; + + if (!len) + return; + + if (len <= ASM_NOP_MAX) { + memcpy(instr, x86_nops[len], len); + return; + } + + if (len < 128) { + __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); + instr += JMP8_INSN_SIZE; + } else { + __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); + instr += JMP32_INSN_SIZE; } + + for (;instr < target; instr++) + *instr = INT3_INSN_OPCODE; } extern s32 __retpoline_sites[], __retpoline_sites_end[]; @@ -123,133 +165,223 @@ extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); /* - * Are we looking at a near JMP with a 1 or 4-byte displacement. + * Matches NOP and NOPL, not any of the other possible NOPs. */ -static inline bool is_jmp(const u8 opcode) +static bool insn_is_nop(struct insn *insn) { - return opcode == 0xeb || opcode == 0xe9; + /* Anything NOP, but no REP NOP */ + if (insn->opcode.bytes[0] == 0x90 && + (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) + return true; + + /* NOPL */ + if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) + return true; + + /* TODO: more nops */ + + return false; } -static void __init_or_module -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) +/* + * Find the offset of the first non-NOP instruction starting at @offset + * but no further than @len. + */ +static int skip_nops(u8 *instr, int offset, int len) { - u8 *next_rip, *tgt_rip; - s32 n_dspl, o_dspl; - int repl_len; + struct insn insn; - if (a->replacementlen != 5) - return; + for (; offset < len; offset += insn.length) { + if (insn_decode_kernel(&insn, &instr[offset])) + break; - o_dspl = *(s32 *)(insn_buff + 1); + if (!insn_is_nop(&insn)) + break; + } - /* next_rip of the replacement JMP */ - next_rip = repl_insn + a->replacementlen; - /* target rip of the replacement JMP */ - tgt_rip = next_rip + o_dspl; - n_dspl = tgt_rip - orig_insn; + return offset; +} - DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); +/* + * Optimize a sequence of NOPs, possibly preceded by an unconditional jump + * to the end of the NOP sequence into a single NOP. + */ +static bool __init_or_module +__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) +{ + int i = *next - insn->length; - if (tgt_rip - orig_insn >= 0) { - if (n_dspl - 2 <= 127) - goto two_byte_jmp; - else - goto five_byte_jmp; - /* negative offset */ - } else { - if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) - goto two_byte_jmp; - else - goto five_byte_jmp; + switch (insn->opcode.bytes[0]) { + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + *prev = i; + *target = *next + insn->immediate.value; + return false; } -two_byte_jmp: - n_dspl -= 2; + if (insn_is_nop(insn)) { + int nop = i; - insn_buff[0] = 0xeb; - insn_buff[1] = (s8)n_dspl; - add_nops(insn_buff + 2, 3); + *next = skip_nops(instr, *next, len); + if (*target && *next == *target) + nop = *prev; - repl_len = 2; - goto done; + add_nop(instr + nop, *next - nop); + DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); + return true; + } -five_byte_jmp: - n_dspl -= 5; + *target = 0; + return false; +} - insn_buff[0] = 0xe9; - *(s32 *)&insn_buff[1] = n_dspl; +/* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +{ + int prev, target = 0; - repl_len = 5; + for (int next, i = 0; i < len; i = next) { + struct insn insn; -done: + if (insn_decode_kernel(&insn, &instr[i])) + return; + + next = i + insn.length; - DPRINTK("final displ: 0x%08x, JMP 0x%lx", - n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); + __optimize_nops(instr, len, &insn, &next, &prev, &target); + } } /* - * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * In this context, "source" is where the instructions are placed in the + * section .altinstr_replacement, for example during kernel build by the + * toolchain. + * "Destination" is where the instructions are being patched in by this + * machinery. + * + * The source offset is: + * + * src_imm = target - src_next_ip (1) * - * @instr: instruction byte stream - * @instrlen: length of the above - * @off: offset within @instr where the first NOP has been detected + * and the target offset is: * - * Return: number of NOPs found (and replaced). + * dst_imm = target - dst_next_ip (2) + * + * so rework (1) as an expression for target like: + * + * target = src_imm + src_next_ip (1a) + * + * and substitute in (2) to get: + * + * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) + * + * Now, since the instruction stream is 'identical' at src and dst (it + * is being copied after all) it can be stated that: + * + * src_next_ip = src + ip_offset + * dst_next_ip = dst + ip_offset (4) + * + * Substitute (4) in (3) and observe ip_offset being cancelled out to + * obtain: + * + * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) + * = src_imm + src - dst + ip_offset - ip_offset + * = src_imm + src - dst (5) + * + * IOW, only the relative displacement of the code block matters. */ -static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) -{ - unsigned long flags; - int i = off, nnops; - while (i < instrlen) { - if (instr[i] != 0x90) - break; +#define apply_reloc_n(n_, p_, d_) \ + do { \ + s32 v = *(s##n_ *)(p_); \ + v += (d_); \ + BUG_ON((v >> 31) != (v >> (n_-1))); \ + *(s##n_ *)(p_) = (s##n_)v; \ + } while (0) - i++; + +static __always_inline +void apply_reloc(int n, void *ptr, uintptr_t diff) +{ + switch (n) { + case 1: apply_reloc_n(8, ptr, diff); break; + case 2: apply_reloc_n(16, ptr, diff); break; + case 4: apply_reloc_n(32, ptr, diff); break; + default: BUG(); } +} - nnops = i - off; +static __always_inline +bool need_reloc(unsigned long offset, u8 *src, size_t src_len) +{ + u8 *target = src + offset; + /* + * If the target is inside the patched block, it's relative to the + * block itself and does not need relocation. + */ + return (target < src || target > src + src_len); +} - if (nnops <= 1) - return nnops; +static void __init_or_module noinline +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +{ + int prev, target = 0; - local_irq_save(flags); - add_nops(instr + off, nnops); - local_irq_restore(flags); + for (int next, i = 0; i < len; i = next) { + struct insn insn; - DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) + return; - return nnops; -} + next = i + insn.length; -/* - * "noinline" to cause control flow change and thus invalidate I$ and - * cause refetch after modification. - */ -static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) -{ - struct insn insn; - int i = 0; + if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) + continue; - /* - * Jump over the non-NOP insns and optimize single-byte NOPs into bigger - * ones. - */ - for (;;) { - if (insn_decode_kernel(&insn, &instr[i])) - return; + switch (insn.opcode.bytes[0]) { + case 0x0f: + if (insn.opcode.bytes[1] < 0x80 || + insn.opcode.bytes[1] > 0x8f) + break; - /* - * See if this and any potentially following NOPs can be - * optimized. - */ - if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, len, i); - else - i += insn.length; + fallthrough; /* Jcc.d32 */ + case 0x70 ... 0x7f: /* Jcc.d8 */ + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case CALL_INSN_OPCODE: + if (need_reloc(next + insn.immediate.value, src, src_len)) { + apply_reloc(insn.immediate.nbytes, + buf + i + insn_offset_immediate(&insn), + src - dest); + } - if (i >= len) - return; + /* + * Where possible, convert JMP.d32 into JMP.d8. + */ + if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { + s32 imm = insn.immediate.value; + imm += src - dest; + imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; + if ((imm >> 31) == (imm >> 7)) { + buf[i+0] = JMP8_INSN_OPCODE; + buf[i+1] = (s8)imm; + + memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); + } + } + break; + } + + if (insn_rip_relative(&insn)) { + if (need_reloc(next + insn.displacement.value, src, src_len)) { + apply_reloc(insn.displacement.nbytes, + buf + i + insn_offset_displacement(&insn), + src - dest); + } + } } } @@ -270,7 +402,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insn_buff[MAX_PATCH_LEN]; - DPRINTK("alt table %px, -> %px", start, end); + DPRINTK(ALT, "alt table %px, -> %px", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -294,47 +426,31 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ - if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) - goto next; + if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { + optimize_nops(instr, a->instrlen); + continue; + } - DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", (a->flags & ALT_FLAG_NOT) ? "!" : "", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); - memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; - /* - * 0xe8 is a relative jump; fix the offset. - * - * Instruction length is checked before the opcode to avoid - * accessing uninitialized bytes for zero-length replacements. - */ - if (a->replacementlen == 5 && *insn_buff == 0xe8) { - *(s32 *)(insn_buff + 1) += replacement - instr; - DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", - *(s32 *)(insn_buff + 1), - (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); - } - - if (a->replacementlen && is_jmp(replacement[0])) - recompute_jump(a, instr, replacement, insn_buff); - for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); + apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); - text_poke_early(instr, insn_buff, insn_buff_sz); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); -next: - optimize_nops(instr, a->instrlen); + text_poke_early(instr, insn_buff, insn_buff_sz); } } @@ -555,15 +671,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) continue; } - DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(bytes, len); - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } @@ -590,13 +706,12 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; + /* Patch the custom return thunks... */ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { - if (x86_return_thunk == __x86_return_thunk) - return -1; - i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { + /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } @@ -609,6 +724,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; + /* + * Do not patch out the default return thunks if those needed are the + * ones generated by the compiler. + */ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK) && + (x86_return_thunk == __x86_return_thunk)) + return; + for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; @@ -630,14 +753,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) addr, dest, 5, addr)) continue; - DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } @@ -655,7 +778,9 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_endbr(void *addr, bool warn) +static void poison_cfi(void *addr); + +static void __init_or_module poison_endbr(void *addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); @@ -667,20 +792,23 @@ static void poison_endbr(void *addr, bool warn) return; } - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); /* * When we have IBT, the lack of ENDBR will trigger #CP */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); text_poke_early(addr, &poison, 4); } /* * Generated by: objtool --ibt + * + * Seal the functions for indirect calls by clobbering the ENDBR instructions + * and the kCFI hash value. */ -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; @@ -689,13 +817,13 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) poison_endbr(addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_endbr(addr - 16, false); + poison_cfi(addr - 16); } } #else -void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -940,6 +1068,17 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) return 0; } +static void cfi_rewrite_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + + poison_endbr(addr+16, false); + } +} + /* .retpoline_sites */ static int cfi_rand_callers(s32 *start, s32 *end) { @@ -1034,14 +1173,19 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_FINEIBT: + /* place the FineIBT preamble at func()-16 */ ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; + /* rewrite the callers to target func()-16 */ ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; + /* now that nobody targets func()+0, remove ENDBR there */ + cfi_rewrite_endbr(start_cfi, end_cfi); + if (builtin) pr_info("Using FineIBT CFI\n"); return; @@ -1054,6 +1198,41 @@ err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } +static inline void poison_hash(void *addr) +{ + *(u32 *)addr = 0; +} + +static void poison_cfi(void *addr) +{ + switch (cfi_mode) { + case CFI_FINEIBT: + /* + * __cfi_\func: + * osp nopl (%rax) + * subl $0, %r10d + * jz 1f + * ud2 + * 1: nop + */ + poison_endbr(addr, false); + poison_hash(addr + fineibt_preamble_hash); + break; + + case CFI_KCFI: + /* + * __cfi_\func: + * movl $0, %eax + * .skip 11, 0x90 + */ + poison_hash(addr + 1); + break; + + default: + break; + } +} + #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, @@ -1061,6 +1240,10 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, { } +#ifdef CONFIG_X86_KERNEL_IBT +static void poison_cfi(void *addr) { } +#endif + #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, @@ -1148,7 +1331,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); @@ -1225,6 +1408,20 @@ int alternatives_text_reserved(void *start, void *end) #endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void __init_or_module add_nops(void *insns, unsigned int len) +{ + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, x86_nops[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + void __init_or_module apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end) { @@ -1332,6 +1529,35 @@ static noinline void __init int3_selftest(void) unregister_die_notifier(&int3_exception_nb); } +static __initdata int __alt_reloc_selftest_addr; + +__visible noinline void __init __alt_reloc_selftest(void *arg) +{ + WARN_ON(arg != &__alt_reloc_selftest_addr); +} + +static noinline void __init alt_reloc_selftest(void) +{ + /* + * Tests apply_relocation(). + * + * This has a relative immediate (CALL) in a place other than the first + * instruction and additionally on x86_64 we get a RIP-relative LEA: + * + * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c + * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 + * + * Getting this wrong will either crash and burn or tickle the WARN + * above. + */ + asm_inline volatile ( + ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) + : /* output */ + : [mem] "m" (__alt_reloc_selftest_addr) + : _ASM_ARG1 + ); +} + void __init alternative_instructions(void) { int3_selftest(); @@ -1399,7 +1625,10 @@ void __init alternative_instructions(void) */ callthunks_patch_builtin_calls(); - apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + /* + * Seal all functions that do not have their address taken. + */ + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ @@ -1419,6 +1648,8 @@ void __init alternative_instructions(void) restart_nmi(); alternatives_patched = 1; + + alt_reloc_selftest(); } /** @@ -1799,7 +2030,7 @@ struct bp_patching_desc *try_get_desc(void) { struct bp_patching_desc *desc = &bp_desc; - if (!arch_atomic_inc_not_zero(&desc->refs)) + if (!raw_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; @@ -1810,7 +2041,7 @@ static __always_inline void put_desc(void) struct bp_patching_desc *desc = &bp_desc; smp_mb__before_atomic(); - arch_atomic_dec(&desc->refs); + raw_atomic_dec(&desc->refs); } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) @@ -1954,6 +2185,16 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries atomic_set_release(&bp_desc.refs, 1); /* + * Function tracing can enable thousands of places that need to be + * updated. This can take quite some time, and with full kernel debugging + * enabled, this could cause the softlockup watchdog to trigger. + * This function gets called every 256 entries added to be patched. + * Call cond_resched() here to make sure that other tasks can get scheduled + * while processing all the functions being patched. + */ + cond_resched(); + + /* * Corresponding read barrier in int3 notifier for making sure the * nr_entries and handler are correctly ordered wrt. patching. */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 7e331e8f3692..035a3db5330b 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,28 +15,31 @@ #include <linux/pci_ids.h> #include <asm/amd_nb.h> -#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 -#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 -#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 -#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 -#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 -#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 -#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 -#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 -#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 -#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 -#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e -#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 -#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 -#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 +#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 +#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 +#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb + +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec +#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 +#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 +#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 +#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc +#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -53,6 +56,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, {} }; @@ -81,6 +85,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, {} }; @@ -101,6 +106,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, {} }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 770557110051..af49e24b46a4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -101,6 +101,9 @@ static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; */ static bool virt_ext_dest_id __ro_after_init; +/* For parallel bootup. */ +unsigned long apic_mmio_base __ro_after_init; + /* * Map cpu index to physical APIC ID */ @@ -2163,6 +2166,7 @@ void __init register_lapic_address(unsigned long address) if (!x2apic_mode) { set_fixmap_nocache(FIX_APIC_BASE, address); + apic_mmio_base = APIC_BASE; apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", APIC_BASE, address); } @@ -2376,7 +2380,7 @@ static int nr_logical_cpuids = 1; /* * Used to store mapping between logical CPU IDs and APIC IDs. */ -static int cpuid_to_apicid[] = { +int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; @@ -2386,20 +2390,31 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id) } #ifdef CONFIG_SMP -/** - * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread - * @apicid: APIC ID to check +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) +{ + /* Isolate the SMT bit(s) in the APICID and check for 0 */ + u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + + if (smp_num_siblings == 1 || !(apicid & mask)) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); +} + +/* + * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid + * during early boot. Initialize the primary thread mask before SMP + * bringup. */ -bool apic_id_is_primary_thread(unsigned int apicid) +static int __init smp_init_primary_thread_mask(void) { - u32 mask; + unsigned int cpu; - if (smp_num_siblings == 1) - return true; - /* Isolate the SMT bit(s) in the APICID and check for 0 */ - mask = (1U << (fls(smp_num_siblings) - 1)) - 1; - return !(apicid & mask); + for (cpu = 0; cpu < nr_logical_cpuids; cpu++) + cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); + return 0; } +early_initcall(smp_init_primary_thread_mask); +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } #endif /* @@ -2544,6 +2559,9 @@ int generic_processor_info(int apicid, int version) set_cpu_present(cpu, true); num_processors++; + if (system_state != SYSTEM_BOOTING) + cpu_mark_primary_thread(cpu, apicid); + return cpu; } diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6bde05a86b4e..896bc41cb2ba 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -97,7 +97,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 482855227964..d9384d5b4b8e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -546,7 +546,6 @@ unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); /* The following values are used for the per node hub info struct */ -static __initdata unsigned short *_node_to_pnode; static __initdata unsigned short _min_socket, _max_socket; static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; static __initdata struct uv_gam_range_entry *uv_gre_table; @@ -554,6 +553,7 @@ static __initdata struct uv_gam_parameters *uv_gp_table; static __initdata unsigned short *_socket_to_node; static __initdata unsigned short *_socket_to_pnode; static __initdata unsigned short *_pnode_to_socket; +static __initdata unsigned short *_node_to_socket; static __initdata struct uv_gam_range_s *_gr_table; @@ -617,7 +617,8 @@ static __init void build_uv_gr_table(void) bytes = _gr_table_len * sizeof(struct uv_gam_range_s); grt = kzalloc(bytes, GFP_KERNEL); - BUG_ON(!grt); + if (WARN_ON_ONCE(!grt)) + return; _gr_table = grt; for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { @@ -1022,7 +1023,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, switch (index) { case UVY_MMIOH0: mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0; - nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK; n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; min_nasid = min_pnode; max_nasid = max_pnode; @@ -1030,7 +1031,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, break; case UVY_MMIOH1: mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1; - nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK; n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; min_nasid = min_pnode; max_nasid = max_pnode; @@ -1038,7 +1039,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, break; case UVX_MMIOH0: mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0; - nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK; n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; min_nasid = min_pnode * 2; max_nasid = max_pnode * 2; @@ -1046,7 +1047,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index, break; case UVX_MMIOH1: mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1; - nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK; n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; min_nasid = min_pnode * 2; max_nasid = max_pnode * 2; @@ -1072,8 +1073,9 @@ static void __init calc_mmioh_map(enum mmioh_arch index, /* Invalid NASID check */ if (nasid < min_nasid || max_nasid < nasid) { - pr_err("UV:%s:Invalid NASID:%x (range:%x..%x)\n", - __func__, index, min_nasid, max_nasid); + /* Not an error: unused table entries get "poison" values */ + pr_debug("UV:%s:Invalid NASID(%x):%x (range:%x..%x)\n", + __func__, index, nasid, min_nasid, max_nasid); nasid = -1; } @@ -1292,6 +1294,7 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->nasid_shift = uv_cpuid.nasid_shift; hi->min_pnode = _min_pnode; hi->min_socket = _min_socket; + hi->node_to_socket = _node_to_socket; hi->pnode_to_socket = _pnode_to_socket; hi->socket_to_node = _socket_to_node; hi->socket_to_pnode = _socket_to_pnode; @@ -1348,7 +1351,7 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; unsigned long lgre = 0, gend = 0; int index = 0; - int sock_min = 999999, pnode_min = 99999; + int sock_min = INT_MAX, pnode_min = INT_MAX; int sock_max = -1, pnode_max = -1; uv_gre_table = gre; @@ -1459,11 +1462,37 @@ static int __init decode_uv_systab(void) return 0; } +/* + * Given a bitmask 'bits' representing presnt blades, numbered + * starting at 'base', masking off unused high bits of blade number + * with 'mask', update the minimum and maximum blade numbers that we + * have found. (Masking with 'mask' necessary because of BIOS + * treatment of system partitioning when creating this table we are + * interpreting.) + */ +static inline void blade_update_min_max(unsigned long bits, int base, int mask, int *min, int *max) +{ + int first, last; + + if (!bits) + return; + first = (base + __ffs(bits)) & mask; + last = (base + __fls(bits)) & mask; + + if (*min > first) + *min = first; + if (*max < last) + *max = last; +} + /* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) { unsigned long np; int i, uv_pb = 0; + int sock_min = INT_MAX, sock_max = -1, s_mask; + + s_mask = (1 << uv_cpuid.n_skt) - 1; if (UVH_NODE_PRESENT_TABLE) { pr_info("UV: NODE_PRESENT_DEPTH = %d\n", @@ -1471,35 +1500,82 @@ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); - uv_pb += hweight64(np); + blade_update_min_max(np, i * 64, s_mask, &sock_min, &sock_max); } } if (UVH_NODE_PRESENT_0) { np = uv_read_local_mmr(UVH_NODE_PRESENT_0); pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np); - uv_pb += hweight64(np); + blade_update_min_max(np, 0, s_mask, &sock_min, &sock_max); } if (UVH_NODE_PRESENT_1) { np = uv_read_local_mmr(UVH_NODE_PRESENT_1); pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np); - uv_pb += hweight64(np); + blade_update_min_max(np, 64, s_mask, &sock_min, &sock_max); + } + + /* Only update if we actually found some bits indicating blades present */ + if (sock_max >= sock_min) { + _min_socket = sock_min; + _max_socket = sock_max; + uv_pb = sock_max - sock_min + 1; } if (uv_possible_blades != uv_pb) uv_possible_blades = uv_pb; - pr_info("UV: number nodes/possible blades %d\n", uv_pb); + pr_info("UV: number nodes/possible blades %d (%d - %d)\n", + uv_pb, sock_min, sock_max); +} + +static int __init alloc_conv_table(int num_elem, unsigned short **table) +{ + int i; + size_t bytes; + + bytes = num_elem * sizeof(*table[0]); + *table = kmalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!*table)) + return -ENOMEM; + for (i = 0; i < num_elem; i++) + ((unsigned short *)*table)[i] = SOCK_EMPTY; + return 0; } +/* Remove conversion table if it's 1:1 */ +#define FREE_1_TO_1_TABLE(tbl, min, max, max2) free_1_to_1_table(&tbl, #tbl, min, max, max2) + +static void __init free_1_to_1_table(unsigned short **tp, char *tname, int min, int max, int max2) +{ + int i; + unsigned short *table = *tp; + + if (table == NULL) + return; + if (max != max2) + return; + for (i = 0; i < max; i++) { + if (i != table[i]) + return; + } + kfree(table); + *tp = NULL; + pr_info("UV: %s is 1:1, conversion table removed\n", tname); +} + +/* + * Build Socket Tables + * If the number of nodes is >1 per socket, socket to node table will + * contain lowest node number on that socket. + */ static void __init build_socket_tables(void) { struct uv_gam_range_entry *gre = uv_gre_table; - int num, nump; + int nums, numn, nump; int cpu, i, lnid; int minsock = _min_socket; int maxsock = _max_socket; int minpnode = _min_pnode; int maxpnode = _max_pnode; - size_t bytes; if (!gre) { if (is_uv2_hub() || is_uv3_hub()) { @@ -1507,39 +1583,36 @@ static void __init build_socket_tables(void) return; } pr_err("UV: Error: UVsystab address translations not available!\n"); - BUG(); + WARN_ON_ONCE(!gre); + return; } - /* Build socket id -> node id, pnode */ - num = maxsock - minsock + 1; - bytes = num * sizeof(_socket_to_node[0]); - _socket_to_node = kmalloc(bytes, GFP_KERNEL); - _socket_to_pnode = kmalloc(bytes, GFP_KERNEL); - + numn = num_possible_nodes(); nump = maxpnode - minpnode + 1; - bytes = nump * sizeof(_pnode_to_socket[0]); - _pnode_to_socket = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket); - - for (i = 0; i < num; i++) - _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY; - - for (i = 0; i < nump; i++) - _pnode_to_socket[i] = SOCK_EMPTY; + nums = maxsock - minsock + 1; + + /* Allocate and clear tables */ + if ((alloc_conv_table(nump, &_pnode_to_socket) < 0) + || (alloc_conv_table(nums, &_socket_to_pnode) < 0) + || (alloc_conv_table(numn, &_node_to_socket) < 0) + || (alloc_conv_table(nums, &_socket_to_node) < 0)) { + kfree(_pnode_to_socket); + kfree(_socket_to_pnode); + kfree(_node_to_socket); + return; + } /* Fill in pnode/node/addr conversion list values: */ - pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; i = gre->sockid - minsock; - /* Duplicate: */ - if (_socket_to_pnode[i] != SOCK_EMPTY) - continue; - _socket_to_pnode[i] = gre->pnode; + if (_socket_to_pnode[i] == SOCK_EMPTY) + _socket_to_pnode[i] = gre->pnode; i = gre->pnode - minpnode; - _pnode_to_socket[i] = gre->sockid; + if (_pnode_to_socket[i] == SOCK_EMPTY) + _pnode_to_socket[i] = gre->sockid; pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, @@ -1549,66 +1622,39 @@ static void __init build_socket_tables(void) /* Set socket -> node values: */ lnid = NUMA_NO_NODE; - for_each_present_cpu(cpu) { + for_each_possible_cpu(cpu) { int nid = cpu_to_node(cpu); int apicid, sockid; if (lnid == nid) continue; lnid = nid; + apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - _socket_to_node[sockid - minsock] = nid; - pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", - sockid, apicid, nid); - } - /* Set up physical blade to pnode translation from GAM Range Table: */ - bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]); - _node_to_pnode = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!_node_to_pnode); + if (_socket_to_node[sockid - minsock] == SOCK_EMPTY) + _socket_to_node[sockid - minsock] = nid; - for (lnid = 0; lnid < num_possible_nodes(); lnid++) { - unsigned short sockid; + if (_node_to_socket[nid] == SOCK_EMPTY) + _node_to_socket[nid] = sockid; - for (sockid = minsock; sockid <= maxsock; sockid++) { - if (lnid == _socket_to_node[sockid - minsock]) { - _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock]; - break; - } - } - if (sockid > maxsock) { - pr_err("UV: socket for node %d not found!\n", lnid); - BUG(); - } + pr_info("UV: sid:%02x: apicid:%04x socket:%02d node:%03x s2n:%03x\n", + sockid, + apicid, + _node_to_socket[nid], + nid, + _socket_to_node[sockid - minsock]); } /* - * If socket id == pnode or socket id == node for all nodes, + * If e.g. socket id == pnode for all pnodes, * system runs faster by removing corresponding conversion table. */ - pr_info("UV: Checking socket->node/pnode for identity maps\n"); - if (minsock == 0) { - for (i = 0; i < num; i++) - if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i]) - break; - if (i >= num) { - kfree(_socket_to_node); - _socket_to_node = NULL; - pr_info("UV: 1:1 socket_to_node table removed\n"); - } - } - if (minsock == minpnode) { - for (i = 0; i < num; i++) - if (_socket_to_pnode[i] != SOCK_EMPTY && - _socket_to_pnode[i] != i + minpnode) - break; - if (i >= num) { - kfree(_socket_to_pnode); - _socket_to_pnode = NULL; - pr_info("UV: 1:1 socket_to_pnode table removed\n"); - } - } + FREE_1_TO_1_TABLE(_socket_to_node, _min_socket, nums, numn); + FREE_1_TO_1_TABLE(_node_to_socket, _min_socket, nums, numn); + FREE_1_TO_1_TABLE(_socket_to_pnode, _min_pnode, nums, nump); + FREE_1_TO_1_TABLE(_pnode_to_socket, _min_pnode, nums, nump); } /* Check which reboot to use */ @@ -1692,12 +1738,13 @@ static __init int uv_system_init_hubless(void) static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; - int bytes, cpu, nodeid; - unsigned short min_pnode = 9999, max_pnode = 0; + int bytes, cpu, nodeid, bid; + unsigned short min_pnode = USHRT_MAX, max_pnode = 0; char *hub = is_uv5_hub() ? "UV500" : is_uv4_hub() ? "UV400" : is_uv3_hub() ? "UV300" : is_uv2_hub() ? "UV2000/3000" : NULL; + struct uv_hub_info_s **uv_hub_info_list_blade; if (!hub) { pr_err("UV: Unknown/unsupported UV hub\n"); @@ -1720,9 +1767,12 @@ static void __init uv_system_init_hub(void) build_uv_gr_table(); set_block_size(); uv_init_hub_info(&hub_info); - uv_possible_blades = num_possible_nodes(); - if (!_node_to_pnode) + /* If UV2 or UV3 may need to get # blades from HW */ + if (is_uv(UV2|UV3) && !uv_gre_table) boot_init_possible_blades(&hub_info); + else + /* min/max sockets set in decode_gam_rng_tbl */ + uv_possible_blades = (_max_socket - _min_socket) + 1; /* uv_num_possible_blades() is really the hub count: */ pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus()); @@ -1731,79 +1781,98 @@ static void __init uv_system_init_hub(void) hub_info.coherency_domain_number = sn_coherency_id; uv_rtc_init(); + /* + * __uv_hub_info_list[] is indexed by node, but there is only + * one hub_info structure per blade. First, allocate one + * structure per blade. Further down we create a per-node + * table (__uv_hub_info_list[]) pointing to hub_info + * structures for the correct blade. + */ + bytes = sizeof(void *) * uv_num_possible_blades(); - __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); - BUG_ON(!__uv_hub_info_list); + uv_hub_info_list_blade = kzalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!uv_hub_info_list_blade)) + return; bytes = sizeof(struct uv_hub_info_s); - for_each_node(nodeid) { + for_each_possible_blade(bid) { struct uv_hub_info_s *new_hub; - if (__uv_hub_info_list[nodeid]) { - pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid); - BUG(); + /* Allocate & fill new per hub info list */ + new_hub = (bid == 0) ? &uv_hub_info_node0 + : kzalloc_node(bytes, GFP_KERNEL, uv_blade_to_node(bid)); + if (WARN_ON_ONCE(!new_hub)) { + /* do not kfree() bid 0, which is statically allocated */ + while (--bid > 0) + kfree(uv_hub_info_list_blade[bid]); + kfree(uv_hub_info_list_blade); + return; } - /* Allocate new per hub info list */ - new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid); - BUG_ON(!new_hub); - __uv_hub_info_list[nodeid] = new_hub; - new_hub = uv_hub_info_list(nodeid); - BUG_ON(!new_hub); + uv_hub_info_list_blade[bid] = new_hub; *new_hub = hub_info; /* Use information from GAM table if available: */ - if (_node_to_pnode) - new_hub->pnode = _node_to_pnode[nodeid]; + if (uv_gre_table) + new_hub->pnode = uv_blade_to_pnode(bid); else /* Or fill in during CPU loop: */ new_hub->pnode = 0xffff; - new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); + new_hub->numa_blade_id = bid; new_hub->memory_nid = NUMA_NO_NODE; new_hub->nr_possible_cpus = 0; new_hub->nr_online_cpus = 0; } + /* + * Now populate __uv_hub_info_list[] for each node with the + * pointer to the struct for the blade it resides on. + */ + + bytes = sizeof(void *) * num_possible_nodes(); + __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!__uv_hub_info_list)) { + for_each_possible_blade(bid) + /* bid 0 is statically allocated */ + if (bid != 0) + kfree(uv_hub_info_list_blade[bid]); + kfree(uv_hub_info_list_blade); + return; + } + + for_each_node(nodeid) + __uv_hub_info_list[nodeid] = uv_hub_info_list_blade[uv_node_to_blade_id(nodeid)]; + /* Initialize per CPU info: */ for_each_possible_cpu(cpu) { - int apicid = per_cpu(x86_cpu_to_apicid, cpu); - int numa_node_id; + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + unsigned short bid; unsigned short pnode; - nodeid = cpu_to_node(cpu); - numa_node_id = numa_cpu_node(cpu); pnode = uv_apicid_to_pnode(apicid); + bid = uv_pnode_to_socket(pnode) - _min_socket; - uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); + uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list_blade[bid]; uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); - /* Init memoryless node: */ - if (nodeid != numa_node_id && - uv_hub_info_list(numa_node_id)->pnode == 0xffff) - uv_hub_info_list(numa_node_id)->pnode = pnode; - else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) + if (uv_cpu_hub_info(cpu)->pnode == 0xffff) uv_cpu_hub_info(cpu)->pnode = pnode; } - for_each_node(nodeid) { - unsigned short pnode = uv_hub_info_list(nodeid)->pnode; + for_each_possible_blade(bid) { + unsigned short pnode = uv_hub_info_list_blade[bid]->pnode; - /* Add pnode info for pre-GAM list nodes without CPUs: */ - if (pnode == 0xffff) { - unsigned long paddr; + if (pnode == 0xffff) + continue; - paddr = node_start_pfn(nodeid) << PAGE_SHIFT; - pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); - uv_hub_info_list(nodeid)->pnode = pnode; - } min_pnode = min(pnode, min_pnode); max_pnode = max(pnode, max_pnode); - pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n", - nodeid, - uv_hub_info_list(nodeid)->pnode, - uv_hub_info_list(nodeid)->nr_possible_cpus); + pr_info("UV: HUB:%2d pn:%02x nrcpus:%d\n", + bid, + uv_hub_info_list_blade[bid]->pnode, + uv_hub_info_list_blade[bid]->nr_possible_cpus); } pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode); @@ -1811,6 +1880,9 @@ static void __init uv_system_init_hub(void) map_mmr_high(max_pnode); map_mmioh_high(min_pnode, max_pnode); + kfree(uv_hub_info_list_blade); + uv_hub_info_list_blade = NULL; + uv_nmi_setup(); uv_cpu_init(); uv_setup_proc_files(0); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 22ab13966427..c06bfc086565 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -133,8 +133,8 @@ static bool skip_addr(void *dest) /* Accounts directly */ if (dest == ret_from_fork) return true; -#ifdef CONFIG_HOTPLUG_CPU - if (dest == start_cpu0) +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) + if (dest == soft_restart_cpu) return true; #endif #ifdef CONFIG_FUNCTION_TRACER @@ -293,7 +293,8 @@ void *callthunks_translate_call_dest(void *dest) return target ? : dest; } -bool is_callthunk(void *addr) +#ifdef CONFIG_BPF_JIT +static bool is_callthunk(void *addr) { unsigned int tmpl_size = SKL_TMPL_SIZE; void *tmpl = skl_call_thunk_template; @@ -306,7 +307,6 @@ bool is_callthunk(void *addr) return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); } -#ifdef CONFIG_BPF_JIT int x86_call_depth_emit_accounting(u8 **pprog, void *func) { unsigned int tmpl_size = SKL_TMPL_SIZE; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7e3ceaf75c1..4350f6bfc064 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -27,7 +27,7 @@ obj-y += cpuid-deps.o obj-y += umwait.o obj-$(CONFIG_PROC_FS) += proc.o -obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o +obj-y += capflags.o powerflags.o obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL @@ -54,7 +54,6 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o obj-$(CONFIG_ACRN_GUEST) += acrn.o -ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ @@ -63,5 +62,4 @@ vmxfeature = $(src)/../../include/asm/vmxfeatures.h $(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) -endif targets += capflags.c diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 182af64387d0..9e2a91830f72 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -9,7 +9,6 @@ * - Andrew D. Balsa (code cleanup). */ #include <linux/init.h> -#include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/nospec.h> @@ -27,8 +26,6 @@ #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> -#include <asm/alternative.h> -#include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> @@ -125,21 +122,8 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); EXPORT_SYMBOL_GPL(mmio_stale_data_clear); -void __init check_bugs(void) +void __init cpu_select_mitigations(void) { - identify_boot_cpu(); - - /* - * identify_boot_cpu() initialized SMT support information, let the - * core code know. - */ - cpu_smt_check_topology(); - - if (!IS_ENABLED(CONFIG_SMP)) { - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); - } - /* * Read the SPEC_CTRL MSR to account for reserved bits which may * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD @@ -176,39 +160,6 @@ void __init check_bugs(void) md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); - - arch_smt_update(); - -#ifdef CONFIG_X86_32 - /* - * Check whether we are able to run this kernel safely on SMP. - * - * - i386 is no longer supported. - * - In order to run on anything without a TSC, we need to be - * compiled for a i486. - */ - if (boot_cpu_data.x86 < 4) - panic("Kernel requires i486+ for 'invlpg' and other features"); - - init_utsname()->machine[1] = - '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); - alternative_instructions(); - - fpu__init_check_bugs(); -#else /* CONFIG_X86_64 */ - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -#endif } /* diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 4063e8991211..8f86eacf69f7 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -39,6 +39,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Shared L2 cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); +static cpumask_var_t cpu_cacheinfo_mask; + /* Kernel controls MTRR and/or PAT MSRs. */ unsigned int memory_caching_control __ro_after_init; @@ -1172,8 +1174,10 @@ void cache_bp_restore(void) cache_cpu_init(); } -static int cache_ap_init(unsigned int cpu) +static int cache_ap_online(unsigned int cpu) { + cpumask_set_cpu(cpu, cpu_cacheinfo_mask); + if (!memory_caching_control || get_cache_aps_delayed_init()) return 0; @@ -1191,11 +1195,17 @@ static int cache_ap_init(unsigned int cpu) * lock to prevent MTRR entry changes */ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL, - cpu_callout_mask); + cpu_cacheinfo_mask); return 0; } +static int cache_ap_offline(unsigned int cpu) +{ + cpumask_clear_cpu(cpu, cpu_cacheinfo_mask); + return 0; +} + /* * Delayed cache initialization for all AP's */ @@ -1210,9 +1220,12 @@ void cache_aps_init(void) static int __init cache_ap_register(void) { + zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL); + cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask); + cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING, "x86/cachectrl:starting", - cache_ap_init, NULL); + cache_ap_online, cache_ap_offline); return 0; } -core_initcall(cache_ap_register); +early_initcall(cache_ap_register); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80710a68ef7d..52683fddafaf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,12 +18,16 @@ #include <linux/init.h> #include <linux/kprobes.h> #include <linux/kgdb.h> +#include <linux/mem_encrypt.h> #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/io.h> #include <linux/syscore_ops.h> #include <linux/pgtable.h> #include <linux/stackprotector.h> +#include <linux/utsname.h> +#include <asm/alternative.h> #include <asm/cmdline.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> @@ -59,7 +63,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> -#include <asm/sigframe.h> +#include <asm/set_memory.h> #include <asm/traps.h> #include <asm/sev.h> @@ -67,14 +71,6 @@ u32 elf_hwcap2 __read_mostly; -/* all of these masks are initialized in setup_cpu_local_masks() */ -cpumask_var_t cpu_initialized_mask; -cpumask_var_t cpu_callout_mask; -cpumask_var_t cpu_callin_mask; - -/* representing cpus for which sibling maps can be computed */ -cpumask_var_t cpu_sibling_setup_mask; - /* Number of siblings per CPU package */ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); @@ -169,15 +165,6 @@ clear_ppin: clear_cpu_cap(c, info->feature); } -/* correctly size the local cpu masks */ -void __init setup_cpu_local_masks(void) -{ - alloc_bootmem_cpumask_var(&cpu_initialized_mask); - alloc_bootmem_cpumask_var(&cpu_callin_mask); - alloc_bootmem_cpumask_var(&cpu_callout_mask); - alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); -} - static void default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 @@ -1502,12 +1489,10 @@ static void __init cpu_parse_early_param(void) if (!kstrtouint(opt, 10, &bit)) { if (bit < NCAPINTS * 32) { -#ifdef CONFIG_X86_FEATURE_NAMES /* empty-string, i.e., ""-defined feature flags */ if (!x86_cap_flags[bit]) pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); else -#endif pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); setup_clear_cpu_cap(bit); @@ -1520,7 +1505,6 @@ static void __init cpu_parse_early_param(void) continue; } -#ifdef CONFIG_X86_FEATURE_NAMES for (bit = 0; bit < 32 * NCAPINTS; bit++) { if (!x86_cap_flag(bit)) continue; @@ -1537,7 +1521,6 @@ static void __init cpu_parse_early_param(void) if (!found) pr_cont(" (unknown: %s)", opt); -#endif } pr_cont("\n"); @@ -1600,10 +1583,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) sld_setup(c); - fpu__init_system(c); - - init_sigframe_size(); - #ifdef CONFIG_X86_32 /* * Regardless of whether PCID is enumerated, the SDM says @@ -2123,19 +2102,6 @@ static void dbg_restore_debug_regs(void) #define dbg_restore_debug_regs() #endif /* ! CONFIG_KGDB */ -static void wait_for_master_cpu(int cpu) -{ -#ifdef CONFIG_SMP - /* - * wait for ACK from master CPU before continuing - * with AP initialization - */ - WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)); - while (!cpumask_test_cpu(cpu, cpu_callout_mask)) - cpu_relax(); -#endif -} - static inline void setup_getcpu(int cpu) { unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); @@ -2158,11 +2124,7 @@ static inline void setup_getcpu(int cpu) } #ifdef CONFIG_X86_64 -static inline void ucode_cpu_init(int cpu) -{ - if (cpu) - load_ucode_ap(); -} +static inline void ucode_cpu_init(int cpu) { } static inline void tss_setup_ist(struct tss_struct *tss) { @@ -2239,8 +2201,6 @@ void cpu_init(void) struct task_struct *cur = current; int cpu = raw_smp_processor_id(); - wait_for_master_cpu(cpu); - ucode_cpu_init(cpu); #ifdef CONFIG_NUMA @@ -2285,26 +2245,12 @@ void cpu_init(void) doublefault_init_cpu_tss(); - fpu__init_cpu(); - if (is_uv_system()) uv_cpu_init(); load_fixmap_gdt(cpu); } -#ifdef CONFIG_SMP -void cpu_init_secondary(void) -{ - /* - * Relies on the BP having set-up the IDT tables, which are loaded - * on this CPU in cpu_init_exception_handling(). - */ - cpu_init_exception_handling(); - cpu_init(); -} -#endif - #ifdef CONFIG_MICROCODE_LATE_LOADING /** * store_cpu_caps() - Store a snapshot of CPU capabilities @@ -2362,3 +2308,69 @@ void arch_smt_update(void) /* Check whether IPI broadcasting can be enabled */ apic_smt_update(); } + +void __init arch_cpu_finalize_init(void) +{ + identify_boot_cpu(); + + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology(); + + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + + cpu_select_mitigations(); + + arch_smt_update(); + + if (IS_ENABLED(CONFIG_X86_32)) { + /* + * Check whether this is a real i386 which is not longer + * supported and fixup the utsname. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + } + + /* + * Must be before alternatives because it might set or clear + * feature bits. + */ + fpu__init_system(); + fpu__init_cpu(); + + alternative_instructions(); + + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); + } else { + fpu__init_check_bugs(); + } + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. It + * must be called after late_time_init() so that Hyper-V x86/x64 + * hypercalls work when the SWIOTLB bounce buffers are decrypted. + */ + mem_encrypt_init(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f97b0fe13da8..1c44630d4789 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -79,6 +79,7 @@ extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); +void cpu_select_mitigations(void); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 0b971f974096..5e74610b39e7 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -715,11 +715,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) bool amd_mce_is_memory_error(struct mce *m) { + enum smca_bank_types bank_type; /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; + bank_type = smca_get_bank_type(m->extcpu, m->bank); if (mce_flags.smca) - return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0; + return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -1050,7 +1052,7 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol if (bank_type >= N_SMCA_BANK_TYPES) return NULL; - if (b && bank_type == SMCA_UMC) { + if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) return smca_umc_block_names[b->block]; return NULL; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2eec60f50057..89e2aab5d34d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1022,12 +1022,12 @@ static noinstr int mce_start(int *no_way_out) if (!timeout) return ret; - arch_atomic_add(*no_way_out, &global_nwo); + raw_atomic_add(*no_way_out, &global_nwo); /* * Rely on the implied barrier below, such that global_nwo * is updated before mce_callin. */ - order = arch_atomic_inc_return(&mce_callin); + order = raw_atomic_inc_return(&mce_callin); arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); /* Enable instrumentation around calls to external facilities */ @@ -1036,10 +1036,10 @@ static noinstr int mce_start(int *no_way_out) /* * Wait for everyone. */ - while (arch_atomic_read(&mce_callin) != num_online_cpus()) { + while (raw_atomic_read(&mce_callin) != num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) { - arch_atomic_set(&global_nwo, 0); + raw_atomic_set(&global_nwo, 0); goto out; } ndelay(SPINUNIT); @@ -1054,7 +1054,7 @@ static noinstr int mce_start(int *no_way_out) /* * Monarch: Starts executing now, the others wait. */ - arch_atomic_set(&mce_executing, 1); + raw_atomic_set(&mce_executing, 1); } else { /* * Subject: Now start the scanning loop one by one in @@ -1062,10 +1062,10 @@ static noinstr int mce_start(int *no_way_out) * This way when there are any shared banks it will be * only seen by one CPU before cleared, avoiding duplicates. */ - while (arch_atomic_read(&mce_executing) < order) { + while (raw_atomic_read(&mce_executing) < order) { if (mce_timed_out(&timeout, "Timeout: Subject CPUs unable to finish machine check processing")) { - arch_atomic_set(&global_nwo, 0); + raw_atomic_set(&global_nwo, 0); goto out; } ndelay(SPINUNIT); @@ -1075,7 +1075,7 @@ static noinstr int mce_start(int *no_way_out) /* * Cache the global no_way_out state. */ - *no_way_out = arch_atomic_read(&global_nwo); + *no_way_out = raw_atomic_read(&global_nwo); ret = order; @@ -1533,7 +1533,7 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (kill_current_task) + if (!mce_usable_address(&m)) queue_task_work(&m, msg, kill_me_now); else queue_task_work(&m, msg, kill_me_maybe); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index f5fdeb1e3606..87208e46f7ed 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -78,8 +78,6 @@ static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) if (sig == e->installed_cpu) return e->equiv_cpu; - - e++; } return 0; } @@ -596,11 +594,6 @@ void reload_ucode_amd(unsigned int cpu) } } } -static u16 __find_equiv_id(unsigned int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - return find_equiv_id(&equiv_table, uci->cpu_sig.sig); -} /* * a small, trivial cache of per-family ucode patches @@ -651,9 +644,11 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u16 equiv_id; - equiv_id = __find_equiv_id(cpu); + + equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); if (!equiv_id) return NULL; @@ -705,7 +700,7 @@ static enum ucode_state apply_microcode_amd(int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ - if (rev >= mc_amd->hdr.patch_id) { + if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; goto out; } diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index cc4f9f1cb94c..aee4bc5ad496 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := mtrr.o if.o generic.o cleanup.o -obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o +obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o legacy.o diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index eff6ac62c0ff..ef3e8e42b782 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -110,7 +110,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) } const struct mtrr_ops amd_mtrr_ops = { - .vendor = X86_VENDOR_AMD, + .var_regs = 2, .set = amd_set_mtrr, .get = amd_get_mtrr, .get_free_region = generic_get_free_region, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index b8a74eddde83..6f6c3ae92943 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -45,15 +45,6 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) return -ENOSPC; } -/* - * Report boot time MCR setups - */ -void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) -{ - centaur_mcr[mcr].low = lo; - centaur_mcr[mcr].high = hi; -} - static void centaur_get_mcr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) @@ -112,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t } const struct mtrr_ops centaur_mtrr_ops = { - .vendor = X86_VENDOR_CENTAUR, + .var_regs = 8, .set = centaur_set_mcr, .get = centaur_get_mcr, .get_free_region = centaur_get_free_region, diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index b5f43049fa5f..18cf79d6e2c5 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -55,9 +55,6 @@ static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; -static int __initdata debug_print; -#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0) - #define BIOS_BUG_MSG \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" @@ -79,12 +76,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, base, base + size); } - if (debug_print) { - pr_debug("After WB checking\n"); - for (i = 0; i < nr_range; i++) - pr_debug("MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end); - } + + Dprintk("After WB checking\n"); + for (i = 0; i < nr_range; i++) + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); /* Take out UC ranges: */ for (i = 0; i < num_var_ranges; i++) { @@ -112,24 +108,22 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, subtract_range(range, RANGE_NUM, extra_remove_base, extra_remove_base + extra_remove_size); - if (debug_print) { - pr_debug("After UC checking\n"); - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - pr_debug("MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end); - } + Dprintk("After UC checking\n"); + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); } /* sort the ranges */ nr_range = clean_sort_range(range, RANGE_NUM); - if (debug_print) { - pr_debug("After sorting\n"); - for (i = 0; i < nr_range; i++) - pr_debug("MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end); - } + + Dprintk("After sorting\n"); + for (i = 0; i < nr_range; i++) + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); return nr_range; } @@ -164,16 +158,9 @@ static int __init enable_mtrr_cleanup_setup(char *str) } early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); -static int __init mtrr_cleanup_debug_setup(char *str) -{ - debug_print = 1; - return 0; -} -early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); - static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) + unsigned char type) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -183,7 +170,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, return; } - mask = (1ULL << address_bits) - 1; + mask = (1ULL << boot_cpu_data.x86_phys_bits) - 1; mask &= ~((((u64)sizek) << 10) - 1); base = ((u64)basek) << 10; @@ -209,7 +196,7 @@ save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, range_state[reg].type = type; } -static void __init set_var_mtrr_all(unsigned int address_bits) +static void __init set_var_mtrr_all(void) { unsigned long basek, sizek; unsigned char type; @@ -220,7 +207,7 @@ static void __init set_var_mtrr_all(unsigned int address_bits) sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); type = range_state[reg].type; - set_var_mtrr(reg, basek, sizek, type, address_bits); + set_var_mtrr(reg, basek, sizek, type); } } @@ -267,7 +254,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1UL << align; - if (debug_print) { + if (mtrr_debug) { char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; @@ -542,7 +529,7 @@ static void __init print_out_mtrr_range_state(void) start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; - pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + Dprintk("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", i, start_base, start_factor, size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE) ? "UC" : @@ -680,7 +667,7 @@ static int __init mtrr_search_optimal_index(void) return index_good; } -int __init mtrr_cleanup(unsigned address_bits) +int __init mtrr_cleanup(void) { unsigned long x_remove_base, x_remove_size; unsigned long base, size, def, dummy; @@ -689,7 +676,10 @@ int __init mtrr_cleanup(unsigned address_bits) int index_good; int i; - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + if (!mtrr_enabled()) + return 0; + + if (!cpu_feature_enabled(X86_FEATURE_MTRR) || enable_mtrr_cleanup < 1) return 0; rdmsr(MSR_MTRRdefType, def, dummy); @@ -711,7 +701,7 @@ int __init mtrr_cleanup(unsigned address_bits) return 0; /* Print original var MTRRs at first, for debugging: */ - pr_debug("original variable MTRRs\n"); + Dprintk("original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); @@ -742,8 +732,8 @@ int __init mtrr_cleanup(unsigned address_bits) mtrr_print_out_one_result(i); if (!result[i].bad) { - set_var_mtrr_all(address_bits); - pr_debug("New variable MTRRs\n"); + set_var_mtrr_all(); + Dprintk("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } @@ -763,7 +753,7 @@ int __init mtrr_cleanup(unsigned address_bits) mtrr_calc_range_state(chunk_size, gran_size, x_remove_base, x_remove_size, i); - if (debug_print) { + if (mtrr_debug) { mtrr_print_out_one_result(i); pr_info("\n"); } @@ -786,8 +776,8 @@ int __init mtrr_cleanup(unsigned address_bits) gran_size = result[i].gran_sizek; gran_size <<= 10; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - set_var_mtrr_all(address_bits); - pr_debug("New variable MTRRs\n"); + set_var_mtrr_all(); + Dprintk("New variable MTRRs\n"); print_out_mtrr_range_state(); return 1; } else { @@ -802,7 +792,7 @@ int __init mtrr_cleanup(unsigned address_bits) return 0; } #else -int __init mtrr_cleanup(unsigned address_bits) +int __init mtrr_cleanup(void) { return 0; } @@ -882,15 +872,18 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; + if (!mtrr_enabled()) + return 0; + /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: */ - if (!is_cpu(INTEL) || disable_mtrr_trim) + if (!cpu_feature_enabled(X86_FEATURE_MTRR) || disable_mtrr_trim) return 0; rdmsr(MSR_MTRRdefType, def, dummy); - def &= 0xff; + def &= MTRR_DEF_TYPE_TYPE; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 173b9e01e623..238dad57d4d6 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -235,7 +235,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, } const struct mtrr_ops cyrix_mtrr_ops = { - .vendor = X86_VENDOR_CYRIX, + .var_regs = 8, .set = cyrix_set_arr, .get = cyrix_get_arr, .get_free_region = cyrix_get_free_region, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ee09d359e08f..2d6aa5d2e3d7 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -8,10 +8,12 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> - +#include <linux/cc_platform.h> #include <asm/processor-flags.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/mshyperv.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/msr.h> @@ -31,6 +33,55 @@ static struct fixed_range_block fixed_range_blocks[] = { {} }; +struct cache_map { + u64 start; + u64 end; + u64 flags; + u64 type:8; + u64 fixed:1; +}; + +bool mtrr_debug; + +static int __init mtrr_param_setup(char *str) +{ + int rc = 0; + + if (!str) + return -EINVAL; + if (!strcmp(str, "debug")) + mtrr_debug = true; + else + rc = -EINVAL; + + return rc; +} +early_param("mtrr", mtrr_param_setup); + +/* + * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where + * no 2 adjacent ranges have the same cache mode (those would be merged). + * The number is based on the worst case: + * - no two adjacent fixed MTRRs share the same cache mode + * - one variable MTRR is spanning a huge area with mode WB + * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2 + * additional ranges each (result like "ababababa...aba" with a = WB, b = UC), + * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries + * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries + * to the possible maximum, as it always starts at 4GB, thus it can't be in + * the middle of that MTRR, unless that MTRR starts at 0, which would remove + * the initial "a" from the "abababa" pattern above) + * The map won't contain ranges with no matching MTRR (those fall back to the + * default cache mode). + */ +#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2) + +static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata; +static struct cache_map *cache_map __refdata = init_cache_map; +static unsigned int cache_map_size = CACHE_MAP_MAX; +static unsigned int cache_map_n; +static unsigned int cache_map_fixed; + static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; @@ -38,6 +89,9 @@ u64 mtrr_tom2; struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); +/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */ +u32 phys_hi_rsvd; + /* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD @@ -69,175 +123,370 @@ static u64 get_mtrr_size(u64 mask) { u64 size; - mask >>= PAGE_SHIFT; - mask |= size_or_mask; + mask |= (u64)phys_hi_rsvd << 32; size = -mask; - size <<= PAGE_SHIFT; + return size; } +static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size) +{ + struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg; + + if (!(mtrr->mask_lo & MTRR_PHYSMASK_V)) + return MTRR_TYPE_INVALID; + + *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK); + *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) + + (mtrr->mask_lo & PAGE_MASK)); + + return mtrr->base_lo & MTRR_PHYSBASE_TYPE; +} + +static u8 get_effective_type(u8 type1, u8 type2) +{ + if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK)) + return MTRR_TYPE_WRTHROUGH; + + if (type1 != type2) + return MTRR_TYPE_UNCACHABLE; + + return type1; +} + +static void rm_map_entry_at(int idx) +{ + cache_map_n--; + if (cache_map_n > idx) { + memmove(cache_map + idx, cache_map + idx + 1, + sizeof(*cache_map) * (cache_map_n - idx)); + } +} + /* - * Check and return the effective type for MTRR-MTRR type overlap. - * Returns 1 if the effective type is UNCACHEABLE, else returns 0 + * Add an entry into cache_map at a specific index. Merges adjacent entries if + * appropriate. Return the number of merges for correcting the scan index + * (this is needed as merging will reduce the number of entries, which will + * result in skipping entries in future iterations if the scan index isn't + * corrected). + * Note that the corrected index can never go below -1 (resulting in being 0 in + * the next scan iteration), as "2" is returned only if the current index is + * larger than zero. */ -static int check_type_overlap(u8 *prev, u8 *curr) +static int add_map_entry_at(u64 start, u64 end, u8 type, int idx) { - if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) { - *prev = MTRR_TYPE_UNCACHABLE; - *curr = MTRR_TYPE_UNCACHABLE; - return 1; + bool merge_prev = false, merge_next = false; + + if (start >= end) + return 0; + + if (idx > 0) { + struct cache_map *prev = cache_map + idx - 1; + + if (!prev->fixed && start == prev->end && type == prev->type) + merge_prev = true; } - if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) || - (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) { - *prev = MTRR_TYPE_WRTHROUGH; - *curr = MTRR_TYPE_WRTHROUGH; + if (idx < cache_map_n) { + struct cache_map *next = cache_map + idx; + + if (!next->fixed && end == next->start && type == next->type) + merge_next = true; } - if (*prev != *curr) { - *prev = MTRR_TYPE_UNCACHABLE; - *curr = MTRR_TYPE_UNCACHABLE; + if (merge_prev && merge_next) { + cache_map[idx - 1].end = cache_map[idx].end; + rm_map_entry_at(idx); + return 2; + } + if (merge_prev) { + cache_map[idx - 1].end = end; return 1; } + if (merge_next) { + cache_map[idx].start = start; + return 1; + } + + /* Sanity check: the array should NEVER be too small! */ + if (cache_map_n == cache_map_size) { + WARN(1, "MTRR cache mode memory map exhausted!\n"); + cache_map_n = cache_map_fixed; + return 0; + } + + if (cache_map_n > idx) { + memmove(cache_map + idx + 1, cache_map + idx, + sizeof(*cache_map) * (cache_map_n - idx)); + } + + cache_map[idx].start = start; + cache_map[idx].end = end; + cache_map[idx].type = type; + cache_map[idx].fixed = 0; + cache_map_n++; return 0; } -/** - * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries - * - * Return the MTRR fixed memory type of 'start'. - * - * MTRR fixed entries are divided into the following ways: - * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges - * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges - * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges - * - * Return Values: - * MTRR_TYPE_(type) - Matched memory type - * MTRR_TYPE_INVALID - Unmatched +/* Clear a part of an entry. Return 1 if start of entry is still valid. */ +static int clr_map_range_at(u64 start, u64 end, int idx) +{ + int ret = start != cache_map[idx].start; + u64 tmp; + + if (start == cache_map[idx].start && end == cache_map[idx].end) { + rm_map_entry_at(idx); + } else if (start == cache_map[idx].start) { + cache_map[idx].start = end; + } else if (end == cache_map[idx].end) { + cache_map[idx].end = start; + } else { + tmp = cache_map[idx].end; + cache_map[idx].end = start; + add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1); + } + + return ret; +} + +/* + * Add MTRR to the map. The current map is scanned and each part of the MTRR + * either overlapping with an existing entry or with a hole in the map is + * handled separately. */ -static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +static void add_map_entry(u64 start, u64 end, u8 type) { - int idx; + u8 new_type, old_type; + u64 tmp; + int i; - if (start >= 0x100000) - return MTRR_TYPE_INVALID; + for (i = 0; i < cache_map_n && start < end; i++) { + if (start >= cache_map[i].end) + continue; + + if (start < cache_map[i].start) { + /* Region start has no overlap. */ + tmp = min(end, cache_map[i].start); + i -= add_map_entry_at(start, tmp, type, i); + start = tmp; + continue; + } - /* 0x0 - 0x7FFFF */ - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - /* 0x80000 - 0xBFFFF */ - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; + new_type = get_effective_type(type, cache_map[i].type); + old_type = cache_map[i].type; + + if (cache_map[i].fixed || new_type == old_type) { + /* Cut off start of new entry. */ + start = cache_map[i].end; + continue; + } + + /* Handle only overlapping part of region. */ + tmp = min(end, cache_map[i].end); + i += clr_map_range_at(start, tmp, i); + i -= add_map_entry_at(start, tmp, new_type, i); + start = tmp; } - /* 0xC0000 - 0xFFFFF */ - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; + /* Add rest of region after last map entry (rest might be empty). */ + add_map_entry_at(start, end, type, i); } -/** - * mtrr_type_lookup_variable - look up memory type in MTRR variable entries - * - * Return Value: - * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) - * - * Output Arguments: - * repeat - Set to 1 when [start:end] spanned across MTRR range and type - * returned corresponds only to [start:*partial_end]. Caller has - * to lookup again for [*partial_end:end]. - * - * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the - * region is fully covered by a single MTRR entry or the default - * type. +/* Add variable MTRRs to cache map. */ +static void map_add_var(void) +{ + u64 start, size; + unsigned int i; + u8 type; + + /* + * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it + * needs to be added again when rebuilding the map due to potentially + * having moved as a result of variable MTRRs for memory below 4GB. + */ + if (mtrr_tom2) { + add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK); + cache_map[cache_map_n - 1].fixed = 1; + } + + for (i = 0; i < num_var_ranges; i++) { + type = get_var_mtrr_state(i, &start, &size); + if (type != MTRR_TYPE_INVALID) + add_map_entry(start, start + size, type); + } +} + +/* + * Rebuild map by replacing variable entries. Needs to be called when MTRR + * registers are being changed after boot, as such changes could include + * removals of registers, which are complicated to handle without rebuild of + * the map. */ -static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, - int *repeat, u8 *uniform) +void generic_rebuild_map(void) { - int i; - u64 base, mask; - u8 prev_match, curr_match; + if (mtrr_if != &generic_mtrr_ops) + return; - *repeat = 0; - *uniform = 1; + cache_map_n = cache_map_fixed; - prev_match = MTRR_TYPE_INVALID; - for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state, inclusive; + map_add_var(); +} - if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) - continue; +static unsigned int __init get_cache_map_size(void) +{ + return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0); +} - base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) + - (mtrr_state.var_ranges[i].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) + - (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK); - - start_state = ((start & mask) == (base & mask)); - end_state = ((end & mask) == (base & mask)); - inclusive = ((start < base) && (end > base)); - - if ((start_state != end_state) || inclusive) { - /* - * We have start:end spanning across an MTRR. - * We split the region into either - * - * - start_state:1 - * (start:mtrr_end)(mtrr_end:end) - * - end_state:1 - * (start:mtrr_start)(mtrr_start:end) - * - inclusive:1 - * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) - * - * depending on kind of overlap. - * - * Return the type of the first region and a pointer - * to the start of next region so that caller will be - * advised to lookup again after having adjusted start - * and end. - * - * Note: This way we handle overlaps with multiple - * entries and the default type properly. - */ - if (start_state) - *partial_end = base + get_mtrr_size(mask); - else - *partial_end = base; - - if (unlikely(*partial_end <= start)) { - WARN_ON(1); - *partial_end = start + PAGE_SIZE; - } +/* Build the cache_map containing the cache modes per memory range. */ +void __init mtrr_build_map(void) +{ + u64 start, end, size; + unsigned int i; + u8 type; - end = *partial_end - 1; /* end is inclusive */ - *repeat = 1; - *uniform = 0; + /* Add fixed MTRRs, optimize for adjacent entries with same type. */ + if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) { + /* + * Start with 64k size fixed entries, preset 1st one (hence the + * loop below is starting with index 1). + */ + start = 0; + end = size = 0x10000; + type = mtrr_state.fixed_ranges[0]; + + for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) { + /* 8 64k entries, then 16 16k ones, rest 4k. */ + if (i == 8 || i == 24) + size >>= 2; + + if (mtrr_state.fixed_ranges[i] != type) { + add_map_entry(start, end, type); + start = end; + type = mtrr_state.fixed_ranges[i]; + } + end += size; } + add_map_entry(start, end, type); + } - if ((start & mask) != (base & mask)) - continue; + /* Mark fixed, they take precedence. */ + for (i = 0; i < cache_map_n; i++) + cache_map[i].fixed = 1; + cache_map_fixed = cache_map_n; - curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == MTRR_TYPE_INVALID) { - prev_match = curr_match; - continue; + map_add_var(); + + pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n", + cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed, + get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0)); + + if (mtrr_debug) { + for (i = 0; i < cache_map_n; i++) { + pr_info("%3u: %016llx-%016llx %s\n", i, + cache_map[i].start, cache_map[i].end - 1, + mtrr_attrib_to_str(cache_map[i].type)); } + } +} - *uniform = 0; - if (check_type_overlap(&prev_match, &curr_match)) - return curr_match; +/* Copy the cache_map from __initdata memory to dynamically allocated one. */ +void __init mtrr_copy_map(void) +{ + unsigned int new_size = get_cache_map_size(); + + if (!mtrr_state.enabled || !new_size) { + cache_map = NULL; + return; + } + + mutex_lock(&mtrr_mutex); + + cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL); + if (cache_map) { + memmove(cache_map, init_cache_map, + cache_map_n * sizeof(*cache_map)); + cache_map_size = new_size; + } else { + mtrr_state.enabled = 0; + pr_err("MTRRs disabled due to allocation failure for lookup map.\n"); + } + + mutex_unlock(&mtrr_mutex); +} + +/** + * mtrr_overwrite_state - set static MTRR state + * + * Used to set MTRR state via different means (e.g. with data obtained from + * a hypervisor). + * Is allowed only for special cases when running virtualized. Must be called + * from the x86_init.hyper.init_platform() hook. It can be called only once. + * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR + * is cleared. + */ +void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) +{ + unsigned int i; + + /* Only allowed to be called once before mtrr_bp_init(). */ + if (WARN_ON_ONCE(mtrr_state_set)) + return; + + /* Only allowed when running virtualized. */ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return; + + /* + * Only allowed for special virtualization cases: + * - when running as Hyper-V, SEV-SNP guest using vTOM + * - when running as Xen PV guest + * - when running as SEV-SNP or TDX guest to avoid unnecessary + * VMM communication/Virtualization exceptions (#VC, #VE) + */ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && + !hv_is_isolation_supported() && + !cpu_feature_enabled(X86_FEATURE_XENPV) && + !cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return; + + /* Disable MTRR in order to disable MTRR modifications. */ + setup_clear_cpu_cap(X86_FEATURE_MTRR); + + if (var) { + if (num_var > MTRR_MAX_VAR_RANGES) { + pr_warn("Trying to overwrite MTRR state with %u variable entries\n", + num_var); + num_var = MTRR_MAX_VAR_RANGES; + } + for (i = 0; i < num_var; i++) + mtrr_state.var_ranges[i] = var[i]; + num_var_ranges = num_var; } - if (prev_match != MTRR_TYPE_INVALID) - return prev_match; + mtrr_state.def_type = def_type; + mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED; - return mtrr_state.def_type; + mtrr_state_set = 1; +} + +static u8 type_merge(u8 type, u8 new_type, u8 *uniform) +{ + u8 effective_type; + + if (type == MTRR_TYPE_INVALID) + return new_type; + + effective_type = get_effective_type(type, new_type); + if (type != effective_type) + *uniform = 0; + + return effective_type; } /** @@ -248,66 +497,49 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, * MTRR_TYPE_INVALID - MTRR is disabled * * Output Argument: - * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the - * region is fully covered by a single MTRR entry or the default - * type. + * uniform - Set to 1 when the returned MTRR type is valid for the whole + * region, set to 0 else. */ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type, is_uniform = 1, dummy; - int repeat; - u64 partial_end; + u8 type = MTRR_TYPE_INVALID; + unsigned int i; - /* Make end inclusive instead of exclusive */ - end--; + if (!mtrr_state_set) { + /* Uniformity is unknown. */ + *uniform = 0; + return MTRR_TYPE_UNCACHABLE; + } - if (!mtrr_state_set) - return MTRR_TYPE_INVALID; + *uniform = 1; if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return MTRR_TYPE_INVALID; + return MTRR_TYPE_UNCACHABLE; - /* - * Look up the fixed ranges first, which take priority over - * the variable ranges. - */ - if ((start < 0x100000) && - (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { - is_uniform = 0; - type = mtrr_type_lookup_fixed(start, end); - goto out; - } + for (i = 0; i < cache_map_n && start < end; i++) { + /* Region after current map entry? -> continue with next one. */ + if (start >= cache_map[i].end) + continue; - /* - * Look up the variable ranges. Look of multiple ranges matching - * this address and pick type as per MTRR precedence. - */ - type = mtrr_type_lookup_variable(start, end, &partial_end, - &repeat, &is_uniform); + /* Start of region not covered by current map entry? */ + if (start < cache_map[i].start) { + /* At least some part of region has default type. */ + type = type_merge(type, mtrr_state.def_type, uniform); + /* End of region not covered, too? -> lookup done. */ + if (end <= cache_map[i].start) + return type; + } - /* - * Common path is with repeat = 0. - * However, we can have cases where [start:end] spans across some - * MTRR ranges and/or the default type. Do repeated lookups for - * that case here. - */ - while (repeat) { - prev_type = type; - start = partial_end; - is_uniform = 0; - type = mtrr_type_lookup_variable(start, end, &partial_end, - &repeat, &dummy); + /* At least part of region covered by map entry. */ + type = type_merge(type, cache_map[i].type, uniform); - if (check_type_overlap(&prev_type, &type)) - goto out; + start = cache_map[i].end; } - if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) - type = MTRR_TYPE_WRBACK; + /* End of region past last entry in map? -> use default type. */ + if (start < end) + type = type_merge(type, mtrr_state.def_type, uniform); -out: - *uniform = is_uniform; return type; } @@ -363,8 +595,8 @@ static void __init print_fixed_last(void) if (!last_fixed_end) return; - pr_debug(" %05X-%05X %s\n", last_fixed_start, - last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + pr_info(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } @@ -402,10 +634,10 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - pr_debug("MTRR default type: %s\n", - mtrr_attrib_to_str(mtrr_state.def_type)); + pr_info("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - pr_debug("MTRR fixed ranges %sabled:\n", + pr_info("MTRR fixed ranges %sabled:\n", ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? "en" : "dis"); @@ -420,26 +652,27 @@ static void __init print_mtrr_state(void) /* tail */ print_fixed_last(); } - pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); - high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; + pr_info("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { - if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V) + pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & + MTRR_PHYSBASE_TYPE)); else - pr_debug(" %u disabled\n", i); + pr_info(" %u disabled\n", i); } if (mtrr_tom2) - pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); + pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } /* Grab all of the MTRR state for this CPU into *state */ @@ -452,7 +685,7 @@ bool __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; rdmsr(MSR_MTRRcap, lo, dummy); - mtrr_state.have_fixed = (lo >> 8) & 1; + mtrr_state.have_fixed = lo & MTRR_CAP_FIX; for (i = 0; i < num_var_ranges; i++) get_mtrr_var_range(i, &vrs[i]); @@ -460,8 +693,8 @@ bool __init get_mtrr_state(void) get_fixed_ranges(mtrr_state.fixed_ranges); rdmsr(MSR_MTRRdefType, lo, dummy); - mtrr_state.def_type = (lo & 0xff); - mtrr_state.enabled = (lo & 0xc00) >> 10; + mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE; + mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT; if (amd_special_default_mtrr()) { unsigned low, high; @@ -474,7 +707,8 @@ bool __init get_mtrr_state(void) mtrr_tom2 &= 0xffffff800000ULL; } - print_mtrr_state(); + if (mtrr_debug) + print_mtrr_state(); mtrr_state_set = 1; @@ -574,7 +808,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); - if ((mask_lo & 0x800) == 0) { + if (!(mask_lo & MTRR_PHYSMASK_V)) { /* Invalid (i.e. free) range */ *base = 0; *size = 0; @@ -585,8 +819,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask: */ - tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; - mask = size_or_mask | tmp; + tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK); + mask = (u64)phys_hi_rsvd << 32 | tmp; /* Expand tmp with high bits to all 1s: */ hi = fls64(tmp); @@ -604,9 +838,9 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, * This works correctly if size is a power of two, i.e. a * contiguous range: */ - *size = -mask; + *size = -mask >> PAGE_SHIFT; *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; - *type = base_lo & 0xff; + *type = base_lo & MTRR_PHYSBASE_TYPE; out_put_cpu: put_cpu(); @@ -644,9 +878,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) bool changed = false; rdmsr(MTRRphysBase_MSR(index), lo, hi); - if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) - || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != - (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD) + || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; @@ -654,9 +887,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), lo, hi); - if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) - || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != - (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD) + || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); changed = true; } @@ -691,11 +923,12 @@ static unsigned long set_mtrr_state(void) * Set_mtrr_restore restores the old value of MTRRdefType, * so to set it we fiddle with the saved value: */ - if ((deftype_lo & 0xff) != mtrr_state.def_type - || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { + if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type || + ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) { - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | - (mtrr_state.enabled << 10); + deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) | + mtrr_state.def_type | + (mtrr_state.enabled << MTRR_STATE_SHIFT); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -708,7 +941,7 @@ void mtrr_disable(void) rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi); } void mtrr_enable(void) @@ -762,9 +995,9 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, memset(vr, 0, sizeof(struct mtrr_var_range)); } else { vr->base_lo = base << PAGE_SHIFT | type; - vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT); - vr->mask_lo = -size << PAGE_SHIFT | 0x800; - vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT); + vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; + vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V; + vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); @@ -783,7 +1016,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, * For Intel PPro stepping <= 7 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF */ - if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && + if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { @@ -817,7 +1050,7 @@ static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); - return config & (1 << 10); + return config & MTRR_CAP_WC; } int positive_have_wrcomb(void) diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c new file mode 100644 index 000000000000..d25882fcf181 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/legacy.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/syscore_ops.h> +#include <asm/cpufeature.h> +#include <asm/mtrr.h> +#include <asm/processor.h> +#include "mtrr.h" + +void mtrr_set_if(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* Pre-Athlon (K6) AMD CPU MTRRs */ + if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) + mtrr_if = &amd_mtrr_ops; + break; + case X86_VENDOR_CENTAUR: + if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) + mtrr_if = ¢aur_mtrr_ops; + break; + case X86_VENDOR_CYRIX: + if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) + mtrr_if = &cyrix_mtrr_ops; + break; + default: + break; + } +} + +/* + * The suspend/resume methods are only for CPUs without MTRR. CPUs using generic + * MTRR driver don't require this. + */ +struct mtrr_value { + mtrr_type ltype; + unsigned long lbase; + unsigned long lsize; +}; + +static struct mtrr_value *mtrr_value; + +static int mtrr_save(void) +{ + int i; + + if (!mtrr_value) + return -ENOMEM; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); + } + return 0; +} + +static void mtrr_restore(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + if (mtrr_value[i].lsize) { + mtrr_if->set(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } + } +} + +static struct syscore_ops mtrr_syscore_ops = { + .suspend = mtrr_save, + .resume = mtrr_restore, +}; + +void mtrr_register_syscore(void) +{ + mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL); + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + register_syscore_ops(&mtrr_syscore_ops); +} diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 783f3210d582..767bf1c71aad 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -59,15 +59,9 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; -static bool mtrr_enabled(void) -{ - return !!mtrr_if; -} unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; -static DEFINE_MUTEX(mtrr_mutex); - -u64 size_or_mask, size_and_mask; +DEFINE_MUTEX(mtrr_mutex); const struct mtrr_ops *mtrr_if; @@ -105,21 +99,6 @@ static int have_wrcomb(void) return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; } -/* This function returns the number of variable MTRRs */ -static void __init set_num_var_ranges(bool use_generic) -{ - unsigned long config = 0, dummy; - - if (use_generic) - rdmsr(MSR_MTRRcap, config, dummy); - else if (is_cpu(AMD) || is_cpu(HYGON)) - config = 2; - else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) - config = 8; - - num_var_ranges = config & 0xff; -} - static void __init init_table(void) { int i, max; @@ -194,20 +173,8 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ -static void -set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data = { .smp_reg = reg, - .smp_base = base, - .smp_size = size, - .smp_type = type - }; - - stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); -} - -static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, + mtrr_type type) { struct set_mtrr_data data = { .smp_reg = reg, .smp_base = base, @@ -216,6 +183,8 @@ static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, }; stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); + + generic_rebuild_map(); } /** @@ -337,7 +306,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { - set_mtrr_cpuslocked(i, base, size, type); + set_mtrr(i, base, size, type); if (likely(replace < 0)) { mtrr_usage_table[i] = 1; } else { @@ -345,7 +314,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (increment) mtrr_usage_table[i]++; if (unlikely(replace != i)) { - set_mtrr_cpuslocked(replace, 0, 0, 0); + set_mtrr(replace, 0, 0, 0); mtrr_usage_table[replace] = 0; } } @@ -363,7 +332,7 @@ static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { pr_warn("size and base must be multiples of 4 kiB\n"); - pr_debug("size: 0x%lx base: 0x%lx\n", size, base); + Dprintk("size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -454,8 +423,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - pr_debug("no MTRR for %lx000,%lx000 found\n", - base, size); + Dprintk("no MTRR for %lx000,%lx000 found\n", base, size); goto out; } } @@ -473,7 +441,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) goto out; } if (--mtrr_usage_table[reg] < 1) - set_mtrr_cpuslocked(reg, 0, 0, 0); + set_mtrr(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); @@ -574,136 +542,54 @@ int arch_phys_wc_index(int handle) } EXPORT_SYMBOL_GPL(arch_phys_wc_index); -/* The suspend/resume methods are only for CPU without MTRR. CPU using generic - * MTRR driver doesn't require this - */ -struct mtrr_value { - mtrr_type ltype; - unsigned long lbase; - unsigned long lsize; -}; - -static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; - -static int mtrr_save(void) -{ - int i; - - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); - } - return 0; -} - -static void mtrr_restore(void) -{ - int i; - - for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) { - set_mtrr(i, mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); - } - } -} - - - -static struct syscore_ops mtrr_syscore_ops = { - .suspend = mtrr_save, - .resume = mtrr_restore, -}; - int __initdata changed_by_mtrr_cleanup; -#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) /** - * mtrr_bp_init - initialize mtrrs on the boot CPU + * mtrr_bp_init - initialize MTRRs on the boot CPU * * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). - * */ void __init mtrr_bp_init(void) { + bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR); const char *why = "(not available)"; - u32 phys_addr; - - phys_addr = 32; + unsigned long config, dummy; - if (boot_cpu_has(X86_FEATURE_MTRR)) { - mtrr_if = &generic_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(36); - size_and_mask = 0x00f00000; - phys_addr = 36; + phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32); + if (!generic_mtrrs && mtrr_state.enabled) { /* - * This is an AMD specific MSR, but we assume(hope?) that - * Intel will implement it too when they extend the address - * bus of the Xeon. + * Software overwrite of MTRR state, only for generic case. + * Note that X86_FEATURE_MTRR has been reset in this case. */ - if (cpuid_eax(0x80000000) >= 0x80000008) { - phys_addr = cpuid_eax(0x80000008) & 0xff; - /* CPUID workaround for Intel 0F33/0F34 CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 0xF && - boot_cpu_data.x86_model == 0x3 && - (boot_cpu_data.x86_stepping == 0x3 || - boot_cpu_data.x86_stepping == 0x4)) - phys_addr = 36; - - size_or_mask = SIZE_OR_MASK_BITS(phys_addr); - size_and_mask = ~size_or_mask & 0xfffff00000ULL; - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && - boot_cpu_data.x86 == 6) { - /* - * VIA C* family have Intel style MTRRs, - * but don't support PAE - */ - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - phys_addr = 32; - } - } else { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { - /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = &amd_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - case X86_VENDOR_CENTAUR: - if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { - mtrr_if = ¢aur_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - case X86_VENDOR_CYRIX: - if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { - mtrr_if = &cyrix_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - default: - break; - } + init_table(); + mtrr_build_map(); + pr_info("MTRRs set to read-only\n"); + + return; } + if (generic_mtrrs) + mtrr_if = &generic_mtrr_ops; + else + mtrr_set_if(); + if (mtrr_enabled()) { - set_num_var_ranges(mtrr_if == &generic_mtrr_ops); + /* Get the number of variable MTRR ranges. */ + if (mtrr_if == &generic_mtrr_ops) + rdmsr(MSR_MTRRcap, config, dummy); + else + config = mtrr_if->var_regs; + num_var_ranges = config & MTRR_CAP_VCNT; + init_table(); if (mtrr_if == &generic_mtrr_ops) { /* BIOS may override */ if (get_mtrr_state()) { memory_caching_control |= CACHE_MTRR; - changed_by_mtrr_cleanup = mtrr_cleanup(phys_addr); + changed_by_mtrr_cleanup = mtrr_cleanup(); + mtrr_build_map(); } else { mtrr_if = NULL; why = "by BIOS"; @@ -730,8 +616,14 @@ void mtrr_save_state(void) smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); } -static int __init mtrr_init_finialize(void) +static int __init mtrr_init_finalize(void) { + /* + * Map might exist if mtrr_overwrite_state() has been called or if + * mtrr_enabled() returns true. + */ + mtrr_copy_map(); + if (!mtrr_enabled()) return 0; @@ -741,16 +633,8 @@ static int __init mtrr_init_finialize(void) return 0; } - /* - * The CPU has no MTRR and seems to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * - * TBD: is there any system with such CPU which supports - * suspend/resume? If no, we should remove the code. - */ - register_syscore_ops(&mtrr_syscore_ops); + mtrr_register_syscore(); return 0; } -subsys_initcall(mtrr_init_finialize); +subsys_initcall(mtrr_init_finalize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 02eb5871492d..5655f253d929 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -10,10 +10,13 @@ #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 +extern bool mtrr_debug; +#define Dprintk(x...) do { if (mtrr_debug) pr_info(x); } while (0) + extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { - u32 vendor; + u32 var_regs; void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*get)(unsigned int reg, unsigned long *base, @@ -51,18 +54,26 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); -extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; - -#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) +extern struct mutex mtrr_mutex; extern unsigned int num_var_ranges; extern u64 mtrr_tom2; extern struct mtrr_state_type mtrr_state; +extern u32 phys_hi_rsvd; void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); +#ifdef CONFIG_X86_32 +void mtrr_set_if(void); +void mtrr_register_syscore(void); +#else +static inline void mtrr_set_if(void) { } +static inline void mtrr_register_syscore(void) { } +#endif +void mtrr_build_map(void); +void mtrr_copy_map(void); /* CPU specific mtrr_ops vectors. */ extern const struct mtrr_ops amd_mtrr_ops; @@ -70,4 +81,14 @@ extern const struct mtrr_ops cyrix_mtrr_ops; extern const struct mtrr_ops centaur_mtrr_ops; extern int changed_by_mtrr_cleanup; -extern int mtrr_cleanup(unsigned address_bits); +extern int mtrr_cleanup(void); + +/* + * Must be used by code which uses mtrr_if to call platform-specific + * MTRR manipulation functions. + */ +static inline bool mtrr_enabled(void) +{ + return !!mtrr_if; +} +void generic_rebuild_map(void); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 6ad33f355861..725344048f85 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -726,11 +726,15 @@ unlock: static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) { struct task_struct *p, *t; + pid_t pid; rcu_read_lock(); for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) - seq_printf(s, "%d\n", t->pid); + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } } rcu_read_unlock(); } @@ -2301,6 +2305,26 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) } } +static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); +} + +static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + kernfs_unbreak_active_protection(kn); + rdtgroup_remove(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) { struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); @@ -2308,8 +2332,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) if (!rdtgrp) return NULL; - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); + rdtgroup_kn_get(rdtgrp, kn); mutex_lock(&rdtgroup_mutex); @@ -2328,17 +2351,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); - - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } + rdtgroup_kn_put(rdtgrp, kn); } static int mkdir_mondata_all(struct kernfs_node *parent_kn, @@ -3505,6 +3518,133 @@ out: return ret; } +/** + * mongrp_reparent() - replace parent CTRL_MON group of a MON group + * @rdtgrp: the MON group whose parent should be replaced + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp + * @cpus: cpumask provided by the caller for use during this call + * + * Replaces the parent CTRL_MON group for a MON group, resulting in all member + * tasks' CLOSID immediately changing to that of the new parent group. + * Monitoring data for the group is unaffected by this operation. + */ +static void mongrp_reparent(struct rdtgroup *rdtgrp, + struct rdtgroup *new_prdtgrp, + cpumask_var_t cpus) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + + WARN_ON(rdtgrp->type != RDTMON_GROUP); + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); + + /* Nothing to do when simply renaming a MON group. */ + if (prdtgrp == new_prdtgrp) + return; + + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_move_tail(&rdtgrp->mon.crdtgrp_list, + &new_prdtgrp->mon.crdtgrp_list); + + rdtgrp->mon.parent = new_prdtgrp; + rdtgrp->closid = new_prdtgrp->closid; + + /* Propagate updated closid to all tasks in this group. */ + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); + + update_closid_rmid(cpus, NULL); +} + +static int rdtgroup_rename(struct kernfs_node *kn, + struct kernfs_node *new_parent, const char *new_name) +{ + struct rdtgroup *new_prdtgrp; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret; + + rdtgrp = kernfs_to_rdtgroup(kn); + new_prdtgrp = kernfs_to_rdtgroup(new_parent); + if (!rdtgrp || !new_prdtgrp) + return -ENOENT; + + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ + rdtgroup_kn_get(rdtgrp, kn); + rdtgroup_kn_get(new_prdtgrp, new_parent); + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + /* + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if + * either kernfs_node is a file. + */ + if (kernfs_type(kn) != KERNFS_DIR || + kernfs_type(new_parent) != KERNFS_DIR) { + rdt_last_cmd_puts("Source and destination must be directories"); + ret = -EPERM; + goto out; + } + + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { + ret = -ENOENT; + goto out; + } + + if (rdtgrp->type != RDTMON_GROUP || !kn->parent || + !is_mon_groups(kn->parent, kn->name)) { + rdt_last_cmd_puts("Source must be a MON group\n"); + ret = -EPERM; + goto out; + } + + if (!is_mon_groups(new_parent, new_name)) { + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); + ret = -EPERM; + goto out; + } + + /* + * If the MON group is monitoring CPUs, the CPUs must be assigned to the + * current parent CTRL_MON group and therefore cannot be assigned to + * the new parent, making the move illegal. + */ + if (!cpumask_empty(&rdtgrp->cpu_mask) && + rdtgrp->mon.parent != new_prdtgrp) { + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); + ret = -EPERM; + goto out; + } + + /* + * Allocate the cpumask for use in mongrp_reparent() to avoid the + * possibility of failing to allocate it after kernfs_rename() has + * succeeded. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } + + /* + * Perform all input validation and allocations needed to ensure + * mongrp_reparent() will succeed before calling kernfs_rename(), + * otherwise it would be necessary to revert this call if + * mongrp_reparent() failed. + */ + ret = kernfs_rename(kn, new_parent, new_name); + if (!ret) + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); + + free_cpumask_var(tmpmask); + +out: + mutex_unlock(&rdtgroup_mutex); + rdtgroup_kn_put(rdtgrp, kn); + rdtgroup_kn_put(new_prdtgrp, new_parent); + return ret; +} + static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) @@ -3522,6 +3662,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { .mkdir = rdtgroup_mkdir, .rmdir = rdtgroup_rmdir, + .rename = rdtgroup_rename, .show_options = rdtgroup_show_options, }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 2a0e90fe2abc..91fa70e51004 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -755,6 +755,7 @@ static void sgx_mmu_notifier_release(struct mmu_notifier *mn, { struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); struct sgx_encl_mm *tmp = NULL; + bool found = false; /* * The enclave itself can remove encl_mm. Note, objects can't be moved @@ -764,12 +765,13 @@ static void sgx_mmu_notifier_release(struct mmu_notifier *mn, list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { if (tmp == encl_mm) { list_del_rcu(&encl_mm->list); + found = true; break; } } spin_unlock(&encl_mm->encl->mm_lock); - if (tmp == encl_mm) { + if (found) { synchronize_srcu(&encl_mm->encl->srcu); mmu_notifier_put(mn); } diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 21ca0a831b70..5d390df21440 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -214,7 +214,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, if (!(vma->vm_flags & VM_MAYEXEC)) return -EACCES; - ret = get_user_pages(src, 1, 0, &src_page, NULL); + ret = get_user_pages(src, 1, 0, &src_page); if (ret < 1) return -EFAULT; diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 3b58d8703094..6eaf9a6bc02f 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -9,6 +9,7 @@ #include <asm/processor.h> #include <asm/desc.h> #include <asm/traps.h> +#include <asm/doublefault.h> #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 851eb13edc01..998a08f17e33 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -53,7 +53,7 @@ void fpu__init_cpu(void) fpu__init_cpu_xstate(); } -static bool fpu__probe_without_cpuid(void) +static bool __init fpu__probe_without_cpuid(void) { unsigned long cr0; u16 fsw, fcw; @@ -71,7 +71,7 @@ static bool fpu__probe_without_cpuid(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } -static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) +static void __init fpu__init_system_early_generic(void) { if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { @@ -211,10 +211,10 @@ static void __init fpu__init_system_xstate_size_legacy(void) * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ -void __init fpu__init_system(struct cpuinfo_x86 *c) +void __init fpu__init_system(void) { fpstate_reset(¤t->thread.fpu); - fpu__init_system_early_generic(c); + fpu__init_system_early_generic(); /* * The FPU has to be operational for some of the diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5e7ead52cfdb..12df54ff0e81 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -282,7 +282,6 @@ static inline void tramp_free(void *tramp) { } /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -extern void ftrace_regs_caller_ret(void); extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -525,9 +524,6 @@ static void *addr_from_call(void *ptr) return ptr + CALL_INSN_SIZE + call.disp; } -void prepare_ftrace_return(unsigned long ip, unsigned long *parent, - unsigned long frame_pointer); - /* * If the ops->trampoline was not allocated, then it probably * has a static trampoline func, or is the ftrace caller itself. diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 0d9a14528176..24c1175a47e2 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -187,12 +187,14 @@ SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl %eax + pushl $0 pushl %edx - movl $0, %eax + pushl %eax + movl %esp, %eax call ftrace_return_to_handler movl %eax, %ecx - popl %edx popl %eax + popl %edx + addl $4, %esp # skip ebp JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index b8c720b5dab2..945cfa5f7239 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -348,12 +348,13 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__) SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - subq $16, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) movq %rdx, 8(%rsp) - movq %rbp, %rdi + movq %rbp, 16(%rsp) + movq %rsp, %rdi call ftrace_return_to_handler @@ -361,7 +362,7 @@ SYM_CODE_START(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $16, %rsp + addq $24, %rsp /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 10c27b4261eb..246a609f889b 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -69,6 +69,7 @@ asmlinkage __visible void __init __noreturn i386_start_kernel(void) * to the first kernel PMD. Note the upper half of each PMD or PTE are * always zero at this stage. */ +void __init mk_early_pgtbl_32(void); void __init mk_early_pgtbl_32(void) { #ifdef __pa diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 67c8ed99144b..c9318993f959 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -138,20 +138,6 @@ SYM_CODE_START(startup_32) jmp .Ldefault_entry SYM_CODE_END(startup_32) -#ifdef CONFIG_HOTPLUG_CPU -/* - * Boot CPU0 entry point. It's called from play_dead(). Everything has been set - * up already except stack. We just set up stack here. Then call - * start_secondary(). - */ -SYM_FUNC_START(start_cpu0) - movl initial_stack, %ecx - movl %ecx, %esp - call *(initial_code) -1: jmp 1b -SYM_FUNC_END(start_cpu0) -#endif - /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 113c13376e51..c5b9289837dc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,7 +24,9 @@ #include "../entry/calling.h" #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/apicdef.h> #include <asm/fixmap.h> +#include <asm/smp.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -234,8 +236,67 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ANNOTATE_NOENDBR // above #ifdef CONFIG_SMP + /* + * For parallel boot, the APIC ID is read from the APIC, and then + * used to look up the CPU number. For booting a single CPU, the + * CPU number is encoded in smpboot_control. + * + * Bit 31 STARTUP_READ_APICID (Read APICID from APIC) + * Bit 0-23 CPU# if STARTUP_xx flags are not set + */ movl smpboot_control(%rip), %ecx + testl $STARTUP_READ_APICID, %ecx + jnz .Lread_apicid + /* + * No control bit set, single CPU bringup. CPU number is provided + * in bit 0-23. This is also the boot CPU case (CPU number 0). + */ + andl $(~STARTUP_PARALLEL_MASK), %ecx + jmp .Lsetup_cpu +.Lread_apicid: + /* Check whether X2APIC mode is already enabled */ + mov $MSR_IA32_APICBASE, %ecx + rdmsr + testl $X2APIC_ENABLE, %eax + jnz .Lread_apicid_msr + + /* Read the APIC ID from the fix-mapped MMIO space. */ + movq apic_mmio_base(%rip), %rcx + addq $APIC_ID, %rcx + movl (%rcx), %eax + shr $24, %eax + jmp .Llookup_AP + +.Lread_apicid_msr: + mov $APIC_X2APIC_ID_MSR, %ecx + rdmsr + +.Llookup_AP: + /* EAX contains the APIC ID of the current CPU */ + xorq %rcx, %rcx + leaq cpuid_to_apicid(%rip), %rbx + +.Lfind_cpunr: + cmpl (%rbx,%rcx,4), %eax + jz .Lsetup_cpu + inc %ecx +#ifdef CONFIG_FORCE_NR_CPUS + cmpl $NR_CPUS, %ecx +#else + cmpl nr_cpu_ids(%rip), %ecx +#endif + jb .Lfind_cpunr + + /* APIC ID not found in the table. Drop the trampoline lock and bail. */ + movq trampoline_lock(%rip), %rax + movl $0, (%rax) + +1: cli + hlt + jmp 1b + +.Lsetup_cpu: /* Get the per cpu offset for the given CPU# which is in ECX */ movq __per_cpu_offset(,%rcx,8), %rdx #else @@ -252,6 +313,16 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movq TASK_threadsp(%rax), %rsp /* + * Now that this CPU is running on its own stack, drop the realmode + * protection. For the boot CPU the pointer is NULL! + */ + movq trampoline_lock(%rip), %rax + testq %rax, %rax + jz .Lsetup_gdt + movl $0, (%rax) + +.Lsetup_gdt: + /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace * addresses where we're currently running on. We have to do that here @@ -375,13 +446,13 @@ SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" #include "sev_verify_cbit.S" -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) /* - * Boot CPU0 entry point. It's called from play_dead(). Everything has been set - * up already except stack. We just set up stack here. Then call - * start_secondary() via .Ljump_to_C_code. + * Entry point for soft restart of a CPU. Invoked from xxx_play_dead() for + * restarting the boot CPU or for restarting SEV guest CPUs after CPU hot + * unplug. Everything is set up already except the stack. */ -SYM_CODE_START(start_cpu0) +SYM_CODE_START(soft_restart_cpu) ANNOTATE_NOENDBR UNWIND_HINT_END_OF_STACK @@ -390,7 +461,7 @@ SYM_CODE_START(start_cpu0) movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code -SYM_CODE_END(start_cpu0) +SYM_CODE_END(soft_restart_cpu) #endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -433,6 +504,8 @@ SYM_DATA(initial_code, .quad x86_64_start_kernel) #ifdef CONFIG_AMD_MEM_ENCRYPT SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) #endif + +SYM_DATA(trampoline_lock, .quad 0); __FINITDATA __INIT diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 766ffe3ba313..9f668d2f3d11 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -211,6 +211,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR + sum += irq_stats(cpu)->irq_hv_callback_count; +#endif +#if IS_ENABLED(CONFIG_HYPERV) + sum += irq_stats(cpu)->irq_hv_reenlightenment_count; + sum += irq_stats(cpu)->hyperv_stimer0_count; +#endif #ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 670eb08b972a..ee4fe8cdb857 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -165,32 +165,19 @@ int arch_asym_cpu_priority(int cpu) /** * sched_set_itmt_core_prio() - Set CPU priority based on ITMT - * @prio: Priority of cpu core - * @core_cpu: The cpu number associated with the core + * @prio: Priority of @cpu + * @cpu: The CPU number * * The pstate driver will find out the max boost frequency * and call this function to set a priority proportional - * to the max boost frequency. CPU with higher boost + * to the max boost frequency. CPUs with higher boost * frequency will receive higher priority. * * No need to rebuild sched domain after updating * the CPU priorities. The sched domains have no * dependency on CPU priorities. */ -void sched_set_itmt_core_prio(int prio, int core_cpu) +void sched_set_itmt_core_prio(int prio, int cpu) { - int cpu, i = 1; - - for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { - int smt_prio; - - /* - * Ensure that the siblings are moved to the end - * of the priority chain and only used when - * all other high priority cpus are out of capacity. - */ - smt_prio = prio * smp_num_siblings / (i * i); - per_cpu(sched_core_priority, cpu) = smt_prio; - i++; - } + per_cpu(sched_core_priority, cpu) = prio; } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 0f35d44c56fe..fb8f52149be9 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struct timespec64 *now) return -ENODEV; } -static noinstr u64 kvm_clock_read(void) +static u64 kvm_clock_read(void) { u64 ret; @@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs) static noinstr u64 kvm_sched_clock_read(void) { - return kvm_clock_read() - kvm_sched_clock_offset; + return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset; } static inline void kvm_sched_clock_init(bool stable) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 525876e7b9f4..adc67f98819a 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -367,8 +367,10 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) va = (unsigned long)ldt_slot_va(ldt->slot) + offset; ptep = get_locked_pte(mm, va, &ptl); - pte_clear(mm, va, ptep); - pte_unmap_unlock(ptep, ptl); + if (!WARN_ON_ONCE(!ptep)) { + pte_clear(mm, va, ptep); + pte_unmap_unlock(ptep, ptl); + } } va = (unsigned long)ldt_slot_va(ldt->slot); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b05f62ee2344..5f71a0cf4399 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -358,7 +358,7 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); } if (locks) { void *lseg = (void *)locks->sh_addr; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 776f4b1e395b..a0c551846b35 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -496,7 +496,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi) */ sev_es_nmi_complete(); if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) - arch_atomic_long_inc(&nsp->idt_calls); + raw_atomic_long_inc(&nsp->idt_calls); if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index b348a672f71d..b525fe6d6657 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> +#include <linux/pnp.h> #include <asm/setup.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index dac41a0072ea..72015dba72ab 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ #include <linux/static_call.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> +#include <linux/entry-common.h> #include <asm/cpu.h> #include <asm/apic.h> #include <linux/uaccess.h> @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg) +{ + schedule_tail(prev); + + /* Is this a kernel thread? */ + if (unlikely(fn)) { + fn(fn_arg); + /* + * A kernel thread is allowed to return here after successfully + * calling kernel_execve(). Exit to userspace to complete the + * execve() syscall. + */ + regs->ax = 0; + } + + syscall_exit_to_user_mode(regs); +} + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; @@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame = &fork_frame->frame; frame->bp = encode_frame_pointer(childregs); - frame->ret_addr = (unsigned long) ret_from_fork; + frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; p->thread.iopl_warn = 0; @@ -759,15 +779,26 @@ bool xen_set_default_idle(void) } #endif +struct cpumask cpus_stop_mask; + void __noreturn stop_this_cpu(void *dummy) { + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); + unsigned int cpu = smp_processor_id(); + local_irq_disable(); + /* - * Remove this CPU: + * Remove this CPU from the online mask and disable it + * unconditionally. This might be redundant in case that the reboot + * vector was handled late and stop_other_cpus() sent an NMI. + * + * According to SDM and APM NMIs can be accepted even after soft + * disabling the local APIC. */ - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); disable_local_APIC(); - mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + mcheck_cpu_clear(c); /* * Use wbinvd on processors that support SME. This provides support @@ -781,8 +812,17 @@ void __noreturn stop_this_cpu(void *dummy) * Test the CPUID bit directly because the machine might've cleared * X86_FEATURE_SME due to cmdline options. */ - if (cpuid_eax(0x8000001f) & BIT(0)) + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) native_wbinvd(); + + /* + * This brings a cache line back and dirties it, but + * native_stop_other_cpus() will overwrite cpus_stop_mask after it + * observed that all CPUs reported stop. This write will invalidate + * the related cache line on this CPU. + */ + cpumask_clear_cpu(cpu, &cpus_stop_mask); + for (;;) { /* * Use native_halt() so that memory contents don't change diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 56acf53a782a..b3f81379c2fc 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -101,11 +101,11 @@ u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd) * updating at the same time, and one of them could be slightly behind, * making the assumption that last_value always go forward fail to hold. */ - last = arch_atomic64_read(&last_value); + last = raw_atomic64_read(&last_value); do { if (ret <= last) return last; - } while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret)); + } while (!raw_atomic64_try_cmpxchg(&last_value, &last, ret)); return ret; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 16babff771bd..fd975a4a5200 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -796,7 +796,6 @@ static void __init early_reserve_memory(void) memblock_x86_reserve_range_setup_data(); - reserve_ibft_region(); reserve_bios_regions(); trim_snb_memory(); } @@ -1032,11 +1031,14 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_init(); + reserve_ibft_region(); dmi_setup(); /* * VMware detection requires dmi to be available, so this * needs to be done after dmi_setup(), for the boot CPU. + * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be + * called before cache_bp_init() for setting up MTRR state. */ init_hypervisor_platform(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 3a5b0c9c4fcc..2eabccde94fb 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -12,6 +12,9 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) +#else +#undef WARN +#define WARN(condition, format...) (!!(condition)) #endif /* I/O parameters for CPUID-related helpers */ @@ -991,3 +994,103 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) cpuid_ext_range_max = fn->eax; } } + +static void pvalidate_pages(struct snp_psc_desc *desc) +{ + struct psc_entry *e; + unsigned long vaddr; + unsigned int size; + unsigned int i; + bool validate; + int rc; + + for (i = 0; i <= desc->hdr.end_entry; i++) { + e = &desc->entries[i]; + + vaddr = (unsigned long)pfn_to_kaddr(e->gfn); + size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; + validate = e->operation == SNP_PAGE_STATE_PRIVATE; + + rc = pvalidate(vaddr, size, validate); + if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { + unsigned long vaddr_end = vaddr + PMD_SIZE; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (rc) + break; + } + } + + if (rc) { + WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + } + } +} + +static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct es_em_ctxt ctxt; + + vc_ghcb_invalidate(ghcb); + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + return ret; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index b031244d6d2d..1ee7bed453de 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -113,13 +113,23 @@ struct ghcb_state { }; static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); - static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); struct sev_config { __u64 debug : 1, - __reserved : 63; + + /* + * A flag used by __set_pages_state() that indicates when the + * per-CPU GHCB has been created and registered and thus can be + * used by the BSP instead of the early boot GHCB. + * + * For APs, the per-CPU GHCB is created before they are started + * and registered upon startup, so this flag can be used globally + * for the BSP and APs. + */ + ghcbs_initialized : 1, + + __reserved : 62; }; static struct sev_config sev_cfg __read_mostly; @@ -645,32 +655,26 @@ static u64 __init get_jump_table_addr(void) return ret; } -static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) -{ - unsigned long vaddr_end; - int rc; - - vaddr = vaddr & PAGE_MASK; - vaddr_end = vaddr + (npages << PAGE_SHIFT); - - while (vaddr < vaddr_end) { - rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); - if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); - - vaddr = vaddr + PAGE_SIZE; - } -} - -static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op) { unsigned long paddr_end; u64 val; + int ret; + + vaddr = vaddr & PAGE_MASK; paddr = paddr & PAGE_MASK; paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { + if (op == SNP_PAGE_STATE_SHARED) { + /* Page validation must be rescinded before changing to shared */ + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) + goto e_term; + } + /* * Use the MSR protocol because this function can be called before * the GHCB is established. @@ -691,7 +695,15 @@ static void __init early_set_pages_state(unsigned long paddr, unsigned int npage paddr, GHCB_MSR_PSC_RESP_VAL(val))) goto e_term; - paddr = paddr + PAGE_SIZE; + if (op == SNP_PAGE_STATE_PRIVATE) { + /* Page validation must be performed after changing to private */ + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) + goto e_term; + } + + vaddr += PAGE_SIZE; + paddr += PAGE_SIZE; } return; @@ -701,7 +713,7 @@ e_term: } void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned int npages) + unsigned long npages) { /* * This can be invoked in early boot while running identity mapped, so @@ -716,14 +728,11 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * Ask the hypervisor to mark the memory pages as private in the RMP * table. */ - early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); - - /* Validate the memory pages after they've been added in the RMP table. */ - pvalidate_pages(vaddr, npages, true); + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); } void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned int npages) + unsigned long npages) { /* * This can be invoked in early boot while running identity mapped, so @@ -734,11 +743,8 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) return; - /* Invalidate the memory pages before they are marked shared in the RMP table. */ - pvalidate_pages(vaddr, npages, false); - /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); } void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) @@ -756,96 +762,16 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op WARN(1, "invalid memory op %d\n", op); } -static int vmgexit_psc(struct snp_psc_desc *desc) +static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) { - int cur_entry, end_entry, ret = 0; - struct snp_psc_desc *data; struct ghcb_state state; - struct es_em_ctxt ctxt; - unsigned long flags; - struct ghcb *ghcb; - - /* - * __sev_get_ghcb() needs to run with IRQs disabled because it is using - * a per-CPU GHCB. - */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - if (!ghcb) { - ret = 1; - goto out_unlock; - } - - /* Copy the input desc into GHCB shared buffer */ - data = (struct snp_psc_desc *)ghcb->shared_buffer; - memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - - /* - * As per the GHCB specification, the hypervisor can resume the guest - * before processing all the entries. Check whether all the entries - * are processed. If not, then keep retrying. Note, the hypervisor - * will update the data memory directly to indicate the status, so - * reference the data->hdr everywhere. - * - * The strategy here is to wait for the hypervisor to change the page - * state in the RMP table before guest accesses the memory pages. If the - * page state change was not successful, then later memory access will - * result in a crash. - */ - cur_entry = data->hdr.cur_entry; - end_entry = data->hdr.end_entry; - - while (data->hdr.cur_entry <= data->hdr.end_entry) { - ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - - /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); - - /* - * Page State Change VMGEXIT can pass error code through - * exit_info_2. - */ - if (WARN(ret || ghcb->save.sw_exit_info_2, - "SNP: PSC failed ret=%d exit_info_2=%llx\n", - ret, ghcb->save.sw_exit_info_2)) { - ret = 1; - goto out; - } - - /* Verify that reserved bit is not set */ - if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { - ret = 1; - goto out; - } - - /* - * Sanity check that entry processing is not going backwards. - * This will happen only if hypervisor is tricking us. - */ - if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, -"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", - end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { - ret = 1; - goto out; - } - } - -out: - __sev_put_ghcb(&state); - -out_unlock: - local_irq_restore(flags); - - return ret; -} - -static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, - unsigned long vaddr_end, int op) -{ + bool use_large_entry; struct psc_hdr *hdr; struct psc_entry *e; + unsigned long flags; unsigned long pfn; + struct ghcb *ghcb; int i; hdr = &data->hdr; @@ -854,74 +780,104 @@ static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, memset(data, 0, sizeof(*data)); i = 0; - while (vaddr < vaddr_end) { - if (is_vmalloc_addr((void *)vaddr)) + while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { + hdr->end_entry = i; + + if (is_vmalloc_addr((void *)vaddr)) { pfn = vmalloc_to_pfn((void *)vaddr); - else + use_large_entry = false; + } else { pfn = __pa(vaddr) >> PAGE_SHIFT; + use_large_entry = true; + } e->gfn = pfn; e->operation = op; - hdr->end_entry = i; - /* - * Current SNP implementation doesn't keep track of the RMP page - * size so use 4K for simplicity. - */ - e->pagesize = RMP_PG_SIZE_4K; + if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && + (vaddr_end - vaddr) >= PMD_SIZE) { + e->pagesize = RMP_PG_SIZE_2M; + vaddr += PMD_SIZE; + } else { + e->pagesize = RMP_PG_SIZE_4K; + vaddr += PAGE_SIZE; + } - vaddr = vaddr + PAGE_SIZE; e++; i++; } - if (vmgexit_psc(data)) + /* Page validation must be rescinded before changing to shared */ + if (op == SNP_PAGE_STATE_SHARED) + pvalidate_pages(data); + + local_irq_save(flags); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else + ghcb = boot_ghcb; + + /* Invoke the hypervisor to perform the page state changes */ + if (!ghcb || vmgexit_psc(ghcb, data)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Page validation must be performed after changing to private */ + if (op == SNP_PAGE_STATE_PRIVATE) + pvalidate_pages(data); + + return vaddr; } -static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) { - unsigned long vaddr_end, next_vaddr; - struct snp_psc_desc *desc; + struct snp_psc_desc desc; + unsigned long vaddr_end; - desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); - if (!desc) - panic("SNP: failed to allocate memory for PSC descriptor\n"); + /* Use the MSR protocol when a GHCB is not available. */ + if (!boot_ghcb) + return early_set_pages_state(vaddr, __pa(vaddr), npages, op); vaddr = vaddr & PAGE_MASK; vaddr_end = vaddr + (npages << PAGE_SHIFT); - while (vaddr < vaddr_end) { - /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ - next_vaddr = min_t(unsigned long, vaddr_end, - (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); - - __set_pages_state(desc, vaddr, next_vaddr, op); - - vaddr = next_vaddr; - } - - kfree(desc); + while (vaddr < vaddr_end) + vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op); } -void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; - pvalidate_pages(vaddr, npages, false); - set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); } -void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ + unsigned long vaddr; + unsigned int npages; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + vaddr = (unsigned long)__va(start); + npages = (end - start) >> PAGE_SHIFT; - pvalidate_pages(vaddr, npages, true); + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); } static int snp_set_vmsa(void *va, bool vmsa) @@ -1267,6 +1223,8 @@ void setup_ghcb(void) if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) snp_register_per_cpu_ghcb(); + sev_cfg.ghcbs_initialized = true; + return; } @@ -1328,7 +1286,7 @@ static void sev_es_play_dead(void) * If we get here, the VCPU was woken up again. Jump to CPU * startup code to get it back online. */ - start_cpu0(); + soft_restart_cpu(); } #else /* CONFIG_HOTPLUG_CPU */ #define sev_es_play_dead native_play_dead @@ -1395,9 +1353,6 @@ void __init sev_es_init_vc_handling(void) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } - /* Enable SEV-ES special handling */ - static_branch_enable(&sev_es_enable_key); - /* Initialize per-cpu GHCB pages */ for_each_possible_cpu(cpu) { alloc_runtime_data(cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 004cb30b7419..cfeec3ee877e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -182,7 +182,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, static unsigned long __ro_after_init max_frame_size; static unsigned int __ro_after_init fpu_default_state_size; -void __init init_sigframe_size(void) +static int __init init_sigframe_size(void) { fpu_default_state_size = fpu__get_fpstate_size(); @@ -194,7 +194,9 @@ void __init init_sigframe_size(void) max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); pr_info("max sigframe size: %lu\n", max_frame_size); + return 0; } +early_initcall(init_sigframe_size); unsigned long get_sigframe_size(void) { diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 375b33ecafa2..7eb18ca7bd45 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -21,12 +21,14 @@ #include <linux/interrupt.h> #include <linux/cpu.h> #include <linux/gfp.h> +#include <linux/kexec.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/cpu.h> #include <asm/idtentry.h> #include <asm/nmi.h> #include <asm/mce.h> @@ -129,7 +131,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) } /* - * this function calls the 'stop' function on all other CPUs in the system. + * Disable virtualization, APIC etc. and park the CPU in a HLT loop */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { @@ -146,61 +148,96 @@ static int register_stop_handler(void) static void native_stop_other_cpus(int wait) { - unsigned long flags; - unsigned long timeout; + unsigned int cpu = smp_processor_id(); + unsigned long flags, timeout; if (reboot_force) return; - /* - * Use an own vector here because smp_call_function - * does lots of things not suitable in a panic situation. - */ + /* Only proceed if this is the first CPU to reach this code */ + if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1) + return; + + /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ + if (kexec_in_progress) + smp_kick_mwait_play_dead(); /* - * We start by using the REBOOT_VECTOR irq. - * The irq is treated as a sync point to allow critical - * regions of code on other cpus to release their spin locks - * and re-enable irqs. Jumping straight to an NMI might - * accidentally cause deadlocks with further shutdown/panic - * code. By syncing, we give the cpus up to one second to - * finish their work before we force them off with the NMI. + * 1) Send an IPI on the reboot vector to all other CPUs. + * + * The other CPUs should react on it after leaving critical + * sections and re-enabling interrupts. They might still hold + * locks, but there is nothing which can be done about that. + * + * 2) Wait for all other CPUs to report that they reached the + * HLT loop in stop_this_cpu() + * + * 3) If the system uses INIT/STARTUP for CPU bringup, then + * send all present CPUs an INIT vector, which brings them + * completely out of the way. + * + * 4) If #3 is not possible and #2 timed out send an NMI to the + * CPUs which did not yet report + * + * 5) Wait for all other CPUs to report that they reached the + * HLT loop in stop_this_cpu() + * + * #4 can obviously race against a CPU reaching the HLT loop late. + * That CPU will have reported already and the "have all CPUs + * reached HLT" condition will be true despite the fact that the + * other CPU is still handling the NMI. Again, there is no + * protection against that as "disabled" APICs still respond to + * NMIs. */ - if (num_online_cpus() > 1) { - /* did someone beat us here? */ - if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) - return; - - /* sync above data before sending IRQ */ - wmb(); + cpumask_copy(&cpus_stop_mask, cpu_online_mask); + cpumask_clear_cpu(cpu, &cpus_stop_mask); + if (!cpumask_empty(&cpus_stop_mask)) { apic_send_IPI_allbutself(REBOOT_VECTOR); /* * Don't wait longer than a second for IPI completion. The * wait request is not checked here because that would - * prevent an NMI shutdown attempt in case that not all + * prevent an NMI/INIT shutdown in case that not all * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && timeout--) + while (!cpumask_empty(&cpus_stop_mask) && timeout--) udelay(1); } - /* if the REBOOT_VECTOR didn't work, try with the NMI */ - if (num_online_cpus() > 1) { + /* + * Park all other CPUs in INIT including "offline" CPUs, if + * possible. That's a safe place where they can't resume execution + * of HLT and then execute the HLT loop from overwritten text or + * page tables. + * + * The only downside is a broadcast MCE, but up to the point where + * the kexec() kernel brought all APs online again an MCE will just + * make HLT resume and handle the MCE. The machine crashes and burns + * due to overwritten text, page tables and data. So there is a + * choice between fire and frying pan. The result is pretty much + * the same. Chose frying pan until x86 provides a sane mechanism + * to park a CPU. + */ + if (smp_park_other_cpus_in_init()) + goto done; + + /* + * If park with INIT was not possible and the REBOOT_VECTOR didn't + * take all secondary CPUs offline, try with the NMI. + */ + if (!cpumask_empty(&cpus_stop_mask)) { /* * If NMI IPI is enabled, try to register the stop handler * and send the IPI. In any case try to wait for the other * CPUs to stop. */ if (!smp_no_nmi_ipi && !register_stop_handler()) { - /* Sync above data before sending IRQ */ - wmb(); - pr_emerg("Shutting down cpus with NMI\n"); - apic_send_IPI_allbutself(NMI_VECTOR); + for_each_cpu(cpu, &cpus_stop_mask) + apic->send_IPI(cpu, NMI_VECTOR); } /* * Don't wait longer than 10 ms if the caller didn't @@ -208,14 +245,21 @@ static void native_stop_other_cpus(int wait) * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; - while (num_online_cpus() > 1 && (wait || timeout--)) + while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--)) udelay(1); } +done: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); local_irq_restore(flags); + + /* + * Ensure that the cpus_stop_mask cache lines are invalidated on + * the other CPUs. See comment vs. SME in stop_this_cpu(). + */ + cpumask_clear(&cpus_stop_mask); } /* @@ -268,8 +312,7 @@ struct smp_ops smp_ops = { #endif .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, + .kick_ap_alive = native_kick_ap, .cpu_disable = native_cpu_disable, .play_dead = native_play_dead, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 352f0ce1ece4..e1aa2cd7734b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -53,10 +53,13 @@ #include <linux/tboot.h> #include <linux/gfp.h> #include <linux/cpuidle.h> +#include <linux/kexec.h> #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> #include <linux/stackprotector.h> +#include <linux/cpuhotplug.h> +#include <linux/mc146818rtc.h> #include <asm/acpi.h> #include <asm/cacheinfo.h> @@ -74,7 +77,7 @@ #include <asm/fpu/api.h> #include <asm/setup.h> #include <asm/uv/uv.h> -#include <linux/mc146818rtc.h> +#include <asm/microcode.h> #include <asm/i8259.h> #include <asm/misc.h> #include <asm/qspinlock.h> @@ -101,6 +104,26 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* CPUs which are the primary SMT threads */ +struct cpumask __cpu_primary_thread_mask __read_mostly; + +/* Representing CPUs for which sibling maps can be computed */ +static cpumask_var_t cpu_sibling_setup_mask; + +struct mwait_cpu_dead { + unsigned int control; + unsigned int status; +}; + +#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF +#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD + +/* + * Cache line aligned data for mwait_play_dead(). Separate on purpose so + * that it's unlikely to be touched by other CPUs. + */ +static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); + /* Logical package management. We might want to allocate that dynamically */ unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); @@ -121,7 +144,6 @@ int arch_update_cpu_topology(void) return retval; } - static unsigned int smpboot_warm_reset_vector_count; static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) @@ -154,66 +176,63 @@ static inline void smpboot_restore_warm_reset_vector(void) } -/* - * Report back to the Boot Processor during boot time or to the caller processor - * during CPU online. - */ -static void smp_callin(void) +/* Run the next set of setup steps for the upcoming CPU */ +static void ap_starting(void) { - int cpuid; + int cpuid = smp_processor_id(); - /* - * If waken up by an INIT in an 82489DX configuration - * cpu_callout_mask guarantees we don't get here before - * an INIT_deassert IPI reaches our local APIC, so it is - * now safe to touch our local APIC. - */ - cpuid = smp_processor_id(); + /* Mop up eventual mwait_play_dead() wreckage */ + this_cpu_write(mwait_cpu_dead.status, 0); + this_cpu_write(mwait_cpu_dead.control, 0); /* - * the boot CPU has finished the init stage and is spinning - * on callin_map until we finish. We are free to set up this - * CPU, first the APIC. (this is probably redundant on most - * boards) + * If woken up by an INIT in an 82489DX configuration the alive + * synchronization guarantees that the CPU does not reach this + * point before an INIT_deassert IPI reaches the local APIC, so it + * is now safe to touch the local APIC. + * + * Set up this CPU, first the APIC, which is probably redundant on + * most boards. */ apic_ap_setup(); - /* - * Save our processor parameters. Note: this information - * is needed for clock calibration. - */ + /* Save the processor parameters. */ smp_store_cpu_info(cpuid); /* * The topology information must be up to date before - * calibrate_delay() and notify_cpu_starting(). + * notify_cpu_starting(). */ - set_cpu_sibling_map(raw_smp_processor_id()); + set_cpu_sibling_map(cpuid); ap_init_aperfmperf(); - /* - * Get our bogomips. - * Update loops_per_jiffy in cpu_data. Previous call to - * smp_store_cpu_info() stored a value that is close but not as - * accurate as the value just calculated. - */ - calibrate_delay(); - cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; pr_debug("Stack at about %p\n", &cpuid); wmb(); + /* + * This runs the AP through all the cpuhp states to its target + * state CPUHP_ONLINE. + */ notify_cpu_starting(cpuid); +} +static void ap_calibrate_delay(void) +{ /* - * Allow the master to continue. + * Calibrate the delay loop and update loops_per_jiffy in cpu_data. + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. + * + * As this is invoked after the TSC synchronization check, + * calibrate_delay_is_known() will skip the calibration routine + * when TSC is synchronized across sockets. */ - cpumask_set_cpu(cpuid, cpu_callin_mask); + calibrate_delay(); + cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy; } -static int cpu0_logical_apicid; -static int enable_start_cpu0; /* * Activate a secondary processor. */ @@ -226,24 +245,63 @@ static void notrace start_secondary(void *unused) */ cr4_init(); -#ifdef CONFIG_X86_32 - /* switch away from the initial page table */ - load_cr3(swapper_pg_dir); - __flush_tlb_all(); -#endif - cpu_init_secondary(); + /* + * 32-bit specific. 64-bit reaches this code with the correct page + * table established. Yet another historical divergence. + */ + if (IS_ENABLED(CONFIG_X86_32)) { + /* switch away from the initial page table */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + } + + cpu_init_exception_handling(); + + /* + * 32-bit systems load the microcode from the ASM startup code for + * historical reasons. + * + * On 64-bit systems load it before reaching the AP alive + * synchronization point below so it is not part of the full per + * CPU serialized bringup part when "parallel" bringup is enabled. + * + * That's even safe when hyperthreading is enabled in the CPU as + * the core code starts the primary threads first and leaves the + * secondary threads waiting for SIPI. Loading microcode on + * physical cores concurrently is a safe operation. + * + * This covers both the Intel specific issue that concurrent + * microcode loading on SMT siblings must be prohibited and the + * vendor independent issue`that microcode loading which changes + * CPUID, MSRs etc. must be strictly serialized to maintain + * software state correctness. + */ + if (IS_ENABLED(CONFIG_X86_64)) + load_ucode_ap(); + + /* + * Synchronization point with the hotplug core. Sets this CPUs + * synchronization state to ALIVE and spin-waits for the control CPU to + * release this CPU for further bringup. + */ + cpuhp_ap_sync_alive(); + + cpu_init(); + fpu__init_cpu(); rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); - smp_callin(); - enable_start_cpu0 = 0; + ap_starting(); + + /* Check TSC synchronization with the control CPU. */ + check_tsc_sync_target(); - /* otherwise gcc will move up smp_processor_id before the cpu_init */ - barrier(); /* - * Check TSC synchronization with the boot CPU: + * Calibrate the delay loop after the TSC synchronization check. + * This allows to skip the calibration when TSC is synchronized + * across sockets. */ - check_tsc_sync_target(); + ap_calibrate_delay(); speculative_store_bypass_ht_init(); @@ -257,7 +315,6 @@ static void notrace start_secondary(void *unused) set_cpu_online(smp_processor_id(), true); lapic_online(); unlock_vector_lock(); - cpu_set_state_online(smp_processor_id()); x86_platform.nmi_init(); /* enable local interrupts */ @@ -270,15 +327,6 @@ static void notrace start_secondary(void *unused) } /** - * topology_is_primary_thread - Check whether CPU is the primary SMT thread - * @cpu: CPU to check - */ -bool topology_is_primary_thread(unsigned int cpu) -{ - return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); -} - -/** * topology_smt_supported - Check whether SMT is supported by the CPUs */ bool topology_smt_supported(void) @@ -288,6 +336,7 @@ bool topology_smt_supported(void) /** * topology_phys_to_logical_pkg - Map a physical package id to a logical + * @phys_pkg: The physical package id to map * * Returns logical package id or -1 if not found */ @@ -304,15 +353,17 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) return -1; } EXPORT_SYMBOL(topology_phys_to_logical_pkg); + /** * topology_phys_to_logical_die - Map a physical die id to logical + * @die_id: The physical die id to map + * @cur_cpu: The CPU for which the mapping is done * * Returns logical die id or -1 if not found */ -int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) +static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) { - int cpu; - int proc_id = cpu_data(cur_cpu).phys_proc_id; + int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id; for_each_possible_cpu(cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -323,7 +374,6 @@ int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) } return -1; } -EXPORT_SYMBOL(topology_phys_to_logical_die); /** * topology_update_package_map - Update the physical to logical package map @@ -398,7 +448,7 @@ void smp_store_cpu_info(int id) c->cpu_index = id; /* * During boot time, CPU0 has this setup already. Save the info when - * bringing up AP or offlined CPU0. + * bringing up an AP. */ identify_secondary_cpu(c); c->initialized = true; @@ -552,7 +602,7 @@ static int x86_core_flags(void) #ifdef CONFIG_SCHED_SMT static int x86_smt_flags(void) { - return cpu_smt_flags() | x86_sched_itmt_flags(); + return cpu_smt_flags(); } #endif #ifdef CONFIG_SCHED_CLUSTER @@ -563,50 +613,57 @@ static int x86_cluster_flags(void) #endif #endif -static struct sched_domain_topology_level x86_numa_in_package_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { NULL, }, -}; +/* + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours, Intel Cluster-on-Die, and Intel + * Sub-NUMA Clustering have this. + */ +static bool x86_has_numa_in_package; -static struct sched_domain_topology_level x86_hybrid_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, -#endif -#ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; +static struct sched_domain_topology_level x86_topology[6]; + +static void __init build_sched_topology(void) +{ + int i = 0; -static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + }; #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, + /* + * For now, skip the cluster domain on Hybrid. + */ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) + }; + } #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) + }; #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; + /* + * When there is NUMA topology inside the package skip the DIE domain + * since the NUMA domains will auto-magically create the right spanning + * domains based on the SLIT. + */ + if (!x86_has_numa_in_package) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, SD_INIT_NAME(DIE) + }; + } -/* - * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours, Intel Cluster-on-Die, and Intel - * Sub-NUMA Clustering have this. - */ -static bool x86_has_numa_in_package; + /* + * There must be one trailing NULL entry left. + */ + BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + + set_sched_topology(x86_topology); +} void set_cpu_sibling_map(int cpu) { @@ -706,9 +763,9 @@ static void impress_friends(void) * Allow the user to impress friends. */ pr_debug("Before bogomips\n"); - for_each_possible_cpu(cpu) - if (cpumask_test_cpu(cpu, cpu_callout_mask)) - bogosum += cpu_data(cpu).loops_per_jiffy; + for_each_online_cpu(cpu) + bogosum += cpu_data(cpu).loops_per_jiffy; + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), @@ -795,86 +852,42 @@ static void __init smp_quirk_init_udelay(void) } /* - * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal - * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this - * won't ... remember to clear down the APIC, etc later. + * Wake up AP by INIT, INIT, STARTUP sequence. */ -int -wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) +static void send_init_sequence(int phys_apicid) { - u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; - unsigned long send_status, accept_status = 0; - int maxlvt; - - /* Target chip */ - /* Boot on the stack */ - /* Kick the second */ - apic_icr_write(APIC_DM_NMI | dm, apicid); + int maxlvt = lapic_get_maxlvt(); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); + /* Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(boot_cpu_apic_version)) { - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + /* Due to the Pentium erratum 3AP. */ + if (maxlvt > 3) apic_write(APIC_ESR, 0); - accept_status = (apic_read(APIC_ESR) & 0xEF); + apic_read(APIC_ESR); } - pr_debug("NMI sent\n"); - if (send_status) - pr_err("APIC never delivered???\n"); - if (accept_status) - pr_err("APIC delivery error (%lx)\n", accept_status); + /* Assert INIT on the target CPU */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); + safe_apic_wait_icr_idle(); - return (send_status | accept_status); + udelay(init_udelay); + + /* Deassert INIT on the target CPU */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + safe_apic_wait_icr_idle(); } -static int -wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ +static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; - int maxlvt, num_starts, j; + int num_starts, j, maxlvt; + preempt_disable(); maxlvt = lapic_get_maxlvt(); - - /* - * Be paranoid about clearing APIC errors. - */ - if (APIC_INTEGRATED(boot_cpu_apic_version)) { - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } - - pr_debug("Asserting INIT\n"); - - /* - * Turn INIT on target chip - */ - /* - * Send IPI - */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, - phys_apicid); - - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); - - udelay(init_udelay); - - pr_debug("Deasserting INIT\n"); - - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + send_init_sequence(phys_apicid); mb(); @@ -945,15 +958,16 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) if (accept_status) pr_err("APIC delivery error (%lx)\n", accept_status); + preempt_enable(); return (send_status | accept_status); } /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { + static int width, node_width, first = 1; static int current_node = NUMA_NO_NODE; int node = early_cpu_to_node(cpu); - static int width, node_width; if (!width) width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ @@ -961,10 +975,10 @@ static void announce_cpu(int cpu, int apicid) if (!node_width) node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ - if (cpu == 1) - printk(KERN_INFO "x86: Booting SMP configuration:\n"); - if (system_state < SYSTEM_RUNNING) { + if (first) + pr_info("x86: Booting SMP configuration:\n"); + if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); @@ -975,77 +989,16 @@ static void announce_cpu(int cpu, int apicid) } /* Add padding for the BSP */ - if (cpu == 1) + if (first) pr_cont("%*s", width + 1, " "); + first = 0; pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); - } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", node, cpu, apicid); } -static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) -{ - int cpu; - - cpu = smp_processor_id(); - if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) - return NMI_HANDLED; - - return NMI_DONE; -} - -/* - * Wake up AP by INIT, INIT, STARTUP sequence. - * - * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS - * boot-strap code which is not a desired behavior for waking up BSP. To - * void the boot-strap code, wake up CPU0 by NMI instead. - * - * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined - * (i.e. physically hot removed and then hot added), NMI won't wake it up. - * We'll change this code in the future to wake up hard offlined CPU0 if - * real platform and request are available. - */ -static int -wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, - int *cpu0_nmi_registered) -{ - int id; - int boot_error; - - preempt_disable(); - - /* - * Wake up AP by INIT, INIT, STARTUP sequence. - */ - if (cpu) { - boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); - goto out; - } - - /* - * Wake up BSP by nmi. - * - * Register a NMI handler to help wake up CPU0. - */ - boot_error = register_nmi_handler(NMI_LOCAL, - wakeup_cpu0_nmi, 0, "wake_cpu0"); - - if (!boot_error) { - enable_start_cpu0 = 1; - *cpu0_nmi_registered = 1; - id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid; - boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); - } - -out: - preempt_enable(); - - return boot_error; -} - int common_cpu_up(unsigned int cpu, struct task_struct *idle) { int ret; @@ -1071,17 +1024,13 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. - * Returns zero if CPU booted OK, else error code from + * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, - int *cpu0_nmi_registered) +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { - /* start_ip had better be page-aligned! */ unsigned long start_ip = real_mode_header->trampoline_start; - - unsigned long boot_error = 0; - unsigned long timeout; + int ret; #ifdef CONFIG_X86_64 /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ @@ -1094,7 +1043,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, if (IS_ENABLED(CONFIG_X86_32)) { early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_stack = idle->thread.sp; - } else { + } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) { smpboot_control = cpu; } @@ -1108,7 +1057,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, * This grunge runs the startup process for * the targeted processor. */ - if (x86_platform.legacy.warm_reset) { pr_debug("Setting warm reset code and vector.\n"); @@ -1123,13 +1071,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, } } - /* - * AP might wait on cpu_callout_mask in cpu_init() with - * cpu_initialized_mask set if previous attempt to online - * it timed-out. Clear cpu_initialized_mask so that after - * INIT/SIPI it could start with a clean state. - */ - cpumask_clear_cpu(cpu, cpu_initialized_mask); smp_mb(); /* @@ -1137,66 +1078,25 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, * - Use a method from the APIC driver if one defined, with wakeup * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, - * - Use an INIT boot APIC message for APs or NMI for BSP. + * - Use an INIT boot APIC message */ if (apic->wakeup_secondary_cpu_64) - boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + ret = apic->wakeup_secondary_cpu_64(apicid, start_ip); else if (apic->wakeup_secondary_cpu) - boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); + ret = apic->wakeup_secondary_cpu(apicid, start_ip); else - boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, - cpu0_nmi_registered); + ret = wakeup_secondary_cpu_via_init(apicid, start_ip); - if (!boot_error) { - /* - * Wait 10s total for first sign of life from AP - */ - boot_error = -1; - timeout = jiffies + 10*HZ; - while (time_before(jiffies, timeout)) { - if (cpumask_test_cpu(cpu, cpu_initialized_mask)) { - /* - * Tell AP to proceed with initialization - */ - cpumask_set_cpu(cpu, cpu_callout_mask); - boot_error = 0; - break; - } - schedule(); - } - } - - if (!boot_error) { - /* - * Wait till AP completes initial initialization - */ - while (!cpumask_test_cpu(cpu, cpu_callin_mask)) { - /* - * Allow other tasks to run while we wait for the - * AP to come online. This also gives a chance - * for the MTRR work(triggered by the AP coming online) - * to be completed in the stop machine context. - */ - schedule(); - } - } - - if (x86_platform.legacy.warm_reset) { - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); - } - - return boot_error; + /* If the wakeup mechanism failed, cleanup the warm reset vector */ + if (ret) + arch_cpuhp_cleanup_kick_cpu(cpu); + return ret; } -int native_cpu_up(unsigned int cpu, struct task_struct *tidle) +int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); - int cpu0_nmi_registered = 0; - unsigned long flags; - int err, ret = 0; + int err; lockdep_assert_irqs_enabled(); @@ -1210,24 +1110,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) } /* - * Already booted CPU? - */ - if (cpumask_test_cpu(cpu, cpu_callin_mask)) { - pr_debug("do_boot_cpu %d Already started\n", cpu); - return -ENOSYS; - } - - /* * Save current MTRR state in case it was changed since early boot * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: */ mtrr_save_state(); - /* x86 CPUs take themselves offline, so delayed offline is OK. */ - err = cpu_check_up_prepare(cpu); - if (err && err != -EBUSY) - return err; - /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; @@ -1235,41 +1122,44 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) if (err) return err; - err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); - if (err) { + err = do_boot_cpu(apicid, cpu, tidle); + if (err) pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); - ret = -EIO; - goto unreg_nmi; - } - /* - * Check TSC synchronization with the AP (keep irqs disabled - * while doing so): - */ - local_irq_save(flags); - check_tsc_sync_source(cpu); - local_irq_restore(flags); + return err; +} - while (!cpu_online(cpu)) { - cpu_relax(); - touch_nmi_watchdog(); - } +int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle) +{ + return smp_ops.kick_ap_alive(cpu, tidle); +} -unreg_nmi: - /* - * Clean up the nmi handler. Do this after the callin and callout sync - * to avoid impact of possible long unregister time. - */ - if (cpu0_nmi_registered) - unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); +void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) +{ + /* Cleanup possible dangling ends... */ + if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset) + smpboot_restore_warm_reset_vector(); +} - return ret; +void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) +{ + if (smp_ops.cleanup_dead_cpu) + smp_ops.cleanup_dead_cpu(cpu); + + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); +} + +void arch_cpuhp_sync_state_poll(void) +{ + if (smp_ops.poll_sync_state) + smp_ops.poll_sync_state(); } /** - * arch_disable_smp_support() - disables SMP support for x86 at runtime + * arch_disable_smp_support() - Disables SMP support for x86 at boottime */ -void arch_disable_smp_support(void) +void __init arch_disable_smp_support(void) { disable_ioapic_support(); } @@ -1361,14 +1251,6 @@ static void __init smp_cpu_index_default(void) } } -static void __init smp_get_logical_apicid(void) -{ - if (x2apic_mode) - cpu0_logical_apicid = apic_read(APIC_LDR); - else - cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); -} - void __init smp_prepare_cpus_common(void) { unsigned int i; @@ -1379,7 +1261,6 @@ void __init smp_prepare_cpus_common(void) * Setup boot CPU information */ smp_store_boot_cpu_info(); /* Final full version of the data */ - cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); for_each_possible_cpu(i) { @@ -1390,18 +1271,24 @@ void __init smp_prepare_cpus_common(void) zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } - /* - * Set 'default' x86 topology, this matches default_topology() in that - * it has NUMA nodes as a topology level. See also - * native_smp_cpus_done(). - * - * Must be done before set_cpus_sibling_map() is ran. - */ - set_sched_topology(x86_topology); - set_cpu_sibling_map(0); } +#ifdef CONFIG_X86_64 +/* Establish whether parallel bringup can be supported. */ +bool __init arch_cpuhp_init_parallel_bringup(void) +{ + if (!x86_cpuinit.parallel_bringup) { + pr_info("Parallel CPU startup disabled by the platform\n"); + return false; + } + + smpboot_control = STARTUP_READ_APICID; + pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control); + return true; +} +#endif + /* * Prepare for SMP bootup. * @max_cpus: configured maximum number of CPUs, It is a legacy parameter @@ -1431,8 +1318,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) /* Setup local timer */ x86_init.timers.setup_percpu_clockev(); - smp_get_logical_apicid(); - pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); @@ -1455,6 +1340,33 @@ void arch_thaw_secondary_cpus_end(void) cache_aps_init(); } +bool smp_park_other_cpus_in_init(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + unsigned int apicid; + + if (apic->wakeup_secondary_cpu_64 || apic->wakeup_secondary_cpu) + return false; + + /* + * If this is a crash stop which does not execute on the boot CPU, + * then this cannot use the INIT mechanism because INIT to the boot + * CPU will reset the machine. + */ + if (this_cpu) + return false; + + for_each_present_cpu(cpu) { + if (cpu == this_cpu) + continue; + apicid = apic->cpu_present_to_apicid(cpu); + if (apicid == BAD_APICID) + continue; + send_init_sequence(apicid); + } + return true; +} + /* * Early setup to make printk work. */ @@ -1466,9 +1378,6 @@ void __init native_smp_prepare_boot_cpu(void) if (!IS_ENABLED(CONFIG_SMP)) switch_gdt_and_percpu_base(me); - /* already set me in cpu_online_mask in boot_cpu_init() */ - cpumask_set_cpu(me, cpu_callout_mask); - cpu_set_state_online(me); native_pv_lock_init(); } @@ -1490,13 +1399,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done\n"); calculate_max_logical_packages(); - - /* XXX for now assume numa-in-package and hybrid don't overlap */ - if (x86_has_numa_in_package) - set_sched_topology(x86_numa_in_package_topology); - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - set_sched_topology(x86_hybrid_topology); - + build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init(); @@ -1592,6 +1495,12 @@ __init void prefill_possible_map(void) set_cpu_possible(i, true); } +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} + #ifdef CONFIG_HOTPLUG_CPU /* Recompute SMT state for all CPUs on offline */ @@ -1650,10 +1559,6 @@ static void remove_siblinginfo(int cpu) static void remove_cpu_from_maps(int cpu) { set_cpu_online(cpu, false); - cpumask_clear_cpu(cpu, cpu_callout_mask); - cpumask_clear_cpu(cpu, cpu_callin_mask); - /* was set by cpu_init() */ - cpumask_clear_cpu(cpu, cpu_initialized_mask); numa_remove_cpu(cpu); } @@ -1704,64 +1609,27 @@ int native_cpu_disable(void) return 0; } -int common_cpu_die(unsigned int cpu) -{ - int ret = 0; - - /* We don't do anything here: idle task is faking death itself. */ - - /* They ack this in play_dead() by setting CPU_DEAD */ - if (cpu_wait_death(cpu, 5)) { - if (system_state == SYSTEM_RUNNING) - pr_info("CPU %u is now offline\n", cpu); - } else { - pr_err("CPU %u didn't die...\n", cpu); - ret = -1; - } - - return ret; -} - -void native_cpu_die(unsigned int cpu) -{ - common_cpu_die(cpu); -} - void play_dead_common(void) { idle_task_exit(); - /* Ack it */ - (void)cpu_report_death(); - + cpuhp_ap_report_dead(); /* * With physical CPU hotplug, we should halt the cpu */ local_irq_disable(); } -/** - * cond_wakeup_cpu0 - Wake up CPU0 if needed. - * - * If NMI wants to wake up CPU0, start CPU0. - */ -void cond_wakeup_cpu0(void) -{ - if (smp_processor_id() == 0 && enable_start_cpu0) - start_cpu0(); -} -EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); - /* * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. */ static inline void mwait_play_dead(void) { + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; - void *mwait_ptr; int i; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || @@ -1796,12 +1664,9 @@ static inline void mwait_play_dead(void) (highest_subcstate - 1); } - /* - * This should be a memory location in a cache line which is - * unlikely to be touched by other processors. The actual - * content is immaterial as it is not actually modified in any way. - */ - mwait_ptr = ¤t_thread_info()->flags; + /* Set up state for the kexec() hack below */ + md->status = CPUDEAD_MWAIT_WAIT; + md->control = CPUDEAD_MWAIT_WAIT; wbinvd(); @@ -1814,13 +1679,58 @@ static inline void mwait_play_dead(void) * case where we return around the loop. */ mb(); - clflush(mwait_ptr); + clflush(md); mb(); - __monitor(mwait_ptr, 0, 0); + __monitor(md, 0, 0); mb(); __mwait(eax, 0); - cond_wakeup_cpu0(); + if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { + /* + * Kexec is about to happen. Don't go back into mwait() as + * the kexec kernel might overwrite text and data including + * page tables and stack. So mwait() would resume when the + * monitor cache line is written to and then the CPU goes + * south due to overwritten text, page tables and stack. + * + * Note: This does _NOT_ protect against a stray MCE, NMI, + * SMI. They will resume execution at the instruction + * following the HLT instruction and run into the problem + * which this is trying to prevent. + */ + WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT); + while(1) + native_halt(); + } + } +} + +/* + * Kick all "offline" CPUs out of mwait on kexec(). See comment in + * mwait_play_dead(). + */ +void smp_kick_mwait_play_dead(void) +{ + u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT; + struct mwait_cpu_dead *md; + unsigned int cpu, i; + + for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) { + md = per_cpu_ptr(&mwait_cpu_dead, cpu); + + /* Does it sit in mwait_play_dead() ? */ + if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT) + continue; + + /* Wait up to 5ms */ + for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) { + /* Bring it out of mwait */ + WRITE_ONCE(md->control, newstate); + udelay(5); + } + + if (READ_ONCE(md->status) != newstate) + pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu); } } @@ -1829,11 +1739,8 @@ void __noreturn hlt_play_dead(void) if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); - while (1) { + while (1) native_halt(); - - cond_wakeup_cpu0(); - } } void native_play_dead(void) @@ -1852,12 +1759,6 @@ int native_cpu_disable(void) return -ENOSYS; } -void native_cpu_die(unsigned int cpu) -{ - /* We said "no" in __cpu_disable */ - BUG(); -} - void native_play_dead(void) { BUG(); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 1b83377274b8..ca004e2e4469 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -38,102 +38,12 @@ static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); #ifdef CONFIG_HOTPLUG_CPU - -#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0 -static int cpu0_hotpluggable = 1; -#else -static int cpu0_hotpluggable; -static int __init enable_cpu0_hotplug(char *str) -{ - cpu0_hotpluggable = 1; - return 1; -} - -__setup("cpu0_hotplug", enable_cpu0_hotplug); -#endif - -#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 -/* - * This function offlines a CPU as early as possible and allows userspace to - * boot up without the CPU. The CPU can be onlined back by user after boot. - * - * This is only called for debugging CPU offline/online feature. - */ -int _debug_hotplug_cpu(int cpu, int action) +int arch_register_cpu(int cpu) { - int ret; - - if (!cpu_is_hotpluggable(cpu)) - return -EINVAL; + struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu); - switch (action) { - case 0: - ret = remove_cpu(cpu); - if (!ret) - pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); - else - pr_debug("Can't offline CPU%d.\n", cpu); - break; - case 1: - ret = add_cpu(cpu); - if (ret) - pr_debug("Can't online CPU%d.\n", cpu); - - break; - default: - ret = -EINVAL; - } - - return ret; -} - -static int __init debug_hotplug_cpu(void) -{ - _debug_hotplug_cpu(0, 0); - return 0; -} - -late_initcall_sync(debug_hotplug_cpu); -#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ - -int arch_register_cpu(int num) -{ - struct cpuinfo_x86 *c = &cpu_data(num); - - /* - * Currently CPU0 is only hotpluggable on Intel platforms. Other - * vendors can add hotplug support later. - * Xen PV guests don't support CPU0 hotplug at all. - */ - if (c->x86_vendor != X86_VENDOR_INTEL || - cpu_feature_enabled(X86_FEATURE_XENPV)) - cpu0_hotpluggable = 0; - - /* - * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate - * depends on BSP. PIC interrupts depend on BSP. - * - * If the BSP dependencies are under control, one can tell kernel to - * enable BSP hotplug. This basically adds a control file and - * one can attempt to offline BSP. - */ - if (num == 0 && cpu0_hotpluggable) { - unsigned int irq; - /* - * We won't take down the boot processor on i386 if some - * interrupts only are able to be serviced by the BSP in PIC. - */ - for_each_active_irq(irq) { - if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) { - cpu0_hotpluggable = 0; - break; - } - } - } - if (num || cpu0_hotpluggable) - per_cpu(cpu_devices, num).cpu.hotpluggable = 1; - - return register_cpu(&per_cpu(cpu_devices, num).cpu, num); + xc->cpu.hotpluggable = cpu > 0; + return register_cpu(&xc->cpu, cpu); } EXPORT_SYMBOL(arch_register_cpu); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 344698852146..3425c6a943e4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(char *buf) } early_param("tsc_early_khz", tsc_early_khz_setup); -__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +__always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; - preempt_disable_notrace(); - do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; @@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +{ + preempt_disable_notrace(); + __cyc2ns_read(data); +} + __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); @@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; - cyc2ns_read_begin(&data); + __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_read_end(); + return ns; +} +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = __cycles_2_ns(cyc); + preempt_enable_notrace(); return ns; } @@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void) u64 tsc_now = rdtsc(); /* return the value in ns */ - return cycles_2_ns(tsc_now); + return __cycles_2_ns(tsc_now); } /* @@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc) /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -noinstr u64 sched_clock(void) +noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } @@ -260,11 +271,20 @@ bool using_native_sched_clock(void) return static_call_query(pv_sched_clock) == native_sched_clock; } #else -u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); +u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif +notrace u64 sched_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = sched_clock_noinstr(); + preempt_enable_notrace(); + return now; +} + int check_tsc_unstable(void) { return tsc_unstable; @@ -1598,10 +1618,7 @@ void __init tsc_init(void) #ifdef CONFIG_SMP /* - * If we have a constant TSC and are using the TSC for the delay loop, - * we can skip clock calibration if another cpu in the same socket has already - * been calibrated. This assumes that CONSTANT_TSC applies to all - * cpus in the socket - this should be a safe assumption. + * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { @@ -1609,6 +1626,21 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); + /* + * If TSC has constant frequency and TSC is synchronized across + * sockets then reuse CPU0 calibration. + */ + if (constant_tsc && !tsc_unstable) + return cpu_data(0).loops_per_jiffy; + + /* + * If TSC has constant frequency and TSC is not synchronized across + * sockets and this is not the first CPU in the socket, then reuse + * the calibration value of an already online CPU on that socket. + * + * This assumes that CONSTANT_TSC is consistent for all CPUs in a + * socket. + */ if (!constant_tsc || !mask) return 0; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9452dc9664b5..bbc440c93e08 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -245,7 +245,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) */ static atomic_t start_count; static atomic_t stop_count; -static atomic_t skip_test; static atomic_t test_runs; /* @@ -344,21 +343,14 @@ static inline unsigned int loop_timeout(int cpu) } /* - * Source CPU calls into this - it waits for the freshly booted - * target CPU to arrive and then starts the measurement: + * The freshly booted CPU initiates this via an async SMP function call. */ -void check_tsc_sync_source(int cpu) +static void check_tsc_sync_source(void *__cpu) { + unsigned int cpu = (unsigned long)__cpu; int cpus = 2; /* - * No need to check if we already know that the TSC is not - * synchronized or if we have no TSC. - */ - if (unsynchronized_tsc()) - return; - - /* * Set the maximum number of test runs to * 1 if the CPU does not provide the TSC_ADJUST MSR * 3 if the MSR is available, so the target can try to adjust @@ -368,16 +360,9 @@ void check_tsc_sync_source(int cpu) else atomic_set(&test_runs, 3); retry: - /* - * Wait for the target to start or to skip the test: - */ - while (atomic_read(&start_count) != cpus - 1) { - if (atomic_read(&skip_test) > 0) { - atomic_set(&skip_test, 0); - return; - } + /* Wait for the target to start. */ + while (atomic_read(&start_count) != cpus - 1) cpu_relax(); - } /* * Trigger the target to continue into the measurement too: @@ -397,14 +382,14 @@ retry: if (!nr_warps) { atomic_set(&test_runs, 0); - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n", smp_processor_id(), cpu); } else if (atomic_dec_and_test(&test_runs) || random_warps) { /* Force it to 0 if random warps brought us here */ atomic_set(&test_runs, 0); - pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n", + pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n", smp_processor_id(), cpu); pr_warn("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); @@ -457,11 +442,12 @@ void check_tsc_sync_target(void) * SoCs the TSC is frequency synchronized, but still the TSC ADJUST * register might have been wreckaged by the BIOS.. */ - if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) { - atomic_inc(&skip_test); + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) return; - } + /* Kick the control CPU into the TSC synchronization function */ + smp_call_function_single(cpumask_first(cpu_online_mask), check_tsc_sync_source, + (unsigned long *)(unsigned long)cpu, 0); retry: /* * Register this CPU's participation and wait for the diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 3ac50b7298d1..7e574cf3bf8a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -7,14 +7,23 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> +#include <asm/orc_header.h> + +ORC_HEADER; #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) #define orc_warn_current(args...) \ ({ \ - if (state->task == current && !state->error) \ + static bool dumped_before; \ + if (state->task == current && !state->error) { \ orc_warn(args); \ + if (unwind_debug && !dumped_before) { \ + dumped_before = true; \ + unwind_dump(state); \ + } \ + } \ }) extern int __start_orc_unwind_ip[]; @@ -23,8 +32,49 @@ extern struct orc_entry __start_orc_unwind[]; extern struct orc_entry __stop_orc_unwind[]; static bool orc_init __ro_after_init; +static bool unwind_debug __ro_after_init; static unsigned int lookup_num_blocks __ro_after_init; +static int __init unwind_debug_cmdline(char *str) +{ + unwind_debug = true; + + return 0; +} +early_param("unwind_debug", unwind_debug_cmdline); + +static void unwind_dump(struct unwind_state *state) +{ + static bool dumped_before; + unsigned long word, *sp; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + + if (dumped_before) + return; + + dumped_before = true; + + printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", + state->stack_info.type, state->stack_info.next_sp, + state->stack_mask, state->graph_idx); + + for (sp = __builtin_frame_address(0); sp; + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) + break; + + for (; sp < stack_info.end; sp++) { + + word = READ_ONCE_NOCHECK(*sp); + + printk_deferred("%0*lx: %0*lx (%pB)\n", BITS_PER_LONG/4, + (unsigned long)sp, BITS_PER_LONG/4, + word, (void *)word); + } + } +} + static inline unsigned long orc_ip(const int *ip) { return (unsigned long)ip + *ip; @@ -136,21 +186,6 @@ static struct orc_entry null_orc_entry = { .type = ORC_TYPE_CALL }; -#ifdef CONFIG_CALL_THUNKS -static struct orc_entry *orc_callthunk_find(unsigned long ip) -{ - if (!is_callthunk((void *)ip)) - return NULL; - - return &null_orc_entry; -} -#else -static struct orc_entry *orc_callthunk_find(unsigned long ip) -{ - return NULL; -} -#endif - /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { .type = ORC_TYPE_CALL, @@ -203,11 +238,7 @@ static struct orc_entry *orc_find(unsigned long ip) if (orc) return orc; - orc = orc_ftrace_find(ip); - if (orc) - return orc; - - return orc_callthunk_find(ip); + return orc_ftrace_find(ip); } #ifdef CONFIG_MODULES @@ -219,7 +250,6 @@ static struct orc_entry *cur_orc_table = __start_orc_unwind; static void orc_sort_swap(void *_a, void *_b, int size) { struct orc_entry *orc_a, *orc_b; - struct orc_entry orc_tmp; int *a = _a, *b = _b, tmp; int delta = _b - _a; @@ -231,9 +261,7 @@ static void orc_sort_swap(void *_a, void *_b, int size) /* Swap the corresponding .orc_unwind entries: */ orc_a = cur_orc_table + (a - cur_orc_ip_table); orc_b = cur_orc_table + (b - cur_orc_ip_table); - orc_tmp = *orc_a; - *orc_a = *orc_b; - *orc_b = orc_tmp; + swap(*orc_a, *orc_b); } static int orc_sort_cmp(const void *_a, const void *_b) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 25f155205770..03c885d3640f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -508,4 +508,8 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif +#ifdef CONFIG_RETHUNK +. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned"); +#endif + #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d82f4fa2f1bf..a37ebd3b4773 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -126,12 +126,13 @@ struct x86_init_ops x86_init __initdata = { struct x86_cpuinit_ops x86_cpuinit = { .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, + .parallel_bringup = true, }; static void default_nmi_init(void) { }; -static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { } -static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } +static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } +static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } static bool is_private_mmio_noop(u64 addr) {return false; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0c9660a07b23..7f4d13383cf2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -501,20 +501,15 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { - int r; - - r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; + return -E2BIG; + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; + return -EFAULT; -out: cpuid->nent = vcpu->arch.cpuid_nent; - return r; + return 0; } /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ @@ -734,6 +729,10 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, + F(PERFMON_V2) + ); + /* * Synthesize "LFENCE is serializing" into the AMD-defined entry in * KVM's supported CPUID if the feature is reported as supported by the @@ -948,7 +947,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) union cpuid10_eax eax; union cpuid10_edx edx; - if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } @@ -1128,7 +1127,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x80000021); + entry->eax = min(entry->eax, 0x80000022); /* * Serializing LFENCE is reported in a multitude of ways, and * NullSegClearsBase is not reported in CPUID on Zen2; help @@ -1233,6 +1232,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); break; + /* AMD Extended Performance Monitoring and Debug */ + case 0x80000022: { + union cpuid_0x80000022_ebx ebx; + + entry->ecx = entry->edx = 0; + if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { + entry->eax = entry->ebx; + break; + } + + cpuid_entry_override(entry, CPUID_8000_0022_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; + else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; + else + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; + + entry->ebx = ebx.full; + break; + } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4756bcb5724f..8dec646e764b 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -411,7 +411,10 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); + /* Bit 7 is 1, means there's an interrupt */ + ret |= 0x80; } else { + /* Bit 7 is 0, means there's no interrupt */ ret = 0x07; pic_update_irq(s->pics_state); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3c300a196bdf..113ca9661ab2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -51,11 +51,6 @@ #define mod_64(x, y) ((x) % (y)) #endif -#define PRId64 "d" -#define PRIx64 "llx" -#define PRIu64 "u" -#define PRIo64 "o" - /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6eaa3d6994ae..ec169f5c7dce 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -58,6 +58,8 @@ extern bool itlb_multihit_kvm_mitigation; +static bool nx_hugepage_mitigation_hard_disabled; + int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT @@ -67,12 +69,13 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, - .get = param_get_bool, + .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { @@ -1600,6 +1603,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + if (kvm_x86_ops.set_apic_access_page_addr && + range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + return flush; } @@ -5797,6 +5804,14 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu vcpu_clear_mmio_info(vcpu, addr); + /* + * Walking and synchronizing SPTEs both assume they are operating in + * the context of the current MMU, and would need to be reworked if + * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. + */ + if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) + return; + if (!VALID_PAGE(root_hpa)) return; @@ -6844,6 +6859,14 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) +{ + if (nx_hugepage_mitigation_hard_disabled) + return sprintf(buffer, "never\n"); + + return param_get_bool(buffer, kp); +} + static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -6860,15 +6883,29 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) bool old_val = nx_huge_pages; bool new_val; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + /* In "auto" mode deploy workaround only if CPU has the bug. */ - if (sysfs_streq(val, "off")) + if (sysfs_streq(val, "off")) { new_val = 0; - else if (sysfs_streq(val, "force")) + } else if (sysfs_streq(val, "force")) { new_val = 1; - else if (sysfs_streq(val, "auto")) + } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); - else if (kstrtobool(val, &new_val) < 0) + } else if (sysfs_streq(val, "never")) { + new_val = 0; + + mutex_lock(&kvm_lock); + if (!list_empty(&vm_list)) { + mutex_unlock(&kvm_lock); + return -EBUSY; + } + nx_hugepage_mitigation_hard_disabled = true; + mutex_unlock(&kvm_lock); + } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; + } __set_nx_huge_pages(new_val); @@ -7006,6 +7043,9 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel uint old_period, new_period; int err; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); @@ -7164,6 +7204,9 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; + if (nx_hugepage_mitigation_hard_disabled) + return 0; + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", &kvm->arch.nx_huge_page_recovery_thread); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 08340219c35a..512163d52194 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -592,7 +592,10 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and - * does not hold the mmu_lock. + * does not hold the mmu_lock. On failure, i.e. if a different logical + * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with + * the current value, so the caller operates on fresh data, e.g. if it + * retries tdp_mmu_set_spte_atomic() */ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 9fac1ec03463..3eb6e7f47e96 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -25,10 +25,24 @@ #define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) #define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) +static bool is_mtrr_base_msr(unsigned int msr) +{ + /* MTRR base MSRs use even numbers, masks use odd numbers. */ + return !(msr & 0x1); +} + +static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, + unsigned int msr) +{ + int index = (msr - MTRRphysBase_MSR(0)) / 2; + + return &vcpu->arch.mtrr_state.var_ranges[index]; +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): case MSR_MTRRfix64K_00000: case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: @@ -41,7 +55,6 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: return true; } return false; @@ -52,7 +65,7 @@ static bool valid_mtrr_type(unsigned t) return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; u64 mask; @@ -60,9 +73,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!msr_mtrr_valid(msr)) return false; - if (msr == MSR_IA32_CR_PAT) { - return kvm_pat_valid(data); - } else if (msr == MSR_MTRRdefType) { + if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; return valid_mtrr_type(data & 0xff); @@ -74,7 +85,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + WARN_ON(!(msr >= MTRRphysBase_MSR(0) && + msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))); mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { @@ -88,7 +100,6 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return (data & mask) == 0; } -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) { @@ -308,10 +319,8 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - int index; - if (msr == MSR_IA32_CR_PAT || !tdp_enabled || - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) @@ -326,8 +335,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) end = ~0ULL; } else { /* variable range MTRRs. */ - index = (msr - 0x200) / 2; - var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end); + var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end); } kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); @@ -342,21 +350,18 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; struct kvm_mtrr_range *tmp, *cur; - int index, is_mtrr_mask; - index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; - cur = &mtrr_state->var_ranges[index]; + cur = var_mtrr_msr_to_range(vcpu, msr); /* remove the entry if it's in the list. */ if (var_mtrr_range_is_valid(cur)) - list_del(&mtrr_state->var_ranges[index].node); + list_del(&cur->node); /* * Set all illegal GPA bits in the mask, since those bits must * implicitly be 0. The bits are then cleared when reading them. */ - if (!is_mtrr_mask) + if (is_mtrr_base_msr(msr)) cur->base = data; else cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -382,8 +387,6 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; else if (msr == MSR_MTRRdefType) vcpu->arch.mtrr_state.deftype = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; else set_var_mtrr_msr(vcpu, msr, data); @@ -411,21 +414,16 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 1; index = fixed_msr_to_range_index(msr); - if (index >= 0) + if (index >= 0) { *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; - else if (msr == MSR_MTRRdefType) + } else if (msr == MSR_MTRRdefType) { *pdata = vcpu->arch.mtrr_state.deftype; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ - int is_mtrr_mask; - - index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; - if (!is_mtrr_mask) - *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; + } else { + /* Variable MTRRs */ + if (is_mtrr_base_msr(msr)) + *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; else - *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; + *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask; *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 1690d41c1830..bf653df86112 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,11 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) -{ - return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); -} - static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); @@ -562,6 +557,14 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); + default: + break; + } return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } @@ -577,13 +580,86 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + msr_info->data = pmu->global_status; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_CORE_PERF_GLOBAL_CTRL: + msr_info->data = pmu->global_ctrl; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + msr_info->data = 0; + break; + default: + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + } + + return 0; } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + u64 data = msr_info->data; + u64 diff; + + /* + * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, + * whereas Intel generates #GP on attempts to write reserved/RO MSRs. + */ + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + /* Per PPR, Read-only MSR. Writes are ignored. */ + if (!msr_info->host_initiated) + break; + + if (data & pmu->global_status_mask) + return 1; + + pmu->global_status = data; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + data &= ~pmu->global_ctrl_mask; + fallthrough; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_mask) + return 1; + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + break; + default: + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + } + + return 0; } /* refresh PMU settings. This function generally is called when underlying diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5c7bbf03b599..7d9ba301c090 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -20,7 +20,6 @@ struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); - bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -37,10 +36,25 @@ struct kvm_pmu_ops { const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; + const int MIN_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); +static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + * + * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2. + */ + return pmu->version > 1; +} + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -161,6 +175,7 @@ extern struct x86_pmu_capability kvm_pmu_cap; static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; /* * Hybrid PMUs don't play nice with virtualization without careful @@ -175,11 +190,15 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) perf_get_x86_pmu_capability(&kvm_pmu_cap); /* - * For Intel, only support guest architectural pmu - * on a host with architectural pmu. + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. */ - if ((is_intel && !kvm_pmu_cap.version) || - !kvm_pmu_cap.num_counters_gp) + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) enable_pmu = false; } @@ -201,6 +220,33 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +{ + int bit; + + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); +} + +/* + * Check if a PMC is enabled by comparing it against global_ctrl bits. + * + * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled. + */ +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!kvm_pmu_has_perf_global_ctrl(pmu)) + return true; + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index a5717282bb9c..56cbdb24400a 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -15,6 +15,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, CPUID_8000_0007_EDX, + CPUID_8000_0022_EAX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -47,6 +48,9 @@ enum kvm_only_cpuid_leafs { /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) +/* CPUID level 0x80000022 (EAX) */ +#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0) + struct cpuid_reg { u32 function; u32 index; @@ -74,6 +78,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, + [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, }; /* @@ -108,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX_EDECCSSA; else if (x86_feature == X86_FEATURE_CONSTANT_TSC) return KVM_X86_FEATURE_CONSTANT_TSC; + else if (x86_feature == X86_FEATURE_PERFMON_V2) + return KVM_X86_FEATURE_PERFMON_V2; return x86_feature; } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 5fa939e411d8..cef5a3d0abd0 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -78,14 +78,6 @@ static bool amd_hw_event_available(struct kvm_pmc *pmc) return true; } -/* check if a PMC is enabled by comparing it against global_ctrl bits. Because - * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). - */ -static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) -{ - return true; -} - static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -102,12 +94,6 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30)); } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ - return false; -} - static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -119,6 +105,29 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + switch (msr) { + case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: + return pmu->version > 0; + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + return pmu->version > 1; + default: + if (msr > MSR_F15H_PERF_CTR5 && + msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters) + return pmu->version > 1; + break; + } + + return amd_msr_idx_to_pmc(vcpu, msr); +} + static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -172,20 +181,39 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void amd_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + union cpuid_0x80000022_ebx ebx; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + pmu->version = 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + pmu->version = 2; + /* + * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest + * CPUID entry is guaranteed to be non-NULL. + */ + BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 || + x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); + ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; + pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; + } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; - else + } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + } + + pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, + kvm_pmu_cap.num_counters_gp); + + if (pmu->version > 1) { + pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_mask = pmu->global_ctrl_mask; + } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; - pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - pmu->global_status = 0; bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } @@ -216,11 +244,12 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } + + pmu->global_ctrl = pmu->global_status = 0; } struct kvm_pmu_ops amd_pmu_ops __initdata = { .hw_event_available = amd_hw_event_available, - .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -233,4 +262,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .reset = amd_pmu_reset, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 69ae5e1b3120..07756b7348ae 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2216,10 +2216,7 @@ void __init sev_hardware_setup(void) } sev_asid_count = max_sev_asid - min_sev_asid + 1; - if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)) - goto out; - - pr_info("SEV supported: %u ASIDs\n", sev_asid_count); + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); sev_supported = true; /* SEV-ES support requested? */ @@ -2244,13 +2241,19 @@ void __init sev_hardware_setup(void) goto out; sev_es_asid_count = min_sev_asid - 1; - if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)) - goto out; - - pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count); + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; out: + if (boot_cpu_has(X86_FEATURE_SEV)) + pr_info("SEV %s (ASIDs %u - %u)\n", + sev_supported ? "enabled" : "disabled", + min_sev_asid, max_sev_asid); + if (boot_cpu_has(X86_FEATURE_SEV_ES)) + pr_info("SEV-ES %s (ASIDs %u - %u)\n", + sev_es_supported ? "enabled" : "disabled", + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; #endif diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 54089f990c8f..d381ad424554 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -244,15 +244,6 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa"; static unsigned long iopm_base; -struct kvm_ldttss_desc { - u16 limit0; - u16 base0; - unsigned base1:8, type:5, dpl:2, p:1; - unsigned limit1:4, zero0:3, g:1, base2:8; - u32 base3; - u32 zero1; -} __attribute__((packed)); - DEFINE_PER_CPU(struct svm_cpu_data, svm_data); /* @@ -588,7 +579,6 @@ static int svm_hardware_enable(void) struct svm_cpu_data *sd; uint64_t efer; - struct desc_struct *gdt; int me = raw_smp_processor_id(); rdmsrl(MSR_EFER, efer); @@ -601,9 +591,6 @@ static int svm_hardware_enable(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - gdt = get_current_gdt_rw(); - sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); @@ -752,7 +739,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) BUG_ON(offset == MSR_INVALID); - return !!test_bit(bit_write, &tmp); + return test_bit(bit_write, &tmp); } static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, @@ -2939,9 +2926,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) - return 1; - vcpu->arch.pat = data; + ret = kvm_set_msr_common(vcpu, msr); + if (ret) + break; + svm->vmcb01.ptr->save.g_pat = data; if (is_guest_mode(vcpu)) nested_vmcb02_compute_g_pat(svm); @@ -3418,8 +3406,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(vcpu, KVM_ISA_SVM); - /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) @@ -3457,14 +3443,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void reload_tss(struct kvm_vcpu *vcpu) -{ - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); - - sd->tss_desc->type = 9; /* available 32/64-bit TSS */ - load_TR_desc(); -} - static void pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); @@ -4099,9 +4077,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); - if (!sev_es_guest(vcpu->kvm)) - reload_tss(vcpu); - if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); @@ -4156,6 +4131,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(vcpu); + trace_kvm_exit(vcpu, KVM_ISA_SVM); + svm_complete_interrupts(vcpu); if (is_guest_mode(vcpu)) @@ -5025,9 +5002,22 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - /* AMD PMU PERFCTR_CORE CPUID */ - if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + if (enable_pmu) { + /* + * Enumerate support for PERFCTR_CORE if and only if KVM has + * access to enough counters to virtualize "core" support, + * otherwise limit vPMU support to the legacy number of counters. + */ + if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) + kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, + kvm_pmu_cap.num_counters_gp); + else + kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + + if (kvm_pmu_cap.version != 2 || + !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); + } /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f44751dd8d5d..18af7e712a5a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -303,7 +303,6 @@ struct svm_cpu_data { u32 max_asid; u32 next_asid; u32 min_asid; - struct kvm_ldttss_desc *tss_desc; struct page *save_area; unsigned long save_area_pa; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 45162c1bcd8f..d0abee35d7ba 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -152,8 +152,8 @@ static inline bool cpu_has_vmx_ept(void) static inline bool vmx_umip_emulated(void) { - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; + return !boot_cpu_has(X86_FEATURE_UMIP) && + (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_DESC); } static inline bool cpu_has_vmx_rdtscp(void) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e35cf0bd0df9..516391cc0d64 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2328,8 +2328,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() * will not have to rewrite the controls just for this bit. */ - if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && - (vmcs12->guest_cr4 & X86_CR4_UMIP)) + if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) exec_control |= SECONDARY_EXEC_DESC; if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) @@ -2649,7 +2648,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -4524,7 +4523,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 741efe2c497b..80c769c58a87 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -73,18 +73,6 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) -{ - int bit; - - if (!diff) - return; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - set_bit(bit, pmu->reprogram_pmi); - kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); -} - static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -107,17 +95,6 @@ static bool intel_hw_event_available(struct kvm_pmc *pmc) return true; } -/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ -static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!intel_pmu_has_perf_global_ctrl(pmu)) - return true; - - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -198,11 +175,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - return intel_pmu_has_perf_global_ctrl(pmu); - break; + return kvm_pmu_has_perf_global_ctrl(pmu); case MSR_IA32_PEBS_ENABLE: ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; @@ -352,15 +325,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; break; - case MSR_CORE_PERF_GLOBAL_STATUS: - msr_info->data = pmu->global_status; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - msr_info->data = pmu->global_ctrl; - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = 0; - break; case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; break; @@ -410,29 +374,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (!msr_info->host_initiated) - return 1; /* RO MSR */ - - pmu->global_status = data; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (!kvm_valid_perf_global_ctrl(pmu, data)) - return 1; - - if (pmu->global_ctrl != data) { - diff = pmu->global_ctrl ^ data; - pmu->global_ctrl = data; - reprogram_counters(pmu, diff); - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (data & pmu->global_ovf_ctrl_mask) - return 1; - - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - break; case MSR_IA32_PEBS_ENABLE: if (data & pmu->pebs_enable_mask) return 1; @@ -444,8 +385,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_DS_AREA: - if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) - return 1; if (is_noncanonical_address(data, vcpu)) return 1; @@ -531,7 +470,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; pmu->global_ctrl_mask = ~0ull; - pmu->global_ovf_ctrl_mask = ~0ull; + pmu->global_status_mask = ~0ull; pmu->fixed_ctr_ctrl_mask = ~0ull; pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; @@ -585,11 +524,17 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); pmu->global_ctrl_mask = counter_mask; - pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + + /* + * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) + * share reserved bit definitions. The kernel just happens to use + * OVF_CTRL for the names. + */ + pmu->global_status_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_ovf_ctrl_mask &= + pmu->global_status_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); @@ -801,7 +746,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) pmc = intel_pmc_idx_to_pmc(pmu, bit); if (!pmc || !pmc_speculative_in_use(pmc) || - !intel_pmc_is_enabled(pmc) || !pmc->perf_event) + !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; /* @@ -816,7 +761,6 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) struct kvm_pmu_ops intel_pmu_ops __initdata = { .hw_event_available = intel_hw_event_available, - .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -831,4 +775,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = 1, }; diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 2261b684a7d4..3e822e582497 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -357,11 +357,12 @@ static int handle_encls_einit(struct kvm_vcpu *vcpu) static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) { - if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) - return false; - + /* + * ENCLS generates a #UD if SGX1 isn't supported, i.e. this point will + * be reached if and only if the SGX1 leafs are enabled. + */ if (leaf >= ECREATE && leaf <= ETRACK) - return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + return true; if (leaf >= EAUG && leaf <= EMODT) return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); @@ -380,9 +381,11 @@ int handle_encls(struct kvm_vcpu *vcpu) { u32 leaf = (u32)kvm_rax_read(vcpu); - if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { kvm_queue_exception(vcpu, UD_VECTOR); - } else if (!sgx_enabled_in_guest_bios(vcpu)) { + } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) || + !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { kvm_inject_gp(vcpu, 0); } else { if (leaf == ECREATE) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 631fd7da2bc3..07e927d4d099 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -187,7 +187,7 @@ SYM_FUNC_START(__vmx_vcpu_run) _ASM_EXTABLE(.Lvmresume, .Lfixup) _ASM_EXTABLE(.Lvmlaunch, .Lfixup) -SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) +SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) /* Restore unwind state from before the VMRESUME/VMLAUNCH. */ UNWIND_HINT_RESTORE diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 44fb619803b8..0ecf4be2c6af 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2287,19 +2287,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; goto find_uret_msr; case MSR_IA32_CR_PAT: - if (!kvm_pat_valid(data)) - return 1; + ret = kvm_set_msr_common(vcpu, msr_info); + if (ret) + break; if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) get_vmcs12(vcpu)->guest_ia32_pat = data; - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, data); - vcpu->arch.pat = data; - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && @@ -3387,15 +3384,15 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long hw_cr4; + /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ - unsigned long hw_cr4; - hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); if (is_unrestricted_guest(vcpu)) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; @@ -3404,7 +3401,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; - if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { + if (vmx_umip_emulated()) { if (cr4 & X86_CR4_UMIP) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; @@ -5402,7 +5399,13 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { - WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); + /* + * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this + * and other code needs to be updated if UMIP can be guest owned. + */ + BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); + + WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); return kvm_emulate_instruction(vcpu, 0); } @@ -6708,7 +6711,12 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { - struct page *page; + const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *slot; + unsigned long mmu_seq; + kvm_pfn_t pfn; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { @@ -6720,18 +6728,53 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) + /* + * Grab the memslot so that the hva lookup for the mmu_notifier retry + * is guaranteed to use the same memslot as the pfn lookup, i.e. rely + * on the pfn lookup's validation of the memslot to ensure a valid hva + * is used for the retry check. + */ + slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return; - vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + /* + * Ensure that the mmu_notifier sequence count is read before KVM + * retrieves the pfn from the primary MMU. Note, the memslot is + * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() + * in kvm_mmu_invalidate_end(). + */ + mmu_seq = kvm->mmu_invalidate_seq; + smp_rmb(); + + /* + * No need to retry if the memslot does not exist or is invalid. KVM + * controls the APIC-access page memslot, and only deletes the memslot + * if APICv is permanently inhibited, i.e. the memslot won't reappear. + */ + pfn = gfn_to_pfn_memslot(slot, gfn); + if (is_error_noslot_pfn(pfn)) + return; + + read_lock(&vcpu->kvm->mmu_lock); + if (mmu_invalidate_retry_hva(kvm, mmu_seq, + gfn_to_hva_memslot(slot, gfn))) { + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + read_unlock(&vcpu->kvm->mmu_lock); + goto out; + } + + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); + read_unlock(&vcpu->kvm->mmu_lock); + vmx_flush_tlb_current(vcpu); +out: /* * Do not pin apic access page in memory, the MMU notifier * will call us again if it is migrated or swapped out. */ - put_page(page); + kvm_release_pfn_clean(pfn); } static void vmx_hwapic_isr_update(int max_isr) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9e66531861cf..32384ba38499 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -93,18 +93,6 @@ union vmx_exit_reason { u32 full; }; -static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) -{ - /* - * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is - * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is - * greater than zero. However, KVM only exposes and emulates the MSR - * to/for the guest if the guest PMU supports at least "Architectural - * Performance Monitoring Version 2". - */ - return pmu->version > 1; -} - struct lbr_desc { /* Basic info about guest LBR records. */ struct x86_pmu_lbr records; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04b57a336b34..a6b9bea62fb8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1017,13 +1017,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -1032,15 +1030,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { @@ -1427,15 +1423,14 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) - * extract the supported MSRs from the related const lists. - * msrs_to_save is selected from the msrs_to_save_all to reflect the - * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs - * may depend on host virtualization features rather than host cpu features. + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. */ static const u32 msrs_to_save_base[] = { @@ -1483,6 +1478,10 @@ static const u32 msrs_to_save_pmu[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -1531,11 +1530,11 @@ static const u32 emulated_msrs_all[] = { MSR_IA32_UCODE_REV, /* - * The following list leaves out MSRs whose values are determined - * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. - * We always support the "true" VMX control MSRs, even if the host - * processor does not, so I am putting these registers here rather - * than in msrs_to_save_all. + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, @@ -1631,7 +1630,7 @@ static u64 kvm_get_arch_capabilities(void) * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us - * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that * capability to the guest too, and if EPT is disabled we're not * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will * require a nested hypervisor to do a flush of its own. @@ -1809,7 +1808,7 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - allowed = !!test_bit(index - start, bitmap); + allowed = test_bit(index - start, bitmap); break; } } @@ -2799,14 +2798,13 @@ static u64 read_tsc(void) static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode) { - long v; u64 tsc_pg_val; + long v; switch (clock->vclock_mode) { case VDSO_CLOCKMODE_HVCLOCK: - tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), - tsc_timestamp); - if (tsc_pg_val != U64_MAX) { + if (hv_read_tsc_page_tsc(hv_get_tsc_page(), + tsc_timestamp, &tsc_pg_val)) { /* TSC page valid */ *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & @@ -3702,8 +3700,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MSR_IA32_CR_PAT: + if (!kvm_pat_valid(data)) + return 1; + + vcpu->arch.pat = data; + break; + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -4110,9 +4114,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } + case MSR_IA32_CR_PAT: + msr_info->data = vcpu->arch.pat; + break; case MSR_MTRRcap: - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -7150,6 +7157,12 @@ static void kvm_probe_msr_to_save(u32 msr_index) kvm_pmu_cap.num_counters_fixed) return; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + return; + break; case MSR_IA32_XFD: case MSR_IA32_XFD_ERR: if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) @@ -10435,20 +10448,6 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ - unsigned long apic_address; - - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (start <= apic_address && apic_address < end) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -} - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); @@ -13162,7 +13161,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) { - return arch_atomic_read(&kvm->arch.assigned_device_count); + return raw_atomic_read(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c544602d07a3..82e3dafc5453 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -309,7 +309,6 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 01932af64193..ea3a28e7b613 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -61,8 +61,9 @@ ifeq ($(CONFIG_X86_32),y) lib-y += strstr_32.o lib-y += string_32.o lib-y += memmove_32.o + lib-y += cmpxchg8b_emu.o ifneq ($(CONFIG_X86_CMPXCHG64),y) - lib-y += cmpxchg8b_emu.o atomic64_386_32.o + lib-y += atomic64_386_32.o endif else obj-y += iomap_copy_64.o diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 33c70c0160ea..6962df315793 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -1,47 +1,54 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #include <linux/linkage.h> #include <asm/percpu.h> +#include <asm/processor-flags.h> .text /* + * Emulate 'cmpxchg16b %gs:(%rsi)' + * * Inputs: * %rsi : memory location to compare * %rax : low 64 bits of old value * %rdx : high 64 bits of old value * %rbx : low 64 bits of new value * %rcx : high 64 bits of new value - * %al : Operation successful + * + * Notably this is not LOCK prefixed and is not safe against NMIs */ SYM_FUNC_START(this_cpu_cmpxchg16b_emu) -# -# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not -# via the ZF. Caller will access %al to get result. -# -# Note that this is only useful for a cpuops operation. Meaning that we -# do *not* have a fully atomic operation but just an operation that is -# *atomic* on a single cpu (as provided by the this_cpu_xx class of -# macros). -# pushfq cli - cmpq PER_CPU_VAR((%rsi)), %rax - jne .Lnot_same - cmpq PER_CPU_VAR(8(%rsi)), %rdx - jne .Lnot_same + /* if (*ptr == old) */ + cmpq PER_CPU_VAR(0(%rsi)), %rax + jne .Lnot_same + cmpq PER_CPU_VAR(8(%rsi)), %rdx + jne .Lnot_same - movq %rbx, PER_CPU_VAR((%rsi)) - movq %rcx, PER_CPU_VAR(8(%rsi)) + /* *ptr = new */ + movq %rbx, PER_CPU_VAR(0(%rsi)) + movq %rcx, PER_CPU_VAR(8(%rsi)) + + /* set ZF in EFLAGS to indicate success */ + orl $X86_EFLAGS_ZF, (%rsp) popfq - mov $1, %al RET .Lnot_same: + /* *ptr != old */ + + /* old = *ptr */ + movq PER_CPU_VAR(0(%rsi)), %rax + movq PER_CPU_VAR(8(%rsi)), %rdx + + /* clear ZF in EFLAGS to indicate failure */ + andl $(~X86_EFLAGS_ZF), (%rsp) + popfq - xor %al,%al RET SYM_FUNC_END(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 6a912d58fecc..49805257b125 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -2,10 +2,16 @@ #include <linux/linkage.h> #include <asm/export.h> +#include <asm/percpu.h> +#include <asm/processor-flags.h> .text +#ifndef CONFIG_X86_CMPXCHG64 + /* + * Emulate 'cmpxchg8b (%esi)' on UP + * * Inputs: * %esi : memory location to compare * %eax : low 32 bits of old value @@ -15,32 +21,65 @@ */ SYM_FUNC_START(cmpxchg8b_emu) -# -# Emulate 'cmpxchg8b (%esi)' on UP except we don't -# set the whole ZF thing (caller will just compare -# eax:edx with the expected value) -# pushfl cli - cmpl (%esi), %eax - jne .Lnot_same - cmpl 4(%esi), %edx - jne .Lhalf_same + cmpl 0(%esi), %eax + jne .Lnot_same + cmpl 4(%esi), %edx + jne .Lnot_same + + movl %ebx, 0(%esi) + movl %ecx, 4(%esi) - movl %ebx, (%esi) - movl %ecx, 4(%esi) + orl $X86_EFLAGS_ZF, (%esp) popfl RET .Lnot_same: - movl (%esi), %eax -.Lhalf_same: - movl 4(%esi), %edx + movl 0(%esi), %eax + movl 4(%esi), %edx + + andl $(~X86_EFLAGS_ZF), (%esp) popfl RET SYM_FUNC_END(cmpxchg8b_emu) EXPORT_SYMBOL(cmpxchg8b_emu) + +#endif + +#ifndef CONFIG_UML + +SYM_FUNC_START(this_cpu_cmpxchg8b_emu) + + pushfl + cli + + cmpl PER_CPU_VAR(0(%esi)), %eax + jne .Lnot_same2 + cmpl PER_CPU_VAR(4(%esi)), %edx + jne .Lnot_same2 + + movl %ebx, PER_CPU_VAR(0(%esi)) + movl %ecx, PER_CPU_VAR(4(%esi)) + + orl $X86_EFLAGS_ZF, (%esp) + + popfl + RET + +.Lnot_same2: + movl PER_CPU_VAR(0(%esi)), %eax + movl PER_CPU_VAR(4(%esi)), %edx + + andl $(~X86_EFLAGS_ZF), (%esp) + + popfl + RET + +SYM_FUNC_END(this_cpu_cmpxchg8b_emu) + +#endif diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index 50734a23034c..cea25ca8b8cf 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -5,22 +5,34 @@ * This file contains network checksum routines that are better done * in an architecture-specific manner due to speed. */ - + #include <linux/compiler.h> #include <linux/export.h> #include <asm/checksum.h> #include <asm/word-at-a-time.h> -static inline unsigned short from32to16(unsigned a) +static inline unsigned short from32to16(unsigned a) { - unsigned short b = a >> 16; + unsigned short b = a >> 16; asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" + "adcw $0,%w0\n" : "=r" (b) : "0" (b), "r" (a)); return b; } +static inline __wsum csum_tail(u64 temp64, int odd) +{ + unsigned int result; + + result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); + if (unlikely(odd)) { + result = from32to16(result); + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + } + return (__force __wsum)result; +} + /* * Do a checksum on an arbitrary memory area. * Returns a 32bit checksum. @@ -35,7 +47,7 @@ static inline unsigned short from32to16(unsigned a) __wsum csum_partial(const void *buff, int len, __wsum sum) { u64 temp64 = (__force u64)sum; - unsigned odd, result; + unsigned odd; odd = 1 & (unsigned long) buff; if (unlikely(odd)) { @@ -47,21 +59,52 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) buff++; } - while (unlikely(len >= 64)) { + /* + * len == 40 is the hot case due to IPv6 headers, but annotating it likely() + * has noticeable negative affect on codegen for all other cases with + * minimal performance benefit here. + */ + if (len == 40) { asm("addq 0*8(%[src]),%[res]\n\t" "adcq 1*8(%[src]),%[res]\n\t" "adcq 2*8(%[src]),%[res]\n\t" "adcq 3*8(%[src]),%[res]\n\t" "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); - buff += 64; - len -= 64; + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[40])buff)); + return csum_tail(temp64, odd); + } + if (unlikely(len >= 64)) { + /* + * Extra accumulators for better ILP in the loop. + */ + u64 tmp_accum, tmp_carries; + + asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t" + "xorl %k[tmp_carries],%k[tmp_carries]\n\t" + "subl $64, %[len]\n\t" + "1:\n\t" + "addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcl $0,%k[tmp_carries]\n\t" + "addq 4*8(%[src]),%[tmp_accum]\n\t" + "adcq 5*8(%[src]),%[tmp_accum]\n\t" + "adcq 6*8(%[src]),%[tmp_accum]\n\t" + "adcq 7*8(%[src]),%[tmp_accum]\n\t" + "adcl $0,%k[tmp_carries]\n\t" + "addq $64, %[src]\n\t" + "subl $64, %[len]\n\t" + "jge 1b\n\t" + "addq %[tmp_accum],%[res]\n\t" + "adcq %[tmp_carries],%[res]\n\t" + "adcq $0,%[res]" + : [tmp_accum] "=&r"(tmp_accum), + [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64), + [len] "+r"(len), [src] "+r"(buff) + : "m"(*(const char *)buff)); } if (len & 32) { @@ -70,45 +113,37 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) "adcq 2*8(%[src]),%[res]\n\t" "adcq 3*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[32])buff)); buff += 32; } if (len & 16) { asm("addq 0*8(%[src]),%[res]\n\t" "adcq 1*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[16])buff)); buff += 16; } if (len & 8) { asm("addq 0*8(%[src]),%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [src] "r" (buff) - : "memory"); + : [res] "+r"(temp64) + : [src] "r"(buff), "m"(*(const char(*)[8])buff)); buff += 8; } if (len & 7) { - unsigned int shift = (8 - (len & 7)) * 8; + unsigned int shift = (-len << 3) & 63; unsigned long trail; trail = (load_unaligned_zeropad(buff) << shift) >> shift; asm("addq %[trail],%[res]\n\t" "adcq $0,%[res]" - : [res] "+r" (temp64) - : [trail] "r" (trail)); + : [res] "+r"(temp64) + : [trail] "r"(trail)); } - result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return (__force __wsum)result; + return csum_tail(temp64, odd); } EXPORT_SYMBOL(csum_partial); @@ -118,6 +153,6 @@ EXPORT_SYMBOL(csum_partial); */ __sum16 ip_compute_csum(const void *buff, int len) { - return csum_fold(csum_partial(buff,len,0)); + return csum_fold(csum_partial(buff, len, 0)); } EXPORT_SYMBOL(ip_compute_csum); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b64a2bd1a1ef..9c63713477bb 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -143,43 +143,43 @@ SYM_FUNC_END(__get_user_nocheck_8) EXPORT_SYMBOL(__get_user_nocheck_8) -SYM_CODE_START_LOCAL(.Lbad_get_user_clac) +SYM_CODE_START_LOCAL(__get_user_handle_exception) ASM_CLAC .Lbad_get_user: xor %edx,%edx mov $(-EFAULT),%_ASM_AX RET -SYM_CODE_END(.Lbad_get_user_clac) +SYM_CODE_END(__get_user_handle_exception) #ifdef CONFIG_X86_32 -SYM_CODE_START_LOCAL(.Lbad_get_user_8_clac) +SYM_CODE_START_LOCAL(__get_user_8_handle_exception) ASM_CLAC bad_get_user_8: xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX RET -SYM_CODE_END(.Lbad_get_user_8_clac) +SYM_CODE_END(__get_user_8_handle_exception) #endif /* get_user */ - _ASM_EXTABLE(1b, .Lbad_get_user_clac) - _ASM_EXTABLE(2b, .Lbad_get_user_clac) - _ASM_EXTABLE(3b, .Lbad_get_user_clac) + _ASM_EXTABLE(1b, __get_user_handle_exception) + _ASM_EXTABLE(2b, __get_user_handle_exception) + _ASM_EXTABLE(3b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(4b, .Lbad_get_user_clac) + _ASM_EXTABLE(4b, __get_user_handle_exception) #else - _ASM_EXTABLE(4b, .Lbad_get_user_8_clac) - _ASM_EXTABLE(5b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(4b, __get_user_8_handle_exception) + _ASM_EXTABLE(5b, __get_user_8_handle_exception) #endif /* __get_user */ - _ASM_EXTABLE(6b, .Lbad_get_user_clac) - _ASM_EXTABLE(7b, .Lbad_get_user_clac) - _ASM_EXTABLE(8b, .Lbad_get_user_clac) + _ASM_EXTABLE(6b, __get_user_handle_exception) + _ASM_EXTABLE(7b, __get_user_handle_exception) + _ASM_EXTABLE(8b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(9b, .Lbad_get_user_clac) + _ASM_EXTABLE(9b, __get_user_handle_exception) #else - _ASM_EXTABLE(9b, .Lbad_get_user_8_clac) - _ASM_EXTABLE(10b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(9b, __get_user_8_handle_exception) + _ASM_EXTABLE(10b, __get_user_8_handle_exception) #endif diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 02661861e5dd..0559b206fb11 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -38,10 +38,12 @@ SYM_FUNC_START(__memmove) cmp %rdi, %r8 jg 2f - /* FSRM implies ERMS => no length checks, do the copy directly */ +#define CHECK_LEN cmp $0x20, %rdx; jb 1f +#define MEMMOVE_BYTES movq %rdx, %rcx; rep movsb; RET .Lmemmove_begin_forward: - ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM - ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS + ALTERNATIVE_2 __stringify(CHECK_LEN), \ + __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \ + __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM /* * movsq instruction have many startup latency @@ -207,11 +209,6 @@ SYM_FUNC_START(__memmove) movb %r11b, (%rdi) 13: RET - -.Lmemmove_erms: - movq %rdx, %rcx - rep movsb - RET SYM_FUNC_END(__memmove) EXPORT_SYMBOL(__memmove) diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index b09cd2ad426c..47fd9bd6b91d 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -27,14 +27,14 @@ void msrs_free(struct msr *msrs) EXPORT_SYMBOL(msrs_free); /** - * Read an MSR with error handling - * + * msr_read - Read an MSR with error handling * @msr: MSR to read * @m: value to read into * * It returns read data only on success, otherwise it doesn't change the output * argument @m. * + * Return: %0 for success, otherwise an error code */ static int msr_read(u32 msr, struct msr *m) { @@ -49,10 +49,12 @@ static int msr_read(u32 msr, struct msr *m) } /** - * Write an MSR with error handling + * msr_write - Write an MSR with error handling * * @msr: MSR to write * @m: value to write + * + * Return: %0 for success, otherwise an error code */ static int msr_write(u32 msr, struct msr *m) { @@ -88,12 +90,14 @@ static inline int __flip_bit(u32 msr, u8 bit, bool set) } /** - * Set @bit in a MSR @msr. + * msr_set_bit - Set @bit in a MSR @msr. + * @msr: MSR to write + * @bit: bit number to set * - * Retval: - * < 0: An error was encountered. - * = 0: Bit was already set. - * > 0: Hardware accepted the MSR write. + * Return: + * * < 0: An error was encountered. + * * = 0: Bit was already set. + * * > 0: Hardware accepted the MSR write. */ int msr_set_bit(u32 msr, u8 bit) { @@ -101,12 +105,14 @@ int msr_set_bit(u32 msr, u8 bit) } /** - * Clear @bit in a MSR @msr. + * msr_clear_bit - Clear @bit in a MSR @msr. + * @msr: MSR to write + * @bit: bit number to clear * - * Retval: - * < 0: An error was encountered. - * = 0: Bit was already cleared. - * > 0: Hardware accepted the MSR write. + * Return: + * * < 0: An error was encountered. + * * = 0: Bit was already cleared. + * * > 0: Hardware accepted the MSR write. */ int msr_clear_bit(u32 msr, u8 bit) { diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 3062d09a776d..1451e0c4ae22 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -131,22 +131,22 @@ SYM_FUNC_START(__put_user_nocheck_8) SYM_FUNC_END(__put_user_nocheck_8) EXPORT_SYMBOL(__put_user_nocheck_8) -SYM_CODE_START_LOCAL(.Lbad_put_user_clac) +SYM_CODE_START_LOCAL(__put_user_handle_exception) ASM_CLAC .Lbad_put_user: movl $-EFAULT,%ecx RET -SYM_CODE_END(.Lbad_put_user_clac) +SYM_CODE_END(__put_user_handle_exception) - _ASM_EXTABLE(1b, .Lbad_put_user_clac) - _ASM_EXTABLE(2b, .Lbad_put_user_clac) - _ASM_EXTABLE(3b, .Lbad_put_user_clac) - _ASM_EXTABLE(4b, .Lbad_put_user_clac) - _ASM_EXTABLE(5b, .Lbad_put_user_clac) - _ASM_EXTABLE(6b, .Lbad_put_user_clac) - _ASM_EXTABLE(7b, .Lbad_put_user_clac) - _ASM_EXTABLE(9b, .Lbad_put_user_clac) + _ASM_EXTABLE(1b, __put_user_handle_exception) + _ASM_EXTABLE(2b, __put_user_handle_exception) + _ASM_EXTABLE(3b, __put_user_handle_exception) + _ASM_EXTABLE(4b, __put_user_handle_exception) + _ASM_EXTABLE(5b, __put_user_handle_exception) + _ASM_EXTABLE(6b, __put_user_handle_exception) + _ASM_EXTABLE(7b, __put_user_handle_exception) + _ASM_EXTABLE(9b, __put_user_handle_exception) #ifdef CONFIG_X86_32 - _ASM_EXTABLE(8b, .Lbad_put_user_clac) - _ASM_EXTABLE(10b, .Lbad_put_user_clac) + _ASM_EXTABLE(8b, __put_user_handle_exception) + _ASM_EXTABLE(10b, __put_user_handle_exception) #endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index b3b1e376dce8..3fd066d42ec0 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -143,7 +143,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) * from re-poisioning the BTB prediction. */ .align 64 - .skip 63, 0xcc + .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) ANNOTATE_NOENDBR /* diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 003d90138e20..e9251b89a9e9 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -9,6 +9,7 @@ #include <linux/export.h> #include <linux/uaccess.h> #include <linux/highmem.h> +#include <linux/libnvdimm.h> /* * Zero Userspace diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 7fe56c594aa6..91c52ead1226 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -32,6 +32,7 @@ #include <asm/traps.h> #include <asm/user.h> #include <asm/fpu/api.h> +#include <asm/fpu/regset.h> #include "fpu_system.h" #include "fpu_emu.h" diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4399983c50c..e8711b2cafaf 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -880,12 +880,6 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } -static noinline void -bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) -{ - __bad_area(regs, error_code, address, 0, SEGV_MAPERR); -} - static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { @@ -1366,51 +1360,10 @@ void do_user_addr_fault(struct pt_regs *regs, lock_mmap: #endif /* CONFIG_PER_VMA_LOCK */ - /* - * Kernel-mode access to the user address space should only occur - * on well-defined single instructions listed in the exception - * tables. But, an erroneous kernel fault occurring outside one of - * those areas which also holds mmap_lock might deadlock attempting - * to validate the fault against the address space. - * - * Only do the expensive exception table search when we might be at - * risk of a deadlock. This happens if we - * 1. Failed to acquire mmap_lock, and - * 2. The access did not originate in userspace. - */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!user_mode(regs) && !search_exception_tables(regs->ip)) { - /* - * Fault from code in kernel from - * which we do not expect faults. - */ - bad_area_nosemaphore(regs, error_code, address); - return; - } retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { - bad_area(regs, error_code, address); - return; - } - if (likely(vma->vm_start <= address)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); - return; - } - if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1418,7 +1371,6 @@ retry: * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, vma); return; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 2c54b76d8f84..d9efa35711ee 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -3,6 +3,7 @@ #include <linux/export.h> #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> +#include <asm/numa.h> void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d4e2648a1dfb..b63403d7179d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -45,7 +45,6 @@ #include <asm/olpc_ofw.h> #include <asm/pgalloc.h> #include <asm/sections.h> -#include <asm/paravirt.h> #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/page_types.h> @@ -74,7 +73,6 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *)alloc_low_page(); - paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); @@ -99,7 +97,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *)alloc_low_page(); - paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -181,12 +178,10 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, set_pte(newpte + i, pte[i]); *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); - paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); BUG_ON(newpte != pte_offset_kernel(pmd, 0)); __flush_tlb_all(); - paravirt_release_pte(__pa(pte) >> PAGE_SHIFT); pte = newpte; } BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) @@ -482,7 +477,6 @@ void __init native_pagetable_init(void) pfn, pmd, __pa(pmd), pte, __pa(pte)); pte_clear(NULL, va, pte); } - paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); paging_init(); } @@ -491,15 +485,8 @@ void __init native_pagetable_init(void) * point, we've been running on some set of pagetables constructed by * the boot process. * - * If we're booting on native hardware, this will be a pagetable - * constructed in arch/x86/kernel/head_32.S. The root of the - * pagetable will be swapper_pg_dir. - * - * If we're booting paravirtualized under a hypervisor, then there are - * more options: we may already be running PAE, and the pagetable may - * or may not be based in swapper_pg_dir. In any case, - * paravirt_pagetable_init() will set up swapper_pg_dir - * appropriately for the rest of the initialization to work. + * This will be a pagetable constructed in arch/x86/kernel/head_32.S. + * The root of the pagetable will be swapper_pg_dir. * * In general, pagetable_init() assumes that the pagetable may already * be partially populated, and so it avoids stomping on any existing diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 557f0fe25dff..37db264866b6 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -172,10 +172,10 @@ void __meminit init_trampoline_kaslr(void) set_p4d(p4d_tramp, __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)); } else { - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index e0b51c09109f..54bbd5163e8d 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -319,7 +319,7 @@ static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) #endif } -static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) +static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) { /* * To maintain the security guarantees of SEV-SNP guests, make sure @@ -327,6 +327,8 @@ static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool */ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc) snp_set_memory_shared(vaddr, npages); + + return true; } /* Return true unconditionally: return value doesn't matter for the SEV side */ @@ -501,6 +503,21 @@ void __init sme_early_init(void) x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required; x86_platform.guest.enc_cache_flush_required = amd_enc_cache_flush_required; + + /* + * AMD-SEV-ES intercepts the RDMSR to read the X2APIC ID in the + * parallel bringup low level code. That raises #VC which cannot be + * handled there. + * It does not provide a RDMSR GHCB protocol so the early startup + * code cannot directly communicate with the secure firmware. The + * alternative solution to retrieve the APIC ID via CPUID(0xb), + * which is covered by the GHCB protocol, is not viable either + * because there is no enforcement of the CPUID(0xb) provided + * "initial" APIC ID to be the same as the real APIC ID. + * Disable parallel bootup. + */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) + x86_cpuinit.parallel_bringup = false; } void __init mem_encrypt_free_decrypted_mem(void) diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index c6efcf559d88..d73aeb16417f 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -188,7 +188,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) if (pmd_large(*pmd)) return; - pte = pte_offset_map(pmd, ppd->vaddr); + pte = pte_offset_kernel(pmd, ppd->vaddr); if (pte_none(*pte)) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } @@ -612,7 +612,7 @@ void __init sme_enable(struct boot_params *bp) out: if (sme_me_mask) { physical_mask &= ~sme_me_mask; - cc_set_vendor(CC_VENDOR_AMD); + cc_vendor = CC_VENDOR_AMD; cc_set_mask(sme_me_mask); } } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 7159cf787613..df4182b6449f 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> @@ -231,7 +232,7 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end) * points to #2, but almost all physical-to-virtual translations point to #1. * * This is so that we can have both a directmap of all physical memory *and* - * take full advantage of the the limited (s32) immediate addressing range (2G) + * take full advantage of the limited (s32) immediate addressing range (2G) * of x86_64. * * See Documentation/arch/x86/x86_64/mm.rst for more detail. @@ -2151,7 +2152,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); /* Notify hypervisor that we are about to set/clr encryption attribute. */ - x86_platform.guest.enc_status_change_prepare(addr, numpages, enc); + if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) + return -EIO; ret = __change_page_attr_set_clr(&cpa, 1); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e4f499eb0f29..15a8009a4480 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -702,14 +702,8 @@ void p4d_clear_huge(p4d_t *p4d) * pud_set_huge - setup kernel PUD mapping * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this - * function sets up a huge page only if any of the following conditions are met: - * - * - MTRRs are disabled, or - * - * - MTRRs are enabled and the range is completely covered by a single MTRR, or - * - * - MTRRs are enabled and the corresponding MTRR memory type is WB, which - * has no effect on the requested PAT memory type. + * function sets up a huge page only if the complete range has the same MTRR + * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. @@ -718,11 +712,10 @@ void p4d_clear_huge(p4d_t *p4d) */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { - u8 mtrr, uniform; + u8 uniform; - mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); - if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && - (mtrr != MTRR_TYPE_WRBACK)) + mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ @@ -745,11 +738,10 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { - u8 mtrr, uniform; + u8 uniform; - mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); - if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && - (mtrr != MTRR_TYPE_WRBACK)) { + mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1056bbf55b17..438adb695daa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2570,7 +2570,7 @@ out_image: } if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, proglen, pass + 1, image); + bpf_jit_dump(prog->len, proglen, pass + 1, rw_image); if (image) { if (!prog->is_func || extra_pass) { diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 584c25b588b4..87313701f069 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -83,7 +83,7 @@ static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value) *value |= 0x100; } -void sata_revid_init(struct sim_dev_reg *reg) +static void sata_revid_init(struct sim_dev_reg *reg) { reg->sim_reg.value = 0x01060100; reg->sim_reg.mask = 0; @@ -172,7 +172,7 @@ static inline void extract_bytes(u32 *value, int reg, int len) *value &= mask; } -int bridge_read(unsigned int devfn, int reg, int len, u32 *value) +static int bridge_read(unsigned int devfn, int reg, int len, u32 *value) { u32 av_bridge_base, av_bridge_limit; int retval = 0; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f3f2d87cce1b..e9f99c56f3ce 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -96,6 +96,9 @@ static const unsigned long * const efi_tables[] = { #ifdef CONFIG_EFI_COCO_SECRET &efi.coco_secret, #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + &efi.unaccepted, +#endif }; u64 efi_setup; /* efi setup_data physical address */ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 232acf418cfb..77f7ac3668cb 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -853,9 +853,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size, /* Disable interrupts around EFI calls: */ local_irq_save(flags); - status = efi_call(efi.runtime->set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); + status = arch_efi_call_virt(efi.runtime, set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); local_irq_restore(flags); efi_fpu_end(); diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index b0b848d6933a..f0cc00032751 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -114,6 +114,14 @@ void efi_delete_dummy_variable(void) EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL); } +u64 efivar_reserved_space(void) +{ + if (efi_no_storage_paranoia) + return 0; + return EFI_MIN_RESERVE; +} +EXPORT_SYMBOL_GPL(efivar_reserved_space); + /* * In the nonblocking case we do not attempt to perform garbage * collection if we do not have enough free space. Rather, we do the diff --git a/arch/x86/platform/efi/runtime-map.c b/arch/x86/platform/efi/runtime-map.c index bbee682ef8cd..a6f02cef3ca2 100644 --- a/arch/x86/platform/efi/runtime-map.c +++ b/arch/x86/platform/efi/runtime-map.c @@ -93,7 +93,7 @@ static void map_release(struct kobject *kobj) kfree(entry); } -static struct kobj_type __refdata map_ktype = { +static const struct kobj_type __refconst map_ktype = { .sysfs_ops = &map_attr_ops, .default_groups = def_groups, .release = map_release, diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 75e3319e8bee..74ebd6882690 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -234,7 +234,7 @@ static int __init olpc_dt_compatible_match(phandle node, const char *compat) return 0; } -void __init olpc_dt_fixup(void) +static void __init olpc_dt_fixup(void) { phandle node; u32 board_rev; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 7a4d5e911415..63230ff8cf4f 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -351,43 +351,6 @@ static int bsp_pm_callback(struct notifier_block *nb, unsigned long action, case PM_HIBERNATION_PREPARE: ret = bsp_check(); break; -#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 - case PM_RESTORE_PREPARE: - /* - * When system resumes from hibernation, online CPU0 because - * 1. it's required for resume and - * 2. the CPU was online before hibernation - */ - if (!cpu_online(0)) - _debug_hotplug_cpu(0, 1); - break; - case PM_POST_RESTORE: - /* - * When a resume really happens, this code won't be called. - * - * This code is called only when user space hibernation software - * prepares for snapshot device during boot time. So we just - * call _debug_hotplug_cpu() to restore to CPU0's state prior to - * preparing the snapshot device. - * - * This works for normal boot case in our CPU0 hotplug debug - * mode, i.e. CPU0 is offline and user mode hibernation - * software initializes during boot time. - * - * If CPU0 is online and user application accesses snapshot - * device after boot time, this will offline CPU0 and user may - * see different CPU0 state before and after accessing - * the snapshot device. But hopefully this is not a case when - * user debugging CPU0 hotplug. Even if users hit this case, - * they can easily online CPU0 back. - * - * To simplify this debug code, we only consider normal boot - * case. Otherwise we need to remember CPU0's state and restore - * to that state and resolve racy conditions etc. - */ - _debug_hotplug_cpu(0, 0); - break; -#endif default: break; } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 42abd6af1198..c2a29be35c01 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -12,7 +12,7 @@ $(obj)/string.o: $(srctree)/arch/x86/boot/compressed/string.c FORCE $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) -CFLAGS_sha256.o := -D__DISABLE_EXPORTS +CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY # When profile-guided optimization is enabled, llvm emits two different # overlapping text sections, which is not supported by kexec. Remove profile diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index af565816d2ba..788e5559549f 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -154,6 +154,9 @@ static void __init setup_real_mode(void) trampoline_header->flags = 0; + trampoline_lock = &trampoline_header->lock; + *trampoline_lock = 0; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); /* Map the real mode stub as virtual == physical */ diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index e38d61d6562e..c9f76fae902e 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -37,6 +37,24 @@ .text .code16 +.macro LOCK_AND_LOAD_REALMODE_ESP lock_pa=0 + /* + * Make sure only one CPU fiddles with the realmode stack + */ +.Llock_rm\@: + .if \lock_pa + lock btsl $0, pa_tr_lock + .else + lock btsl $0, tr_lock + .endif + jnc 2f + pause + jmp .Llock_rm\@ +2: + # Setup stack + movl $rm_stack_end, %esp +.endm + .balign PAGE_SIZE SYM_CODE_START(trampoline_start) cli # We should be safe anyway @@ -49,8 +67,7 @@ SYM_CODE_START(trampoline_start) mov %ax, %es mov %ax, %ss - # Setup stack - movl $rm_stack_end, %esp + LOCK_AND_LOAD_REALMODE_ESP call verify_cpu # Verify the cpu supports long mode testl %eax, %eax # Check for return code @@ -93,8 +110,7 @@ SYM_CODE_START(sev_es_trampoline_start) mov %ax, %es mov %ax, %ss - # Setup stack - movl $rm_stack_end, %esp + LOCK_AND_LOAD_REALMODE_ESP jmp .Lswitch_to_protected SYM_CODE_END(sev_es_trampoline_start) @@ -177,7 +193,7 @@ SYM_CODE_START(pa_trampoline_compat) * In compatibility mode. Prep ESP and DX for startup_32, then disable * paging and complete the switch to legacy 32-bit mode. */ - movl $rm_stack_end, %esp + LOCK_AND_LOAD_REALMODE_ESP lock_pa=1 movw $__KERNEL_DS, %dx movl $(CR0_STATE & ~X86_CR0_PG), %eax @@ -241,6 +257,7 @@ SYM_DATA_START(trampoline_header) SYM_DATA(tr_efer, .space 8) SYM_DATA(tr_cr4, .space 4) SYM_DATA(tr_flags, .space 4) + SYM_DATA(tr_lock, .space 4) SYM_DATA_END(trampoline_header) #include "trampoline_common.S" diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index f41a17ebac48..49a0452402e9 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/fb.h> void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off) { diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 7d7ffb9c826a..863d0d6b3edc 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -16,6 +16,8 @@ #include <asm/setup.h> #include <asm/xen/hypercall.h> +#include "xen-ops.h" + static efi_char16_t vendor[100] __initdata; static efi_system_table_t efi_systab_xen __initdata = { diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index c1cd28e915a3..a6820ca940bf 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -161,13 +161,12 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) int rc = 0; /* - * This can happen if CPU was offlined earlier and - * offlining timed out in common_cpu_die(). + * If a CPU was offlined earlier and offlining timed out then the + * lock mechanism is still initialized. Uninit it unconditionally + * as it's safe to call even if already uninited. Interrupts and + * timer have already been handled in xen_cpu_dead_hvm(). */ - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - } + xen_uninit_lock_cpu(cpu); if (cpu_acpi_id(cpu) != U32_MAX) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 093b78c8bbec..93b658248d01 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -68,6 +68,7 @@ #include <asm/reboot.h> #include <asm/hypervisor.h> #include <asm/mach_traps.h> +#include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> @@ -119,6 +120,54 @@ static int __init parse_xen_msr_safe(char *str) } early_param("xen_msr_safe", parse_xen_msr_safe); +/* Get MTRR settings from Xen and put them into mtrr_state. */ +static void __init xen_set_mtrr_data(void) +{ +#ifdef CONFIG_MTRR + struct xen_platform_op op = { + .cmd = XENPF_read_memtype, + .interface_version = XENPF_INTERFACE_VERSION, + }; + unsigned int reg; + unsigned long mask; + uint32_t eax, width; + static struct mtrr_var_range var[MTRR_MAX_VAR_RANGES] __initdata; + + /* Get physical address width (only 64-bit cpus supported). */ + width = 36; + eax = cpuid_eax(0x80000000); + if ((eax >> 16) == 0x8000 && eax >= 0x80000008) { + eax = cpuid_eax(0x80000008); + width = eax & 0xff; + } + + for (reg = 0; reg < MTRR_MAX_VAR_RANGES; reg++) { + op.u.read_memtype.reg = reg; + if (HYPERVISOR_platform_op(&op)) + break; + + /* + * Only called in dom0, which has all RAM PFNs mapped at + * RAM MFNs, and all PCI space etc. is identity mapped. + * This means we can treat MFN == PFN regarding MTRR settings. + */ + var[reg].base_lo = op.u.read_memtype.type; + var[reg].base_lo |= op.u.read_memtype.mfn << PAGE_SHIFT; + var[reg].base_hi = op.u.read_memtype.mfn >> (32 - PAGE_SHIFT); + mask = ~((op.u.read_memtype.nr_mfns << PAGE_SHIFT) - 1); + mask &= (1UL << width) - 1; + if (mask) + mask |= MTRR_PHYSMASK_V; + var[reg].mask_lo = mask; + var[reg].mask_hi = mask >> 32; + } + + /* Only overwrite MTRR state if any MTRR could be got from Xen. */ + if (reg) + mtrr_overwrite_state(var, reg, MTRR_TYPE_UNCACHABLE); +#endif +} + static void __init xen_pv_init_platform(void) { /* PV guests can't operate virtio devices without grants. */ @@ -135,6 +184,11 @@ static void __init xen_pv_init_platform(void) /* pvclock is in shared info area */ xen_init_time_ops(); + + if (xen_initial_domain()) + xen_set_mtrr_data(); + else + mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); } static void __init xen_pv_guest_late_init(void) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index b3b8d289b9ab..e0a975165de7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -86,6 +86,22 @@ #include "mmu.h" #include "debugfs.h" +/* + * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order + * to avoid warnings with "-Wmissing-prototypes". + */ +pteval_t xen_pte_val(pte_t pte); +pgdval_t xen_pgd_val(pgd_t pgd); +pmdval_t xen_pmd_val(pmd_t pmd); +pudval_t xen_pud_val(pud_t pud); +p4dval_t xen_p4d_val(p4d_t p4d); +pte_t xen_make_pte(pteval_t pte); +pgd_t xen_make_pgd(pgdval_t pgd); +pmd_t xen_make_pmd(pmdval_t pmd); +pud_t xen_make_pud(pudval_t pud); +p4d_t xen_make_p4d(p4dval_t p4d); +pte_t xen_make_pte_init(pteval_t pte); + #ifdef CONFIG_X86_VSYSCALL_EMULATION /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c2be3efb2ba0..8b5cf7bb1f47 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -6,6 +6,7 @@ */ #include <linux/init.h> +#include <linux/iscsi_ibft.h> #include <linux/sched.h> #include <linux/kstrtox.h> #include <linux/mm.h> @@ -764,17 +765,26 @@ char * __init xen_memory_setup(void) BUG_ON(memmap.nr_entries == 0); xen_e820_table.nr_entries = memmap.nr_entries; - /* - * Xen won't allow a 1:1 mapping to be created to UNUSABLE - * regions, so if we're using the machine memory map leave the - * region as RAM as it is in the pseudo-physical map. - * - * UNUSABLE regions in domUs are not handled and will need - * a patch in the future. - */ - if (xen_initial_domain()) + if (xen_initial_domain()) { + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ xen_ignore_unusable(); +#ifdef CONFIG_ISCSI_IBFT_FIND + /* Reserve 0.5 MiB to 1 MiB region so iBFT can be found */ + xen_e820_table.entries[xen_e820_table.nr_entries].addr = IBFT_START; + xen_e820_table.entries[xen_e820_table.nr_entries].size = IBFT_END - IBFT_START; + xen_e820_table.entries[xen_e820_table.nr_entries].type = E820_TYPE_RESERVED; + xen_e820_table.nr_entries++; +#endif + } + /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 22fb982ff971..c20cbb14c82b 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -2,6 +2,10 @@ #ifndef _XEN_SMP_H #ifdef CONFIG_SMP + +void asm_cpu_bringup_and_idle(void); +asmlinkage void cpu_bringup_and_idle(void); + extern void xen_send_IPI_mask(const struct cpumask *mask, int vector); extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index b70afdff419c..ac95d1981cc0 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -55,18 +55,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) } #ifdef CONFIG_HOTPLUG_CPU -static void xen_hvm_cpu_die(unsigned int cpu) +static void xen_hvm_cleanup_dead_cpu(unsigned int cpu) { - if (common_cpu_die(cpu) == 0) { - if (xen_have_vector_callback) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - } + if (xen_have_vector_callback) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); } } #else -static void xen_hvm_cpu_die(unsigned int cpu) +static void xen_hvm_cleanup_dead_cpu(unsigned int cpu) { BUG(); } @@ -77,7 +75,7 @@ void __init xen_hvm_smp_init(void) smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_cpus_done = xen_smp_cpus_done; - smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.cleanup_dead_cpu = xen_hvm_cleanup_dead_cpu; if (!xen_have_vector_callback) { #ifdef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index a9cf8c8fa074..cef78b8c89f4 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -55,14 +55,15 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); -void asm_cpu_bringup_and_idle(void); static void cpu_bringup(void) { int cpu; cr4_init(); + cpuhp_ap_sync_alive(); cpu_init(); + fpu__init_cpu(); touch_softlockup_watchdog(); /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ @@ -83,7 +84,7 @@ static void cpu_bringup(void) set_cpu_online(cpu, true); - cpu_set_state_online(cpu); /* Implies full memory barrier. */ + smp_mb(); /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -254,15 +255,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) struct desc_struct *gdt; unsigned long gdt_mfn; - /* used to tell cpu_init() that it can proceed with initialization */ - cpumask_set_cpu(cpu, cpu_callout_mask); if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (ctxt == NULL) { cpumask_clear_cpu(cpu, xen_cpu_initialized_map); - cpumask_clear_cpu(cpu, cpu_callout_mask); return -ENOMEM; } @@ -316,7 +314,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) +static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle) { int rc; @@ -326,14 +324,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) xen_setup_runstate_info(cpu); - /* - * PV VCPUs are always successfully taken down (see 'while' loop - * in xen_cpu_die()), so -EBUSY is an error. - */ - rc = cpu_check_up_prepare(cpu); - if (rc) - return rc; - /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -343,15 +333,20 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) xen_pmu_init(cpu); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); - BUG_ON(rc); - - while (cpu_report_state(cpu) != CPU_ONLINE) - HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + /* + * Why is this a BUG? If the hypercall fails then everything can be + * rolled back, no? + */ + BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)); return 0; } +static void xen_pv_poll_sync_state(void) +{ + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); +} + #ifdef CONFIG_HOTPLUG_CPU static int xen_pv_cpu_disable(void) { @@ -367,18 +362,18 @@ static int xen_pv_cpu_disable(void) static void xen_pv_cpu_die(unsigned int cpu) { - while (HYPERVISOR_vcpu_op(VCPUOP_is_up, - xen_vcpu_nr(cpu), NULL)) { + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), NULL)) { __set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(HZ/10); } +} - if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - xen_pmu_finish(cpu); - } +static void xen_pv_cleanup_dead_cpu(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ @@ -400,6 +395,11 @@ static void xen_pv_cpu_die(unsigned int cpu) BUG(); } +static void xen_pv_cleanup_dead_cpu(unsigned int cpu) +{ + BUG(); +} + static void __noreturn xen_pv_play_dead(void) { BUG(); @@ -438,8 +438,10 @@ static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_cpus = xen_pv_smp_prepare_cpus, .smp_cpus_done = xen_smp_cpus_done, - .cpu_up = xen_pv_cpu_up, + .kick_ap_alive = xen_pv_kick_ap, .cpu_die = xen_pv_cpu_die, + .cleanup_dead_cpu = xen_pv_cleanup_dead_cpu, + .poll_sync_state = xen_pv_poll_sync_state, .cpu_disable = xen_pv_cpu_disable, .play_dead = xen_pv_play_dead, diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b74ac2562cfb..52fa5609b7f6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -66,11 +66,10 @@ static noinstr u64 xen_sched_clock(void) struct pvclock_vcpu_time_info *src; u64 ret; - preempt_disable_notrace(); src = &__this_cpu_read(xen_vcpu)->time; ret = pvclock_clocksource_read_nowd(src); ret -= xen_sched_clock_offset; - preempt_enable_notrace(); + return ret; } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 643d02900fbb..a0ea285878db 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -90,30 +90,35 @@ SYM_CODE_END(xen_cpu_bringup_again) ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") -#ifdef CONFIG_X86_32 - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) -#else +#ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) -#endif -#ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) -#endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, - .ascii "!writable_page_tables|pae_pgdir_above_4gb") - ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, - .long (1 << XENFEAT_writable_page_tables) | \ - (1 << XENFEAT_dom0) | \ - (1 << XENFEAT_linux_rsdp_unrestricted)) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") - ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) - ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) +# define FEATURES_PV (1 << XENFEAT_writable_page_tables) +#else +# define FEATURES_PV 0 +#endif +#ifdef CONFIG_XEN_PVH +# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted) +#else +# define FEATURES_PVH 0 +#endif +#ifdef CONFIG_XEN_DOM0 +# define FEATURES_DOM0 (1 << XENFEAT_dom0) +#else +# define FEATURES_DOM0 0 +#endif + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, + .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) #endif /*CONFIG_XEN */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a10903785a33..408a2aa66c69 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -72,8 +72,6 @@ void xen_restore_time_memory_area(void); void xen_init_time_ops(void); void xen_hvm_init_time_ops(void); -irqreturn_t xen_debug_interrupt(int irq, void *dev_id); - bool xen_vcpu_stolen(int vcpu); void xen_vcpu_setup(int cpu); @@ -148,9 +146,12 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), void xen_pin_vcpu(int cpu); void xen_emergency_restart(void); +void xen_force_evtchn_callback(void); + #ifdef CONFIG_XEN_PV void xen_pv_pre_suspend(void); void xen_pv_post_suspend(int suspend_cancelled); +void xen_start_kernel(struct start_info *si); #else static inline void xen_pv_pre_suspend(void) {} static inline void xen_pv_post_suspend(int suspend_cancelled) {} |