summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig6
-rw-r--r--arch/x86/Kconfig.assembler5
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/compressed/head_32.S2
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c6
-rw-r--r--arch/x86/boot/compressed/misc.c18
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/sev.c70
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S1
-rw-r--r--arch/x86/coco/tdx/tdcall.S96
-rw-r--r--arch/x86/coco/tdx/tdx.c87
-rw-r--r--arch/x86/crypto/Kconfig38
-rw-r--r--arch/x86/crypto/Makefile6
-rw-r--r--arch/x86/crypto/aria-aesni-avx-asm_64.S172
-rw-r--r--arch/x86/crypto/aria-aesni-avx2-asm_64.S1441
-rw-r--r--arch/x86/crypto/aria-avx.h48
-rw-r--r--arch/x86/crypto/aria-gfni-avx512-asm_64.S971
-rw-r--r--arch/x86/crypto/aria_aesni_avx2_glue.c254
-rw-r--r--arch/x86/crypto/aria_aesni_avx_glue.c49
-rw-r--r--arch/x86/crypto/aria_gfni_avx512_glue.c250
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S71
-rw-r--r--arch/x86/crypto/blowfish_glue.c200
-rw-r--r--arch/x86/crypto/ecb_cbc_helpers.h19
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S6
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c45
-rw-r--r--arch/x86/entry/entry_64.S2
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/entry/vdso/vdso2c.h6
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c9
-rw-r--r--arch/x86/entry/vdso/vdso32/fake_32bit_build.h25
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c27
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso32.lds.S1
-rw-r--r--arch/x86/entry/vdso/vdso32/vgetcpu.c3
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c3
-rw-r--r--arch/x86/entry/vdso/vma.c23
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/events/amd/brs.c13
-rw-r--r--arch/x86/events/amd/core.c6
-rw-r--r--arch/x86/events/amd/ibs.c9
-rw-r--r--arch/x86/events/core.c12
-rw-r--r--arch/x86/events/intel/core.c201
-rw-r--r--arch/x86/events/intel/cstate.c1
-rw-r--r--arch/x86/events/intel/ds.c137
-rw-r--r--arch/x86/events/intel/lbr.c4
-rw-r--r--arch/x86/events/intel/uncore.c41
-rw-r--r--arch/x86/events/intel/uncore.h5
-rw-r--r--arch/x86/events/intel/uncore_discovery.c60
-rw-r--r--arch/x86/events/intel/uncore_discovery.h14
-rw-r--r--arch/x86/events/intel/uncore_snb.c161
-rw-r--r--arch/x86/events/intel/uncore_snbep.c158
-rw-r--r--arch/x86/events/perf_event.h23
-rw-r--r--arch/x86/events/zhaoxin/core.c8
-rw-r--r--arch/x86/include/asm/acpi.h8
-rw-r--r--arch/x86/include/asm/agp.h6
-rw-r--r--arch/x86/include/asm/alternative.h132
-rw-r--r--arch/x86/include/asm/asm-prototypes.h1
-rw-r--r--arch/x86/include/asm/atomic64_32.h44
-rw-r--r--arch/x86/include/asm/atomic64_64.h36
-rw-r--r--arch/x86/include/asm/checksum_64.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h19
-rw-r--r--arch/x86/include/asm/debugreg.h35
-rw-r--r--arch/x86/include/asm/disabled-features.h3
-rw-r--r--arch/x86/include/asm/dma-mapping.h2
-rw-r--r--arch/x86/include/asm/efi.h14
-rw-r--r--arch/x86/include/asm/fpu/sched.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h2
-rw-r--r--arch/x86/include/asm/fpu/xcr.h4
-rw-r--r--arch/x86/include/asm/gsseg.h66
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h20
-rw-r--r--arch/x86/include/asm/ibt.h4
-rw-r--r--arch/x86/include/asm/idtentry.h16
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/irqflags.h11
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h109
-rw-r--r--arch/x86/include/asm/kvmclock.h2
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/microcode.h4
-rw-r--r--arch/x86/include/asm/microcode_amd.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h1
-rw-r--r--arch/x86/include/asm/mshyperv.h78
-rw-r--r--arch/x86/include/asm/msr-index.h31
-rw-r--r--arch/x86/include/asm/mwait.h14
-rw-r--r--arch/x86/include/asm/nospec-branch.h2
-rw-r--r--arch/x86/include/asm/page.h5
-rw-r--r--arch/x86/include/asm/page_32.h4
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/paravirt.h8
-rw-r--r--arch/x86/include/asm/perf_event.h10
-rw-r--r--arch/x86/include/asm/pgtable-2level.h26
-rw-r--r--arch/x86/include/asm/pgtable-3level.h26
-rw-r--r--arch/x86/include/asm/pgtable.h27
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/pvclock.h3
-rw-r--r--arch/x86/include/asm/reboot.h2
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/segment.h8
-rw-r--r--arch/x86/include/asm/shared/io.h4
-rw-r--r--arch/x86/include/asm/shared/tdx.h7
-rw-r--r--arch/x86/include/asm/special_insns.h29
-rw-r--r--arch/x86/include/asm/text-patching.h31
-rw-r--r--arch/x86/include/asm/thread_info.h5
-rw-r--r--arch/x86/include/asm/time.h1
-rw-r--r--arch/x86/include/asm/vdso.h2
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h2
-rw-r--r--arch/x86/include/asm/vdso/processor.h4
-rw-r--r--arch/x86/include/asm/virtext.h16
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm.h34
-rw-r--r--arch/x86/include/uapi/asm/svm.h6
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c35
-rw-r--r--arch/x86/kernel/alternative.c76
-rw-r--r--arch/x86/kernel/apic/io_apic.c7
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets.c14
-rw-r--r--arch/x86/kernel/cpu/amd.c49
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c9
-rw-r--r--arch/x86/kernel/cpu/bugs.c33
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c5
-rw-r--r--arch/x86/kernel/cpu/common.c91
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c2
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c12
-rw-r--r--arch/x86/kernel/cpu/mce/core.c32
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c3
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h44
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c76
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c45
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c44
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c74
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c54
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c13
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h28
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c30
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c307
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c2
-rw-r--r--arch/x86/kernel/cpu/tsx.c1
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/crash.c17
-rw-r--r--arch/x86/kernel/e820.c6
-rw-r--r--arch/x86/kernel/fpu/context.h2
-rw-r--r--arch/x86/kernel/fpu/core.c6
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c4
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kprobes/core.c74
-rw-r--r--arch/x86/kernel/kprobes/opt.c6
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/module.c97
-rw-r--r--arch/x86/kernel/nmi.c116
-rw-r--r--arch/x86/kernel/paravirt.c15
-rw-r--r--arch/x86/kernel/process.c66
-rw-r--r--arch/x86/kernel/pvclock.c22
-rw-r--r--arch/x86/kernel/reboot.c88
-rw-r--r--arch/x86/kernel/rtc.c9
-rw-r--r--arch/x86/kernel/setup.c10
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/signal_32.c128
-rw-r--r--arch/x86/kernel/signal_64.c127
-rw-r--r--arch/x86/kernel/signal_compat.c191
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/static_call.c50
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/tsc.c75
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c84
-rw-r--r--arch/x86/kvm/debugfs.c2
-rw-r--r--arch/x86/kvm/emulate.c26
-rw-r--r--arch/x86/kvm/hyperv.c85
-rw-r--r--arch/x86/kvm/hyperv.h27
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/i8259.c4
-rw-r--r--arch/x86/kvm/ioapic.c1
-rw-r--r--arch/x86/kvm/irq.c1
-rw-r--r--arch/x86/kvm/irq_comm.c7
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h12
-rw-r--r--arch/x86/kvm/kvm_emulate.h7
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c1
-rw-r--r--arch/x86/kvm/lapic.c404
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/mmu/mmu.c320
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h22
-rw-r--r--arch/x86/kvm/mmu/page_track.c1
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h25
-rw-r--r--arch/x86/kvm/mmu/spte.c10
-rw-r--r--arch/x86/kvm/mmu/spte.h20
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c12
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c20
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h25
-rw-r--r--arch/x86/kvm/mtrr.c1
-rw-r--r--arch/x86/kvm/pmu.c290
-rw-r--r--arch/x86/kvm/pmu.h39
-rw-r--r--arch/x86/kvm/reverse_cpuid.h8
-rw-r--r--arch/x86/kvm/smm.c3
-rw-r--r--arch/x86/kvm/svm/avic.c374
-rw-r--r--arch/x86/kvm/svm/nested.c5
-rw-r--r--arch/x86/kvm/svm/pmu.c4
-rw-r--r--arch/x86/kvm/svm/sev.c7
-rw-r--r--arch/x86/kvm/svm/svm.c134
-rw-r--r--arch/x86/kvm/svm/svm.h58
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c1
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h8
-rw-r--r--arch/x86/kvm/vmx/capabilities.h4
-rw-r--r--arch/x86/kvm/vmx/hyperv.c87
-rw-r--r--arch/x86/kvm/vmx/hyperv.h128
-rw-r--r--arch/x86/kvm/vmx/nested.c24
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c28
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c2
-rw-r--r--arch/x86/kvm/vmx/sgx.c5
-rw-r--r--arch/x86/kvm/vmx/vmcs.h4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S80
-rw-r--r--arch/x86/kvm/vmx/vmx.c645
-rw-r--r--arch/x86/kvm/vmx/vmx.h18
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h6
-rw-r--r--arch/x86/kvm/x86.c606
-rw-r--r--arch/x86/kvm/x86.h18
-rw-r--r--arch/x86/kvm/xen.c27
-rw-r--r--arch/x86/kvm/xen.h7
-rw-r--r--arch/x86/lib/cmdline.c4
-rw-r--r--arch/x86/lib/memcpy_64.S5
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S4
-rw-r--r--arch/x86/lib/misc.c2
-rw-r--r--arch/x86/lib/x86-opcode-map.txt1
-rw-r--r--arch/x86/mm/fault.c23
-rw-r--r--arch/x86/mm/pat/memtype.c21
-rw-r--r--arch/x86/mm/tlb.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c171
-rw-r--r--arch/x86/pci/xen.c2
-rw-r--r--arch/x86/platform/efi/efi.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c8
-rw-r--r--arch/x86/platform/uv/uv_irq.c7
-rw-r--r--arch/x86/um/mem_32.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c3
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/setup.c4
-rw-r--r--arch/x86/xen/smp.h2
-rw-r--r--arch/x86/xen/smp_pv.c17
-rw-r--r--arch/x86/xen/time.c50
-rw-r--r--arch/x86/xen/xen-head.S7
251 files changed, 8946 insertions, 3175 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3604074a878b..a825bf031f49 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1502,7 +1502,7 @@ config X86_5LEVEL
depends on X86_64
help
5-level paging enables access to larger address space:
- upto 128 PiB of virtual address space and 4 PiB of
+ up to 128 PiB of virtual address space and 4 PiB of
physical address space.
It will be supported by future Intel CPUs.
@@ -2609,8 +2609,8 @@ config CALL_THUNKS_DEBUG
a noisy dmesg about callthunks generation and call patching for
trouble shooting. The debug prints need to be enabled on the
kernel command line with 'debug-callthunks'.
- Only enable this, when you are debugging call thunks as this
- creates a noticable runtime overhead. If unsure say N.
+ Only enable this when you are debugging call thunks as this
+ creates a noticeable runtime overhead. If unsure say N.
config CPU_IBPB_ENTRY
bool "Enable IBPB on kernel entry"
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index 26b8c08e2fc4..b88f784cb02e 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -19,3 +19,8 @@ config AS_TPAUSE
def_bool $(as-instr,tpause %ecx)
help
Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7
+
+config AS_GFNI
+ def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2)
+ help
+ Supported by binutils >= 2.30 and LLVM integrated assembler
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 9cf07322875a..b39975977c03 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -3,10 +3,10 @@
# select defconfig based on actual architecture
ifeq ($(ARCH),x86)
- ifeq ($(shell uname -m),x86_64)
- KBUILD_DEFCONFIG := x86_64_defconfig
- else
+ ifeq ($(shell uname -m | sed -e 's/i.86/i386/'),i386)
KBUILD_DEFCONFIG := i386_defconfig
+ else
+ KBUILD_DEFCONFIG := x86_64_defconfig
endif
else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
@@ -14,13 +14,13 @@ endif
ifdef CONFIG_CC_IS_GCC
RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
-RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register)
endif
ifdef CONFIG_CC_IS_CLANG
RETPOLINE_CFLAGS := -mretpoline-external-thunk
RETPOLINE_VDSO_CFLAGS := -mretpoline
endif
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
ifdef CONFIG_RETHUNK
RETHUNK_CFLAGS := -mfunction-return=thunk-extern
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 6589ddd4cfaf..987ae727cf9f 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -187,7 +187,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
leal boot_heap@GOTOFF(%ebx), %eax
pushl %eax /* heap area */
pushl %esi /* real mode pointer */
- call extract_kernel /* returns kernel location in %eax */
+ call extract_kernel /* returns kernel entry point in %eax */
addl $24, %esp
/*
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index a75712991df3..03c4328a88cb 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -569,7 +569,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
movl input_len(%rip), %ecx /* input_len */
movq %rbp, %r8 /* output target address */
movl output_len(%rip), %r9d /* decompressed length, end of relocs */
- call extract_kernel /* returns kernel location in %rax */
+ call extract_kernel /* returns kernel entry point in %rax */
popq %rsi
/*
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index d4a314cc50d6..321a5011042d 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -180,6 +180,12 @@ void initialize_identity_maps(void *rmode)
/* Load the new page-table. */
write_cr3(top_level_pgt);
+
+ /*
+ * Now that the required page table mappings are established and a
+ * GHCB can be used, check for SNP guest/HV feature compatibility.
+ */
+ snp_check_features();
}
static pte_t *split_large_pmd(struct x86_mapping_info *info,
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index cf690d8712f4..014ff222bf4b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -277,7 +277,7 @@ static inline void handle_relocations(void *output, unsigned long output_len,
{ }
#endif
-static void parse_elf(void *output)
+static size_t parse_elf(void *output)
{
#ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -293,10 +293,8 @@ static void parse_elf(void *output)
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
- ehdr.e_ident[EI_MAG3] != ELFMAG3) {
+ ehdr.e_ident[EI_MAG3] != ELFMAG3)
error("Kernel is not a valid ELF file");
- return;
- }
debug_putstr("Parsing ELF... ");
@@ -328,6 +326,8 @@ static void parse_elf(void *output)
}
free(phdrs);
+
+ return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
/*
@@ -356,6 +356,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
const unsigned long kernel_total_size = VO__end - VO__text;
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
unsigned long needed_size;
+ size_t entry_offset;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
@@ -456,14 +457,17 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
debug_putstr("\nDecompressing Linux... ");
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
- parse_elf(output);
+ entry_offset = parse_elf(output);
handle_relocations(output, output_len, virt_addr);
- debug_putstr("done.\nBooting the kernel.\n");
+
+ debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
+ debug_puthex(entry_offset);
+ debug_putstr(").\n");
/* Disable exception handling before booting the kernel */
cleanup_exception_handling();
- return output;
+ return output + entry_offset;
}
void fortify_panic(const char *name)
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 62208ec04ca4..20118fb7c53b 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -126,6 +126,7 @@ static inline void console_init(void)
#ifdef CONFIG_AMD_MEM_ENCRYPT
void sev_enable(struct boot_params *bp);
+void snp_check_features(void);
void sev_es_shutdown_ghcb(void);
extern bool sev_es_check_ghcb_fault(unsigned long address);
void snp_set_page_private(unsigned long paddr);
@@ -143,6 +144,7 @@ static inline void sev_enable(struct boot_params *bp)
if (bp)
bp->cc_blob_address = 0;
}
+static inline void snp_check_features(void) { }
static inline void sev_es_shutdown_ghcb(void) { }
static inline bool sev_es_check_ghcb_fault(unsigned long address)
{
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index c93930d5ccbd..d63ad8f99f83 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -208,6 +208,23 @@ void sev_es_shutdown_ghcb(void)
error("Can't unmap GHCB page");
}
+static void __noreturn sev_es_ghcb_terminate(struct ghcb *ghcb, unsigned int set,
+ unsigned int reason, u64 exit_info_2)
+{
+ u64 exit_info_1 = SVM_VMGEXIT_TERM_REASON(set, reason);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_TERM_REQUEST);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
bool sev_es_check_ghcb_fault(unsigned long address)
{
/* Check whether the fault was on the GHCB page */
@@ -270,6 +287,59 @@ static void enforce_vmpl0(void)
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
}
+/*
+ * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
+ * guest side implementation for proper functioning of the guest. If any
+ * of these features are enabled in the hypervisor but are lacking guest
+ * side implementation, the behavior of the guest will be undefined. The
+ * guest could fail in non-obvious way making it difficult to debug.
+ *
+ * As the behavior of reserved feature bits is unknown to be on the
+ * safe side add them to the required features mask.
+ */
+#define SNP_FEATURES_IMPL_REQ (MSR_AMD64_SNP_VTOM | \
+ MSR_AMD64_SNP_REFLECT_VC | \
+ MSR_AMD64_SNP_RESTRICTED_INJ | \
+ MSR_AMD64_SNP_ALT_INJ | \
+ MSR_AMD64_SNP_DEBUG_SWAP | \
+ MSR_AMD64_SNP_VMPL_SSS | \
+ MSR_AMD64_SNP_SECURE_TSC | \
+ MSR_AMD64_SNP_VMGEXIT_PARAM | \
+ MSR_AMD64_SNP_VMSA_REG_PROTECTION | \
+ MSR_AMD64_SNP_RESERVED_BIT13 | \
+ MSR_AMD64_SNP_RESERVED_BIT15 | \
+ MSR_AMD64_SNP_RESERVED_MASK)
+
+/*
+ * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented
+ * by the guest kernel. As and when a new feature is implemented in the
+ * guest kernel, a corresponding bit should be added to the mask.
+ */
+#define SNP_FEATURES_PRESENT (0)
+
+void snp_check_features(void)
+{
+ u64 unsupported;
+
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /*
+ * Terminate the boot if hypervisor has enabled any feature lacking
+ * guest side implementation. Pass on the unsupported features mask through
+ * EXIT_INFO_2 of the GHCB protocol so that those features can be reported
+ * as part of the guest boot failure.
+ */
+ unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+ if (unsupported) {
+ if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ sev_es_ghcb_terminate(boot_ghcb, SEV_TERM_SET_GEN,
+ GHCB_SNP_UNSUPPORTED, unsupported);
+ }
+}
+
void sev_enable(struct boot_params *bp)
{
unsigned int eax, ebx, ecx, edx;
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 112b2375d021..b22f34b8684a 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -34,6 +34,7 @@ SECTIONS
_text = .; /* Text */
*(.text)
*(.text.*)
+ *(.noinstr.text)
_etext = . ;
}
.rodata : {
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
index f9eb1134f22d..6a255e6809bc 100644
--- a/arch/x86/coco/tdx/tdcall.S
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -13,6 +13,12 @@
/*
* Bitmasks of exposed registers (with VMM).
*/
+#define TDX_RDX BIT(2)
+#define TDX_RBX BIT(3)
+#define TDX_RSI BIT(6)
+#define TDX_RDI BIT(7)
+#define TDX_R8 BIT(8)
+#define TDX_R9 BIT(9)
#define TDX_R10 BIT(10)
#define TDX_R11 BIT(11)
#define TDX_R12 BIT(12)
@@ -27,9 +33,11 @@
* details can be found in TDX GHCI specification, section
* titled "TDCALL [TDG.VP.VMCALL] leaf".
*/
-#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \
- TDX_R12 | TDX_R13 | \
- TDX_R14 | TDX_R15 )
+#define TDVMCALL_EXPOSE_REGS_MASK \
+ ( TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \
+ TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15 )
+
+.section .noinstr.text, "ax"
/*
* __tdx_module_call() - Used by TDX guests to request services from
@@ -124,38 +132,38 @@ SYM_FUNC_START(__tdx_hypercall)
push %r14
push %r13
push %r12
+ push %rbx
+
+ /* Free RDI and RSI to be used as TDVMCALL arguments */
+ movq %rdi, %rax
+ push %rsi
+
+ /* Copy hypercall registers from arg struct: */
+ movq TDX_HYPERCALL_r8(%rax), %r8
+ movq TDX_HYPERCALL_r9(%rax), %r9
+ movq TDX_HYPERCALL_r10(%rax), %r10
+ movq TDX_HYPERCALL_r11(%rax), %r11
+ movq TDX_HYPERCALL_r12(%rax), %r12
+ movq TDX_HYPERCALL_r13(%rax), %r13
+ movq TDX_HYPERCALL_r14(%rax), %r14
+ movq TDX_HYPERCALL_r15(%rax), %r15
+ movq TDX_HYPERCALL_rdi(%rax), %rdi
+ movq TDX_HYPERCALL_rsi(%rax), %rsi
+ movq TDX_HYPERCALL_rbx(%rax), %rbx
+ movq TDX_HYPERCALL_rdx(%rax), %rdx
+
+ push %rax
/* Mangle function call ABI into TDCALL ABI: */
/* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
xor %eax, %eax
- /* Copy hypercall registers from arg struct: */
- movq TDX_HYPERCALL_r10(%rdi), %r10
- movq TDX_HYPERCALL_r11(%rdi), %r11
- movq TDX_HYPERCALL_r12(%rdi), %r12
- movq TDX_HYPERCALL_r13(%rdi), %r13
- movq TDX_HYPERCALL_r14(%rdi), %r14
- movq TDX_HYPERCALL_r15(%rdi), %r15
-
movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
- /*
- * For the idle loop STI needs to be called directly before the TDCALL
- * that enters idle (EXIT_REASON_HLT case). STI instruction enables
- * interrupts only one instruction later. If there is a window between
- * STI and the instruction that emulates the HALT state, there is a
- * chance for interrupts to happen in this window, which can delay the
- * HLT operation indefinitely. Since this is the not the desired
- * result, conditionally call STI before TDCALL.
- */
- testq $TDX_HCALL_ISSUE_STI, %rsi
- jz .Lskip_sti
- sti
-.Lskip_sti:
tdcall
/*
- * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that
+ * RAX!=0 indicates a failure of the TDVMCALL mechanism itself and that
* something has gone horribly wrong with the TDX module.
*
* The return status of the hypercall operation is in a separate
@@ -165,30 +173,46 @@ SYM_FUNC_START(__tdx_hypercall)
testq %rax, %rax
jne .Lpanic
- /* TDVMCALL leaf return code is in R10 */
- movq %r10, %rax
+ pop %rax
/* Copy hypercall result registers to arg struct if needed */
- testq $TDX_HCALL_HAS_OUTPUT, %rsi
+ testq $TDX_HCALL_HAS_OUTPUT, (%rsp)
jz .Lout
- movq %r10, TDX_HYPERCALL_r10(%rdi)
- movq %r11, TDX_HYPERCALL_r11(%rdi)
- movq %r12, TDX_HYPERCALL_r12(%rdi)
- movq %r13, TDX_HYPERCALL_r13(%rdi)
- movq %r14, TDX_HYPERCALL_r14(%rdi)
- movq %r15, TDX_HYPERCALL_r15(%rdi)
+ movq %r8, TDX_HYPERCALL_r8(%rax)
+ movq %r9, TDX_HYPERCALL_r9(%rax)
+ movq %r10, TDX_HYPERCALL_r10(%rax)
+ movq %r11, TDX_HYPERCALL_r11(%rax)
+ movq %r12, TDX_HYPERCALL_r12(%rax)
+ movq %r13, TDX_HYPERCALL_r13(%rax)
+ movq %r14, TDX_HYPERCALL_r14(%rax)
+ movq %r15, TDX_HYPERCALL_r15(%rax)
+ movq %rdi, TDX_HYPERCALL_rdi(%rax)
+ movq %rsi, TDX_HYPERCALL_rsi(%rax)
+ movq %rbx, TDX_HYPERCALL_rbx(%rax)
+ movq %rdx, TDX_HYPERCALL_rdx(%rax)
.Lout:
+ /* TDVMCALL leaf return code is in R10 */
+ movq %r10, %rax
+
/*
* Zero out registers exposed to the VMM to avoid speculative execution
* with VMM-controlled values. This needs to include all registers
- * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15
- * context will be restored.
+ * present in TDVMCALL_EXPOSE_REGS_MASK, except RBX, and R12-R15 which
+ * will be restored.
*/
+ xor %r8d, %r8d
+ xor %r9d, %r9d
xor %r10d, %r10d
xor %r11d, %r11d
+ xor %rdi, %rdi
+ xor %rdx, %rdx
+
+ /* Remove TDX_HCALL_* flags from the stack */
+ pop %rsi
/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
+ pop %rbx
pop %r12
pop %r13
pop %r14
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 669d9e4f2901..055300e08fb3 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -19,9 +19,14 @@
#define TDX_GET_VEINFO 3
#define TDX_GET_REPORT 4
#define TDX_ACCEPT_PAGE 6
+#define TDX_WR 8
+
+/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
+#define TDCS_NOTIFY_ENABLES 0x9100000000000010
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
+#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
/* MMIO direction */
#define EPT_READ 0
@@ -37,6 +42,7 @@
#define VE_GET_PORT_NUM(e) ((e) >> 16)
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
+#define ATTR_DEBUG BIT(0)
#define ATTR_SEPT_VE_DISABLE BIT(28)
/* TDX Module call error codes */
@@ -64,8 +70,9 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
}
/* Called from __tdx_hypercall() for unrecoverable failure */
-void __tdx_hypercall_failed(void)
+noinstr void __tdx_hypercall_failed(void)
{
+ instrumentation_begin();
panic("TDVMCALL failed. TDX module bug?");
}
@@ -75,7 +82,7 @@ void __tdx_hypercall_failed(void)
* Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
* guest sides of these calls.
*/
-static u64 hcall_func(u64 exit_reason)
+static __always_inline u64 hcall_func(u64 exit_reason)
{
return exit_reason;
}
@@ -140,6 +147,41 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
}
EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
+static void __noreturn tdx_panic(const char *msg)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = TDVMCALL_REPORT_FATAL_ERROR,
+ .r12 = 0, /* Error code: 0 is Panic */
+ };
+ union {
+ /* Define register order according to the GHCI */
+ struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
+
+ char str[64];
+ } message;
+
+ /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
+ strncpy(message.str, msg, 64);
+
+ args.r8 = message.r8;
+ args.r9 = message.r9;
+ args.r14 = message.r14;
+ args.r15 = message.r15;
+ args.rdi = message.rdi;
+ args.rsi = message.rsi;
+ args.rbx = message.rbx;
+ args.rdx = message.rdx;
+
+ /*
+ * This hypercall should never return and it is not safe
+ * to keep the guest running. Call it forever if it
+ * happens to return.
+ */
+ while (1)
+ __tdx_hypercall(&args, 0);
+}
+
static void tdx_parse_tdinfo(u64 *cc_mask)
{
struct tdx_module_output out;
@@ -171,8 +213,15 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* TD-private memory. Only VMM-shared memory (MMIO) will #VE.
*/
td_attr = out.rdx;
- if (!(td_attr & ATTR_SEPT_VE_DISABLE))
- panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n");
+ if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
+ const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
+
+ /* Relax SEPT_VE_DISABLE check for debug TD. */
+ if (td_attr & ATTR_DEBUG)
+ pr_warn("%s\n", msg);
+ else
+ tdx_panic(msg);
+ }
}
/*
@@ -220,7 +269,7 @@ static int ve_instr_len(struct ve_info *ve)
}
}
-static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
+static u64 __cpuidle __halt(const bool irq_disabled)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -240,20 +289,14 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
* can keep the vCPU in virtual HLT, even if an IRQ is
* pending, without hanging/breaking the guest.
*/
- return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
+ return __tdx_hypercall(&args, 0);
}
static int handle_halt(struct ve_info *ve)
{
- /*
- * Since non safe halt is mainly used in CPU offlining
- * and the guest will always stay in the halt state, don't
- * call the STI instruction (set do_sti as false).
- */
const bool irq_disabled = irqs_disabled();
- const bool do_sti = false;
- if (__halt(irq_disabled, do_sti))
+ if (__halt(irq_disabled))
return -EIO;
return ve_instr_len(ve);
@@ -261,18 +304,12 @@ static int handle_halt(struct ve_info *ve)
void __cpuidle tdx_safe_halt(void)
{
- /*
- * For do_sti=true case, __tdx_hypercall() function enables
- * interrupts using the STI instruction before the TDCALL. So
- * set irq_disabled as false.
- */
const bool irq_disabled = false;
- const bool do_sti = true;
/*
* Use WARN_ONCE() to report the failure.
*/
- if (__halt(irq_disabled, do_sti))
+ if (__halt(irq_disabled))
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
@@ -628,6 +665,11 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
}
}
+static inline bool is_private_gpa(u64 gpa)
+{
+ return gpa == cc_mkenc(gpa);
+}
+
/*
* Handle the kernel #VE.
*
@@ -646,6 +688,8 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
case EXIT_REASON_CPUID:
return handle_cpuid(regs, ve);
case EXIT_REASON_EPT_VIOLATION:
+ if (is_private_gpa(ve->gpa))
+ panic("Unexpected EPT-violation on private memory.");
return handle_mmio(regs, ve);
case EXIT_REASON_IO_INSTRUCTION:
return handle_io(regs, ve);
@@ -812,6 +856,9 @@ void __init tdx_early_init(void)
tdx_parse_tdinfo(&cc_mask);
cc_set_mask(cc_mask);
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
+
/*
* All bits above GPA width are reserved and kernel treats shared bit
* as flag, not as part of physical address.
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 71c4c473d34b..9bbfd01cfa2f 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -304,6 +304,44 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64
Processes 16 blocks in parallel.
+config CRYPTO_ARIA_AESNI_AVX2_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ select CRYPTO_ARIA_AESNI_AVX_X86_64
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX2 (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 32 blocks in parallel.
+
+config CRYPTO_ARIA_GFNI_AVX512_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)"
+ depends on X86 && 64BIT && AS_AVX512 && AS_GFNI
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ select CRYPTO_ARIA_AESNI_AVX_X86_64
+ select CRYPTO_ARIA_AESNI_AVX2_X86_64
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AVX512 (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 64 blocks in parallel.
+
config CRYPTO_CHACHA20_X86_64
tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)"
depends on X86 && 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3e7a329235bd..9aa46093c91b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -103,6 +103,12 @@ sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o
obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) += aria-aesni-avx-x86_64.o
aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o
+obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX2_X86_64) += aria-aesni-avx2-x86_64.o
+aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o
+
+obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o
+aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
index 03ae4cd1d976..9243f6289d34 100644
--- a/arch/x86/crypto/aria-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -8,13 +8,9 @@
#include <linux/linkage.h>
#include <linux/cfi_types.h>
+#include <asm/asm-offsets.h>
#include <asm/frame.h>
-/* struct aria_ctx: */
-#define enc_key 0
-#define dec_key 272
-#define rounds 544
-
/* register macros */
#define CTX %rdi
@@ -271,34 +267,44 @@
#define aria_ark_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
- t0, rk, idx, round) \
+ t0, t1, t2, rk, \
+ idx, round) \
/* AddRoundKey */ \
- vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
- vpxor t0, x0, x0; \
- vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
- vpxor t0, x1, x1; \
- vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
- vpxor t0, x2, x2; \
- vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
- vpxor t0, x3, x3; \
- vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
- vpxor t0, x4, x4; \
- vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
- vpxor t0, x5, x5; \
- vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
- vpxor t0, x6, x6; \
- vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
- vpxor t0, x7, x7;
-
+ vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
+ vpsrld $24, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x0, x0; \
+ vpsrld $16, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x1, x1; \
+ vpsrld $8, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x2, x2; \
+ vpshufb t1, t0, t2; \
+ vpxor t2, x3, x3; \
+ vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
+ vpsrld $24, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x4, x4; \
+ vpsrld $16, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x5, x5; \
+ vpsrld $8, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x6, x6; \
+ vpshufb t1, t0, t2; \
+ vpxor t2, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vpbroadcastq .Ltf_s2_bitmatrix, t0; \
- vpbroadcastq .Ltf_inv_bitmatrix, t1; \
- vpbroadcastq .Ltf_id_bitmatrix, t2; \
- vpbroadcastq .Ltf_aff_bitmatrix, t3; \
- vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vmovdqa .Ltf_s2_bitmatrix, t0; \
+ vmovdqa .Ltf_inv_bitmatrix, t1; \
+ vmovdqa .Ltf_id_bitmatrix, t2; \
+ vmovdqa .Ltf_aff_bitmatrix, t3; \
+ vmovdqa .Ltf_x2_bitmatrix, t4; \
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
@@ -312,14 +318,15 @@
vgf2p8affineinvqb $0, t2, x3, x3; \
vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
#define aria_sbox_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vpxor t7, t7, t7; \
vmovdqa .Linv_shift_row, t0; \
vmovdqa .Lshift_row, t1; \
- vpbroadcastd .L0f0f0f0f, t6; \
+ vbroadcastss .L0f0f0f0f, t6; \
vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
@@ -414,8 +421,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -430,7 +438,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -468,8 +476,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -484,7 +493,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -522,14 +531,15 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round, last_round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, last_round); \
+ y0, y7, y2, rk, 8, last_round); \
\
aria_store_state_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -539,25 +549,27 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, last_round); \
+ y0, y7, y2, rk, 0, last_round); \
\
aria_load_state_8way(y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, 8);
+#ifdef CONFIG_AS_GFNI
#define aria_fe_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -574,7 +586,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -614,8 +626,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -632,7 +645,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -672,8 +685,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round, last_round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -681,7 +695,7 @@
y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, last_round); \
+ y0, y7, y2, rk, 8, last_round); \
\
aria_store_state_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -691,7 +705,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -699,12 +713,14 @@
y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, last_round); \
+ y0, y7, y2, rk, 0, last_round); \
\
aria_load_state_8way(y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, 8);
+#endif /* CONFIG_AS_GFNI */
+
/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
.section .rodata.cst16, "aM", @progbits, 16
.align 16
@@ -756,6 +772,7 @@
.Ltf_hi__x2__and__fwd_aff:
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
+#ifdef CONFIG_AS_GFNI
.section .rodata.cst8, "aM", @progbits, 8
.align 8
/* AES affine: */
@@ -769,6 +786,14 @@
BV8(0, 1, 1, 1, 1, 1, 0, 0),
BV8(0, 0, 1, 1, 1, 1, 1, 0),
BV8(0, 0, 0, 1, 1, 1, 1, 1))
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
/* AES inverse affine: */
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
@@ -781,6 +806,14 @@
BV8(0, 0, 1, 0, 1, 0, 0, 1),
BV8(1, 0, 0, 1, 0, 1, 0, 0),
BV8(0, 1, 0, 0, 1, 0, 1, 0))
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
/* S2: */
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
@@ -793,6 +826,14 @@
BV8(1, 1, 0, 0, 1, 1, 1, 0),
BV8(0, 1, 1, 0, 0, 0, 1, 1),
BV8(1, 1, 1, 1, 0, 1, 1, 0))
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
/* X2: */
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
@@ -805,6 +846,14 @@
BV8(0, 1, 1, 0, 1, 0, 1, 1),
BV8(1, 0, 1, 1, 1, 1, 0, 1),
BV8(1, 0, 0, 1, 0, 0, 1, 1))
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
/* Identity matrix: */
.Ltf_id_bitmatrix:
@@ -816,6 +865,15 @@
BV8(0, 0, 0, 0, 0, 1, 0, 0),
BV8(0, 0, 0, 0, 0, 0, 1, 0),
BV8(0, 0, 0, 0, 0, 0, 0, 1))
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+#endif /* CONFIG_AS_GFNI */
/* 4-bit mask */
.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
@@ -874,7 +932,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 10);
- cmpl $12, rounds(CTX);
+ cmpl $12, ARIA_CTX_rounds(CTX);
jne .Laria_192;
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -887,7 +945,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 12);
- cmpl $14, rounds(CTX);
+ cmpl $14, ARIA_CTX_rounds(CTX);
jne .Laria_256;
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -923,7 +981,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
FRAME_BEGIN
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -948,7 +1006,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
FRAME_BEGIN
- leaq dec_key(CTX), %r9;
+ leaq ARIA_CTX_dec_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1056,7 +1114,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
leaq (%rdx), %r11;
leaq (%rcx), %rsi;
leaq (%rcx), %rdx;
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
call __aria_aesni_avx_crypt_16way;
@@ -1084,6 +1142,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
+#ifdef CONFIG_AS_GFNI
SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
/* input:
* %r9: rk
@@ -1157,7 +1216,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
%xmm0, %xmm1, %xmm2, %xmm3,
%xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 10);
- cmpl $12, rounds(CTX);
+ cmpl $12, ARIA_CTX_rounds(CTX);
jne .Laria_gfni_192;
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1174,7 +1233,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
%xmm0, %xmm1, %xmm2, %xmm3,
%xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 12);
- cmpl $14, rounds(CTX);
+ cmpl $14, ARIA_CTX_rounds(CTX);
jne .Laria_gfni_256;
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
%xmm4, %xmm5, %xmm6, %xmm7,
@@ -1218,7 +1277,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
FRAME_BEGIN
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1243,7 +1302,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
FRAME_BEGIN
- leaq dec_key(CTX), %r9;
+ leaq ARIA_CTX_dec_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1275,7 +1334,7 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
leaq (%rdx), %r11;
leaq (%rcx), %rsi;
leaq (%rcx), %rdx;
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
call __aria_aesni_avx_gfni_crypt_16way;
@@ -1302,3 +1361,4 @@ SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
FRAME_END
RET;
SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
+#endif /* CONFIG_AS_GFNI */
diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..82a14b4ad920
--- /dev/null
+++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
@@ -0,0 +1,1441 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 32-way parallel algorithm (AVX2)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+#include <linux/cfi_types.h>
+
+/* register macros */
+#define CTX %rdi
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu (0 * 32)(rio), x0; \
+ vmovdqu (1 * 32)(rio), x1; \
+ vmovdqu (2 * 32)(rio), x2; \
+ vmovdqu (3 * 32)(rio), x3; \
+ vmovdqu (4 * 32)(rio), x4; \
+ vmovdqu (5 * 32)(rio), x5; \
+ vmovdqu (6 * 32)(rio), x6; \
+ vmovdqu (7 * 32)(rio), x7; \
+ vmovdqu (8 * 32)(rio), y0; \
+ vmovdqu (9 * 32)(rio), y1; \
+ vmovdqu (10 * 32)(rio), y2; \
+ vmovdqu (11 * 32)(rio), y3; \
+ vmovdqu (12 * 32)(rio), y4; \
+ vmovdqu (13 * 32)(rio), y5; \
+ vmovdqu (14 * 32)(rio), y6; \
+ vmovdqu (15 * 32)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab); \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu y0, 0 * 32(mem_cd); \
+ vmovdqu y1, 1 * 32(mem_cd); \
+ vmovdqu y2, 2 * 32(mem_cd); \
+ vmovdqu y3, 3 * 32(mem_cd); \
+ vmovdqu y4, 4 * 32(mem_cd); \
+ vmovdqu y5, 5 * 32(mem_cd); \
+ vmovdqu y6, 6 * 32(mem_cd); \
+ vmovdqu y7, 7 * 32(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu x0, 0 * 32(mem); \
+ vmovdqu x1, 1 * 32(mem); \
+ vmovdqu x2, 2 * 32(mem); \
+ vmovdqu x3, 3 * 32(mem); \
+ vmovdqu x4, 4 * 32(mem); \
+ vmovdqu x5, 5 * 32(mem); \
+ vmovdqu x6, 6 * 32(mem); \
+ vmovdqu x7, 7 * 32(mem); \
+ vmovdqu y0, 8 * 32(mem); \
+ vmovdqu y1, 9 * 32(mem); \
+ vmovdqu y2, 10 * 32(mem); \
+ vmovdqu y3, 11 * 32(mem); \
+ vmovdqu y4, 12 * 32(mem); \
+ vmovdqu y5, 13 * 32(mem); \
+ vmovdqu y6, 14 * 32(mem); \
+ vmovdqu y7, 15 * 32(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \
+ vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \
+ vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \
+ vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \
+ vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \
+ vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \
+ vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \
+ vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \
+ vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \
+ vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \
+ vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \
+ vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \
+ vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \
+ vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \
+ vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, rk, idx, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
+ vpxor t0, x0, x0; \
+ vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
+ vpxor t0, x1, x1; \
+ vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
+ vpxor t0, x2, x2; \
+ vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
+ vpxor t0, x3, x3; \
+ vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
+ vpxor t0, x4, x4; \
+ vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
+ vpxor t0, x5, x5; \
+ vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
+ vpxor t0, x6, x6; \
+ vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
+ vpxor t0, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix, t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix, t1; \
+ vpbroadcastq .Ltf_id_bitmatrix, t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix, t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7
+
+#endif /* CONFIG_AS_GFNI */
+#define aria_sbox_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpxor t7, t7, t7; \
+ vpxor t6, t6, t6; \
+ vbroadcasti128 .Linv_shift_row, t0; \
+ vbroadcasti128 .Lshift_row, t1; \
+ vbroadcasti128 .Ltf_lo__inv_aff__and__s2, t2; \
+ vbroadcasti128 .Ltf_hi__inv_aff__and__s2, t3; \
+ vbroadcasti128 .Ltf_lo__x2__and__fwd_aff, t4; \
+ vbroadcasti128 .Ltf_hi__x2__and__fwd_aff, t5; \
+ \
+ vextracti128 $1, x0, t6##_x; \
+ vaesenclast t7##_x, x0##_x, x0##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x0, x0; \
+ \
+ vextracti128 $1, x4, t6##_x; \
+ vaesenclast t7##_x, x4##_x, x4##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x4, x4; \
+ \
+ vextracti128 $1, x1, t6##_x; \
+ vaesenclast t7##_x, x1##_x, x1##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x1, x1; \
+ \
+ vextracti128 $1, x5, t6##_x; \
+ vaesenclast t7##_x, x5##_x, x5##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x5, x5; \
+ \
+ vextracti128 $1, x2, t6##_x; \
+ vaesdeclast t7##_x, x2##_x, x2##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x2, x2; \
+ \
+ vextracti128 $1, x6, t6##_x; \
+ vaesdeclast t7##_x, x6##_x, x6##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x6, x6; \
+ \
+ vpbroadcastd .L0f0f0f0f, t6; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t0, x0, x0; \
+ vpshufb t0, x4, x4; \
+ vpshufb t0, x1, x1; \
+ vpshufb t0, x5, x5; \
+ vpshufb t1, x3, x3; \
+ vpshufb t1, x7, x7; \
+ vpshufb t1, x2, x2; \
+ vpshufb t1, x6, x6; \
+ \
+ /* affine transformation for S2 */ \
+ filter_8bit(x1, t2, t3, t6, t0); \
+ /* affine transformation for S2 */ \
+ filter_8bit(x5, t2, t3, t6, t0); \
+ \
+ /* affine transformation for X2 */ \
+ filter_8bit(x3, t4, t5, t6, t0); \
+ /* affine transformation for X2 */ \
+ filter_8bit(x7, t4, t5, t6, t0); \
+ \
+ vpxor t6, t6, t6; \
+ vextracti128 $1, x3, t6##_x; \
+ vaesdeclast t7##_x, x3##_x, x3##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x3, x3; \
+ \
+ vextracti128 $1, x7, t6##_x; \
+ vaesdeclast t7##_x, x7##_x, x7##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x7, x7; \
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxor x0, x3, t0; \
+ vpxor x1, x0, t1; \
+ vpxor x2, x1, t2; \
+ vpxor x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxor t2, x0, x0; \
+ vpxor x1, t3, t3; \
+ vpxor t0, x2, x2; \
+ vpxor t1, x3, x1; \
+ vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxor y4, y0, y0; \
+ vpxor y5, y1, y1; \
+ vpxor y6, y2, y2; \
+ vpxor y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxor x4, x0, x0; \
+ vpxor x5, x1, x1; \
+ vpxor x6, x2, x2; \
+ vpxor x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxor x4, y4, y4; \
+ vpxor x5, y5, y5; \
+ vpxor x6, y6, y6; \
+ vpxor x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxor x0, y0, y0; \
+ vpxor x1, y1, y1; \
+ vpxor x2, y2, y2; \
+ vpxor x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+#endif /* CONFIG_AS_GFNI */
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+ .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+ .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ * 1 1 0 0 0 0 0 1 x0 0
+ * 0 1 0 0 1 0 0 0 x1 0
+ * 1 1 0 0 1 1 1 1 x2 0
+ * 0 1 1 0 1 0 0 1 x3 1
+ * 0 1 0 0 1 1 0 0 * x4 + 0
+ * 0 1 0 1 1 0 0 0 x5 0
+ * 0 0 0 0 0 1 0 1 x6 0
+ * 1 1 1 0 0 1 1 1 x7 1
+ */
+.Ltf_lo__inv_aff__and__s2:
+ .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+ .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ * 1 0 1 1 0 0 0 1 x0 0
+ * 0 1 1 1 1 0 1 1 x1 0
+ * 0 0 0 1 1 0 1 0 x2 1
+ * 0 1 0 0 0 1 0 0 x3 0
+ * 0 0 1 1 1 0 1 1 * x4 + 0
+ * 0 1 0 0 1 0 0 0 x5 0
+ * 1 1 0 1 0 0 1 1 x6 0
+ * 0 1 0 0 1 0 1 0 x7 0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+ .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+ .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+#ifdef CONFIG_AS_GFNI
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %ymm0..%ymm15: byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 32(%rax), %r8;
+
+ inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r8);
+ aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 0);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 1);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 2);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 3);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 4);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 5);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 6);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 7);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 8);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 9);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_192;
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11, 12);
+ jmp .Laria_end;
+.Laria_192:
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_256;
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13, 14);
+ jmp .Laria_end;
+.Laria_256:
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 14);
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 15, 16);
+.Laria_end:
+ debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+ %ymm9, %ymm13, %ymm0, %ymm5,
+ %ymm10, %ymm14, %ymm3, %ymm6,
+ %ymm11, %ymm15, %ymm2, %ymm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+ movq 8(%r8), %r11;
+ bswapq %r11;
+
+ vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
+ vpcmpeqd %ymm0, %ymm0, %ymm0;
+ vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%r8), %xmm7;
+ vpshufb %xmm6, %xmm7, %xmm7;
+ vmovdqa %xmm7, %xmm3;
+ inc_le128(%xmm7, %xmm0, %xmm4);
+ vinserti128 $1, %xmm7, %ymm3, %ymm3;
+ vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 32), %r11;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
+ vpshufb %ymm6, %ymm3, %ymm9;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
+ vpshufb %ymm6, %ymm3, %ymm10;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
+ vpshufb %ymm6, %ymm3, %ymm11;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
+ vpshufb %ymm6, %ymm3, %ymm12;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
+ vpshufb %ymm6, %ymm3, %ymm13;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
+ vpshufb %ymm6, %ymm3, %ymm14;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
+ vpshufb %ymm6, %ymm3, %ymm15;
+ vmovdqu %ymm8, (0 * 32)(%rcx);
+ vmovdqu %ymm9, (1 * 32)(%rcx);
+ vmovdqu %ymm10, (2 * 32)(%rcx);
+ vmovdqu %ymm11, (3 * 32)(%rcx);
+ vmovdqu %ymm12, (4 * 32)(%rcx);
+ vmovdqu %ymm13, (5 * 32)(%rcx);
+ vmovdqu %ymm14, (6 * 32)(%rcx);
+ vmovdqu %ymm15, (7 * 32)(%rcx);
+
+ vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
+ vpshufb %ymm6, %ymm3, %ymm8;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
+ vpshufb %ymm6, %ymm3, %ymm9;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
+ vpshufb %ymm6, %ymm3, %ymm10;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
+ vpshufb %ymm6, %ymm3, %ymm11;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
+ vpshufb %ymm6, %ymm3, %ymm12;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
+ vpshufb %ymm6, %ymm3, %ymm13;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
+ vpshufb %ymm6, %ymm3, %ymm14;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
+ vpshufb %ymm6, %ymm3, %ymm15;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
+ vpshufb %xmm6, %xmm3, %xmm3;
+ vmovdqu %xmm3, (%r8);
+ vmovdqu (0 * 32)(%rcx), %ymm0;
+ vmovdqu (1 * 32)(%rcx), %ymm1;
+ vmovdqu (2 * 32)(%rcx), %ymm2;
+ vmovdqu (3 * 32)(%rcx), %ymm3;
+ vmovdqu (4 * 32)(%rcx), %ymm4;
+ vmovdqu (5 * 32)(%rcx), %ymm5;
+ vmovdqu (6 * 32)(%rcx), %ymm6;
+ vmovdqu (7 * 32)(%rcx), %ymm7;
+ jmp .Lctr_carry_done;
+
+ .Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
+ vmovdqu %ymm8, (0 * 32)(%rcx);
+ vmovdqu %ymm9, (1 * 32)(%rcx);
+ vmovdqu %ymm10, (2 * 32)(%rcx);
+ vmovdqu %ymm11, (3 * 32)(%rcx);
+ vmovdqu %ymm12, (4 * 32)(%rcx);
+ vmovdqu %ymm13, (5 * 32)(%rcx);
+ vmovdqu %ymm14, (6 * 32)(%rcx);
+ vmovdqu %ymm15, (7 * 32)(%rcx);
+
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vextracti128 $1, %ymm3, %xmm3;
+ vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
+ vmovdqu %xmm3, (%r8);
+ vmovdqu (0 * 32)(%rcx), %ymm0;
+ vmovdqu (1 * 32)(%rcx), %ymm1;
+ vmovdqu (2 * 32)(%rcx), %ymm2;
+ vmovdqu (3 * 32)(%rcx), %ymm3;
+ vmovdqu (4 * 32)(%rcx), %ymm4;
+ vmovdqu (5 * 32)(%rcx), %ymm5;
+ vmovdqu (6 * 32)(%rcx), %ymm6;
+ vmovdqu (7 * 32)(%rcx), %ymm7;
+
+ .Lctr_carry_done:
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+ vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+ vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+ vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+ vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+ vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+ vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+ vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+ vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+ vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+ vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+ vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+ vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+ vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+ vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+ vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
+
+#ifdef CONFIG_AS_GFNI
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %ymm0..%ymm15: 16 byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 32(%rax), %r8;
+
+ inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r8);
+ aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 0);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 1);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 2);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 3);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 4);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 5);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 6);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 7);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 8);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 9);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 14);
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+ %ymm9, %ymm13, %ymm0, %ymm5,
+ %ymm10, %ymm14, %ymm3, %ymm6,
+ %ymm11, %ymm15, %ymm2, %ymm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx2_ctr_gen_keystream_32way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+ vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+ vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+ vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+ vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+ vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+ vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+ vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+ vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+ vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+ vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+ vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+ vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+ vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+ vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+ vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
+#endif /* CONFIG_AS_GFNI */
diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h
index 01e9a01dc157..6e1b2d8a31ed 100644
--- a/arch/x86/crypto/aria-avx.h
+++ b/arch/x86/crypto/aria-avx.h
@@ -5,12 +5,58 @@
#include <linux/types.h>
#define ARIA_AESNI_PARALLEL_BLOCKS 16
-#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * 16)
+#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_PARALLEL_BLOCKS)
+
+#define ARIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+#define ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_AVX2_PARALLEL_BLOCKS)
+
+#define ARIA_GFNI_AVX512_PARALLEL_BLOCKS 64
+#define ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_GFNI_AVX512_PARALLEL_BLOCKS)
+
+asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
struct aria_avx_ops {
void (*aria_encrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
void (*aria_decrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
void (*aria_ctr_crypt_16way)(const void *ctx, u8 *dst, const u8 *src,
u8 *keystream, u8 *iv);
+ void (*aria_encrypt_32way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_32way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_32way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+ void (*aria_encrypt_64way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_64way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_64way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+
+
};
#endif
diff --git a/arch/x86/crypto/aria-gfni-avx512-asm_64.S b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
new file mode 100644
index 000000000000..3193f0701450
--- /dev/null
+++ b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
@@ -0,0 +1,971 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 64-way parallel algorithm (AVX512)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+#include <linux/cfi_types.h>
+
+/* register macros */
+#define CTX %rdi
+
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define add_le128(out, in, lo_counter, hi_counter1) \
+ vpaddq lo_counter, in, out; \
+ vpcmpuq $1, lo_counter, out, %k1; \
+ kaddb %k1, %k1, %k1; \
+ vpaddq hi_counter1, out, out{%k1};
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandq x, mask4bit, tmp0; \
+ vpandqn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxorq tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu64 d2, st0; \
+ vmovdqu64 d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 a0, st0; \
+ vmovdqu64 a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti64x2 .Lshufb_16x16b, a0; \
+ vmovdqu64 st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu64 d3, st1; \
+ vmovdqu64 st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu64 d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 b0, st0; \
+ vmovdqu64 b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu64 st0, b0; \
+ vmovdqu64 st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu64 d2, st0; \
+ vmovdqu64 d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 a0, st0; \
+ vmovdqu64 a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti64x2 .Lshufb_16x16b, a0; \
+ vmovdqu64 st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu64 d3, st1; \
+ vmovdqu64 st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu64 d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 b0, st0; \
+ vmovdqu64 b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu64 st0, b0; \
+ vmovdqu64 st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu64 (0 * 64)(rio), x0; \
+ vmovdqu64 (1 * 64)(rio), x1; \
+ vmovdqu64 (2 * 64)(rio), x2; \
+ vmovdqu64 (3 * 64)(rio), x3; \
+ vmovdqu64 (4 * 64)(rio), x4; \
+ vmovdqu64 (5 * 64)(rio), x5; \
+ vmovdqu64 (6 * 64)(rio), x6; \
+ vmovdqu64 (7 * 64)(rio), x7; \
+ vmovdqu64 (8 * 64)(rio), y0; \
+ vmovdqu64 (9 * 64)(rio), y1; \
+ vmovdqu64 (10 * 64)(rio), y2; \
+ vmovdqu64 (11 * 64)(rio), y3; \
+ vmovdqu64 (12 * 64)(rio), y4; \
+ vmovdqu64 (13 * 64)(rio), y5; \
+ vmovdqu64 (14 * 64)(rio), y6; \
+ vmovdqu64 (15 * 64)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu64 x0, 0 * 64(mem_ab); \
+ vmovdqu64 x1, 1 * 64(mem_ab); \
+ vmovdqu64 x2, 2 * 64(mem_ab); \
+ vmovdqu64 x3, 3 * 64(mem_ab); \
+ vmovdqu64 x4, 4 * 64(mem_ab); \
+ vmovdqu64 x5, 5 * 64(mem_ab); \
+ vmovdqu64 x6, 6 * 64(mem_ab); \
+ vmovdqu64 x7, 7 * 64(mem_ab); \
+ vmovdqu64 y0, 0 * 64(mem_cd); \
+ vmovdqu64 y1, 1 * 64(mem_cd); \
+ vmovdqu64 y2, 2 * 64(mem_cd); \
+ vmovdqu64 y3, 3 * 64(mem_cd); \
+ vmovdqu64 y4, 4 * 64(mem_cd); \
+ vmovdqu64 y5, 5 * 64(mem_cd); \
+ vmovdqu64 y6, 6 * 64(mem_cd); \
+ vmovdqu64 y7, 7 * 64(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu64 x0, 0 * 64(mem); \
+ vmovdqu64 x1, 1 * 64(mem); \
+ vmovdqu64 x2, 2 * 64(mem); \
+ vmovdqu64 x3, 3 * 64(mem); \
+ vmovdqu64 x4, 4 * 64(mem); \
+ vmovdqu64 x5, 5 * 64(mem); \
+ vmovdqu64 x6, 6 * 64(mem); \
+ vmovdqu64 x7, 7 * 64(mem); \
+ vmovdqu64 y0, 8 * 64(mem); \
+ vmovdqu64 y1, 9 * 64(mem); \
+ vmovdqu64 y2, 10 * 64(mem); \
+ vmovdqu64 y3, 11 * 64(mem); \
+ vmovdqu64 y4, 12 * 64(mem); \
+ vmovdqu64 y5, 13 * 64(mem); \
+ vmovdqu64 y6, 14 * 64(mem); \
+ vmovdqu64 y7, 15 * 64(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \
+ vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \
+ vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \
+ vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \
+ vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \
+ vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \
+ vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \
+ vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \
+ vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \
+ vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \
+ vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \
+ vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \
+ vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \
+ vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \
+ vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
+
+#define aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ t0, rk, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + 3)(rk), t0; \
+ vpxorq t0, x0, x0; \
+ vpbroadcastb ((round * 16) + 2)(rk), t0; \
+ vpxorq t0, x1, x1; \
+ vpbroadcastb ((round * 16) + 1)(rk), t0; \
+ vpxorq t0, x2, x2; \
+ vpbroadcastb ((round * 16) + 0)(rk), t0; \
+ vpxorq t0, x3, x3; \
+ vpbroadcastb ((round * 16) + 7)(rk), t0; \
+ vpxorq t0, x4, x4; \
+ vpbroadcastb ((round * 16) + 6)(rk), t0; \
+ vpxorq t0, x5, x5; \
+ vpbroadcastb ((round * 16) + 5)(rk), t0; \
+ vpxorq t0, x6, x6; \
+ vpbroadcastb ((round * 16) + 4)(rk), t0; \
+ vpxorq t0, x7, x7; \
+ vpbroadcastb ((round * 16) + 11)(rk), t0; \
+ vpxorq t0, y0, y0; \
+ vpbroadcastb ((round * 16) + 10)(rk), t0; \
+ vpxorq t0, y1, y1; \
+ vpbroadcastb ((round * 16) + 9)(rk), t0; \
+ vpxorq t0, y2, y2; \
+ vpbroadcastb ((round * 16) + 8)(rk), t0; \
+ vpxorq t0, y3, y3; \
+ vpbroadcastb ((round * 16) + 15)(rk), t0; \
+ vpxorq t0, y4, y4; \
+ vpbroadcastb ((round * 16) + 14)(rk), t0; \
+ vpxorq t0, y5, y5; \
+ vpbroadcastb ((round * 16) + 13)(rk), t0; \
+ vpxorq t0, y6, y6; \
+ vpbroadcastb ((round * 16) + 12)(rk), t0; \
+ vpxorq t0, y7, y7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix, t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix, t1; \
+ vpbroadcastq .Ltf_id_bitmatrix, t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix, t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7;
+
+#define aria_sbox_16way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix, t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix, t1; \
+ vpbroadcastq .Ltf_id_bitmatrix, t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix, t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \
+ vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \
+ vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \
+ vgf2p8affineinvqb $0, t2, y2, y2; \
+ vgf2p8affineinvqb $0, t2, y6, y6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \
+ vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \
+ vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \
+ vgf2p8affineinvqb $0, t2, y3, y3; \
+ vgf2p8affineinvqb $0, t2, y7, y7;
+
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxorq x0, x3, t0; \
+ vpxorq x1, x0, t1; \
+ vpxorq x2, x1, t2; \
+ vpxorq x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxorq t2, x0, x0; \
+ vpxorq x1, t3, t3; \
+ vpxorq t0, x2, x2; \
+ vpxorq t1, x3, x1; \
+ vmovdqu64 t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxorq y0, x4, x4; \
+ vpxorq y1, x5, x5; \
+ vpxorq y2, x6, x6; \
+ vpxorq y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxorq y4, y0, y0; \
+ vpxorq y5, y1, y1; \
+ vpxorq y6, y2, y2; \
+ vpxorq y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxorq x4, x0, x0; \
+ vpxorq x5, x1, x1; \
+ vpxorq x6, x2, x2; \
+ vpxorq x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxorq x4, y4, y4; \
+ vpxorq x5, y5, y5; \
+ vpxorq x6, y6, y6; \
+ vpxorq x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxorq x0, y0, y0; \
+ vpxorq x1, y1, y1; \
+ vpxorq x2, y2, y2; \
+ vpxorq x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxorq y0, x4, x4; \
+ vpxorq y1, x5, x5; \
+ vpxorq y2, x6, x6; \
+ vpxorq y3, x7, x7;
+
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round) \
+ aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7, \
+ z0, rk, round); \
+ \
+ aria_sbox_16way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y2, y3, y0, y1, \
+ y6, y7, y4, y5, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
+ aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
+ aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
+ aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round) \
+ aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7, \
+ z0, rk, round); \
+ \
+ aria_sbox_16way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
+ aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
+ aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
+ aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, rk, round); \
+ aria_sbox_16way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y2, y3, y0, y1, \
+ y6, y7, y4, y5, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, rk, last_round);
+
+
+.section .rodata.cst64, "aM", @progbits, 64
+.align 64
+.Lcounter0123_lo:
+ .quad 0, 0
+ .quad 1, 0
+ .quad 2, 0
+ .quad 3, 0
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+.Lcounter4444_lo:
+ .quad 4, 0
+.Lcounter8888_lo:
+ .quad 8, 0
+.Lcounter16161616_lo:
+ .quad 16, 0
+.Lcounter1111_hi:
+ .quad 0, 1
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+.text
+SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %zmm0..%zmm15: byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 64(%rax), %r8;
+
+ inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax, %r8);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 0);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 1);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 2);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 3);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 4);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 5);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 6);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 7);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 8);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 9);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 11);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 13);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 14);
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
+ %zmm8, %zmm13, %zmm2, %zmm7,
+ %zmm11, %zmm14, %zmm1, %zmm4,
+ %zmm10, %zmm15, %zmm0, %zmm5,
+ (%rax), (%r8));
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rdx);
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rdx);
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
+
+SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+
+ vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
+ vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
+ vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
+ vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
+ vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
+ vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
+
+ /* load IV and byteswap */
+ movq 8(%r8), %r11;
+ movq (%r8), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ vbroadcasti64x2 (%r8), %zmm20;
+ vpshufb %zmm19, %zmm20, %zmm20;
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 64), %r11;
+ ja .Lload_ctr_carry;
+
+ /* construct IVs */
+ vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
+ vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
+ vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
+ vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
+ vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
+ vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
+ vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
+ vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
+ vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
+ vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
+ vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
+ vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
+ vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
+ vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
+ vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
+ vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
+ jmp .Lload_ctr_done;
+
+.Lload_ctr_carry:
+ /* construct IVs */
+ add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
+ add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+ add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+ add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+ add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
+ add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
+ add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
+ add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
+ add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
+ add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
+ add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
+ add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
+ add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
+ add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
+ add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
+ add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
+
+.Lload_ctr_done:
+ /* Byte-swap IVs and update counter. */
+ addq $64, %r11;
+ adcq $0, %r10;
+ vpshufb %zmm19, %zmm15, %zmm15;
+ vpshufb %zmm19, %zmm14, %zmm14;
+ vpshufb %zmm19, %zmm13, %zmm13;
+ vpshufb %zmm19, %zmm12, %zmm12;
+ vpshufb %zmm19, %zmm11, %zmm11;
+ vpshufb %zmm19, %zmm10, %zmm10;
+ vpshufb %zmm19, %zmm9, %zmm9;
+ vpshufb %zmm19, %zmm8, %zmm8;
+ bswapq %r11;
+ bswapq %r10;
+ vpshufb %zmm19, %zmm7, %zmm7;
+ vpshufb %zmm19, %zmm6, %zmm6;
+ vpshufb %zmm19, %zmm5, %zmm5;
+ vpshufb %zmm19, %zmm4, %zmm4;
+ vpshufb %zmm19, %zmm3, %zmm3;
+ vpshufb %zmm19, %zmm2, %zmm2;
+ vpshufb %zmm19, %zmm1, %zmm1;
+ vpshufb %zmm19, %zmm0, %zmm0;
+ movq %r11, 8(%r8);
+ movq %r10, (%r8);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_gfni_avx512_ctr_gen_keystream_64way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
+ vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
+ vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
+ vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
+ vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
+ vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
+ vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
+ vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
+ vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
+ vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
+ vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
+ vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
+ vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
+ vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
+ vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
+ vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)
diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c
new file mode 100644
index 000000000000..87a11804fc77
--- /dev/null
+++ b/arch/x86/crypto/aria_aesni_avx2_glue.c
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX2/AES-NI/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_encrypt_32way);
+asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_decrypt_32way);
+asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_ctr_crypt_32way);
+#ifdef CONFIG_AS_GFNI
+asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_encrypt_32way);
+asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_decrypt_32way);
+asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_ctr_crypt_32way);
+#endif /* CONFIG_AS_GFNI */
+
+static struct aria_avx_ops aria_ops;
+
+struct aria_avx2_request_ctx {
+ u8 keystream[ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE];
+};
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx2_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx2_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx2_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx2_ctr_encrypt(struct skcipher_request *req)
+{
+ struct aria_avx2_request_ctx *req_ctx = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_32way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx2_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx2_request_ctx));
+
+ return 0;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "__ecb(aria)",
+ .base.cra_driver_name = "__ecb-aria-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx2_set_key,
+ .encrypt = aria_avx2_ecb_encrypt,
+ .decrypt = aria_avx2_ecb_decrypt,
+ }, {
+ .base.cra_name = "__ctr(aria)",
+ .base.cra_driver_name = "__ctr-aria-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL |
+ CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .setkey = aria_avx2_set_key,
+ .encrypt = aria_avx2_ctr_encrypt,
+ .decrypt = aria_avx2_ctr_encrypt,
+ .init = aria_avx2_init_tfm,
+ }
+};
+
+static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
+
+static int __init aria_avx2_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_GFNI) && IS_ENABLED(CONFIG_AS_GFNI)) {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way;
+ } else {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way;
+ }
+
+ return simd_register_skciphers_compat(aria_algs,
+ ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+static void __exit aria_avx2_exit(void)
+{
+ simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+module_init(aria_avx2_init);
+module_exit(aria_avx2_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX2/AES-NI/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-aesni-avx2");
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
index c561ea4fefa5..4e1516b76669 100644
--- a/arch/x86/crypto/aria_aesni_avx_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -18,21 +18,33 @@
asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_encrypt_16way);
asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_decrypt_16way);
asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst,
const u8 *src,
u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_ctr_crypt_16way);
+#ifdef CONFIG_AS_GFNI
asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_encrypt_16way);
asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_decrypt_16way);
asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst,
const u8 *src,
u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_ctr_crypt_16way);
+#endif /* CONFIG_AS_GFNI */
static struct aria_avx_ops aria_ops;
+struct aria_avx_request_ctx {
+ u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE];
+};
+
static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
{
ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
@@ -73,6 +85,7 @@ static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key,
static int aria_avx_ctr_encrypt(struct skcipher_request *req)
{
+ struct aria_avx_request_ctx *req_ctx = skcipher_request_ctx(req);
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
@@ -86,10 +99,9 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req)
u8 *dst = walk.dst.virt.addr;
while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
- u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE];
-
kernel_fpu_begin();
- aria_ops.aria_ctr_crypt_16way(ctx, dst, src, keystream,
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
walk.iv);
kernel_fpu_end();
dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
@@ -98,28 +110,29 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req)
}
while (nbytes >= ARIA_BLOCK_SIZE) {
- u8 keystream[ARIA_BLOCK_SIZE];
-
- memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+ memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
- aria_encrypt(ctx, keystream, keystream);
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
- crypto_xor_cpy(dst, src, keystream, ARIA_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
dst += ARIA_BLOCK_SIZE;
src += ARIA_BLOCK_SIZE;
nbytes -= ARIA_BLOCK_SIZE;
}
if (walk.nbytes == walk.total && nbytes > 0) {
- u8 keystream[ARIA_BLOCK_SIZE];
-
- memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
- aria_encrypt(ctx, keystream, keystream);
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
- crypto_xor_cpy(dst, src, keystream, nbytes);
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
dst += nbytes;
src += nbytes;
nbytes = 0;
@@ -130,6 +143,13 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req)
return err;
}
+static int aria_avx_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx_request_ctx));
+
+ return 0;
+}
+
static struct skcipher_alg aria_algs[] = {
{
.base.cra_name = "__ecb(aria)",
@@ -160,6 +180,7 @@ static struct skcipher_alg aria_algs[] = {
.setkey = aria_avx_set_key,
.encrypt = aria_avx_ctr_encrypt,
.decrypt = aria_avx_ctr_encrypt,
+ .init = aria_avx_init_tfm,
}
};
@@ -182,7 +203,7 @@ static int __init aria_avx_init(void)
return -ENODEV;
}
- if (boot_cpu_has(X86_FEATURE_GFNI)) {
+ if (boot_cpu_has(X86_FEATURE_GFNI) && IS_ENABLED(CONFIG_AS_GFNI)) {
aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c
new file mode 100644
index 000000000000..f4a2208d2638
--- /dev/null
+++ b/arch/x86/crypto/aria_gfni_avx512_glue.c
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX512/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_gfni_avx512_encrypt_64way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_gfni_avx512_decrypt_64way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_gfni_avx512_ctr_crypt_64way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+static struct aria_avx_ops aria_ops;
+
+struct aria_avx512_request_ctx {
+ u8 keystream[ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE];
+};
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_encrypt_64way);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_decrypt_64way);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx512_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx512_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx512_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx512_ctr_encrypt(struct skcipher_request *req)
+{
+ struct aria_avx512_request_ctx *req_ctx = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_64way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ src += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_32way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx512_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm,
+ sizeof(struct aria_avx512_request_ctx));
+
+ return 0;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "__ecb(aria)",
+ .base.cra_driver_name = "__ecb-aria-avx512",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx512_set_key,
+ .encrypt = aria_avx512_ecb_encrypt,
+ .decrypt = aria_avx512_ecb_decrypt,
+ }, {
+ .base.cra_name = "__ctr(aria)",
+ .base.cra_driver_name = "__ctr-aria-avx512",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL |
+ CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .setkey = aria_avx512_set_key,
+ .encrypt = aria_avx512_ctr_encrypt,
+ .decrypt = aria_avx512_ctr_encrypt,
+ .init = aria_avx512_init_tfm,
+ }
+};
+
+static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
+
+static int __init aria_avx512_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AVX512F) ||
+ !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+ !boot_cpu_has(X86_FEATURE_GFNI) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX512/GFNI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way;
+ aria_ops.aria_encrypt_64way = aria_gfni_avx512_encrypt_64way;
+ aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way;
+ aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way;
+
+ return simd_register_skciphers_compat(aria_algs,
+ ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+static void __exit aria_avx512_exit(void)
+{
+ simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+module_init(aria_avx512_init);
+module_exit(aria_avx512_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX512/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-gfni-avx512");
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 4a43e072d2d1..e88c8e4f013c 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -6,7 +6,6 @@
*/
#include <linux/linkage.h>
-#include <linux/cfi_types.h>
.file "blowfish-x86_64-asm.S"
.text
@@ -100,16 +99,11 @@
bswapq RX0; \
movq RX0, (RIO);
-#define xor_block() \
- bswapq RX0; \
- xorq RX0, (RIO);
-
-SYM_FUNC_START(__blowfish_enc_blk)
+SYM_FUNC_START(blowfish_enc_blk)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
- * %rcx: bool, if true: xor output
*/
movq %r12, %r11;
@@ -130,19 +124,13 @@ SYM_FUNC_START(__blowfish_enc_blk)
add_roundkey_enc(16);
movq %r11, %r12;
-
movq %r10, RIO;
- test %cl, %cl;
- jnz .L__enc_xor;
write_block();
RET;
-.L__enc_xor:
- xor_block();
- RET;
-SYM_FUNC_END(__blowfish_enc_blk)
+SYM_FUNC_END(blowfish_enc_blk)
-SYM_TYPED_FUNC_START(blowfish_dec_blk)
+SYM_FUNC_START(blowfish_dec_blk)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -272,28 +260,26 @@ SYM_FUNC_END(blowfish_dec_blk)
movq RX3, 24(RIO);
#define xor_block4() \
- bswapq RX0; \
- xorq RX0, (RIO); \
+ movq (RIO), RT0; \
+ bswapq RT0; \
+ xorq RT0, RX1; \
\
- bswapq RX1; \
- xorq RX1, 8(RIO); \
+ movq 8(RIO), RT2; \
+ bswapq RT2; \
+ xorq RT2, RX2; \
\
- bswapq RX2; \
- xorq RX2, 16(RIO); \
- \
- bswapq RX3; \
- xorq RX3, 24(RIO);
+ movq 16(RIO), RT3; \
+ bswapq RT3; \
+ xorq RT3, RX3;
-SYM_FUNC_START(__blowfish_enc_blk_4way)
+SYM_FUNC_START(blowfish_enc_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
- * %rcx: bool, if true: xor output
*/
pushq %r12;
pushq %rbx;
- pushq %rcx;
movq %rdi, CTX
movq %rsi, %r11;
@@ -313,37 +299,28 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
round_enc4(14);
add_preloaded_roundkey4();
- popq %r12;
movq %r11, RIO;
-
- test %r12b, %r12b;
- jnz .L__enc_xor4;
-
write_block4();
popq %rbx;
popq %r12;
RET;
+SYM_FUNC_END(blowfish_enc_blk_4way)
-.L__enc_xor4:
- xor_block4();
-
- popq %rbx;
- popq %r12;
- RET;
-SYM_FUNC_END(__blowfish_enc_blk_4way)
-
-SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
+SYM_FUNC_START(__blowfish_dec_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
+ * %rcx: cbc (bool)
*/
pushq %r12;
pushq %rbx;
+ pushq %rcx;
+ pushq %rdx;
movq %rdi, CTX;
- movq %rsi, %r11
+ movq %rsi, %r11;
movq %rdx, RIO;
preload_roundkey_dec(17);
@@ -359,6 +336,14 @@ SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
round_dec4(3);
add_preloaded_roundkey4();
+ popq RIO;
+ popq %r12;
+ testq %r12, %r12;
+ jz .L_no_cbc_xor;
+
+ xor_block4();
+
+.L_no_cbc_xor:
movq %r11, RIO;
write_block4();
@@ -366,4 +351,4 @@ SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
popq %r12;
RET;
-SYM_FUNC_END(blowfish_dec_blk_4way)
+SYM_FUNC_END(__blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 019c64c1340a..552f2df0643f 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -16,26 +16,28 @@
#include <linux/module.h>
#include <linux/types.h>
+#include "ecb_cbc_helpers.h"
+
/* regular block cipher functions */
-asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
- bool xor);
+asmlinkage void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
/* 4-way parallel cipher functions */
-asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+asmlinkage void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src);
+asmlinkage void __blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src, bool cbc);
-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+static inline void blowfish_dec_ecb_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
{
- __blowfish_enc_blk(ctx, dst, src, false);
+ return __blowfish_dec_blk_4way(ctx, dst, src, false);
}
-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void blowfish_dec_cbc_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
{
- __blowfish_enc_blk_4way(ctx, dst, src, false);
+ return __blowfish_dec_blk_4way(ctx, dst, src, true);
}
static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
@@ -54,183 +56,35 @@ static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm,
return blowfish_setkey(&tfm->base, key, keylen);
}
-static int ecb_crypt(struct skcipher_request *req,
- void (*fn)(struct bf_ctx *, u8 *, const u8 *),
- void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- u8 *wsrc = walk.src.virt.addr;
- u8 *wdst = walk.dst.virt.addr;
-
- /* Process four block batch */
- if (nbytes >= bsize * 4) {
- do {
- fn_4way(ctx, wdst, wsrc);
-
- wsrc += bsize * 4;
- wdst += bsize * 4;
- nbytes -= bsize * 4;
- } while (nbytes >= bsize * 4);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- fn(ctx, wdst, wsrc);
-
- wsrc += bsize;
- wdst += bsize;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, blowfish_enc_blk, blowfish_enc_blk_4way);
+ ECB_WALK_START(req, BF_BLOCK_SIZE, -1);
+ ECB_BLOCK(4, blowfish_enc_blk_4way);
+ ECB_BLOCK(1, blowfish_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, blowfish_dec_blk, blowfish_dec_blk_4way);
-}
-
-static unsigned int __cbc_encrypt(struct bf_ctx *ctx,
- struct skcipher_walk *walk)
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 *iv = (u64 *)walk->iv;
-
- do {
- *dst = *src ^ *iv;
- blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
- iv = dst;
-
- src += 1;
- dst += 1;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
- *(u64 *)walk->iv = *iv;
- return nbytes;
+ ECB_WALK_START(req, BF_BLOCK_SIZE, -1);
+ ECB_BLOCK(4, blowfish_dec_ecb_4way);
+ ECB_BLOCK(1, blowfish_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while (walk.nbytes) {
- nbytes = __cbc_encrypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static unsigned int __cbc_decrypt(struct bf_ctx *ctx,
- struct skcipher_walk *walk)
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ivs[4 - 1];
- u64 last_iv;
-
- /* Start of the last block. */
- src += nbytes / bsize - 1;
- dst += nbytes / bsize - 1;
-
- last_iv = *src;
-
- /* Process four block batch */
- if (nbytes >= bsize * 4) {
- do {
- nbytes -= bsize * 4 - bsize;
- src -= 4 - 1;
- dst -= 4 - 1;
-
- ivs[0] = src[0];
- ivs[1] = src[1];
- ivs[2] = src[2];
-
- blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
-
- dst[1] ^= ivs[0];
- dst[2] ^= ivs[1];
- dst[3] ^= ivs[2];
-
- nbytes -= bsize;
- if (nbytes < bsize)
- goto done;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- } while (nbytes >= bsize * 4);
- }
-
- /* Handle leftovers */
- for (;;) {
- blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- break;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- }
-
-done:
- *dst ^= *(u64 *)walk->iv;
- *(u64 *)walk->iv = last_iv;
-
- return nbytes;
+ CBC_WALK_START(req, BF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(blowfish_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while (walk.nbytes) {
- nbytes = __cbc_decrypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
+ CBC_WALK_START(req, BF_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(4, blowfish_dec_cbc_4way);
+ CBC_DEC_BLOCK(1, blowfish_dec_blk);
+ CBC_WALK_END();
}
static struct crypto_alg bf_cipher_alg = {
diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
index eaa15c7b29d6..11955bd01af1 100644
--- a/arch/x86/crypto/ecb_cbc_helpers.h
+++ b/arch/x86/crypto/ecb_cbc_helpers.h
@@ -13,13 +13,14 @@
#define ECB_WALK_START(req, bsize, fpu_blocks) do { \
void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \
+ const int __fpu_blocks = (fpu_blocks); \
const int __bsize = (bsize); \
struct skcipher_walk walk; \
int err = skcipher_walk_virt(&walk, (req), false); \
while (walk.nbytes > 0) { \
unsigned int nbytes = walk.nbytes; \
- bool do_fpu = (fpu_blocks) != -1 && \
- nbytes >= (fpu_blocks) * __bsize; \
+ bool do_fpu = __fpu_blocks != -1 && \
+ nbytes >= __fpu_blocks * __bsize; \
const u8 *src = walk.src.virt.addr; \
u8 *dst = walk.dst.virt.addr; \
u8 __maybe_unused buf[(bsize)]; \
@@ -35,7 +36,12 @@
} while (0)
#define ECB_BLOCK(blocks, func) do { \
- while (nbytes >= (blocks) * __bsize) { \
+ const int __blocks = (blocks); \
+ if (do_fpu && __blocks < __fpu_blocks) { \
+ kernel_fpu_end(); \
+ do_fpu = false; \
+ } \
+ while (nbytes >= __blocks * __bsize) { \
(func)(ctx, dst, src); \
ECB_WALK_ADVANCE(blocks); \
} \
@@ -53,7 +59,12 @@
} while (0)
#define CBC_DEC_BLOCK(blocks, func) do { \
- while (nbytes >= (blocks) * __bsize) { \
+ const int __blocks = (blocks); \
+ if (do_fpu && __blocks < __fpu_blocks) { \
+ kernel_fpu_end(); \
+ do_fpu = false; \
+ } \
+ while (nbytes >= __blocks * __bsize) { \
const u8 *__iv = src + ((blocks) - 1) * __bsize; \
if (dst == src) \
__iv = memcpy(buf, __iv, __bsize); \
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 2bf871899920..257ed9446f3e 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -4,7 +4,7 @@
* instructions. This file contains accelerated part of ghash
* implementation. More information about PCLMULQDQ can be found at:
*
- * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ * https://www.intel.com/content/dam/develop/external/us/en/documents/clmul-wp-rev-2-02-2014-04-20.pdf
*
* Copyright (c) 2009 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
@@ -88,7 +88,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
RET
SYM_FUNC_END(__clmul_gf128mul_ble)
-/* void clmul_ghash_mul(char *dst, const u128 *shash) */
+/* void clmul_ghash_mul(char *dst, const le128 *shash) */
SYM_FUNC_START(clmul_ghash_mul)
FRAME_BEGIN
movups (%rdi), DATA
@@ -104,7 +104,7 @@ SYM_FUNC_END(clmul_ghash_mul)
/*
* void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- * const u128 *shash);
+ * const le128 *shash);
*/
SYM_FUNC_START(clmul_ghash_update)
FRAME_BEGIN
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 1f1a95f3dd0c..700ecaee9a08 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,21 +19,22 @@
#include <crypto/internal/simd.h>
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
+#include <asm/unaligned.h>
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
-void clmul_ghash_mul(char *dst, const u128 *shash);
+void clmul_ghash_mul(char *dst, const le128 *shash);
void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- const u128 *shash);
+ const le128 *shash);
struct ghash_async_ctx {
struct cryptd_ahash *cryptd_tfm;
};
struct ghash_ctx {
- u128 shash;
+ le128 shash;
};
struct ghash_desc_ctx {
@@ -54,22 +55,40 @@ static int ghash_setkey(struct crypto_shash *tfm,
const u8 *key, unsigned int keylen)
{
struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
- be128 *x = (be128 *)key;
u64 a, b;
if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
- /* perform multiplication by 'x' in GF(2^128) */
- a = be64_to_cpu(x->a);
- b = be64_to_cpu(x->b);
-
- ctx->shash.a = (b << 1) | (a >> 63);
- ctx->shash.b = (a << 1) | (b >> 63);
-
+ /*
+ * GHASH maps bits to polynomial coefficients backwards, which makes it
+ * hard to implement. But it can be shown that the GHASH multiplication
+ *
+ * D * K (mod x^128 + x^7 + x^2 + x + 1)
+ *
+ * (where D is a data block and K is the key) is equivalent to:
+ *
+ * bitreflect(D) * bitreflect(K) * x^(-127)
+ * (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * So, the code below precomputes:
+ *
+ * bitreflect(K) * x^(-127) (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * ... but in Montgomery form (so that Montgomery multiplication can be
+ * used), i.e. with an extra x^128 factor, which means actually:
+ *
+ * bitreflect(K) * x (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * The within-a-byte part of bitreflect() cancels out GHASH's built-in
+ * reflection, and thus bitreflect() is actually a byteswap.
+ */
+ a = get_unaligned_be64(key);
+ b = get_unaligned_be64(key + 8);
+ ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63));
+ ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63));
if (a >> 63)
- ctx->shash.b ^= ((u64)0xc2) << 56;
-
+ ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56);
return 0;
}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 15739a2c0983..7ecd2aeeeffc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -782,7 +782,7 @@ _ASM_NOKPROBE(common_interrupt_return)
/*
* Reload gs selector with exception handling
- * edi: new selector
+ * di: new selector
*
* Is in entry.text as it shouldn't be instrumented.
*/
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 838613ac15b8..1506a22a4fb6 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -29,7 +29,7 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-vobjs32-y += vdso32/vclock_gettime.o
+vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
vobjs-$(CONFIG_X86_SGX) += vsgx.o
# files to link into kernel
@@ -104,6 +104,7 @@ $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
CFLAGS_REMOVE_vclock_gettime.o = -pg
CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg
CFLAGS_REMOVE_vsgx.o = -pg
#
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 5264daa8859f..67b3e37576a6 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -179,6 +179,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
+ fprintf(outfile, "#include <linux/init.h>\n");
fprintf(outfile, "#include <asm/page_types.h>\n");
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
@@ -218,5 +219,10 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
required_syms[i].name, (int64_t)syms[i]);
}
+ fprintf(outfile, "};\n\n");
+ fprintf(outfile, "static __init int init_%s(void) {\n", image_name);
+ fprintf(outfile, "\treturn init_vdso_image(&%s);\n", image_name);
fprintf(outfile, "};\n");
+ fprintf(outfile, "subsys_initcall(init_%s);\n", image_name);
+
}
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 43842fade8fa..3b300a773c7e 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -51,17 +51,8 @@ __setup("vdso32=", vdso32_setup);
__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
-int __init sysenter_setup(void)
-{
- init_vdso_image(&vdso_image_32);
-
- return 0;
-}
-
#ifdef CONFIG_X86_64
-subsys_initcall(sysenter_setup);
-
#ifdef CONFIG_SYSCTL
/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
diff --git a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
new file mode 100644
index 000000000000..db1b15f686e3
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_64
+
+/*
+ * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
+ * configuration
+ */
+#undef CONFIG_64BIT
+#undef CONFIG_X86_64
+#undef CONFIG_COMPAT
+#undef CONFIG_PGTABLE_LEVELS
+#undef CONFIG_ILLEGAL_POINTER_VALUE
+#undef CONFIG_SPARSEMEM_VMEMMAP
+#undef CONFIG_NR_CPUS
+#undef CONFIG_PARAVIRT_XXL
+
+#define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
+#define CONFIG_PAGE_OFFSET 0
+#define CONFIG_ILLEGAL_POINTER_VALUE 0
+#define CONFIG_NR_CPUS 1
+
+#define BUILD_VDSO32_64
+
+#endif
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 283ed9d00426..86981decfea8 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -1,29 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
#define BUILD_VDSO32
-
-#ifdef CONFIG_X86_64
-
-/*
- * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
- * configuration
- */
-#undef CONFIG_64BIT
-#undef CONFIG_X86_64
-#undef CONFIG_COMPAT
-#undef CONFIG_PGTABLE_LEVELS
-#undef CONFIG_ILLEGAL_POINTER_VALUE
-#undef CONFIG_SPARSEMEM_VMEMMAP
-#undef CONFIG_NR_CPUS
-#undef CONFIG_PARAVIRT_XXL
-
-#define CONFIG_X86_32 1
-#define CONFIG_PGTABLE_LEVELS 2
-#define CONFIG_PAGE_OFFSET 0
-#define CONFIG_ILLEGAL_POINTER_VALUE 0
-#define CONFIG_NR_CPUS 1
-
-#define BUILD_VDSO32_64
-
-#endif
-
+#include "fake_32bit_build.h"
#include "../vclock_gettime.c"
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index c7720995ab1a..8a3be07006bb 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -28,6 +28,7 @@ VERSION
__vdso_time;
__vdso_clock_getres;
__vdso_clock_gettime64;
+ __vdso_getcpu;
};
LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdso32/vgetcpu.c b/arch/x86/entry/vdso/vdso32/vgetcpu.c
new file mode 100644
index 000000000000..3a9791f5e998
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vgetcpu.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "fake_32bit_build.h"
+#include "../vgetcpu.c"
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index b88a82bbc359..0a9007c24056 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -7,8 +7,7 @@
#include <linux/kernel.h>
#include <linux/getcpu.h>
-#include <linux/time.h>
-#include <asm/vgtod.h>
+#include <asm/segment.h>
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index b8f3f9b9e53c..11a5c68d1218 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -44,13 +44,16 @@ unsigned int vclocks_used __read_mostly;
unsigned int __read_mostly vdso64_enabled = 1;
#endif
-void __init init_vdso_image(const struct vdso_image *image)
+int __init init_vdso_image(const struct vdso_image *image)
{
+ BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32);
BUG_ON(image->size % PAGE_SIZE != 0);
apply_alternatives((struct alt_instr *)(image->data + image->alt),
(struct alt_instr *)(image->data + image->alt +
image->alt_len));
+
+ return 0;
}
static const struct vm_special_mapping vvar_mapping;
@@ -113,10 +116,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- unsigned long size = vma->vm_end - vma->vm_start;
-
if (vma_is_special_mapping(vma, &vvar_mapping))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
}
mmap_read_unlock(mm);
@@ -418,18 +419,4 @@ static __init int vdso_setup(char *s)
return 1;
}
__setup("vdso=", vdso_setup);
-
-static int __init init_vdso(void)
-{
- BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32);
-
- init_vdso_image(&vdso_image_64);
-
-#ifdef CONFIG_X86_X32_ABI
- init_vdso_image(&vdso_image_x32);
-#endif
-
- return 0;
-}
-subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 4af81df133ee..d234ca797e4a 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -391,7 +391,7 @@ void __init map_vsyscall(void)
}
if (vsyscall_mode == XONLY)
- gate_vma.vm_flags = VM_EXEC;
+ vm_flags_init(&gate_vma, VM_EXEC);
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index 58461fa18b6f..ed308719236c 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -41,18 +41,15 @@ static inline unsigned int brs_to(int idx)
return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1;
}
-static inline void set_debug_extn_cfg(u64 val)
+static __always_inline void set_debug_extn_cfg(u64 val)
{
/* bits[4:3] must always be set to 11b */
- wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3);
+ __wrmsr(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3, val >> 32);
}
-static inline u64 get_debug_extn_cfg(void)
+static __always_inline u64 get_debug_extn_cfg(void)
{
- u64 val;
-
- rdmsrl(MSR_AMD_DBG_EXTN_CFG, val);
- return val;
+ return __rdmsr(MSR_AMD_DBG_EXTN_CFG);
}
static bool __init amd_brs_detect(void)
@@ -405,7 +402,7 @@ void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_i
* called from ACPI processor_idle.c or acpi_pad.c
* with interrupts disabled
*/
-void perf_amd_brs_lopwr_cb(bool lopwr_in)
+void noinstr perf_amd_brs_lopwr_cb(bool lopwr_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
union amd_debug_extn_cfg cfg;
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 4386b10682ce..8c45b198b62f 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -928,10 +928,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
- if (has_branch_stack(event)) {
- data.br_stack = &cpuc->lbr_stack;
- data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
- }
+ if (has_branch_stack(event))
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index da3f5ebac4e1..64582954b5f6 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -1110,8 +1110,7 @@ fail:
.data = ibs_data.data,
},
};
- data.raw = &raw;
- data.sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(&data, &raw);
}
if (perf_ibs == &perf_ibs_op)
@@ -1122,10 +1121,8 @@ fail:
* recorded as part of interrupt regs. Thus we need to use rip from
* interrupt regs while unwinding call stack.
*/
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
- data.callchain = perf_callchain(event, iregs);
- data.sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(&data, event, iregs);
throttle = perf_event_overflow(event, &data, &regs);
out:
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 85a63a41c471..d096b04bf80e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2974,17 +2974,19 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
- if (!x86_pmu_initialized()) {
+ /* This API doesn't currently support enumerating hybrid PMUs. */
+ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
+ !x86_pmu_initialized()) {
memset(cap, 0, sizeof(*cap));
return;
}
- cap->version = x86_pmu.version;
/*
- * KVM doesn't support the hybrid PMU yet.
- * Return the common value in global x86_pmu,
- * which available for all cores.
+ * Note, hybrid CPU models get tracked as having hybrid PMUs even when
+ * all E-cores are disabled via BIOS. When E-cores are disabled, the
+ * base PMU holds the correct number of counters for P-cores.
*/
+ cap->version = x86_pmu.version;
cap->num_counters_gp = x86_pmu.num_counters;
cap->num_counters_fixed = x86_pmu.num_counters_fixed;
cap->bit_width_gp = x86_pmu.cntval_bits;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index dfd2c124cdf8..a3fb996a86a1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2119,6 +2119,16 @@ static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_cmt_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff3ffffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff3ffffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+ INTEL_UEVENT_EXTRA_REG(0x0127, MSR_SNOOP_RSP_0, 0xffffffffffffffffull, SNOOP_0),
+ INTEL_UEVENT_EXTRA_REG(0x0227, MSR_SNOOP_RSP_1, 0xffffffffffffffffull, SNOOP_1),
+ EVENT_EXTRA_END
+};
+
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
@@ -3026,10 +3036,8 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
perf_sample_data_init(&data, 0, event->hw.last_period);
- if (has_branch_stack(event)) {
- data.br_stack = &cpuc->lbr_stack;
- data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
- }
+ if (has_branch_stack(event))
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -4182,6 +4190,12 @@ static int hsw_hw_config(struct perf_event *event)
static struct event_constraint counter0_constraint =
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1);
+static struct event_constraint counter1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x2);
+
+static struct event_constraint counter0_1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x3);
+
static struct event_constraint counter2_constraint =
EVENT_CONSTRAINT(0, 0x4, 0);
@@ -4191,6 +4205,12 @@ static struct event_constraint fixed0_constraint =
static struct event_constraint fixed0_counter0_constraint =
INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL);
+static struct event_constraint fixed0_counter0_1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000003ULL);
+
+static struct event_constraint counters_1_7_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xfeULL);
+
static struct event_constraint *
hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
@@ -4322,6 +4342,78 @@ adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return &emptyconstraint;
}
+static struct event_constraint *
+cmt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0 & 1 and Fixed counter 0.
+ * If a :ppp event which is not available on the above eligible counters,
+ * error out.
+ */
+ if (event->attr.precise_ip == 3) {
+ /* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */
+ if (constraint_match(&fixed0_constraint, event->hw.config))
+ return &fixed0_counter0_1_constraint;
+
+ switch (c->idxmsk64 & 0x3ull) {
+ case 0x1:
+ return &counter0_constraint;
+ case 0x2:
+ return &counter1_constraint;
+ case 0x3:
+ return &counter0_1_constraint;
+ }
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+rwc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = spr_get_event_constraints(cpuc, idx, event);
+
+ /* The Retire Latency is not supported by the fixed counter 0. */
+ if (event->attr.precise_ip &&
+ (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
+ constraint_match(&fixed0_constraint, event->hw.config)) {
+ /*
+ * The Instruction PDIR is only available
+ * on the fixed counter 0. Error out for this case.
+ */
+ if (event->attr.precise_ip == 3)
+ return &emptyconstraint;
+ return &counters_1_7_constraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+mtl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type == hybrid_big)
+ return rwc_get_event_constraints(cpuc, idx, event);
+ if (pmu->cpu_type == hybrid_small)
+ return cmt_get_event_constraints(cpuc, idx, event);
+
+ WARN_ON(1);
+ return &emptyconstraint;
+}
+
static int adl_hw_config(struct perf_event *event)
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
@@ -4494,6 +4586,25 @@ static void flip_smm_bit(void *data)
}
}
+static void intel_pmu_check_num_counters(int *num_counters,
+ int *num_counters_fixed,
+ u64 *intel_ctrl, u64 fixed_mask);
+
+static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
+{
+ unsigned int sub_bitmaps = cpuid_eax(ARCH_PERFMON_EXT_LEAF);
+ unsigned int eax, ebx, ecx, edx;
+
+ if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) {
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ pmu->num_counters = fls(eax);
+ pmu->num_counters_fixed = fls(ebx);
+ intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed,
+ &pmu->intel_ctrl, ebx);
+ }
+}
+
static bool init_hybrid_pmu(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -4519,6 +4630,9 @@ static bool init_hybrid_pmu(int cpu)
if (!cpumask_empty(&pmu->supported_cpus))
goto end;
+ if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
+ update_pmu_cap(pmu);
+
if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed))
return false;
@@ -5463,6 +5577,12 @@ static struct attribute *adl_hybrid_mem_attrs[] = {
NULL,
};
+static struct attribute *mtl_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_adl),
+ EVENT_PTR(mem_st_adl),
+ NULL
+};
+
EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big);
EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big);
EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big);
@@ -5490,20 +5610,40 @@ FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small);
FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small);
FORMAT_ATTR_HYBRID(frontend, hybrid_big);
+#define ADL_HYBRID_RTM_FORMAT_ATTR \
+ FORMAT_HYBRID_PTR(in_tx), \
+ FORMAT_HYBRID_PTR(in_tx_cp)
+
+#define ADL_HYBRID_FORMAT_ATTR \
+ FORMAT_HYBRID_PTR(offcore_rsp), \
+ FORMAT_HYBRID_PTR(ldlat), \
+ FORMAT_HYBRID_PTR(frontend)
+
static struct attribute *adl_hybrid_extra_attr_rtm[] = {
- FORMAT_HYBRID_PTR(in_tx),
- FORMAT_HYBRID_PTR(in_tx_cp),
- FORMAT_HYBRID_PTR(offcore_rsp),
- FORMAT_HYBRID_PTR(ldlat),
- FORMAT_HYBRID_PTR(frontend),
- NULL,
+ ADL_HYBRID_RTM_FORMAT_ATTR,
+ ADL_HYBRID_FORMAT_ATTR,
+ NULL
};
static struct attribute *adl_hybrid_extra_attr[] = {
- FORMAT_HYBRID_PTR(offcore_rsp),
- FORMAT_HYBRID_PTR(ldlat),
- FORMAT_HYBRID_PTR(frontend),
- NULL,
+ ADL_HYBRID_FORMAT_ATTR,
+ NULL
+};
+
+PMU_FORMAT_ATTR_SHOW(snoop_rsp, "config1:0-63");
+FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small);
+
+static struct attribute *mtl_hybrid_extra_attr_rtm[] = {
+ ADL_HYBRID_RTM_FORMAT_ATTR,
+ ADL_HYBRID_FORMAT_ATTR,
+ FORMAT_HYBRID_PTR(snoop_rsp),
+ NULL
+};
+
+static struct attribute *mtl_hybrid_extra_attr[] = {
+ ADL_HYBRID_FORMAT_ATTR,
+ FORMAT_HYBRID_PTR(snoop_rsp),
+ NULL
};
static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
@@ -5725,6 +5865,12 @@ static void intel_pmu_check_hybrid_pmus(u64 fixed_mask)
}
}
+static __always_inline bool is_mtl(u8 x86_model)
+{
+ return (x86_model == INTEL_FAM6_METEORLAKE) ||
+ (x86_model == INTEL_FAM6_METEORLAKE_L);
+}
+
__init int intel_pmu_init(void)
{
struct attribute **extra_skl_attr = &empty_attrs;
@@ -6339,6 +6485,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ case INTEL_FAM6_EMERALDRAPIDS_X:
pmem = true;
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -6348,6 +6495,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
x86_pmu.extra_regs = intel_spr_extra_regs;
x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.pebs_ept = 1;
x86_pmu.pebs_aliases = NULL;
x86_pmu.pebs_prec_dist = true;
x86_pmu.pebs_block = true;
@@ -6381,6 +6529,8 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
case INTEL_FAM6_RAPTORLAKE_S:
+ case INTEL_FAM6_METEORLAKE:
+ case INTEL_FAM6_METEORLAKE_L:
/*
* Alder Lake has 2 types of CPU, core and atom.
*
@@ -6400,9 +6550,7 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
- x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.lbr_pt_coexist = true;
- intel_pmu_pebs_data_source_adl();
x86_pmu.pebs_latency_data = adl_latency_data_small;
x86_pmu.num_topdown_events = 8;
static_call_update(intel_pmu_update_topdown_event,
@@ -6489,8 +6637,22 @@ __init int intel_pmu_init(void)
pmu->event_constraints = intel_slm_event_constraints;
pmu->pebs_constraints = intel_grt_pebs_event_constraints;
pmu->extra_regs = intel_grt_extra_regs;
- pr_cont("Alderlake Hybrid events, ");
- name = "alderlake_hybrid";
+ if (is_mtl(boot_cpu_data.x86_model)) {
+ x86_pmu.pebs_latency_data = mtl_latency_data_small;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+ mem_attr = mtl_hybrid_mem_attrs;
+ intel_pmu_pebs_data_source_mtl();
+ x86_pmu.get_event_constraints = mtl_get_event_constraints;
+ pmu->extra_regs = intel_cmt_extra_regs;
+ pr_cont("Meteorlake Hybrid events, ");
+ name = "meteorlake_hybrid";
+ } else {
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ intel_pmu_pebs_data_source_adl();
+ pr_cont("Alderlake Hybrid events, ");
+ name = "alderlake_hybrid";
+ }
break;
default:
@@ -6605,6 +6767,9 @@ __init int intel_pmu_init(void)
if (is_hybrid())
intel_pmu_check_hybrid_pmus((u64)fixed_mask);
+ if (x86_pmu.intel_cap.pebs_timing_info)
+ x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
+
intel_aux_output_init();
return 0;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 3019fb1926e3..551741e79e03 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -677,6 +677,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 88e58b6ee73c..a2e566e53076 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2,12 +2,14 @@
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/slab.h>
+#include <linux/sched/clock.h>
#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/insn.h>
#include <asm/io.h>
+#include <asm/timer.h>
#include "../perf_event.h"
@@ -53,6 +55,13 @@ union intel_x86_pebs_dse {
unsigned int st_lat_locked:1;
unsigned int ld_reserved3:26;
};
+ struct {
+ unsigned int mtl_dse:5;
+ unsigned int mtl_locked:1;
+ unsigned int mtl_stlb_miss:1;
+ unsigned int mtl_fwd_blk:1;
+ unsigned int ld_reserved4:24;
+ };
};
@@ -135,6 +144,29 @@ void __init intel_pmu_pebs_data_source_adl(void)
__intel_pmu_pebs_data_source_grt(data_source);
}
+static void __init intel_pmu_pebs_data_source_cmt(u64 *data_source)
+{
+ data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
+ data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ data_source[0x0a] = OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE);
+ data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ data_source[0x0c] = OP_LH | LEVEL(RAM) | REM | P(SNOOPX, FWD);
+ data_source[0x0d] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_mtl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_skl(false, data_source);
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ intel_pmu_pebs_data_source_cmt(data_source);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -219,24 +251,19 @@ static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
}
/* Retrieve the latency data for e-core of ADL */
-u64 adl_latency_data_small(struct perf_event *event, u64 status)
+static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
+ u8 dse, bool tlb, bool lock, bool blk)
{
- union intel_x86_pebs_dse dse;
u64 val;
WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
- dse.val = status;
-
- val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
+ dse &= PERF_PEBS_DATA_SOURCE_MASK;
+ val = hybrid_var(event->pmu, pebs_data_source)[dse];
- /*
- * For the atom core on ADL,
- * bit 4: lock, bit 5: TLB access.
- */
- pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss);
+ pebs_set_tlb_lock(&val, tlb, lock);
- if (dse.ld_data_blk)
+ if (blk)
val |= P(BLK, DATA);
else
val |= P(BLK, NA);
@@ -244,6 +271,29 @@ u64 adl_latency_data_small(struct perf_event *event, u64 status)
return val;
}
+u64 adl_latency_data_small(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+
+ dse.val = status;
+
+ return __adl_latency_data_small(event, status, dse.ld_dse,
+ dse.ld_locked, dse.ld_stlb_miss,
+ dse.ld_data_blk);
+}
+
+/* Retrieve the latency data for e-core of MTL */
+u64 mtl_latency_data_small(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+
+ dse.val = status;
+
+ return __adl_latency_data_small(event, status, dse.mtl_dse,
+ dse.mtl_stlb_miss, dse.mtl_locked,
+ dse.mtl_fwd_blk);
+}
+
static u64 load_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
@@ -759,7 +809,8 @@ int intel_pmu_drain_bts_buffer(void)
* the sample.
*/
rcu_read_lock();
- perf_prepare_sample(&header, &data, event, &regs);
+ perf_prepare_sample(&data, event, &regs);
+ perf_prepare_header(&header, &data, event, &regs);
if (perf_output_begin(&handle, &data, event,
header.size * (top - base - skip)))
@@ -1519,6 +1570,27 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
return val;
}
+static void setup_pebs_time(struct perf_event *event,
+ struct perf_sample_data *data,
+ u64 tsc)
+{
+ /* Converting to a user-defined clock is not supported yet. */
+ if (event->attr.use_clockid != 0)
+ return;
+
+ /*
+ * Doesn't support the conversion when the TSC is unstable.
+ * The TSC unstable case is a corner case and very unlikely to
+ * happen. If it happens, the TSC in a PEBS record will be
+ * dropped and fall back to perf_event_clock().
+ */
+ if (!using_native_sched_clock() || !sched_clock_stable())
+ return;
+
+ data->time = native_sched_clock_from_tsc(tsc) + __sched_clock_offset;
+ data->sample_flags |= PERF_SAMPLE_TIME;
+}
+
#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \
PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_DATA_PAGE_SIZE)
@@ -1569,10 +1641,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- data->callchain = perf_callchain(event, iregs);
- data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
+ if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, iregs);
/*
* We use the interrupt regs as a base because the PEBS record does not
@@ -1668,16 +1738,11 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
*
* We can only do this for the default trace clock.
*/
- if (x86_pmu.intel_cap.pebs_format >= 3 &&
- event->attr.use_clockid == 0) {
- data->time = native_sched_clock_from_tsc(pebs->tsc);
- data->sample_flags |= PERF_SAMPLE_TIME;
- }
+ if (x86_pmu.intel_cap.pebs_format >= 3)
+ setup_pebs_time(event, data, pebs->tsc);
- if (has_branch_stack(event)) {
- data->br_stack = &cpuc->lbr_stack;
- data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
- }
+ if (has_branch_stack(event))
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@@ -1705,6 +1770,7 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
#define PEBS_LATENCY_MASK 0xffff
#define PEBS_CACHE_LATENCY_OFFSET 32
+#define PEBS_RETIRE_LATENCY_OFFSET 32
/*
* With adaptive PEBS the layout depends on what fields are configured.
@@ -1735,10 +1801,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_sample_data_init(data, 0, event->hw.last_period);
data->period = event->hw.last_period;
- if (event->attr.use_clockid == 0) {
- data->time = native_sched_clock_from_tsc(basic->tsc);
- data->sample_flags |= PERF_SAMPLE_TIME;
- }
+ setup_pebs_time(event, data, basic->tsc);
/*
* We must however always use iregs for the unwinder to stay sane; the
@@ -1746,16 +1809,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- data->callchain = perf_callchain(event, iregs);
- data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
+ if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, iregs);
*regs = *iregs;
/* The ip in basic is EventingIP */
set_linear_ip(regs, basic->ip);
regs->flags = PERF_EFLAGS_EXACT;
+ if ((sample_type & PERF_SAMPLE_WEIGHT_STRUCT) && (x86_pmu.flags & PMU_FL_RETIRE_LATENCY))
+ data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+
/*
* The record for MEMINFO is in front of GP
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
@@ -1835,8 +1899,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (has_branch_stack(event)) {
intel_pmu_store_pebs_lbrs(lbr);
- data->br_stack = &cpuc->lbr_stack;
- data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
}
}
@@ -2303,8 +2366,10 @@ void __init intel_ds_init(void)
x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
break;
- case 4:
case 5:
+ x86_pmu.pebs_ept = 1;
+ fallthrough;
+ case 4:
x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
if (x86_pmu.intel_cap.pebs_baseline) {
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 1f21f576ca77..c3b0d15a9841 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1606,12 +1606,10 @@ clear_arch_lbr:
*/
void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
- int lbr_fmt = x86_pmu.intel_cap.lbr_format;
-
lbr->nr = x86_pmu.lbr_nr;
lbr->from = x86_pmu.lbr_from;
lbr->to = x86_pmu.lbr_to;
- lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
+ lbr->info = x86_pmu.lbr_info;
}
EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 459b1aafd4d4..bc226603ef3e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -65,6 +65,21 @@ int uncore_die_to_segment(int die)
return bus ? pci_domain_nr(bus) : -EINVAL;
}
+int uncore_device_to_die(struct pci_dev *dev)
+{
+ int node = pcibus_to_node(dev->bus);
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_pcibus(dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node)
+ return c->logical_die_id;
+ }
+
+ return -1;
+}
+
static void uncore_free_pcibus_map(void)
{
struct pci2phy_map *map, *tmp;
@@ -842,6 +857,12 @@ static const struct attribute_group uncore_pmu_attr_group = {
.attrs = uncore_pmu_attrs,
};
+static inline int uncore_get_box_id(struct intel_uncore_type *type,
+ struct intel_uncore_pmu *pmu)
+{
+ return type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx;
+}
+
void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
{
struct intel_uncore_type *type = pmu->type;
@@ -850,7 +871,7 @@ void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
sprintf(pmu_name, "uncore_type_%u", type->type_id);
else {
sprintf(pmu_name, "uncore_type_%u_%d",
- type->type_id, type->box_ids[pmu->pmu_idx]);
+ type->type_id, uncore_get_box_id(type, pmu));
}
}
@@ -877,7 +898,7 @@ static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
* Use the box ID from the discovery table if applicable.
*/
sprintf(pmu->name, "uncore_%s_%d", type->name,
- type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx);
+ uncore_get_box_id(type, pmu));
}
}
@@ -1674,7 +1695,10 @@ struct intel_uncore_init_fun {
void (*cpu_init)(void);
int (*pci_init)(void);
void (*mmio_init)(void);
+ /* Discovery table is required */
bool use_discovery;
+ /* The units in the discovery table should be ignored. */
+ int *uncore_units_ignore;
};
static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
@@ -1765,6 +1789,11 @@ static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
.mmio_init = adl_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun mtl_uncore_init __initconst = {
+ .cpu_init = mtl_uncore_cpu_init,
+ .mmio_init = adl_uncore_mmio_init,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1782,6 +1811,7 @@ static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
.pci_init = spr_uncore_pci_init,
.mmio_init = spr_uncore_mmio_init,
.use_discovery = true,
+ .uncore_units_ignore = spr_uncore_units_ignore,
};
static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
@@ -1832,6 +1862,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &mtl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
@@ -1853,7 +1885,7 @@ static int __init intel_uncore_init(void)
id = x86_match_cpu(intel_uncore_match);
if (!id) {
- if (!uncore_no_discover && intel_uncore_has_discovery_tables())
+ if (!uncore_no_discover && intel_uncore_has_discovery_tables(NULL))
uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
else
return -ENODEV;
@@ -1861,7 +1893,8 @@ static int __init intel_uncore_init(void)
uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
if (uncore_no_discover && uncore_init->use_discovery)
return -ENODEV;
- if (uncore_init->use_discovery && !intel_uncore_has_discovery_tables())
+ if (uncore_init->use_discovery &&
+ !intel_uncore_has_discovery_tables(uncore_init->uncore_units_ignore))
return -ENODEV;
}
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index e278e2e7c051..c30fb5bb1222 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -34,6 +34,8 @@
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+#define UNCORE_IGNORE_END -1
+
struct pci_extra_dev {
struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
};
@@ -208,6 +210,7 @@ struct pci2phy_map {
struct pci2phy_map *__find_pci2phy_map(int segment);
int uncore_pcibus_to_dieid(struct pci_bus *bus);
int uncore_die_to_segment(int die);
+int uncore_device_to_die(struct pci_dev *dev);
ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
@@ -589,6 +592,7 @@ extern raw_spinlock_t pci2phy_map_lock;
extern struct list_head pci2phy_map_head;
extern struct pci_extra_dev *uncore_extra_pci_dev;
extern struct event_constraint uncore_constraint_empty;
+extern int spr_uncore_units_ignore[];
/* uncore_snb.c */
int snb_uncore_pci_init(void);
@@ -602,6 +606,7 @@ void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
+void mtl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 5fd72d4b8bbb..cb488e41807c 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -33,7 +33,7 @@ static int logical_die_id;
static int get_device_die_id(struct pci_dev *dev)
{
- int cpu, node = pcibus_to_node(dev->bus);
+ int node = pcibus_to_node(dev->bus);
/*
* If the NUMA info is not available, assume that the logical die id is
@@ -43,19 +43,7 @@ static int get_device_die_id(struct pci_dev *dev)
if (node < 0)
return logical_die_id++;
- for_each_cpu(cpu, cpumask_of_node(node)) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->initialized && cpu_to_node(cpu) == node)
- return c->logical_die_id;
- }
-
- /*
- * All CPUs of a node may be offlined. For this case,
- * the PCI and MMIO type of uncore blocks which are
- * enumerated by the device will be unavailable.
- */
- return -1;
+ return uncore_device_to_die(dev);
}
#define __node_2_type(cur) \
@@ -140,13 +128,21 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
unsigned int *box_offset, *ids;
int i;
- if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset))
+ if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) {
+ pr_info("Invalid address is detected for uncore type %d box %d, "
+ "Disable the uncore unit.\n",
+ unit->box_type, unit->box_id);
return;
+ }
if (parsed) {
type = search_uncore_discovery_type(unit->box_type);
- if (WARN_ON_ONCE(!type))
+ if (!type) {
+ pr_info("A spurious uncore type %d is detected, "
+ "Disable the uncore type.\n",
+ unit->box_type);
return;
+ }
/* Store the first box of each die */
if (!type->box_ctrl_die[die])
type->box_ctrl_die[die] = unit->ctl;
@@ -181,8 +177,12 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
ids[i] = type->ids[i];
box_offset[i] = type->box_offset[i];
- if (WARN_ON_ONCE(unit->box_id == ids[i]))
+ if (unit->box_id == ids[i]) {
+ pr_info("Duplicate uncore type %d box ID %d is detected, "
+ "Drop the duplicate uncore unit.\n",
+ unit->box_type, unit->box_id);
goto free_ids;
+ }
}
ids[i] = unit->box_id;
box_offset[i] = unit->ctl - type->box_ctrl;
@@ -202,8 +202,25 @@ free_box_offset:
}
+static bool
+uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore)
+{
+ int i;
+
+ if (!ignore)
+ return false;
+
+ for (i = 0; ignore[i] != UNCORE_IGNORE_END ; i++) {
+ if (unit->box_type == ignore[i])
+ return true;
+ }
+
+ return false;
+}
+
static int parse_discovery_table(struct pci_dev *dev, int die,
- u32 bar_offset, bool *parsed)
+ u32 bar_offset, bool *parsed,
+ int *ignore)
{
struct uncore_global_discovery global;
struct uncore_unit_discovery unit;
@@ -258,6 +275,9 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
if (unit.access_type >= UNCORE_ACCESS_MAX)
continue;
+ if (uncore_ignore_unit(&unit, ignore))
+ continue;
+
uncore_insert_box_info(&unit, die, *parsed);
}
@@ -266,7 +286,7 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
return 0;
}
-bool intel_uncore_has_discovery_tables(void)
+bool intel_uncore_has_discovery_tables(int *ignore)
{
u32 device, val, entry_id, bar_offset;
int die, dvsec = 0, ret = true;
@@ -302,7 +322,7 @@ bool intel_uncore_has_discovery_tables(void)
if (die < 0)
continue;
- parse_discovery_table(dev, die, bar_offset, &parsed);
+ parse_discovery_table(dev, die, bar_offset, &parsed, ignore);
}
}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index f4439357779a..6ee80ad3423e 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -21,9 +21,15 @@
/* Global discovery table size */
#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20
-#define UNCORE_DISCOVERY_PCI_DOMAIN(data) ((data >> 28) & 0x7)
-#define UNCORE_DISCOVERY_PCI_BUS(data) ((data >> 20) & 0xff)
-#define UNCORE_DISCOVERY_PCI_DEVFN(data) ((data >> 12) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET 28
+#define UNCORE_DISCOVERY_PCI_DOMAIN(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET) & 0x7)
+#define UNCORE_DISCOVERY_PCI_BUS_OFFSET 20
+#define UNCORE_DISCOVERY_PCI_BUS(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_BUS_OFFSET) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DEVFN_OFFSET 12
+#define UNCORE_DISCOVERY_PCI_DEVFN(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_DEVFN_OFFSET) & 0xff)
#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data) (data & 0xfff)
@@ -122,7 +128,7 @@ struct intel_uncore_discovery_type {
unsigned int *box_offset; /* Box offset */
};
-bool intel_uncore_has_discovery_tables(void);
+bool intel_uncore_has_discovery_tables(int *ignore);
void intel_uncore_clear_discovery_tables(void);
void intel_uncore_generic_uncore_cpu_init(void);
int intel_uncore_generic_uncore_pci_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 1f4869227efb..7fd4334e12a1 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -109,6 +109,19 @@
#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728
#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729
#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A
+#define PCI_DEVICE_ID_INTEL_MTL_1_IMC 0x7d00
+#define PCI_DEVICE_ID_INTEL_MTL_2_IMC 0x7d01
+#define PCI_DEVICE_ID_INTEL_MTL_3_IMC 0x7d02
+#define PCI_DEVICE_ID_INTEL_MTL_4_IMC 0x7d05
+#define PCI_DEVICE_ID_INTEL_MTL_5_IMC 0x7d10
+#define PCI_DEVICE_ID_INTEL_MTL_6_IMC 0x7d14
+#define PCI_DEVICE_ID_INTEL_MTL_7_IMC 0x7d15
+#define PCI_DEVICE_ID_INTEL_MTL_8_IMC 0x7d16
+#define PCI_DEVICE_ID_INTEL_MTL_9_IMC 0x7d21
+#define PCI_DEVICE_ID_INTEL_MTL_10_IMC 0x7d22
+#define PCI_DEVICE_ID_INTEL_MTL_11_IMC 0x7d23
+#define PCI_DEVICE_ID_INTEL_MTL_12_IMC 0x7d24
+#define PCI_DEVICE_ID_INTEL_MTL_13_IMC 0x7d28
#define IMC_UNCORE_DEV(a) \
@@ -205,6 +218,32 @@
#define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0
#define ADL_UNC_ARB_MSR_OFFSET 0x8
+/* MTL Cbo register */
+#define MTL_UNC_CBO_0_PER_CTR0 0x2448
+#define MTL_UNC_CBO_0_PERFEVTSEL0 0x2442
+
+/* MTL HAC_ARB register */
+#define MTL_UNC_HAC_ARB_CTR 0x2018
+#define MTL_UNC_HAC_ARB_CTRL 0x2012
+
+/* MTL ARB register */
+#define MTL_UNC_ARB_CTR 0x2418
+#define MTL_UNC_ARB_CTRL 0x2412
+
+/* MTL cNCU register */
+#define MTL_UNC_CNCU_FIXED_CTR 0x2408
+#define MTL_UNC_CNCU_FIXED_CTRL 0x2402
+#define MTL_UNC_CNCU_BOX_CTL 0x240e
+
+/* MTL sNCU register */
+#define MTL_UNC_SNCU_FIXED_CTR 0x2008
+#define MTL_UNC_SNCU_FIXED_CTRL 0x2002
+#define MTL_UNC_SNCU_BOX_CTL 0x200e
+
+/* MTL HAC_CBO register */
+#define MTL_UNC_HBO_CTR 0x2048
+#define MTL_UNC_HBO_CTRL 0x2042
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11");
@@ -598,6 +637,115 @@ void adl_uncore_cpu_init(void)
uncore_msr_uncores = adl_msr_uncores;
}
+static struct intel_uncore_type mtl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_CBO_0_PER_CTR0,
+ .event_ctl = MTL_UNC_CBO_0_PERFEVTSEL0,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_hac_arb = {
+ .name = "hac_arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_HAC_ARB_CTR,
+ .event_ctl = MTL_UNC_HAC_ARB_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_ARB_CTR,
+ .event_ctl = MTL_UNC_ARB_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_hac_cbox = {
+ .name = "hac_cbox",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_HBO_CTR,
+ .event_ctl = MTL_UNC_HBO_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static void mtl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static struct intel_uncore_ops mtl_uncore_msr_ops = {
+ .init_box = mtl_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type mtl_uncore_cncu = {
+ .name = "cncu",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .box_ctl = MTL_UNC_CNCU_BOX_CTL,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = MTL_UNC_CNCU_FIXED_CTR,
+ .fixed_ctl = MTL_UNC_CNCU_FIXED_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &mtl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type mtl_uncore_sncu = {
+ .name = "sncu",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .box_ctl = MTL_UNC_SNCU_BOX_CTL,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = MTL_UNC_SNCU_FIXED_CTR,
+ .fixed_ctl = MTL_UNC_SNCU_FIXED_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &mtl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type *mtl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ &mtl_uncore_hac_arb,
+ &mtl_uncore_arb,
+ &mtl_uncore_hac_cbox,
+ &mtl_uncore_cncu,
+ &mtl_uncore_sncu,
+ NULL
+};
+
+void mtl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = icl_get_cbox_num();
+ uncore_msr_uncores = mtl_msr_uncores;
+}
+
enum {
SNB_PCI_UNCORE_IMC,
};
@@ -1264,6 +1412,19 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = {
IMC_UNCORE_DEV(RPL_23),
IMC_UNCORE_DEV(RPL_24),
IMC_UNCORE_DEV(RPL_25),
+ IMC_UNCORE_DEV(MTL_1),
+ IMC_UNCORE_DEV(MTL_2),
+ IMC_UNCORE_DEV(MTL_3),
+ IMC_UNCORE_DEV(MTL_4),
+ IMC_UNCORE_DEV(MTL_5),
+ IMC_UNCORE_DEV(MTL_6),
+ IMC_UNCORE_DEV(MTL_7),
+ IMC_UNCORE_DEV(MTL_8),
+ IMC_UNCORE_DEV(MTL_9),
+ IMC_UNCORE_DEV(MTL_10),
+ IMC_UNCORE_DEV(MTL_11),
+ IMC_UNCORE_DEV(MTL_12),
+ IMC_UNCORE_DEV(MTL_13),
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 44c2f879f708..7d1199554fe3 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1453,9 +1453,6 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
}
raw_spin_unlock(&pci2phy_map_lock);
} else {
- int node = pcibus_to_node(ubox_dev->bus);
- int cpu;
-
segment = pci_domain_nr(ubox_dev->bus);
raw_spin_lock(&pci2phy_map_lock);
map = __find_pci2phy_map(segment);
@@ -1465,15 +1462,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
break;
}
- die_id = -1;
- for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ map->pbus_to_dieid[bus] = die_id = uncore_device_to_die(ubox_dev);
- if (c->initialized && cpu_to_node(cpu) == node) {
- map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
- break;
- }
- }
raw_spin_unlock(&pci2phy_map_lock);
if (WARN_ON_ONCE(die_id == -1)) {
@@ -6142,24 +6132,6 @@ static int spr_upi_get_topology(struct intel_uncore_type *type)
return discover_upi_topology(type, SPR_UBOX_DID, SPR_UPI_REGS_ADDR_DEVICE_LINK0);
}
-static struct intel_uncore_type spr_uncore_upi = {
- .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
- .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
- .format_group = &spr_uncore_raw_format_group,
- .ops = &spr_uncore_pci_ops,
- .name = "upi",
- .attr_update = spr_upi_attr_update,
- .get_topology = spr_upi_get_topology,
- .set_mapping = spr_upi_set_mapping,
- .cleanup_mapping = spr_upi_cleanup_mapping,
-};
-
-static struct intel_uncore_type spr_uncore_m3upi = {
- SPR_UNCORE_PCI_COMMON_FORMAT(),
- .name = "m3upi",
- .constraints = icx_uncore_m3upi_constraints,
-};
-
static struct intel_uncore_type spr_uncore_mdf = {
SPR_UNCORE_COMMON_FORMAT(),
.name = "mdf",
@@ -6168,7 +6140,13 @@ static struct intel_uncore_type spr_uncore_mdf = {
#define UNCORE_SPR_NUM_UNCORE_TYPES 12
#define UNCORE_SPR_IIO 1
#define UNCORE_SPR_IMC 6
+#define UNCORE_SPR_UPI 8
+#define UNCORE_SPR_M3UPI 9
+/*
+ * The uncore units, which are supported by the discovery table,
+ * are defined here.
+ */
static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
&spr_uncore_chabox,
&spr_uncore_iio,
@@ -6178,12 +6156,56 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
NULL,
&spr_uncore_imc,
&spr_uncore_m2m,
- &spr_uncore_upi,
- &spr_uncore_m3upi,
+ NULL,
+ NULL,
NULL,
&spr_uncore_mdf,
};
+/*
+ * The uncore units, which are not supported by the discovery table,
+ * are implemented from here.
+ */
+#define SPR_UNCORE_UPI_NUM_BOXES 4
+
+static unsigned int spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
+ 0, 0x8000, 0x10000, 0x18000
+};
+
+static struct intel_uncore_type spr_uncore_upi = {
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
+ .format_group = &spr_uncore_raw_format_group,
+ .ops = &spr_uncore_pci_ops,
+ .name = "upi",
+ .attr_update = spr_upi_attr_update,
+ .get_topology = spr_upi_get_topology,
+ .set_mapping = spr_upi_set_mapping,
+ .cleanup_mapping = spr_upi_cleanup_mapping,
+ .type_id = UNCORE_SPR_UPI,
+ .num_counters = 4,
+ .num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_UPI_PCI_PMON_CTL0,
+ .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
+ .pci_offsets = spr_upi_pci_offsets,
+};
+
+static struct intel_uncore_type spr_uncore_m3upi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "m3upi",
+ .type_id = UNCORE_SPR_M3UPI,
+ .num_counters = 4,
+ .num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_M3UPI_PCI_PMON_CTL0,
+ .box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .pci_offsets = spr_upi_pci_offsets,
+ .constraints = icx_uncore_m3upi_constraints,
+};
+
enum perf_uncore_spr_iio_freerunning_type_id {
SPR_IIO_MSR_IOCLK,
SPR_IIO_MSR_BW_IN,
@@ -6314,6 +6336,7 @@ static struct intel_uncore_type spr_uncore_imc_free_running = {
#define UNCORE_SPR_MSR_EXTRA_UNCORES 1
#define UNCORE_SPR_MMIO_EXTRA_UNCORES 1
+#define UNCORE_SPR_PCI_EXTRA_UNCORES 2
static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = {
&spr_uncore_iio_free_running,
@@ -6323,6 +6346,17 @@ static struct intel_uncore_type *spr_mmio_uncores[UNCORE_SPR_MMIO_EXTRA_UNCORES]
&spr_uncore_imc_free_running,
};
+static struct intel_uncore_type *spr_pci_uncores[UNCORE_SPR_PCI_EXTRA_UNCORES] = {
+ &spr_uncore_upi,
+ &spr_uncore_m3upi
+};
+
+int spr_uncore_units_ignore[] = {
+ UNCORE_SPR_UPI,
+ UNCORE_SPR_M3UPI,
+ UNCORE_IGNORE_END
+};
+
static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
struct intel_uncore_type *from_type)
{
@@ -6423,9 +6457,69 @@ void spr_uncore_cpu_init(void)
spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
}
+#define SPR_UNCORE_UPI_PCIID 0x3241
+#define SPR_UNCORE_UPI0_DEVFN 0x9
+#define SPR_UNCORE_M3UPI_PCIID 0x3246
+#define SPR_UNCORE_M3UPI0_DEVFN 0x29
+
+static void spr_update_device_location(int type_id)
+{
+ struct intel_uncore_type *type;
+ struct pci_dev *dev = NULL;
+ u32 device, devfn;
+ u64 *ctls;
+ int die;
+
+ if (type_id == UNCORE_SPR_UPI) {
+ type = &spr_uncore_upi;
+ device = SPR_UNCORE_UPI_PCIID;
+ devfn = SPR_UNCORE_UPI0_DEVFN;
+ } else if (type_id == UNCORE_SPR_M3UPI) {
+ type = &spr_uncore_m3upi;
+ device = SPR_UNCORE_M3UPI_PCIID;
+ devfn = SPR_UNCORE_M3UPI0_DEVFN;
+ } else
+ return;
+
+ ctls = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
+ if (!ctls) {
+ type->num_boxes = 0;
+ return;
+ }
+
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+ if (devfn != dev->devfn)
+ continue;
+
+ die = uncore_device_to_die(dev);
+ if (die < 0)
+ continue;
+
+ ctls[die] = pci_domain_nr(dev->bus) << UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET |
+ dev->bus->number << UNCORE_DISCOVERY_PCI_BUS_OFFSET |
+ devfn << UNCORE_DISCOVERY_PCI_DEVFN_OFFSET |
+ type->box_ctl;
+ }
+
+ type->box_ctls = ctls;
+}
+
int spr_uncore_pci_init(void)
{
- uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL);
+ /*
+ * The discovery table of UPI on some SPR variant is broken,
+ * which impacts the detection of both UPI and M3UPI uncore PMON.
+ * Use the pre-defined UPI and M3UPI table to replace.
+ *
+ * The accurate location, e.g., domain and BUS number,
+ * can only be retrieved at load time.
+ * Update the location of UPI and M3UPI.
+ */
+ spr_update_device_location(UNCORE_SPR_UPI);
+ spr_update_device_location(UNCORE_SPR_M3UPI);
+ uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI,
+ UNCORE_SPR_PCI_EXTRA_UNCORES,
+ spr_pci_uncores);
return 0;
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 0e849f28a5c1..d6de4487348c 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -35,15 +35,17 @@
* per-core reg tables.
*/
enum extra_reg_type {
- EXTRA_REG_NONE = -1, /* not used */
+ EXTRA_REG_NONE = -1, /* not used */
- EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
- EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
- EXTRA_REG_LBR = 2, /* lbr_select */
- EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
- EXTRA_REG_FE = 4, /* fe_* */
+ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
+ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
+ EXTRA_REG_LBR = 2, /* lbr_select */
+ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
+ EXTRA_REG_FE = 4, /* fe_* */
+ EXTRA_REG_SNOOP_0 = 5, /* snoop response 0 */
+ EXTRA_REG_SNOOP_1 = 6, /* snoop response 1 */
- EXTRA_REG_MAX /* number of entries needed */
+ EXTRA_REG_MAX /* number of entries needed */
};
struct event_constraint {
@@ -606,6 +608,7 @@ union perf_capabilities {
u64 pebs_baseline:1;
u64 perf_metrics:1;
u64 pebs_output_pt_available:1;
+ u64 pebs_timing_info:1;
u64 anythread_deprecated:1;
};
u64 capabilities;
@@ -647,6 +650,7 @@ enum {
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x10
+#define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1)
struct x86_hybrid_pmu {
struct pmu pmu;
@@ -1000,6 +1004,7 @@ do { \
#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
+#define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1486,6 +1491,8 @@ int intel_pmu_drain_bts_buffer(void);
u64 adl_latency_data_small(struct perf_event *event, u64 status);
+u64 mtl_latency_data_small(struct perf_event *event, u64 status);
+
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
@@ -1597,6 +1604,8 @@ void intel_pmu_pebs_data_source_adl(void);
void intel_pmu_pebs_data_source_grt(void);
+void intel_pmu_pebs_data_source_mtl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
index 949d845c922b..3e9acdaeed1e 100644
--- a/arch/x86/events/zhaoxin/core.c
+++ b/arch/x86/events/zhaoxin/core.c
@@ -541,7 +541,13 @@ __init int zhaoxin_pmu_init(void)
switch (boot_cpu_data.x86) {
case 0x06:
- if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
+ /*
+ * Support Zhaoxin CPU from ZXC series, exclude Nano series through FMS.
+ * Nano FMS: Family=6, Model=F, Stepping=[0-A][C-D]
+ * ZXC FMS: Family=6, Model=F, Stepping=E-F OR Family=6, Model=0x19, Stepping=0-3
+ */
+ if ((boot_cpu_data.x86_model == 0x0f && boot_cpu_data.x86_stepping >= 0x0e) ||
+ boot_cpu_data.x86_model == 0x19) {
x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 65064d9f7fa6..8eb74cf386db 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -14,6 +14,7 @@
#include <asm/mmu.h>
#include <asm/mpspec.h>
#include <asm/x86_init.h>
+#include <asm/cpufeature.h>
#ifdef CONFIG_ACPI_APEI
# include <asm/pgtable_types.h>
@@ -63,6 +64,13 @@ extern int (*acpi_suspend_lowlevel)(void);
/* Physical address to resume after wakeup */
unsigned long acpi_get_wakeup_address(void);
+static inline bool acpi_skip_set_wakeup_address(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+
+#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address
+
/*
* Check if the CPU can handle C2 and deeper
*/
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index cd7b14322035..c8c111d8fbd7 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -23,10 +23,4 @@
*/
#define flush_agp_cache() wbinvd()
-/* GATT allocation. Returns/accepts GATT kernel virtual address. */
-#define alloc_gatt_pages(order) \
- ((char *)__get_free_pages(GFP_KERNEL, (order)))
-#define free_gatt_pages(table, order) \
- free_pages((unsigned long)(table), (order))
-
#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 7659217f4d49..e2975a32d443 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -6,8 +6,10 @@
#include <linux/stringify.h>
#include <asm/asm.h>
-#define ALTINSTR_FLAG_INV (1 << 15)
-#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV)
+#define ALT_FLAGS_SHIFT 16
+
+#define ALT_FLAG_NOT BIT(0)
+#define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature))
#ifndef __ASSEMBLY__
@@ -59,10 +61,27 @@
".long 999b - .\n\t" \
".popsection\n\t"
+/*
+ * The patching flags are part of the upper bits of the @ft_flags parameter when
+ * specifying them. The split is currently like this:
+ *
+ * [31... flags ...16][15... CPUID feature bit ...0]
+ *
+ * but since this is all hidden in the macros argument being split, those fields can be
+ * extended in the future to fit in a u64 or however the need arises.
+ */
struct alt_instr {
s32 instr_offset; /* original instruction */
s32 repl_offset; /* offset to replacement instruction */
- u16 cpuid; /* cpuid bit set for replacement */
+
+ union {
+ struct {
+ u32 cpuid: 16; /* CPUID bit set for replacement */
+ u32 flags: 16; /* patching control flags */
+ };
+ u32 ft_flags;
+ };
+
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction */
} __packed;
@@ -182,10 +201,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
" - (" alt_slen ")), 0x90\n" \
alt_end_marker ":\n"
-#define ALTINSTR_ENTRY(feature, num) \
+#define ALTINSTR_ENTRY(ft_flags, num) \
" .long 661b - .\n" /* label */ \
" .long " b_replacement(num)"f - .\n" /* new instruction */ \
- " .word " __stringify(feature) "\n" /* feature bit */ \
+ " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
" .byte " alt_total_slen "\n" /* source len */ \
" .byte " alt_rlen(num) "\n" /* replacement len */
@@ -194,20 +213,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
/* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, newinstr, feature) \
+#define ALTERNATIVE(oldinstr, newinstr, ft_flags) \
OLDINSTR(oldinstr, 1) \
".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(feature, 1) \
+ ALTINSTR_ENTRY(ft_flags, 1) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr, 1) \
".popsection\n"
-#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
+#define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
OLDINSTR_2(oldinstr, 1, 2) \
".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(feature1, 1) \
- ALTINSTR_ENTRY(feature2, 2) \
+ ALTINSTR_ENTRY(ft_flags1, 1) \
+ ALTINSTR_ENTRY(ft_flags2, 2) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr1, 1) \
@@ -215,21 +234,22 @@ static inline int alternatives_text_reserved(void *start, void *end)
".popsection\n"
/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
-#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
- newinstr_yes, feature)
-
-#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \
- OLDINSTR_3(oldinsn, 1, 2, 3) \
- ".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(feat1, 1) \
- ALTINSTR_ENTRY(feat2, 2) \
- ALTINSTR_ENTRY(feat3, 3) \
- ".popsection\n" \
- ".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinsn1, 1) \
- ALTINSTR_REPLACEMENT(newinsn2, 2) \
- ALTINSTR_REPLACEMENT(newinsn3, 3) \
+ newinstr_yes, ft_flags)
+
+#define ALTERNATIVE_3(oldinsn, newinsn1, ft_flags1, newinsn2, ft_flags2, \
+ newinsn3, ft_flags3) \
+ OLDINSTR_3(oldinsn, 1, 2, 3) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(ft_flags1, 1) \
+ ALTINSTR_ENTRY(ft_flags2, 2) \
+ ALTINSTR_ENTRY(ft_flags3, 3) \
+ ".popsection\n" \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(newinsn1, 1) \
+ ALTINSTR_REPLACEMENT(newinsn2, 2) \
+ ALTINSTR_REPLACEMENT(newinsn3, 3) \
".popsection\n"
/*
@@ -244,14 +264,14 @@ static inline int alternatives_text_reserved(void *start, void *end)
* For non barrier like inlines please define new variants
* without volatile and memory clobber.
*/
-#define alternative(oldinstr, newinstr, feature) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+#define alternative(oldinstr, newinstr, ft_flags) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory")
-#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
- asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+#define alternative_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
+ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) ::: "memory")
-#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \
- asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory")
+#define alternative_ternary(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
+ asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) ::: "memory")
/*
* Alternative inline assembly with input.
@@ -261,8 +281,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Argument numbers start with 1.
* Leaving an unused argument 0 to keep API compatibility.
*/
-#define alternative_input(oldinstr, newinstr, feature, input...) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+#define alternative_input(oldinstr, newinstr, ft_flags, input...) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: : "i" (0), ## input)
/*
@@ -273,20 +293,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, newinstr1 is used.
* Otherwise, oldinstr is used.
*/
-#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \
- feature2, input...) \
- asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
- newinstr2, feature2) \
+#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \
+ ft_flags2, input...) \
+ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \
+ newinstr2, ft_flags2) \
: : "i" (0), ## input)
/* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+#define alternative_io(oldinstr, newinstr, ft_flags, output, input...) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: output : "i" (0), ## input)
/* Like alternative_io, but for replacing a direct call with another one. */
-#define alternative_call(oldfunc, newfunc, feature, output, input...) \
- asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
+#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \
+ asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \
: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
/*
@@ -295,10 +315,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, function1 is used.
* Otherwise, old function is used.
*/
-#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
+#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
output, input...) \
- asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
- "call %P[new2]", feature2) \
+ asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\
+ "call %P[new2]", ft_flags2) \
: output, ASM_CALL_CONSTRAINT \
: [old] "i" (oldfunc), [new1] "i" (newfunc1), \
[new2] "i" (newfunc2), ## input)
@@ -347,10 +367,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
* enough information for the alternatives patching code to patch an
* instruction. See apply_alternatives().
*/
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstr_entry orig alt ft_flags orig_len alt_len
.long \orig - .
.long \alt - .
- .word \feature
+ .4byte \ft_flags
.byte \orig_len
.byte \alt_len
.endm
@@ -361,7 +381,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
* @newinstr. ".skip" directive takes care of proper instruction padding
* in case @newinstr is longer than @oldinstr.
*/
-.macro ALTERNATIVE oldinstr, newinstr, feature
+.macro ALTERNATIVE oldinstr, newinstr, ft_flags
140:
\oldinstr
141:
@@ -369,7 +389,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
142:
.pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f
+ altinstr_entry 140b,143f,\ft_flags,142b-140b,144f-143f
.popsection
.pushsection .altinstr_replacement,"ax"
@@ -399,7 +419,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
* has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
* @feature2, it replaces @oldinstr with @feature2.
*/
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+.macro ALTERNATIVE_2 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2
140:
\oldinstr
141:
@@ -408,8 +428,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
142:
.pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
- altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
+ altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
+ altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
.popsection
.pushsection .altinstr_replacement,"ax"
@@ -421,7 +441,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
.popsection
.endm
-.macro ALTERNATIVE_3 oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3
+.macro ALTERNATIVE_3 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, newinstr3, ft_flags3
140:
\oldinstr
141:
@@ -430,9 +450,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
142:
.pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
- altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
- altinstruction_entry 140b,145f,\feature3,142b-140b,146f-145f
+ altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
+ altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
+ altinstr_entry 140b,145f,\ft_flags3,142b-140b,146f-145f
.popsection
.pushsection .altinstr_replacement,"ax"
@@ -447,9 +467,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
.endm
/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
-#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
- newinstr_yes, feature
+ newinstr_yes, ft_flags
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 8f80de627c60..b1a98fa38828 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -12,6 +12,7 @@
#include <asm/special_insns.h>
#include <asm/preempt.h>
#include <asm/asm.h>
+#include <asm/gsseg.h>
#ifndef CONFIG_X86_CMPXCHG64
extern void cmpxchg8b_emu(void);
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 5efd01b548d1..808b4eece251 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -71,7 +71,7 @@ ATOMIC64_DECL(add_unless);
* the old value.
*/
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
{
return arch_cmpxchg64(&v->counter, o, n);
}
@@ -85,7 +85,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
* Atomically xchgs the value of @v to @n and returns
* the old value.
*/
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
{
s64 o;
unsigned high = (unsigned)(n >> 32);
@@ -104,7 +104,7 @@ static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
*
* Atomically sets the value of @v to @n.
*/
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
unsigned high = (unsigned)(i >> 32);
unsigned low = (unsigned)i;
@@ -119,7 +119,7 @@ static inline void arch_atomic64_set(atomic64_t *v, s64 i)
*
* Atomically reads the value of @v and returns it.
*/
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
s64 r;
alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
@@ -133,7 +133,7 @@ static inline s64 arch_atomic64_read(const atomic64_t *v)
*
* Atomically adds @i to @v and returns @i + *@v
*/
-static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
alternative_atomic64(add_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -145,7 +145,7 @@ static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
/*
* Other variants with different arithmetic operators:
*/
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
alternative_atomic64(sub_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -154,7 +154,7 @@ static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_sub_return arch_atomic64_sub_return
-static inline s64 arch_atomic64_inc_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
{
s64 a;
alternative_atomic64(inc_return, "=&A" (a),
@@ -163,7 +163,7 @@ static inline s64 arch_atomic64_inc_return(atomic64_t *v)
}
#define arch_atomic64_inc_return arch_atomic64_inc_return
-static inline s64 arch_atomic64_dec_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
{
s64 a;
alternative_atomic64(dec_return, "=&A" (a),
@@ -179,7 +179,7 @@ static inline s64 arch_atomic64_dec_return(atomic64_t *v)
*
* Atomically adds @i to @v.
*/
-static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
{
__alternative_atomic64(add, add_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -194,7 +194,7 @@ static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
*
* Atomically subtracts @i from @v.
*/
-static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
{
__alternative_atomic64(sub, sub_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -208,7 +208,7 @@ static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
*
* Atomically increments @v by 1.
*/
-static inline void arch_atomic64_inc(atomic64_t *v)
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
__alternative_atomic64(inc, inc_return, /* no output */,
"S" (v) : "memory", "eax", "ecx", "edx");
@@ -221,7 +221,7 @@ static inline void arch_atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
-static inline void arch_atomic64_dec(atomic64_t *v)
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
__alternative_atomic64(dec, dec_return, /* no output */,
"S" (v) : "memory", "eax", "ecx", "edx");
@@ -237,7 +237,7 @@ static inline void arch_atomic64_dec(atomic64_t *v)
* Atomically adds @a to @v, so long as it was not @u.
* Returns non-zero if the add was done, zero otherwise.
*/
-static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
+static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
unsigned low = (unsigned)u;
unsigned high = (unsigned)(u >> 32);
@@ -248,7 +248,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
}
#define arch_atomic64_add_unless arch_atomic64_add_unless
-static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
+static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
{
int r;
alternative_atomic64(inc_not_zero, "=&a" (r),
@@ -257,7 +257,7 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
}
#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
-static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
{
s64 r;
alternative_atomic64(dec_if_positive, "=&A" (r),
@@ -269,7 +269,7 @@ static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
#undef alternative_atomic64
#undef __alternative_atomic64
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -277,7 +277,7 @@ static inline void arch_atomic64_and(s64 i, atomic64_t *v)
c = old;
}
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -288,7 +288,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -296,7 +296,7 @@ static inline void arch_atomic64_or(s64 i, atomic64_t *v)
c = old;
}
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -307,7 +307,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -315,7 +315,7 @@ static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
c = old;
}
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -326,7 +326,7 @@ static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
s64 old, c = 0;
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 7886d0578fc9..c496595bf601 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -17,7 +17,7 @@
* Atomically reads the value of @v.
* Doesn't imply a read memory barrier.
*/
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
return __READ_ONCE((v)->counter);
}
@@ -29,7 +29,7 @@ static inline s64 arch_atomic64_read(const atomic64_t *v)
*
* Atomically sets the value of @v to @i.
*/
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
__WRITE_ONCE(v->counter, i);
}
@@ -55,7 +55,7 @@ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
*
* Atomically subtracts @i from @v.
*/
-static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "subq %1,%0"
: "=m" (v->counter)
@@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
@@ -113,7 +113,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
@@ -127,7 +127,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
-static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
@@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
@@ -161,25 +161,25 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_add_return arch_atomic64_add_return
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
return arch_atomic64_add_return(-i, v);
}
#define arch_atomic64_sub_return arch_atomic64_sub_return
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
return xadd(&v->counter, i);
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
-static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
return xadd(&v->counter, -i);
}
#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
return arch_cmpxchg(&v->counter, old, new);
}
@@ -191,13 +191,13 @@ static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
{
return arch_xchg(&v->counter, new);
}
#define arch_atomic64_xchg arch_atomic64_xchg
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "andq %1,%0"
: "+m" (v->counter)
@@ -205,7 +205,7 @@ static inline void arch_atomic64_and(s64 i, atomic64_t *v)
: "memory");
}
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
@@ -215,7 +215,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "orq %1,%0"
: "+m" (v->counter)
@@ -223,7 +223,7 @@ static inline void arch_atomic64_or(s64 i, atomic64_t *v)
: "memory");
}
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
@@ -233,7 +233,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "xorq %1,%0"
: "+m" (v->counter)
@@ -241,7 +241,7 @@ static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
: "memory");
}
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 407beebadaf4..4d4a47a3a8ab 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -9,7 +9,6 @@
*/
#include <linux/compiler.h>
-#include <linux/uaccess.h>
#include <asm/byteorder.h>
/**
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1a85e1fb0922..ce0c8f7d3218 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -32,6 +32,7 @@ enum cpuid_leafs
CPUID_8000_0007_EBX,
CPUID_7_EDX,
CPUID_8000_001F_EAX,
+ CPUID_8000_0021_EAX,
};
#define X86_CAP_FMT_NUM "%d:%d"
@@ -94,8 +95,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 20))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 21))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -118,8 +120,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 20))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 21))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 61012476d66e..73c9672c123b 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 20 /* N 32-bit words worth of info */
+#define NCAPINTS 21 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -97,7 +97,7 @@
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
+/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
@@ -307,11 +307,18 @@
#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */
#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */
#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
+#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
+#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
+#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */
+#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */
+#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */
+#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */
#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
@@ -426,6 +433,13 @@
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
+#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
+#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */
+#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
+#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */
+
/*
* BUG word(s)
*/
@@ -466,5 +480,6 @@
#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */
#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
+#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b049d950612f..66eb5e1ac4fb 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -39,7 +39,20 @@ static __always_inline unsigned long native_get_debugreg(int regno)
asm("mov %%db6, %0" :"=r" (val));
break;
case 7:
- asm("mov %%db7, %0" :"=r" (val));
+ /*
+ * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them
+ * with other code.
+ *
+ * This is needed because a DR7 access can cause a #VC exception
+ * when running under SEV-ES. Taking a #VC exception is not a
+ * safe thing to do just anywhere in the entry code and
+ * re-ordering might place the access into an unsafe location.
+ *
+ * This happened in the NMI handler, where the DR7 read was
+ * re-ordered to happen before the call to sev_es_ist_enter(),
+ * causing stack recursion.
+ */
+ asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER);
break;
default:
BUG();
@@ -66,7 +79,16 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value)
asm("mov %0, %%db6" ::"r" (value));
break;
case 7:
- asm("mov %0, %%db7" ::"r" (value));
+ /*
+ * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them
+ * with other code.
+ *
+ * While is didn't happen with a DR7 write (see the DR7 read
+ * comment above which explains where it happened), add the
+ * __FORCE_ORDER here too to avoid similar problems in the
+ * future.
+ */
+ asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER);
break;
default:
BUG();
@@ -126,9 +148,14 @@ static __always_inline void local_db_restore(unsigned long dr7)
}
#ifdef CONFIG_CPU_SUP_AMD
-extern void set_dr_addr_mask(unsigned long mask, int dr);
+extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr);
+extern unsigned long amd_get_dr_addr_mask(unsigned int dr);
#else
-static inline void set_dr_addr_mask(unsigned long mask, int dr) { }
+static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { }
+static inline unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ return 0;
+}
#endif
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index c44b56f7ffba..5dfa4fb76f4b 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -124,6 +124,7 @@
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
#define DISABLED_MASK19 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
+#define DISABLED_MASK20 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1c66708e3062..d1dac96ee30b 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -4,7 +4,7 @@
extern const struct dma_map_ops *dma_ops;
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+static inline const struct dma_map_ops *get_arch_dma_ops(void)
{
return dma_ops;
}
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index a63154e049d7..419280d263d2 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -106,6 +106,8 @@ static inline void efi_fpu_end(void)
extern asmlinkage u64 __efi_call(void *fp, ...);
+extern bool efi_disable_ibt_for_runtime;
+
#define efi_call(...) ({ \
__efi_nargs_check(efi_call, 7, __VA_ARGS__); \
__efi_call(__VA_ARGS__); \
@@ -121,7 +123,7 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
#undef arch_efi_call_virt
#define arch_efi_call_virt(p, f, args...) ({ \
- u64 ret, ibt = ibt_save(); \
+ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \
ret = efi_call((void *)p->f, args); \
ibt_restore(ibt); \
ret; \
@@ -335,6 +337,16 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_open_volume(prot, file) \
((prot), efi64_zero_upper(file))
+/* Memory Attribute Protocol */
+#define __efi64_argmap_get_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), (flags))
+
+#define __efi64_argmap_set_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
+#define __efi64_argmap_clear_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
/*
* The macros below handle the plumbing for the argument mapping. To add a
* mapping for a specific EFI method, simply define a macro
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index b2486b2cbc6e..c2d6cd78ed0c 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -39,7 +39,7 @@ extern void fpu_flush_thread(void);
static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
if (cpu_feature_enabled(X86_FEATURE_FPU) &&
- !(current->flags & PF_KTHREAD)) {
+ !(current->flags & (PF_KTHREAD | PF_IO_WORKER))) {
save_fpregs_to_fpstate(old_fpu);
/*
* The save operation preserved register state, so the
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb7cd1139d97..7f6d858ff47a 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -321,7 +321,7 @@ struct xstate_header {
struct xregs_state {
struct fxregs_state i387;
struct xstate_header header;
- u8 extended_state_area[0];
+ u8 extended_state_area[];
} __attribute__ ((packed, aligned (64)));
/*
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
index 9656a5bc6fea..9a710c060445 100644
--- a/arch/x86/include/asm/fpu/xcr.h
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -5,7 +5,7 @@
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
#define XCR_XFEATURE_IN_USE_MASK 0x00000001
-static inline u64 xgetbv(u32 index)
+static __always_inline u64 xgetbv(u32 index)
{
u32 eax, edx;
@@ -27,7 +27,7 @@ static inline void xsetbv(u32 index, u64 value)
*
* Callers should check X86_FEATURE_XGETBV1.
*/
-static inline u64 xfeatures_in_use(void)
+static __always_inline u64 xfeatures_in_use(void)
{
return xgetbv(XCR_XFEATURE_IN_USE_MASK);
}
diff --git a/arch/x86/include/asm/gsseg.h b/arch/x86/include/asm/gsseg.h
new file mode 100644
index 000000000000..ab6a595cea70
--- /dev/null
+++ b/arch/x86/include/asm/gsseg.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_GSSEG_H
+#define _ASM_X86_GSSEG_H
+
+#include <linux/types.h>
+
+#include <asm/asm.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
+#include <asm/processor.h>
+#include <asm/nops.h>
+
+#ifdef CONFIG_X86_64
+
+extern asmlinkage void asm_load_gs_index(u16 selector);
+
+/* Replace with "lkgs %di" once binutils support LKGS instruction */
+#define LKGS_DI _ASM_BYTES(0xf2,0x0f,0x00,0xf7)
+
+static inline void native_lkgs(unsigned int selector)
+{
+ u16 sel = selector;
+ asm_inline volatile("1: " LKGS_DI
+ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k[sel])
+ : [sel] "+D" (sel));
+}
+
+static inline void native_load_gs_index(unsigned int selector)
+{
+ if (cpu_feature_enabled(X86_FEATURE_LKGS)) {
+ native_lkgs(selector);
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ asm_load_gs_index(selector);
+ local_irq_restore(flags);
+ }
+}
+
+#endif /* CONFIG_X86_64 */
+
+static inline void __init lkgs_init(void)
+{
+#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_X86_64
+ if (cpu_feature_enabled(X86_FEATURE_LKGS))
+ pv_ops.cpu.load_gs_index = native_lkgs;
+#endif
+#endif
+}
+
+#ifndef CONFIG_PARAVIRT_XXL
+
+static inline void load_gs_index(unsigned int selector)
+{
+#ifdef CONFIG_X86_64
+ native_load_gs_index(selector);
+#else
+ loadsegment(gs, selector);
+#endif
+}
+
+#endif /* CONFIG_PARAVIRT_XXL */
+
+#endif /* _ASM_X86_GSSEG_H */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 08e822bd7aa6..0b73a809e9e1 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -116,6 +116,9 @@
/* Recommend using the newer ExProcessorMasks interface */
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11)
+/* Indicates that the hypervisor is nested within a Hyper-V partition. */
+#define HV_X64_HYPERV_NESTED BIT(12)
+
/* Recommend using enlightened VMCS */
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
@@ -225,6 +228,17 @@ enum hv_isolation_type {
#define HV_REGISTER_SINT15 0x4000009F
/*
+ * Define synthetic interrupt controller model specific registers for
+ * nested hypervisor.
+ */
+#define HV_REGISTER_NESTED_SCONTROL 0x40001080
+#define HV_REGISTER_NESTED_SVERSION 0x40001081
+#define HV_REGISTER_NESTED_SIEFP 0x40001082
+#define HV_REGISTER_NESTED_SIMP 0x40001083
+#define HV_REGISTER_NESTED_EOM 0x40001084
+#define HV_REGISTER_NESTED_SINT0 0x40001090
+
+/*
* Synthetic Timer MSRs. Four timers per vcpu.
*/
#define HV_REGISTER_STIMER0_CONFIG 0x400000B0
@@ -255,6 +269,9 @@ enum hv_isolation_type {
/* TSC invariant control */
#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118
+/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
+#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0)
+
/* Register name aliases for temporary compatibility */
#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT
#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG
@@ -368,7 +385,8 @@ struct hv_nested_enlightenments_control {
__u32 reserved:31;
} features;
struct {
- __u32 reserved;
+ __u32 inter_partition_comm:1;
+ __u32 reserved:31;
} hypercallControls;
} __packed;
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
index 9b08082a5d9f..baae6b4fea23 100644
--- a/arch/x86/include/asm/ibt.h
+++ b/arch/x86/include/asm/ibt.h
@@ -74,7 +74,7 @@ static inline bool is_endbr(u32 val)
return val == gen_endbr();
}
-extern __noendbr u64 ibt_save(void);
+extern __noendbr u64 ibt_save(bool disable);
extern __noendbr void ibt_restore(u64 save);
#else /* __ASSEMBLY__ */
@@ -100,7 +100,7 @@ extern __noendbr void ibt_restore(u64 save);
static inline bool is_endbr(u32 val) { return false; }
-static inline u64 ibt_save(void) { return 0; }
+static inline u64 ibt_save(bool disable) { return 0; }
static inline void ibt_restore(u64 save) { }
#else /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 72184b0b2219..b241af4ce9b4 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -582,18 +582,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
/* NMI */
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+#if IS_ENABLED(CONFIG_KVM_INTEL)
/*
- * Special NOIST entry point for VMX which invokes this on the kernel
- * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
- * 'executing' marker.
- *
- * On 32bit this just uses the regular NMI entry point because 32-bit does
- * not have ISTs.
+ * Special entry point for VMX which invokes this on the kernel stack, even for
+ * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work
+ * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well
+ * to avoid more ifdeffery.
*/
-DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
-#else
-#define asm_exc_nmi_noist asm_exc_nmi
+DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx);
#endif
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 347707d459c6..cbaf174d8efd 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -123,6 +123,8 @@
#define INTEL_FAM6_METEORLAKE 0xAC
#define INTEL_FAM6_METEORLAKE_L 0xAA
+#define INTEL_FAM6_LUNARLAKE_M 0xBD
+
/* "Small Core" Processors (Atom/E-Core) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 7793e52d6237..8c5ae649d2df 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -8,9 +8,6 @@
#include <asm/nospec-branch.h>
-/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
-#define __cpuidle __section(".cpuidle.text")
-
/*
* Interrupt control:
*/
@@ -45,13 +42,13 @@ static __always_inline void native_irq_enable(void)
asm volatile("sti": : :"memory");
}
-static inline __cpuidle void native_safe_halt(void)
+static __always_inline void native_safe_halt(void)
{
mds_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
-static inline __cpuidle void native_halt(void)
+static __always_inline void native_halt(void)
{
mds_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
@@ -84,7 +81,7 @@ static __always_inline void arch_local_irq_enable(void)
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline __cpuidle void arch_safe_halt(void)
+static __always_inline void arch_safe_halt(void)
{
native_safe_halt();
}
@@ -93,7 +90,7 @@ static inline __cpuidle void arch_safe_halt(void)
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
-static inline __cpuidle void halt(void)
+static __always_inline void halt(void)
{
native_halt();
}
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index abccd51dcfca..8dc345cc6318 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -14,6 +14,7 @@ BUILD_BUG_ON(1)
* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
+KVM_X86_OP(check_processor_compatibility)
KVM_X86_OP(hardware_enable)
KVM_X86_OP(hardware_disable)
KVM_X86_OP(hardware_unsetup)
@@ -76,7 +77,6 @@ KVM_X86_OP(set_nmi_mask)
KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
KVM_X86_OP_OPTIONAL(update_cr8_intercept)
-KVM_X86_OP(check_apicv_inhibit_reasons)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6aaae18f1854..808c292ad3f4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -134,8 +134,6 @@
#define INVALID_PAGE (~(hpa_t)0)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-#define INVALID_GPA (~(gpa_t)0)
-
/* KVM Hugepage definitions for x86 */
#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
@@ -514,6 +512,7 @@ struct kvm_pmc {
#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
#define KVM_PMC_MAX_FIXED 3
+#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6
struct kvm_pmu {
unsigned nr_arch_gp_counters;
@@ -678,6 +677,11 @@ struct kvm_vcpu_hv {
} nested;
};
+struct kvm_hypervisor_cpuid {
+ u32 base;
+ u32 limit;
+};
+
/* Xen HVM per vcpu emulation context */
struct kvm_vcpu_xen {
u64 hypercall_rip;
@@ -698,6 +702,7 @@ struct kvm_vcpu_xen {
struct hrtimer timer;
int poll_evtchn;
struct timer_list poll_timer;
+ struct kvm_hypervisor_cpuid cpuid;
};
struct kvm_queued_exception {
@@ -826,7 +831,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
- u32 kvm_cpuid_base;
+ struct kvm_hypervisor_cpuid kvm_cpuid;
u64 reserved_gpa_bits;
int maxphyaddr;
@@ -1022,19 +1027,30 @@ struct kvm_arch_memory_slot {
};
/*
- * We use as the mode the number of bits allocated in the LDR for the
- * logical processor ID. It happens that these are all powers of two.
- * This makes it is very easy to detect cases where the APICs are
- * configured for multiple modes; in that case, we cannot use the map and
- * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ * Track the mode of the optimized logical map, as the rules for decoding the
+ * destination vary per mode. Enabling the optimized logical map requires all
+ * software-enabled local APIs to be in the same mode, each addressable APIC to
+ * be mapped to only one MDA, and each MDA to map to at most one APIC.
*/
-#define KVM_APIC_MODE_XAPIC_CLUSTER 4
-#define KVM_APIC_MODE_XAPIC_FLAT 8
-#define KVM_APIC_MODE_X2APIC 16
+enum kvm_apic_logical_mode {
+ /* All local APICs are software disabled. */
+ KVM_APIC_MODE_SW_DISABLED,
+ /* All software enabled local APICs in xAPIC cluster addressing mode. */
+ KVM_APIC_MODE_XAPIC_CLUSTER,
+ /* All software enabled local APICs in xAPIC flat addressing mode. */
+ KVM_APIC_MODE_XAPIC_FLAT,
+ /* All software enabled local APICs in x2APIC mode. */
+ KVM_APIC_MODE_X2APIC,
+ /*
+ * Optimized map disabled, e.g. not all local APICs in the same logical
+ * mode, same logical ID assigned to multiple APICs, etc.
+ */
+ KVM_APIC_MODE_MAP_DISABLED,
+};
struct kvm_apic_map {
struct rcu_head rcu;
- u8 mode;
+ enum kvm_apic_logical_mode logical_mode;
u32 max_apic_id;
union {
struct kvm_lapic *xapic_flat_map[8];
@@ -1088,6 +1104,7 @@ struct kvm_hv {
u64 hv_reenlightenment_control;
u64 hv_tsc_emulation_control;
u64 hv_tsc_emulation_status;
+ u64 hv_invtsc_control;
/* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes;
@@ -1133,6 +1150,18 @@ struct kvm_x86_msr_filter {
struct msr_bitmap_range ranges[16];
};
+struct kvm_x86_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u32 fixed_counter_bitmap;
+ __u32 flags;
+ __u32 nr_includes;
+ __u32 nr_excludes;
+ __u64 *includes;
+ __u64 *excludes;
+ __u64 events[];
+};
+
enum kvm_apicv_inhibit {
/********************************************************************/
@@ -1164,6 +1193,12 @@ enum kvm_apicv_inhibit {
APICV_INHIBIT_REASON_BLOCKIRQ,
/*
+ * APICv is disabled because not all vCPUs have a 1:1 mapping between
+ * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
+
+ /*
* For simplicity, the APIC acceleration is inhibited
* first time either APIC ID or APIC base are changed by the guest
* from their reset values.
@@ -1201,6 +1236,12 @@ enum kvm_apicv_inhibit {
* AVIC is disabled because SEV doesn't support it.
*/
APICV_INHIBIT_REASON_SEV,
+
+ /*
+ * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
+ * mapping between logical ID and vCPU.
+ */
+ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
};
struct kvm_arch {
@@ -1249,10 +1290,11 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
- /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
- struct rw_semaphore apicv_update_lock;
-
bool apic_access_memslot_enabled;
+ bool apic_access_memslot_inhibited;
+
+ /* Protects apicv_inhibit_reasons */
+ struct rw_semaphore apicv_update_lock;
unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
@@ -1302,7 +1344,6 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
- int cpu_dirty_logging_count;
enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
@@ -1338,25 +1379,16 @@ struct kvm_arch {
/* Guest can access the SGX PROVISIONKEY. */
bool sgx_provisioning_allowed;
- struct kvm_pmu_event_filter __rcu *pmu_event_filter;
+ struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_huge_page_recovery_thread;
#ifdef CONFIG_X86_64
- /*
- * Whether the TDP MMU is enabled for this VM. This contains a
- * snapshot of the TDP MMU module parameter from when the VM was
- * created and remains unchanged for the life of the VM. If this is
- * true, TDP MMU handler functions will run for various MMU
- * operations.
- */
- bool tdp_mmu_enabled;
-
/* The number of TDP MMU pages across all roots. */
atomic64_t tdp_mmu_pages;
/*
- * List of kvm_mmu_page structs being used as roots.
- * All kvm_mmu_page structs in the list should have
+ * List of struct kvm_mmu_pages being used as roots.
+ * All struct kvm_mmu_pages in the list should have
* tdp_mmu_page set.
*
* For reads, this list is protected by:
@@ -1520,6 +1552,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
struct kvm_x86_ops {
const char *name;
+ int (*check_processor_compatibility)(void);
+
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
@@ -1608,6 +1642,8 @@ struct kvm_x86_ops {
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+ const unsigned long required_apicv_inhibits;
+ bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
@@ -1731,9 +1767,6 @@ struct kvm_x86_nested_ops {
};
struct kvm_x86_init_ops {
- int (*cpu_has_kvm_support)(void);
- int (*disabled_by_bios)(void);
- int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
unsigned int (*handle_intel_pt_intr)(void);
@@ -1760,6 +1793,9 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+void kvm_x86_vendor_exit(void);
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1982,7 +2018,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
bool kvm_apicv_activated(struct kvm *kvm);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
-void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set);
void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
@@ -2054,14 +2090,11 @@ enum {
TASK_SWITCH_GATE = 3,
};
-#define HF_GIF_MASK (1 << 0)
-#define HF_NMI_MASK (1 << 3)
-#define HF_IRET_MASK (1 << 4)
-#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */
#ifdef CONFIG_KVM_SMM
-#define HF_SMM_MASK (1 << 6)
-#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
+#define HF_SMM_MASK (1 << 1)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 2)
# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
# define KVM_ADDRESS_SPACE_NUM 2
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
index 6c5765192102..511b35069187 100644
--- a/arch/x86/include/asm/kvmclock.h
+++ b/arch/x86/include/asm/kvmclock.h
@@ -8,7 +8,7 @@ extern struct clocksource kvm_clock;
DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
{
return &this_cpu_read(hv_clock_per_cpu)->pvti;
}
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6e986088817d..9646ed6e8c0b 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -88,6 +88,9 @@
#define MCI_MISC_ADDR_MEM 3 /* memory address */
#define MCI_MISC_ADDR_GENERIC 7 /* generic */
+/* MCi_ADDR register defines */
+#define MCI_ADDR_PHYSADDR GENMASK_ULL(boot_cpu_data.x86_phys_bits - 1, 0)
+
/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN BIT_ULL(30)
#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index d5a58bde091c..320566a0443d 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -125,13 +125,13 @@ static inline unsigned int x86_cpuid_family(void)
#ifdef CONFIG_MICROCODE
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
-void reload_early_microcode(void);
+void reload_early_microcode(unsigned int cpu);
extern bool initrd_gone;
void microcode_bsp_resume(void);
#else
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
-static inline void reload_early_microcode(void) { }
+static inline void reload_early_microcode(unsigned int cpu) { }
static inline void microcode_bsp_resume(void) { }
#endif
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index ac31f9140d07..e6662adf3af4 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -47,12 +47,12 @@ struct microcode_amd {
extern void __init load_ucode_amd_bsp(unsigned int family);
extern void load_ucode_amd_ap(unsigned int family);
extern int __init save_microcode_in_initrd_amd(unsigned int family);
-void reload_ucode_amd(void);
+void reload_ucode_amd(unsigned int cpu);
#else
static inline void __init load_ucode_amd_bsp(unsigned int family) {}
static inline void load_ucode_amd_ap(unsigned int family) {}
static inline int __init
save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
-static inline void reload_ucode_amd(void) {}
+static inline void reload_ucode_amd(unsigned int cpu) {}
#endif
#endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index b8d40ddeab00..e01aa74a6de7 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,6 +12,7 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
+#include <asm/gsseg.h>
extern atomic64_t last_mm_ctx_id;
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 6d502f3efb0f..4c4c0ec3b62e 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -72,10 +72,16 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
return hv_status;
}
+/* Hypercall to the L0 hypervisor */
+static inline u64 hv_do_nested_hypercall(u64 control, void *input, void *output)
+{
+ return hv_do_hypercall(control | HV_HYPERCALL_NESTED, input, output);
+}
+
/* Fast hypercall with 8 bytes of input and no output */
-static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
+static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
{
- u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+ u64 hv_status;
#ifdef CONFIG_X86_64
{
@@ -103,10 +109,24 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
return hv_status;
}
+static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+ return _hv_do_fast_hypercall8(control, input1);
+}
+
+static inline u64 hv_do_fast_nested_hypercall8(u16 code, u64 input1)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
+
+ return _hv_do_fast_hypercall8(control, input1);
+}
+
/* Fast hypercall with 16 bytes of input */
-static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
{
- u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+ u64 hv_status;
#ifdef CONFIG_X86_64
{
@@ -137,6 +157,20 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
return hv_status;
}
+static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+ return _hv_do_fast_hypercall16(control, input1, input2);
+}
+
+static inline u64 hv_do_fast_nested_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
+
+ return _hv_do_fast_hypercall16(control, input1, input2);
+}
+
extern struct hv_vp_assist_page **hv_vp_assist_page;
static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
@@ -190,36 +224,20 @@ extern bool hv_isolation_type_snp(void);
static inline bool hv_is_synic_reg(unsigned int reg)
{
- if ((reg >= HV_REGISTER_SCONTROL) &&
- (reg <= HV_REGISTER_SINT15))
- return true;
- return false;
+ return (reg >= HV_REGISTER_SCONTROL) &&
+ (reg <= HV_REGISTER_SINT15);
}
-static inline u64 hv_get_register(unsigned int reg)
+static inline bool hv_is_sint_reg(unsigned int reg)
{
- u64 value;
-
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
- hv_ghcb_msr_read(reg, &value);
- else
- rdmsrl(reg, value);
- return value;
+ return (reg >= HV_REGISTER_SINT0) &&
+ (reg <= HV_REGISTER_SINT15);
}
-static inline void hv_set_register(unsigned int reg, u64 value)
-{
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
- hv_ghcb_msr_write(reg, value);
-
- /* Write proxy bit via wrmsl instruction */
- if (reg >= HV_REGISTER_SINT0 &&
- reg <= HV_REGISTER_SINT15)
- wrmsrl(reg, value | 1 << 20);
- } else {
- wrmsrl(reg, value);
- }
-}
+u64 hv_get_register(unsigned int reg);
+void hv_set_register(unsigned int reg, u64 value);
+u64 hv_get_non_nested_register(unsigned int reg);
+void hv_set_non_nested_register(unsigned int reg, u64 value);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
@@ -239,6 +257,8 @@ static inline int hyperv_flush_guest_mapping_range(u64 as,
}
static inline void hv_set_register(unsigned int reg, u64 value) { }
static inline u64 hv_get_register(unsigned int reg) { return 0; }
+static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { }
+static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; }
static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages,
bool visible)
{
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 37ff47552bcb..ad35355ee43e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -25,6 +25,7 @@
#define _EFER_SVME 12 /* Enable virtualization */
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
#define EFER_LME (1<<_EFER_LME)
@@ -33,6 +34,7 @@
#define EFER_SVME (1<<_EFER_SVME)
#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/* Intel MSRs. Some also available on other CPUs */
@@ -49,6 +51,10 @@
#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */
#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
+/* A mask for bits which the kernel toggles when controlling mitigations */
+#define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \
+ | SPEC_CTRL_RRSBA_DIS_S)
+
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
@@ -189,6 +195,9 @@
#define MSR_TURBO_RATIO_LIMIT1 0x000001ae
#define MSR_TURBO_RATIO_LIMIT2 0x000001af
+#define MSR_SNOOP_RSP_0 0x00001328
+#define MSR_SNOOP_RSP_1 0x00001329
+
#define MSR_LBR_SELECT 0x000001c8
#define MSR_LBR_TOS 0x000001c9
@@ -566,6 +575,26 @@
#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
+/* SNP feature bits enabled by the hypervisor */
+#define MSR_AMD64_SNP_VTOM BIT_ULL(3)
+#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4)
+#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5)
+#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6)
+#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7)
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8)
+#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9)
+#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10)
+#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11)
+#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12)
+#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14)
+#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16)
+#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17)
+
+/* SNP feature bits reserved for future use. */
+#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13)
+#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15)
+#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18)
+
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
/* AMD Collaborative Processor Performance Control MSRs */
@@ -1061,6 +1090,8 @@
/* - AMD: */
#define MSR_IA32_MBA_BW_BASE 0xc0000200
+#define MSR_IA32_SMBA_BW_BASE 0xc0000280
+#define MSR_IA32_EVT_CFG_BASE 0xc0000400
/* MSR_IA32_VMX_MISC bits */
#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 3a8fdf881313..778df05f8539 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -26,7 +26,7 @@
#define TPAUSE_C01_STATE 1
#define TPAUSE_C02_STATE 0
-static inline void __monitor(const void *eax, unsigned long ecx,
+static __always_inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
@@ -34,7 +34,7 @@ static inline void __monitor(const void *eax, unsigned long ecx,
:: "a" (eax), "c" (ecx), "d"(edx));
}
-static inline void __monitorx(const void *eax, unsigned long ecx,
+static __always_inline void __monitorx(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitorx %eax, %ecx, %edx;" */
@@ -42,7 +42,7 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
:: "a" (eax), "c" (ecx), "d"(edx));
}
-static inline void __mwait(unsigned long eax, unsigned long ecx)
+static __always_inline void __mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
@@ -77,8 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
* EAX (logical) address to monitor
* ECX #GP if not zero
*/
-static inline void __mwaitx(unsigned long eax, unsigned long ebx,
- unsigned long ecx)
+static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx,
+ unsigned long ecx)
{
/* No MDS buffer clear as this is AMD/HYGON only */
@@ -87,7 +87,7 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
:: "a" (eax), "b" (ebx), "c" (ecx));
}
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
/* "mwait %eax, %ecx;" */
@@ -105,7 +105,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
-static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 771b0a2b7a34..e04313e89f4f 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -564,7 +564,7 @@ static __always_inline void mds_user_clear_cpu_buffers(void)
*
* Clear CPU buffers if the corresponding static key is enabled
*/
-static inline void mds_idle_clear_cpu_buffers(void)
+static __always_inline void mds_idle_clear_cpu_buffers(void)
{
if (static_branch_likely(&mds_idle_clear))
mds_clear_cpu_buffers();
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 9cc82f305f4b..d18e5c332cb9 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -34,9 +34,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
copy_page(to, from);
}
-#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
- alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
+#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
#ifndef __pa
#define __pa(x) __phys_addr((unsigned long)(x))
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index df42f8aa99e4..580d71aca65a 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -15,10 +15,6 @@ extern unsigned long __phys_addr(unsigned long);
#define __phys_addr_symbol(x) __phys_addr(x)
#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_mapnr)
-#endif /* CONFIG_FLATMEM */
-
#include <linux/string.h>
static inline void clear_page(void *page)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 198e03e59ca1..cc6b8e087192 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -39,10 +39,6 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define __phys_reloc_hide(x) (x)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_pfn)
-#endif
-
void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 73e9522db7c1..cf40e813b3d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -26,7 +26,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
void paravirt_set_sched_clock(u64 (*func)(void));
-static inline u64 paravirt_sched_clock(void)
+static __always_inline u64 paravirt_sched_clock(void)
{
return static_call(pv_sched_clock)();
}
@@ -168,7 +168,7 @@ static inline void __write_cr4(unsigned long x)
PVOP_VCALL1(cpu.write_cr4, x);
}
-static inline void arch_safe_halt(void)
+static __always_inline void arch_safe_halt(void)
{
PVOP_VCALL0(irq.safe_halt);
}
@@ -178,7 +178,9 @@ static inline void halt(void)
PVOP_VCALL0(irq.halt);
}
-static inline void wbinvd(void)
+extern noinstr void pv_native_wbinvd(void);
+
+static __always_inline void wbinvd(void)
{
PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
}
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 5d0f6891ae61..8fc15ed5e60b 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -160,6 +160,14 @@ union cpuid10_edx {
};
/*
+ * Intel "Architectural Performance Monitoring extension" CPUID
+ * detection/enumeration details:
+ */
+#define ARCH_PERFMON_EXT_LEAF 0x00000023
+#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1
+#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1
+
+/*
* Intel Architectural LBR CPUID detection/enumeration details:
*/
union cpuid28_eax {
@@ -578,7 +586,7 @@ extern void perf_amd_brs_lopwr_cb(bool lopwr_in);
DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
-static inline void perf_lopwr_cb(bool lopwr_in)
+static __always_inline void perf_lopwr_cb(bool lopwr_in)
{
static_call_mod(perf_lopwr_cb)(lopwr_in);
}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 60d0f9015317..e9482a11ac52 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -80,21 +80,37 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
return ((value >> rightshift) & mask) << leftshift;
}
-/* Encode and de-code a swap entry */
+/*
+ * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
+ * are !pte_none() && !pte_present().
+ *
+ * Format of swap PTEs:
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * <----------------- offset ------------------> 0 E <- type --> 0
+ *
+ * E is the exclusive marker that is not stored in swap entries.
+ */
#define SWP_TYPE_BITS 5
+#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1)
+#define _SWP_TYPE_SHIFT (_PAGE_BIT_PRESENT + 1)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
-#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
- & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_type(x) (((x).val >> _SWP_TYPE_SHIFT) \
+ & _SWP_TYPE_MASK)
#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ (((type) & _SWP_TYPE_MASK) << _SWP_TYPE_SHIFT) \
| ((offset) << SWP_OFFSET_SHIFT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE
+
/* No inverted PFNs on 2 level page tables */
static inline u64 protnone_mask(u64 val)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 967b135fa2c0..9e7c0b719c3c 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -145,8 +145,24 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
}
#endif
-/* Encode and de-code a swap entry */
+/*
+ * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
+ * are !pte_none() && !pte_present().
+ *
+ * Format of swap PTEs:
+ *
+ * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3
+ * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2
+ * < type -> <---------------------- offset ----------------------
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * --------------------------------------------> 0 E 0 0 0 0 0 0 0
+ *
+ * E is the exclusive marker that is not stored in swap entries.
+ */
#define SWP_TYPE_BITS 5
+#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1)
#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
@@ -154,9 +170,10 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
-#define __swp_type(x) (((x).val) & ((1UL << SWP_TYPE_BITS) - 1))
+#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK)
#define __swp_offset(x) ((x).val >> SWP_TYPE_BITS)
-#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << SWP_TYPE_BITS})
+#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \
+ | (offset) << SWP_TYPE_BITS})
/*
* Normally, __swp_entry() converts from arch-independent swp_entry_t to
@@ -184,6 +201,9 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
__pteval_swp_offset(pte)))
+/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE
+
#include <asm/pgtable-invert.h>
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0564edd24ffb..7425f32e5293 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -289,6 +289,11 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_RW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
@@ -313,7 +318,7 @@ static inline int pte_uffd_wp(pte_t pte)
static inline pte_t pte_mkuffd_wp(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_UFFD_WP);
+ return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
}
static inline pte_t pte_clear_uffd_wp(pte_t pte)
@@ -332,11 +337,6 @@ static inline pte_t pte_mkold(pte_t pte)
return pte_clear_flags(pte, _PAGE_ACCESSED);
}
-static inline pte_t pte_wrprotect(pte_t pte)
-{
- return pte_clear_flags(pte, _PAGE_RW);
-}
-
static inline pte_t pte_mkexec(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_NX);
@@ -401,6 +401,11 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
@@ -409,7 +414,7 @@ static inline int pmd_uffd_wp(pmd_t pmd)
static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_UFFD_WP);
+ return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
}
static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
@@ -428,11 +433,6 @@ static inline pmd_t pmd_mkclean(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_DIRTY);
}
-static inline pmd_t pmd_wrprotect(pmd_t pmd)
-{
- return pmd_clear_flags(pmd, _PAGE_RW);
-}
-
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
@@ -1299,8 +1299,6 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
}
-#ifdef _PAGE_SWP_EXCLUSIVE
-#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
@@ -1315,7 +1313,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
}
-#endif /* _PAGE_SWP_EXCLUSIVE */
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4e35c66edeb7..8d73004e4cac 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -542,7 +542,6 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
IDLE_POLL};
extern void enable_sep_cpu(void);
-extern int sysenter_setup(void);
/* Defined in head.S */
@@ -697,7 +696,8 @@ bool xen_set_default_idle(void);
#endif
void __noreturn stop_this_cpu(void *dummy);
-void microcode_check(void);
+void microcode_check(struct cpuinfo_x86 *prev_info);
+void store_cpu_caps(struct cpuinfo_x86 *info);
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 19b695ff2c68..0c92db84469d 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -7,6 +7,7 @@
/* some helper functions for xen and kvm pv clock sources */
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
@@ -39,7 +40,7 @@ bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
-static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 04c17be9b5fd..bc5b4d788c08 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+void cpu_emergency_disable_virtualization(void);
+
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_panic_self_stop(struct pt_regs *regs);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index aff774775c67..7ba1726b71c7 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -98,6 +98,7 @@
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
#define REQUIRED_MASK19 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
+#define REQUIRED_MASK20 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index c390a672d560..794f69625780 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -96,7 +96,7 @@
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - unused
+ * 28 - VDSO getcpu
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
@@ -119,6 +119,7 @@
#define GDT_ENTRY_ESPFIX_SS 26
#define GDT_ENTRY_PERCPU 27
+#define GDT_ENTRY_CPUNODE 28
#define GDT_ENTRY_DOUBLEFAULT_TSS 31
@@ -159,6 +160,8 @@
# define __KERNEL_PERCPU 0
#endif
+#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
+
#else /* 64-bit: */
#include <asm/cache.h>
@@ -226,8 +229,6 @@
#define GDT_ENTRY_TLS_ENTRIES 3
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
-#ifdef CONFIG_X86_64
-
/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
#define VDSO_CPUNODE_BITS 12
#define VDSO_CPUNODE_MASK 0xfff
@@ -265,7 +266,6 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
}
#endif /* !__ASSEMBLY__ */
-#endif /* CONFIG_X86_64 */
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h
index c0ef921c0586..8009d781c2f9 100644
--- a/arch/x86/include/asm/shared/io.h
+++ b/arch/x86/include/asm/shared/io.h
@@ -5,13 +5,13 @@
#include <linux/types.h>
#define BUILDIO(bwl, bw, type) \
-static inline void __out##bwl(type value, u16 port) \
+static __always_inline void __out##bwl(type value, u16 port) \
{ \
asm volatile("out" #bwl " %" #bw "0, %w1" \
: : "a"(value), "Nd"(port)); \
} \
\
-static inline type __in##bwl(u16 port) \
+static __always_inline type __in##bwl(u16 port) \
{ \
type value; \
asm volatile("in" #bwl " %w1, %" #bw "0" \
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index e53f26228fbb..4a03993e0785 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -8,7 +8,6 @@
#define TDX_HYPERCALL_STANDARD 0
#define TDX_HCALL_HAS_OUTPUT BIT(0)
-#define TDX_HCALL_ISSUE_STI BIT(1)
#define TDX_CPUID_LEAF_ID 0x21
#define TDX_IDENT "IntelTDX "
@@ -22,12 +21,18 @@
* This is a software only structure and not part of the TDX module/VMM ABI.
*/
struct tdx_hypercall_args {
+ u64 r8;
+ u64 r9;
u64 r10;
u64 r11;
u64 r12;
u64 r13;
u64 r14;
u64 r15;
+ u64 rdi;
+ u64 rsi;
+ u64 rbx;
+ u64 rdx;
};
/* Used to request services from the VMM */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 35f709f619fb..de48d1389936 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -115,22 +115,11 @@ static inline void wrpkru(u32 pkru)
}
#endif
-static inline void native_wbinvd(void)
+static __always_inline void native_wbinvd(void)
{
asm volatile("wbinvd": : :"memory");
}
-extern asmlinkage void asm_load_gs_index(unsigned int selector);
-
-static inline void native_load_gs_index(unsigned int selector)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- asm_load_gs_index(selector);
- local_irq_restore(flags);
-}
-
static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
@@ -179,24 +168,14 @@ static inline void __write_cr4(unsigned long x)
native_write_cr4(x);
}
-static inline void wbinvd(void)
+static __always_inline void wbinvd(void)
{
native_wbinvd();
}
-
-static inline void load_gs_index(unsigned int selector)
-{
-#ifdef CONFIG_X86_64
- native_load_gs_index(selector);
-#else
- loadsegment(gs, selector);
-#endif
-}
-
#endif /* CONFIG_PARAVIRT_XXL */
-static inline void clflush(volatile void *__p)
+static __always_inline void clflush(volatile void *__p)
{
asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
}
@@ -295,7 +274,7 @@ static inline int enqcmds(void __iomem *dst, const void *src)
return 0;
}
-static inline void tile_release(void)
+static __always_inline void tile_release(void)
{
/*
* Instruction opcode for TILERELEASE; supported in binutils
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index f4b87f08f5c5..29832c338cdc 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -184,6 +184,37 @@ void int3_emulate_ret(struct pt_regs *regs)
unsigned long ip = int3_emulate_pop(regs);
int3_emulate_jmp(regs, ip);
}
+
+static __always_inline
+void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+{
+ static const unsigned long jcc_mask[6] = {
+ [0] = X86_EFLAGS_OF,
+ [1] = X86_EFLAGS_CF,
+ [2] = X86_EFLAGS_ZF,
+ [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
+ [4] = X86_EFLAGS_SF,
+ [5] = X86_EFLAGS_PF,
+ };
+
+ bool invert = cc & 1;
+ bool match;
+
+ if (cc < 0xc) {
+ match = regs->flags & jcc_mask[cc >> 1];
+ } else {
+ match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+ if (cc >= 0xe)
+ match = match || (regs->flags & X86_EFLAGS_ZF);
+ }
+
+ if ((match && !invert) || (!match && invert))
+ ip += disp;
+
+ int3_emulate_jmp(regs, ip);
+}
+
#endif /* !CONFIG_UML_X86 */
#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0cb881c1d69..f1cccba52eb9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -163,7 +163,12 @@ struct thread_info {
* GOOD_FRAME if within a frame
* BAD_STACK if placed across a frame boundary (or outside stack)
* NOT_STACK unable to determine (no frame pointers, etc)
+ *
+ * This function reads pointers from the stack and dereferences them. The
+ * pointers may not have their KMSAN shadow set up properly, which may result
+ * in false positive reports. Disable instrumentation to avoid those.
*/
+__no_kmsan_checks
static inline int arch_within_stack_frames(const void * const stack,
const void * const stackend,
const void *obj, unsigned long len)
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 8ac563abb567..a53961c64a56 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -8,6 +8,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
extern bool pit_timer_init(void);
+extern bool tsc_clocksource_watchdog_disabled(void);
extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 2963a2f5dbc4..d7f6592b74a9 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -45,7 +45,7 @@ extern const struct vdso_image vdso_image_x32;
extern const struct vdso_image vdso_image_32;
#endif
-extern void __init init_vdso_image(const struct vdso_image *image);
+extern int __init init_vdso_image(const struct vdso_image *image);
extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 1936f21ed8cd..4cf6794f9d68 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -318,6 +318,8 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
}
#define vdso_calc_delta vdso_calc_delta
+int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts);
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
index 57b1a7034c64..2cbce97d29ea 100644
--- a/arch/x86/include/asm/vdso/processor.h
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -18,6 +18,10 @@ static __always_inline void cpu_relax(void)
rep_nop();
}
+struct getcpu_cache;
+
+notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 8757078d4442..3b12e6b99412 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -126,7 +126,21 @@ static inline void cpu_svm_disable(void)
wrmsrl(MSR_VM_HSAVE_PA, 0);
rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM to ensure INIT and NMI
+ * aren't blocked, e.g. if a fatal error occurred between CLGI
+ * and STGI. Note, STGI may #UD if SVM is disabled from NMI
+ * context between reading EFER and executing STGI. In that
+ * case, GIF must already be set, otherwise the NMI would have
+ * been blocked, so just eat the fault.
+ */
+ asm_volatile_goto("1: stgi\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "memory" : fault);
+fault:
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ }
}
/** Makes sure SVM is disabled, if it is supported on the CPU
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index e5e0fe10c692..a2dd24947eb8 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -382,7 +382,7 @@ MULTI_stack_switch(struct multicall_entry *mcl,
}
#endif
-static inline int
+static __always_inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
return _hypercall2(int, sched_op, cmd, arg);
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 16f548a661cf..5fc35f889cd1 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -38,9 +38,11 @@ extern struct start_info *xen_start_info;
#include <asm/processor.h>
+#define XEN_SIGNATURE "XenVMMXenVMM"
+
static inline uint32_t xen_cpuid_base(void)
{
- return hypervisor_cpuid_base("XenVMMXenVMM", 2);
+ return hypervisor_cpuid_base(XEN_SIGNATURE, 2);
}
struct pci_dev;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index e48deab8901d..7f467fe05d42 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/ioctl.h>
+#include <linux/stddef.h>
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
@@ -507,8 +508,8 @@ struct kvm_nested_state {
* KVM_{GET,PUT}_NESTED_STATE ioctl values.
*/
union {
- struct kvm_vmx_nested_state_data vmx[0];
- struct kvm_svm_nested_state_data svm[0];
+ __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx);
+ __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm);
} data;
};
@@ -525,6 +526,35 @@ struct kvm_pmu_event_filter {
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0)
+#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+
+/*
+ * Masked event layout.
+ * Bits Description
+ * ---- -----------
+ * 7:0 event select (low bits)
+ * 15:8 umask match
+ * 31:16 unused
+ * 35:32 event select (high bits)
+ * 36:54 unused
+ * 55 exclude bit
+ * 63:56 umask mask
+ */
+
+#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \
+ (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \
+ (((mask) & 0xFFULL) << 56) | \
+ (((match) & 0xFFULL) << 8) | \
+ ((__u64)(!!(exclude)) << 55))
+
+#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
+ (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56)
+
/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index f69c168391aa..80e1df482337 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -116,6 +116,12 @@
#define SVM_VMGEXIT_AP_CREATE 1
#define SVM_VMGEXIT_AP_DESTROY 2
#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
+#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe
+#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \
+ /* SW_EXITINFO1[3:0] */ \
+ (((((u64)reason_set) & 0xf)) | \
+ /* SW_EXITINFO1[11:4] */ \
+ ((((u64)reason_code) & 0xff) << 4))
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
/* Exit code reserved for hypervisor/software use */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 96d51bbc2bd4..dd61752f4c96 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -45,7 +45,6 @@ obj-y += head$(BITS).o
obj-y += ebda.o
obj-y += platform-quirks.o
obj-y += process_$(BITS).o signal.o signal_$(BITS).o
-obj-$(CONFIG_COMPAT) += signal_compat.o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 907cc98b1938..1c38174b5f01 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -188,6 +188,17 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
return cpu;
}
+static bool __init acpi_is_processor_usable(u32 lapic_flags)
+{
+ if (lapic_flags & ACPI_MADT_ENABLED)
+ return true;
+
+ if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ return true;
+
+ return false;
+}
+
static int __init
acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
{
@@ -212,6 +223,10 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
if (apic_id == 0xffffffff)
return 0;
+ /* don't register processors that cannot be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -250,9 +265,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
return 0;
/* don't register processors that can not be onlined */
- if (acpi_support_online_capable &&
- !(processor->lapic_flags & ACPI_MADT_ENABLED) &&
- !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ if (!acpi_is_processor_usable(processor->lapic_flags))
return 0;
/*
@@ -1840,23 +1853,23 @@ early_param("acpi_sci", setup_acpi_sci);
int __acpi_acquire_global_lock(unsigned int *lock)
{
- unsigned int old, new, val;
+ unsigned int old, new;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
+ } while (!try_cmpxchg(lock, &old, new));
return ((new & 0x3) < 3) ? -1 : 0;
}
int __acpi_release_global_lock(unsigned int *lock)
{
- unsigned int old, new, val;
+ unsigned int old, new;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
new = old & ~0x3;
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
+ } while (!try_cmpxchg(lock, &old, new));
return old & 0x1;
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7d8c3cbde368..f615e0cb6d93 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -282,27 +282,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
- /* Mask away "NOT" flag bit for feature to test. */
- u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insn_buff));
- BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
+ BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
/*
* Patch if either:
* - feature is present
- * - feature not present but ALTINSTR_FLAG_INV is set to mean,
+ * - feature not present but ALT_FLAG_NOT is set to mean,
* patch if feature is *NOT* present.
*/
- if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
+ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
goto next;
DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
- (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
- feature >> 5,
- feature & 0x1f,
+ (a->flags & ALT_FLAG_NOT) ? "!" : "",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
instr, instr, a->instrlen,
replacement, a->replacementlen);
@@ -340,6 +338,12 @@ next:
}
}
+static inline bool is_jcc32(struct insn *insn)
+{
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
/*
@@ -378,12 +382,6 @@ static int emit_indirect(int op, int reg, u8 *bytes)
return i;
}
-static inline bool is_jcc32(struct insn *insn)
-{
- /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
- return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
-}
-
static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
{
u8 op = insn->opcode.bytes[0];
@@ -1772,6 +1770,11 @@ void text_poke_sync(void)
on_each_cpu(do_sync_core, NULL, 1);
}
+/*
+ * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
+ * this thing. When len == 6 everything is prefixed with 0x0f and we map
+ * opcode to Jcc.d8, using len to distinguish.
+ */
struct text_poke_loc {
/* addr := _stext + rel_addr */
s32 rel_addr;
@@ -1893,6 +1896,10 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
int3_emulate_jmp(regs, (long)ip + tp->disp);
break;
+ case 0x70 ... 0x7f: /* Jcc */
+ int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
+ break;
+
default:
BUG();
}
@@ -1966,16 +1973,26 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
+ u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
+ u8 _new[POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = tp[i].text;
int len = tp[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
+
+ if (len == 6) {
+ _new[0] = 0x0f;
+ memcpy(_new + 1, new, 5);
+ new = _new;
+ }
+
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
- (const char *)tp[i].text + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
+
do_sync++;
}
@@ -2003,8 +2020,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* The old instruction is recorded so that the event can be
* processed forwards or backwards.
*/
- perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
- tp[i].text, len);
+ perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
}
if (do_sync) {
@@ -2021,10 +2037,15 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* replacing opcode.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
- if (tp[i].text[0] == INT3_INSN_OPCODE)
+ u8 byte = tp[i].text[0];
+
+ if (tp[i].len == 6)
+ byte = 0x0f;
+
+ if (byte == INT3_INSN_OPCODE)
continue;
- text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
+ text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
do_sync++;
}
@@ -2042,9 +2063,11 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
const void *opcode, size_t len, const void *emulate)
{
struct insn insn;
- int ret, i;
+ int ret, i = 0;
- memcpy((void *)tp->text, opcode, len);
+ if (len == 6)
+ i = 1;
+ memcpy((void *)tp->text, opcode+i, len-i);
if (!emulate)
emulate = opcode;
@@ -2055,6 +2078,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
tp->len = len;
tp->opcode = insn.opcode.bytes[0];
+ if (is_jcc32(&insn)) {
+ /*
+ * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
+ */
+ tp->opcode = insn.opcode.bytes[1] - 0x10;
+ }
+
switch (tp->opcode) {
case RET_INSN_OPCODE:
case JMP32_INSN_OPCODE:
@@ -2071,7 +2101,6 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
BUG_ON(len != insn.length);
}
-
switch (tp->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
@@ -2080,6 +2109,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
+ case 0x70 ... 0x7f: /* Jcc */
tp->disp = insn.immediate.value;
break;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a868b76cd3d4..1f83b052bb74 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2364,9 +2364,8 @@ static int mp_irqdomain_create(int ioapic)
return -ENODEV;
}
- ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
- (void *)(long)ioapic);
-
+ ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops,
+ (void *)(long)ioapic);
if (!ip->irqdomain) {
/* Release fw handle if it was allocated above */
if (!cfg->dev)
@@ -2374,8 +2373,6 @@ static int mp_irqdomain_create(int ioapic)
return -ENOMEM;
}
- ip->irqdomain->parent = parent;
-
if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
cfg->type == IOAPIC_DOMAIN_STRICT)
ioapic_dynirq_base = max(ioapic_dynirq_base,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 60e330cdbd17..c6c15ce1952f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call)
apm_irq_save(flags);
firmware_restrict_branch_speculation_start();
- ibt = ibt_save();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
apm_bios_call_asm(call->func, call->ebx, call->ecx,
&call->eax, &call->ebx, &call->ecx, &call->edx,
@@ -690,7 +690,7 @@ static long __apm_bios_call_simple(void *_call)
apm_irq_save(flags);
firmware_restrict_branch_speculation_start();
- ibt = ibt_save();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
&call->eax);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 82c783da16a8..283dcd2f62c8 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -7,6 +7,7 @@
#define COMPILE_OFFSETS
#include <linux/crypto.h>
+#include <crypto/aria.h>
#include <linux/sched.h>
#include <linux/stddef.h>
#include <linux/hardirq.h>
@@ -75,12 +76,18 @@ static void __used common(void)
OFFSET(TDX_MODULE_r11, tdx_module_output, r11);
BLANK();
+ OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8);
+ OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9);
OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10);
OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11);
OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12);
OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13);
OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14);
OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15);
+ OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi);
+ OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi);
+ OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx);
+ OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx);
BLANK();
OFFSET(BP_scratch, boot_params, scratch);
@@ -111,5 +118,12 @@ static void __used common(void)
#ifdef CONFIG_CALL_DEPTH_TRACKING
OFFSET(X86_call_depth, pcpu_hot, call_depth);
#endif
+#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
+ /* Offset for fields in aria_ctx */
+ BLANK();
+ OFFSET(ARIA_CTX_enc_key, aria_ctx, enc_key);
+ OFFSET(ARIA_CTX_dec_key, aria_ctx, dec_key);
+ OFFSET(ARIA_CTX_rounds, aria_ctx, rounds);
+#endif
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f769d6d08b43..380753b14cab 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -956,7 +956,7 @@ static void init_amd(struct cpuinfo_x86 *c)
init_amd_cacheinfo(c);
- if (cpu_has(c, X86_FEATURE_XMM2)) {
+ if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
@@ -1158,24 +1158,43 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
return false;
}
-void set_dr_addr_mask(unsigned long mask, int dr)
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask);
+
+static unsigned int amd_msr_dr_addr_masks[] = {
+ MSR_F16H_DR0_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK + 1,
+ MSR_F16H_DR1_ADDR_MASK + 2
+};
+
+void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr)
{
- if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ int cpu = smp_processor_id();
+
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
return;
- switch (dr) {
- case 0:
- wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
- break;
- case 1:
- case 2:
- case 3:
- wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
- break;
- default:
- break;
- }
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return;
+
+ if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask)
+ return;
+
+ wrmsr(amd_msr_dr_addr_masks[dr], mask, 0);
+ per_cpu(amd_dr_addr_mask, cpu)[dr] = mask;
+}
+
+unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return 0;
+
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return 0;
+
+ return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
}
+EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
u32 amd_get_highest_perf(void)
{
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 1f60a2b27936..fdbb5f07448f 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -330,7 +330,16 @@ static void __init bp_init_freq_invariance(void)
static void disable_freq_invariance_workfn(struct work_struct *work)
{
+ int cpu;
+
static_branch_disable(&arch_scale_freq_key);
+
+ /*
+ * Set arch_freq_scale to a default value on all cpus
+ * This negates the effect of scaling
+ */
+ for_each_possible_cpu(cpu)
+ per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
}
static DECLARE_WORK(disable_freq_invariance_work,
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bca0bd8f4846..cf81848b72f4 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -33,6 +33,7 @@
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
+#include <asm/cpu.h>
#include "cpu.h"
@@ -86,7 +87,7 @@ void update_spec_ctrl_cond(u64 val)
wrmsrl(MSR_IA32_SPEC_CTRL, val);
}
-u64 spec_ctrl_current(void)
+noinstr u64 spec_ctrl_current(void)
{
return this_cpu_read(x86_spec_ctrl_current);
}
@@ -144,9 +145,17 @@ void __init check_bugs(void)
* have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
* init code as it is not enumerated and depends on the family.
*/
- if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -1229,9 +1238,9 @@ static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
[SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
[SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
- [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
- [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
- [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
[SPECTRE_V2_IBRS] = "Mitigation: IBRS",
};
@@ -1300,7 +1309,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
!boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
@@ -1486,8 +1495,12 @@ static void __init spectre_v2_select_mitigation(void)
pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
if (spectre_v2_in_ibrs_mode(mode)) {
- x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- update_spec_ctrl(x86_spec_ctrl_base);
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
+ msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
}
switch (mode) {
@@ -1571,8 +1584,8 @@ static void __init spectre_v2_select_mitigation(void)
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
* and Enhanced IBRS protect firmware too, so enable IBRS around
- * firmware calls only when IBRS / Enhanced IBRS aren't otherwise
- * enabled.
+ * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
+ * otherwise enabled.
*
* Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
* the user might select retpoline on the kernel command line and if
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index f4e5aa27eec6..4063e8991211 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -734,7 +734,7 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
void init_intel_cacheinfo(struct cpuinfo_x86 *c)
{
/* Cache sizes */
- unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
@@ -835,9 +835,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
case LVL_3:
l3 += cache_table[k].size;
break;
- case LVL_TRACE:
- trace += cache_table[k].size;
- break;
}
break;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9cfca3d7d0e2..8cd4126d8253 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -567,17 +567,18 @@ static __init int setup_disable_pku(char *arg)
return 1;
}
__setup("nopku", setup_disable_pku);
-#endif /* CONFIG_X86_64 */
+#endif
#ifdef CONFIG_X86_KERNEL_IBT
-__noendbr u64 ibt_save(void)
+__noendbr u64 ibt_save(bool disable)
{
u64 msr = 0;
if (cpu_feature_enabled(X86_FEATURE_IBT)) {
rdmsrl(MSR_IA32_S_CET, msr);
- wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ if (disable)
+ wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
}
return msr;
@@ -1093,6 +1094,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000001f)
c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+ if (c->extended_cpuid_level >= 0x80000021)
+ c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021);
+
init_scattered_cpuid_features(c);
init_speculation_control(c);
@@ -1226,8 +1230,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/* Zhaoxin Family 7 */
VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
@@ -1256,6 +1260,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define MMIO_SBDS BIT(2)
/* CPU is affected by RETbleed, speculating where you would not expect it */
#define RETBLEED BIT(3)
+/* CPU is affected by SMT (cross-thread) return predictions */
+#define SMT_RSB BIT(4)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
@@ -1287,8 +1293,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
- VULNBL_AMD(0x17, RETBLEED),
- VULNBL_HYGON(0x18, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
{}
};
@@ -1338,8 +1344,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
- if (ia32_cap & ARCH_CAP_IBRS_ALL)
+ /*
+ * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
+ * flag and protect from vendor-specific bugs via the whitelist.
+ */
+ if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) {
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+ if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+ !(ia32_cap & ARCH_CAP_PBRSB_NO))
+ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
!(ia32_cap & ARCH_CAP_MDS_NO)) {
@@ -1401,10 +1415,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_RETBLEED);
}
- if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) &&
- !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
- !(ia32_cap & ARCH_CAP_PBRSB_NO))
- setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
+ setup_force_cpu_bug(X86_BUG_SMT_RSB);
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1682,9 +1694,7 @@ void check_null_seg_clears_base(struct cpuinfo_x86 *c)
if (!IS_ENABLED(CONFIG_X86_64))
return;
- /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
- if (c->extended_cpuid_level >= 0x80000021 &&
- cpuid_eax(0x80000021) & BIT(6))
+ if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE))
return;
/*
@@ -1953,13 +1963,13 @@ void __init identify_boot_cpu(void)
if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
pr_info("CET detected: Indirect Branch Tracking enabled\n");
#ifdef CONFIG_X86_32
- sysenter_setup();
enable_sep_cpu();
#endif
cpu_detect_tlb(&boot_cpu_data);
setup_cr_pinning();
tsx_init();
+ lkgs_init();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -2125,7 +2135,6 @@ static void wait_for_master_cpu(int cpu)
#endif
}
-#ifdef CONFIG_X86_64
static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
@@ -2147,6 +2156,7 @@ static inline void setup_getcpu(int cpu)
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
+#ifdef CONFIG_X86_64
static inline void ucode_cpu_init(int cpu)
{
if (cpu)
@@ -2166,8 +2176,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
#else /* CONFIG_X86_64 */
-static inline void setup_getcpu(int cpu) { }
-
static inline void ucode_cpu_init(int cpu)
{
show_ucode_info_early();
@@ -2297,30 +2305,45 @@ void cpu_init_secondary(void)
#endif
#ifdef CONFIG_MICROCODE_LATE_LOADING
-/*
+/**
+ * store_cpu_caps() - Store a snapshot of CPU capabilities
+ * @curr_info: Pointer where to store it
+ *
+ * Returns: None
+ */
+void store_cpu_caps(struct cpuinfo_x86 *curr_info)
+{
+ /* Reload CPUID max function as it might've changed. */
+ curr_info->cpuid_level = cpuid_eax(0);
+
+ /* Copy all capability leafs and pick up the synthetic ones. */
+ memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability,
+ sizeof(curr_info->x86_capability));
+
+ /* Get the hardware CPUID leafs */
+ get_cpu_cap(curr_info);
+}
+
+/**
+ * microcode_check() - Check if any CPU capabilities changed after an update.
+ * @prev_info: CPU capabilities stored before an update.
+ *
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
* hotplug lock.
+ *
+ * Return: None
*/
-void microcode_check(void)
+void microcode_check(struct cpuinfo_x86 *prev_info)
{
- struct cpuinfo_x86 info;
+ struct cpuinfo_x86 curr_info;
perf_check_microcode();
- /* Reload CPUID max function as it might've changed. */
- info.cpuid_level = cpuid_eax(0);
-
- /*
- * Copy all capability leafs to pick up the synthetic ones so that
- * memcmp() below doesn't fail on that. The ones coming from CPUID will
- * get overwritten in get_cpu_cap().
- */
- memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability));
-
- get_cpu_cap(&info);
+ store_cpu_caps(&curr_info);
- if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)))
+ if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability,
+ sizeof(prev_info->x86_capability)))
return;
pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 7c9b5893c30a..57a5349e6954 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -83,6 +83,4 @@ unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
-extern u64 x86_read_arch_cap_msr(void);
-
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d95221117129..f6748c8bd647 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -68,6 +68,8 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 10fb5b5c9efa..23c5072fbbb7 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -306,6 +306,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
if ((low & BIT(5)) && !((high >> 5) & 0x3))
high |= BIT(5);
+ this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+
wrmsr(smca_config, low, high);
}
@@ -736,15 +738,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
if (m.status & MCI_STATUS_ADDRV) {
m.addr = addr;
- /*
- * Extract [55:<lsb>] where lsb is the least significant
- * *valid* bit of the address bits.
- */
- if (mce_flags.smca) {
- u8 lsb = (m.addr >> 56) & 0x3f;
-
- m.addr &= GENMASK_ULL(55, lsb);
- }
+ smca_extract_err_addr(&m);
}
if (mce_flags.smca) {
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2c8ec5c71712..7832a69d170e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -67,13 +67,7 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
-struct mce_bank {
- u64 ctl; /* subevents to enable */
-
- __u64 init : 1, /* initialise bank? */
- __reserved_1 : 63;
-};
-static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
@@ -579,7 +573,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
mce->severity != MCE_DEFERRED_SEVERITY)
return NOTIFY_DONE;
- pfn = mce->addr >> PAGE_SHIFT;
+ pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
set_mce_nospec(pfn);
mce->kflags |= MCE_HANDLED_UC;
@@ -633,15 +627,7 @@ static noinstr void mce_read_aux(struct mce *m, int i)
m->addr <<= shift;
}
- /*
- * Extract [55:<lsb>] where lsb is the least significant
- * *valid* bit of the address bits.
- */
- if (mce_flags.smca) {
- u8 lsb = (m->addr >> 56) & 0x3f;
-
- m->addr &= GENMASK_ULL(55, lsb);
- }
+ smca_extract_err_addr(m);
}
if (mce_flags.smca) {
@@ -1308,6 +1294,7 @@ static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
+ unsigned long pfn;
int ret;
p->mce_count = 0;
@@ -1316,9 +1303,10 @@ static void kill_me_maybe(struct callback_head *cb)
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
- ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ ret = memory_failure(pfn, flags);
if (!ret) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ set_mce_nospec(pfn);
sync_core();
return;
}
@@ -1340,11 +1328,13 @@ static void kill_me_maybe(struct callback_head *cb)
static void kill_me_never(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ unsigned long pfn;
p->mce_count = 0;
pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
- if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
}
static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 100fbeebdc72..a05ac0716ecf 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -105,8 +105,7 @@ static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
{
char *p;
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
+ strscpy(mce_helper, buf, sizeof(mce_helper));
p = strchr(mce_helper, '\n');
if (p)
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 7e03f5b7f6bd..91a415553c27 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -177,6 +177,24 @@ struct mce_vendor_flags {
extern struct mce_vendor_flags mce_flags;
+struct mce_bank {
+ /* subevents to enable */
+ u64 ctl;
+
+ /* initialise bank? */
+ __u64 init : 1,
+
+ /*
+ * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates
+ * the LSB field is found in MCA_STATUS and not in MCA_ADDR.
+ */
+ lsb_in_status : 1,
+
+ __reserved_1 : 62;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
enum mca_msr {
MCA_CTL,
MCA_STATUS,
@@ -189,8 +207,34 @@ extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
+
+/*
+ * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
+ * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR.
+ */
+static __always_inline void smca_extract_err_addr(struct mce *m)
+{
+ u8 lsb;
+
+ if (!mce_flags.smca)
+ return;
+
+ if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) {
+ lsb = (m->status >> 24) & 0x3f;
+
+ m->addr &= GENMASK_ULL(56, lsb);
+
+ return;
+ }
+
+ lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+}
+
#else
static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline void smca_extract_err_addr(struct mce *m) { }
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 56471f750762..9eb457b10341 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -55,7 +55,9 @@ struct cont_desc {
};
static u32 ucode_new_rev;
-static u8 amd_ucode_patch[PATCH_MAX_SIZE];
+
+/* One blob per node. */
+static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE];
/*
* Microcode patch container file is prepended to the initrd in cpio
@@ -330,8 +332,9 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true);
if (ret < 0) {
/*
- * Patch verification failed, skip to the next
- * container, if there's one:
+ * Patch verification failed, skip to the next container, if
+ * there is one. Before exit, check whether that container has
+ * found a patch already. If so, use it.
*/
goto out;
} else if (ret > 0) {
@@ -350,6 +353,7 @@ skip:
size -= patch_size + SECTION_HDR_SIZE;
}
+out:
/*
* If we have found a patch (desc->mc), it means we're looking at the
* container which has a patch for this CPU so return 0 to mean, @ucode
@@ -364,7 +368,6 @@ skip:
return 0;
}
-out:
return orig_size - size;
}
@@ -414,8 +417,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool
-apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
+static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
{
struct cont_desc desc = { 0 };
u8 (*patch)[PATCH_MAX_SIZE];
@@ -428,7 +430,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
- patch = &amd_ucode_patch;
+ patch = &amd_ucode_patch[0];
#endif
desc.cpuid_1_eax = cpuid_1_eax;
@@ -481,7 +483,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
struct ucode_cpu_info *uci;
struct cpio_data cp;
@@ -511,11 +513,11 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
{
struct cpio_data cp = { };
- __load_ucode_amd(cpuid_1_eax, &cp);
+ find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return;
- apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
+ early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true);
}
void load_ucode_amd_ap(unsigned int cpuid_1_eax)
@@ -546,15 +548,14 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
}
}
- __load_ucode_amd(cpuid_1_eax, &cp);
+ find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return;
- apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
+ early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false);
}
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -572,19 +573,19 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
+ ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret > UCODE_UPDATED)
return -EINVAL;
return 0;
}
-void reload_ucode_amd(void)
+void reload_ucode_amd(unsigned int cpu)
{
- struct microcode_amd *mc;
u32 rev, dummy __always_unused;
+ struct microcode_amd *mc;
- mc = (struct microcode_amd *)amd_ucode_patch;
+ mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)];
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
@@ -816,6 +817,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
return 0;
}
+/* Scan the blob in @data and add microcode patches to the cache. */
static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
size_t size)
{
@@ -850,9 +852,10 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
struct ucode_patch *p;
enum ucode_state ret;
@@ -865,22 +868,22 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
return ret;
}
- p = find_patch(0);
- if (!p) {
- return ret;
- } else {
- if (boot_cpu_data.microcode >= p->patch_id)
- return ret;
+ for_each_node(nid) {
+ cpu = cpumask_first(cpumask_of_node(nid));
+ c = &cpu_data(cpu);
- ret = UCODE_NEW;
- }
+ p = find_patch(cpu);
+ if (!p)
+ continue;
- /* save BSP's matching patch for early load */
- if (!save)
- return ret;
+ if (c->microcode >= p->patch_id)
+ continue;
- memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
- memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE));
+ ret = UCODE_NEW;
+
+ memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE);
+ memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE));
+ }
return ret;
}
@@ -905,14 +908,9 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
- bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
- /* reload ucode container only on the boot cpu */
- if (!bsp)
- return UCODE_OK;
-
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -925,7 +923,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
if (!verify_container(fw->data, fw->size, false))
goto fw_release;
- ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 712aafff96e0..7a329e561354 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -298,7 +298,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
#endif
}
-void reload_early_microcode(void)
+void reload_early_microcode(unsigned int cpu)
{
int vendor, family;
@@ -312,7 +312,7 @@ void reload_early_microcode(void)
break;
case X86_VENDOR_AMD:
if (family >= 0x10)
- reload_ucode_amd();
+ reload_ucode_amd(cpu);
break;
default:
break;
@@ -409,10 +409,10 @@ static int __reload_late(void *info)
goto wait_for_siblings;
if (err >= UCODE_NFOUND) {
- if (err == UCODE_ERROR)
+ if (err == UCODE_ERROR) {
pr_warn("Error reloading microcode on CPU %d\n", cpu);
-
- ret = -1;
+ ret = -1;
+ }
}
wait_for_siblings:
@@ -438,6 +438,7 @@ wait_for_siblings:
static int microcode_reload_late(void)
{
int old = boot_cpu_data.microcode, ret;
+ struct cpuinfo_x86 prev_info;
pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
pr_err("You should switch to early loading, if possible.\n");
@@ -445,12 +446,21 @@ static int microcode_reload_late(void)
atomic_set(&late_cpus_in, 0);
atomic_set(&late_cpus_out, 0);
- ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
- if (ret == 0)
- microcode_check();
+ /*
+ * Take a snapshot before the microcode update in order to compare and
+ * check whether any bits changed after an update.
+ */
+ store_cpu_caps(&prev_info);
- pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n",
- old, boot_cpu_data.microcode);
+ ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
+ if (!ret) {
+ pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n",
+ old, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+ } else {
+ pr_info("Reload failed, current microcode revision: 0x%x\n",
+ boot_cpu_data.microcode);
+ }
return ret;
}
@@ -465,11 +475,8 @@ static ssize_t reload_store(struct device *dev,
ssize_t ret = 0;
ret = kstrtoul(buf, 0, &val);
- if (ret)
- return ret;
-
- if (val != 1)
- return size;
+ if (ret || val != 1)
+ return -EINVAL;
cpus_read_lock();
@@ -507,7 +514,7 @@ static ssize_t version_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
-static ssize_t pf_show(struct device *dev,
+static ssize_t processor_flags_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
@@ -515,8 +522,8 @@ static ssize_t pf_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static DEVICE_ATTR(version, 0444, version_show, NULL);
-static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL);
+static DEVICE_ATTR_RO(version);
+static DEVICE_ATTR_RO(processor_flags);
static struct attribute *mc_default_attrs[] = {
&dev_attr_version.attr,
@@ -557,7 +564,7 @@ void microcode_bsp_resume(void)
if (uci->mc)
microcode_ops->apply_microcode(cpu);
else
- reload_early_microcode();
+ reload_early_microcode(cpu);
}
static struct syscore_ops mc_syscore_ops = {
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cac2bdb57f0b..467cf37ea90a 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -305,14 +305,11 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp)
return false;
}
-/*
- * Print ucode update info.
- */
-static void
-print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+static void print_ucode_info(int old_rev, int new_rev, unsigned int date)
{
- pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
- uci->cpu_sig.rev,
+ pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n",
+ old_rev,
+ new_rev,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
@@ -322,6 +319,7 @@ print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
static int delay_ucode_info;
static int current_mc_date;
+static int early_old_rev;
/*
* Print early updated ucode info after printk works. This is delayed info dump.
@@ -332,7 +330,7 @@ void show_ucode_info_early(void)
if (delay_ucode_info) {
intel_cpu_collect_info(&uci);
- print_ucode_info(&uci, current_mc_date);
+ print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date);
delay_ucode_info = 0;
}
}
@@ -341,40 +339,32 @@ void show_ucode_info_early(void)
* At this point, we can not call printk() yet. Delay printing microcode info in
* show_ucode_info_early() until printk() works.
*/
-static void print_ucode(struct ucode_cpu_info *uci)
+static void print_ucode(int old_rev, int new_rev, int date)
{
- struct microcode_intel *mc;
int *delay_ucode_info_p;
int *current_mc_date_p;
-
- mc = uci->mc;
- if (!mc)
- return;
+ int *early_old_rev_p;
delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+ early_old_rev_p = (int *)__pa_nodebug(&early_old_rev);
*delay_ucode_info_p = 1;
- *current_mc_date_p = mc->hdr.date;
+ *current_mc_date_p = date;
+ *early_old_rev_p = old_rev;
}
#else
-static inline void print_ucode(struct ucode_cpu_info *uci)
+static inline void print_ucode(int old_rev, int new_rev, int date)
{
- struct microcode_intel *mc;
-
- mc = uci->mc;
- if (!mc)
- return;
-
- print_ucode_info(uci, mc->hdr.date);
+ print_ucode_info(old_rev, new_rev, date);
}
#endif
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc;
- u32 rev;
+ u32 rev, old_rev;
mc = uci->mc;
if (!mc)
@@ -391,6 +381,8 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
return UCODE_OK;
}
+ old_rev = rev;
+
/*
* Writeback and invalidate caches before updating microcode to avoid
* internal issues depending on what the microcode is updating.
@@ -407,9 +399,9 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
uci->cpu_sig.rev = rev;
if (early)
- print_ucode(uci);
+ print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date);
else
- print_ucode_info(uci, mc->hdr.date);
+ print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date);
return 0;
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 46668e255421..f36dc2f796c5 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -37,9 +37,76 @@
/* Is Linux running as the root partition? */
bool hv_root_partition;
+/* Is Linux running on nested Microsoft Hypervisor */
+bool hv_nested;
struct ms_hyperv_info ms_hyperv;
#if IS_ENABLED(CONFIG_HYPERV)
+static inline unsigned int hv_get_nested_reg(unsigned int reg)
+{
+ if (hv_is_sint_reg(reg))
+ return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0;
+
+ switch (reg) {
+ case HV_REGISTER_SIMP:
+ return HV_REGISTER_NESTED_SIMP;
+ case HV_REGISTER_SIEFP:
+ return HV_REGISTER_NESTED_SIEFP;
+ case HV_REGISTER_SVERSION:
+ return HV_REGISTER_NESTED_SVERSION;
+ case HV_REGISTER_SCONTROL:
+ return HV_REGISTER_NESTED_SCONTROL;
+ case HV_REGISTER_EOM:
+ return HV_REGISTER_NESTED_EOM;
+ default:
+ return reg;
+ }
+}
+
+u64 hv_get_non_nested_register(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
+ hv_ghcb_msr_read(reg, &value);
+ else
+ rdmsrl(reg, value);
+ return value;
+}
+EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
+
+void hv_set_non_nested_register(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
+ hv_ghcb_msr_write(reg, value);
+
+ /* Write proxy bit via wrmsl instruction */
+ if (hv_is_sint_reg(reg))
+ wrmsrl(reg, value | 1 << 20);
+ } else {
+ wrmsrl(reg, value);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_set_non_nested_register);
+
+u64 hv_get_register(unsigned int reg)
+{
+ if (hv_nested)
+ reg = hv_get_nested_reg(reg);
+
+ return hv_get_non_nested_register(reg);
+}
+EXPORT_SYMBOL_GPL(hv_get_register);
+
+void hv_set_register(unsigned int reg, u64 value)
+{
+ if (hv_nested)
+ reg = hv_get_nested_reg(reg);
+
+ hv_set_non_nested_register(reg, value);
+}
+EXPORT_SYMBOL_GPL(hv_set_register);
+
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@@ -301,6 +368,11 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: running as root partition\n");
}
+ if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
+ hv_nested = true;
+ pr_info("Hyper-V: running on a nested hypervisor\n");
+ }
+
/*
* Extract host information.
*/
@@ -388,7 +460,7 @@ static void __init ms_hyperv_init_platform(void)
* setting of this MSR bit should happen before init_intel()
* is called.
*/
- wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index c98e52ff5f20..030d3b409768 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -100,6 +100,18 @@ struct rdt_hw_resource rdt_resources_all[] = {
.fflags = RFTYPE_RES_MB,
},
},
+ [RDT_RESOURCE_SMBA] =
+ {
+ .r_resctrl = {
+ .rid = RDT_RESOURCE_SMBA,
+ .name = "SMBA",
+ .cache_level = 3,
+ .domains = domain_init(RDT_RESOURCE_SMBA),
+ .parse_ctrlval = parse_bw,
+ .format_str = "%d=%*u",
+ .fflags = RFTYPE_RES_MB,
+ },
+ },
};
/*
@@ -150,6 +162,13 @@ bool is_mba_sc(struct rdt_resource *r)
if (!r)
return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
+ /*
+ * The software controller support is only applicable to MBA resource.
+ * Make sure to check for resource type.
+ */
+ if (r->rid != RDT_RESOURCE_MBA)
+ return false;
+
return r->membw.mba_sc;
}
@@ -213,9 +232,15 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax;
union cpuid_0x10_x_edx edx;
- u32 ebx, ecx;
+ u32 ebx, ecx, subleaf;
+
+ /*
+ * Query CPUID_Fn80000020_EDX_x01 for MBA and
+ * CPUID_Fn80000020_EDX_x02 for SMBA
+ */
+ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
- cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full);
+ cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->default_ctrl = MAX_MBA_BW_AMD;
@@ -647,6 +672,8 @@ enum {
RDT_FLAG_L2_CAT,
RDT_FLAG_L2_CDP,
RDT_FLAG_MBA,
+ RDT_FLAG_SMBA,
+ RDT_FLAG_BMEC,
};
#define RDT_OPT(idx, n, f) \
@@ -670,6 +697,8 @@ static struct rdt_options rdt_options[] __initdata = {
RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+ RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
+ RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
};
#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
@@ -699,7 +728,7 @@ static int __init set_rdt_options(char *str)
}
__setup("rdt", set_rdt_options);
-static bool __init rdt_cpu_has(int flag)
+bool __init rdt_cpu_has(int flag)
{
bool ret = boot_cpu_has(flag);
struct rdt_options *o;
@@ -734,6 +763,19 @@ static __init bool get_mem_config(void)
return false;
}
+static __init bool get_slow_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_SMBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
static __init bool get_rdt_alloc_resources(void)
{
struct rdt_resource *r;
@@ -764,6 +806,9 @@ static __init bool get_rdt_alloc_resources(void)
if (get_mem_config())
ret = true;
+ if (get_slow_mem_config())
+ ret = true;
+
return ret;
}
@@ -853,6 +898,9 @@ static __init void rdt_init_res_defs_amd(void)
} else if (r->rid == RDT_RESOURCE_MBA) {
hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
hw_res->msr_update = mba_wrmsr_amd;
+ } else if (r->rid == RDT_RESOURCE_SMBA) {
+ hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
}
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 1df0e3262bca..eb07d4435391 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -209,7 +209,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
unsigned long dom_id;
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- r->rid == RDT_RESOURCE_MBA) {
+ (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
return -EINVAL;
}
@@ -310,7 +310,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
enum resctrl_conf_type t;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
- int cpu;
u32 idx;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
@@ -341,13 +340,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
if (cpumask_empty(cpu_mask))
goto done;
- cpu = get_cpu();
- /* Update resource control msr on this CPU if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_ctrl_update(&msr_param);
- /* Update resource control msr on other CPUs. */
- smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
- put_cpu();
+
+ /* Update resource control msr on all the CPUs. */
+ on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
done:
free_cpumask_var(cpu_mask);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 5ebd28e6aa0c..8edecc5763d8 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -30,6 +30,29 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
+/* Reads to Local DRAM Memory */
+#define READS_TO_LOCAL_MEM BIT(0)
+
+/* Reads to Remote DRAM Memory */
+#define READS_TO_REMOTE_MEM BIT(1)
+
+/* Non-Temporal Writes to Local Memory */
+#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2)
+
+/* Non-Temporal Writes to Remote Memory */
+#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3)
+
+/* Reads to Local Memory the system identifies as "Slow Memory" */
+#define READS_TO_LOCAL_S_MEM BIT(4)
+
+/* Reads to Remote Memory the system identifies as "Slow Memory" */
+#define READS_TO_REMOTE_S_MEM BIT(5)
+
+/* Dirty Victims to All Types of Memory */
+#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6)
+
+/* Max event bits supported */
+#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
struct rdt_fs_context {
struct kernfs_fs_context kfc;
@@ -52,11 +75,13 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
* struct mon_evt - Entry in the event list of a resource
* @evtid: event id
* @name: name of the event
+ * @configurable: true if the event is configurable
* @list: entry in &rdt_resource->evt_list
*/
struct mon_evt {
enum resctrl_event_id evtid;
char *name;
+ bool configurable;
struct list_head list;
};
@@ -409,6 +434,7 @@ enum resctrl_res_level {
RDT_RESOURCE_L3,
RDT_RESOURCE_L2,
RDT_RESOURCE_MBA,
+ RDT_RESOURCE_SMBA,
/* Must be the last */
RDT_NUM_RESOURCES,
@@ -511,6 +537,7 @@ void closid_free(int closid);
int alloc_rmid(void);
void free_rmid(u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
+bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
@@ -527,5 +554,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
+void __init mbm_config_rftype_init(const char *config);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 77538abeb72a..7fe51488e136 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -204,6 +204,23 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
}
}
+/*
+ * Assumes that hardware counters are also reset and thus that there is
+ * no need to record initial non-zero counts.
+ */
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d)
+{
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+
+ if (is_mbm_total_enabled())
+ memset(hw_dom->arch_mbm_total, 0,
+ sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
+
+ if (is_mbm_local_enabled())
+ memset(hw_dom->arch_mbm_local, 0,
+ sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
+}
+
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
{
u64 shift = 64 - width, chunks;
@@ -763,7 +780,7 @@ static void l3_mon_evt_init(struct rdt_resource *r)
list_add_tail(&mbm_local_event.list, &r->evt_list);
}
-int rdt_get_mon_l3_config(struct rdt_resource *r)
+int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -800,6 +817,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
if (ret)
return ret;
+ if (rdt_cpu_has(X86_FEATURE_BMEC)) {
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
+ mbm_total_event.configurable = true;
+ mbm_config_rftype_init("mbm_total_bytes_config");
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
+ mbm_local_event.configurable = true;
+ mbm_config_rftype_init("mbm_local_bytes_config");
+ }
+ }
+
l3_mon_evt_init(r);
r->mon_capable = true;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 5993da21d822..e2c1599d1b37 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -325,12 +325,7 @@ static void update_cpu_closid_rmid(void *info)
static void
update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
{
- int cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, cpu_mask))
- update_cpu_closid_rmid(r);
- smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
- put_cpu();
+ on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1);
}
static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
@@ -1003,8 +998,11 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
struct rdt_resource *r = of->kn->parent->priv;
struct mon_evt *mevt;
- list_for_each_entry(mevt, &r->evt_list, list)
+ list_for_each_entry(mevt, &r->evt_list, list) {
seq_printf(seq, "%s\n", mevt->name);
+ if (mevt->configurable)
+ seq_printf(seq, "%s_config\n", mevt->name);
+ }
return 0;
}
@@ -1215,7 +1213,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- if (r->rid == RDT_RESOURCE_MBA)
+ if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
continue;
has_cache = true;
list_for_each_entry(d, &r->domains, list) {
@@ -1404,7 +1402,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
ctrl = resctrl_arch_get_config(r, d,
closid,
type);
- if (r->rid == RDT_RESOURCE_MBA)
+ if (r->rid == RDT_RESOURCE_MBA ||
+ r->rid == RDT_RESOURCE_SMBA)
size = ctrl;
else
size = rdtgroup_cbm_to_size(r, d, ctrl);
@@ -1421,6 +1420,248 @@ out:
return ret;
}
+struct mon_config_info {
+ u32 evtid;
+ u32 mon_config;
+};
+
+#define INVALID_CONFIG_INDEX UINT_MAX
+
+/**
+ * mon_event_config_index_get - get the hardware index for the
+ * configurable event
+ * @evtid: event id.
+ *
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ * INVALID_CONFIG_INDEX for invalid evtid
+ */
+static inline unsigned int mon_event_config_index_get(u32 evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return 0;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return 1;
+ default:
+ /* Should never reach here */
+ return INVALID_CONFIG_INDEX;
+ }
+}
+
+static void mon_event_config_read(void *info)
+{
+ struct mon_config_info *mon_info = info;
+ unsigned int index;
+ u64 msrval;
+
+ index = mon_event_config_index_get(mon_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ return;
+ }
+ rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
+
+ /* Report only the valid event configuration bits */
+ mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+}
+
+static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
+{
+ smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
+}
+
+static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
+{
+ struct mon_config_info mon_info = {0};
+ struct rdt_domain *dom;
+ bool sep = false;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ memset(&mon_info, 0, sizeof(struct mon_config_info));
+ mon_info.evtid = evtid;
+ mondata_config_read(dom, &mon_info);
+
+ seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ return 0;
+}
+
+static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ return 0;
+}
+
+static void mon_event_config_write(void *info)
+{
+ struct mon_config_info *mon_info = info;
+ unsigned int index;
+
+ index = mon_event_config_index_get(mon_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ return;
+ }
+ wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
+}
+
+static int mbm_config_write_domain(struct rdt_resource *r,
+ struct rdt_domain *d, u32 evtid, u32 val)
+{
+ struct mon_config_info mon_info = {0};
+ int ret = 0;
+
+ /* mon_config cannot be more than the supported set of events */
+ if (val > MAX_EVT_CONFIG_BITS) {
+ rdt_last_cmd_puts("Invalid event configuration\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Read the current config value first. If both are the same then
+ * no need to write it again.
+ */
+ mon_info.evtid = evtid;
+ mondata_config_read(d, &mon_info);
+ if (mon_info.mon_config == val)
+ goto out;
+
+ mon_info.mon_config = val;
+
+ /*
+ * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
+ * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
+ * are scoped at the domain level. Writing any of these MSRs
+ * on one CPU is observed by all the CPUs in the domain.
+ */
+ smp_call_function_any(&d->cpu_mask, mon_event_config_write,
+ &mon_info, 1);
+
+ /*
+ * When an Event Configuration is changed, the bandwidth counters
+ * for all RMIDs and Events will be cleared by the hardware. The
+ * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
+ * every RMID on the next read to any event for every RMID.
+ * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
+ * cleared while it is tracked by the hardware. Clear the
+ * mbm_local and mbm_total counts for all the RMIDs.
+ */
+ resctrl_arch_reset_rmid_all(r, d);
+
+out:
+ return ret;
+}
+
+static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
+{
+ char *dom_str = NULL, *id_str;
+ unsigned long dom_id, val;
+ struct rdt_domain *d;
+ int ret = 0;
+
+next:
+ if (!tok || tok[0] == '\0')
+ return 0;
+
+ /* Start processing the strings for each domain */
+ dom_str = strim(strsep(&tok, ";"));
+ id_str = strsep(&dom_str, "=");
+
+ if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
+ return -EINVAL;
+ }
+
+ if (!dom_str || kstrtoul(dom_str, 16, &val)) {
+ rdt_last_cmd_puts("Non-numeric event configuration value\n");
+ return -EINVAL;
+ }
+
+ list_for_each_entry(d, &r->domains, list) {
+ if (d->id == dom_id) {
+ ret = mbm_config_write_domain(r, d, evtid, val);
+ if (ret)
+ return -EINVAL;
+ goto next;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ int ret;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ buf[nbytes - 1] = '\0';
+
+ ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ int ret;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ buf[nbytes - 1] = '\0';
+
+ ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret ?: nbytes;
+}
+
/* rdtgroup information files for one cache resource. */
static struct rftype res_common_files[] = {
{
@@ -1520,6 +1761,20 @@ static struct rftype res_common_files[] = {
.fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
},
{
+ .name = "mbm_total_bytes_config",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_total_bytes_config_show,
+ .write = mbm_total_bytes_config_write,
+ },
+ {
+ .name = "mbm_local_bytes_config",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_local_bytes_config_show,
+ .write = mbm_local_bytes_config_write,
+ },
+ {
.name = "cpus",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -1625,6 +1880,15 @@ void __init thread_throttle_mode_init(void)
rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
}
+void __init mbm_config_rftype_init(const char *config)
+{
+ struct rftype *rft;
+
+ rft = rdtgroup_get_rftype_by_name(config);
+ if (rft)
+ rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE;
+}
+
/**
* rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
* @r: The resource group with which the file is associated.
@@ -1866,13 +2130,9 @@ static int set_cache_qos_cfg(int level, bool enable)
/* Pick one CPU from each domain instance to update MSR */
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
}
- cpu = get_cpu();
- /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- update(&enable);
- /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, update, &enable, 1);
- put_cpu();
+
+ /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, update, &enable, 1);
free_cpumask_var(cpu_mask);
@@ -2349,7 +2609,7 @@ static int reset_all_ctrls(struct rdt_resource *r)
struct msr_param msr_param;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
- int i, cpu;
+ int i;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
@@ -2370,13 +2630,9 @@ static int reset_all_ctrls(struct rdt_resource *r)
for (i = 0; i < hw_res->num_closid; i++)
hw_dom->ctrl_val[i] = r->default_ctrl;
}
- cpu = get_cpu();
- /* Update CBM on this cpu if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_ctrl_update(&msr_param);
- /* Update CBM on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
- put_cpu();
+
+ /* Update CBM on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
free_cpumask_var(cpu_mask);
@@ -2855,7 +3111,8 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- if (r->rid == RDT_RESOURCE_MBA) {
+ if (r->rid == RDT_RESOURCE_MBA ||
+ r->rid == RDT_RESOURCE_SMBA) {
rdtgroup_init_mba(r, rdtgrp->closid);
if (is_mba_sc(r))
continue;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index f53944fb8f7f..0dad49a09b7a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -45,6 +45,8 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ 0, 0, 0, 0, 0 }
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index aa9b8b868867..262f5fb18d74 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -95,7 +95,7 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
vma->vm_ops = &sgx_vm_ops;
- vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
+ vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
vma->vm_private_data = encl;
return 0;
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 6a77a14eee38..c3e37eaec8ec 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -105,7 +105,7 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &sgx_vepc_vm_ops;
/* Don't copy VMA in fork() */
- vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
vma->vm_private_data = vepc;
return 0;
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 8009c8346d8f..b31ee4f1657a 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -11,6 +11,7 @@
#include <linux/cpufeature.h>
#include <asm/cmdline.h>
+#include <asm/cpu.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 02039ec3597d..11f83d07925e 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char *arg)
}
early_param("no-steal-acc", parse_no_stealacc);
-static unsigned long long notrace vmware_sched_clock(void)
+static noinstr u64 vmware_sched_clock(void)
{
unsigned long long ns;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 305514431f26..cdd92ab43cda 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,7 +37,6 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
@@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
/*
* Disable Intel PT to stop its logging
*/
@@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9dac24680ff8..fb8cf953380d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -53,7 +53,7 @@
*
* Once the E820 map has been converted to the standard Linux memory layout
* information its role stops - modifying it has no effect and does not get
- * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
+ * re-propagated. So its main role is a temporary bootstrap storage of firmware
* specific memory layout data during early bootup.
*/
static struct e820_table e820_table_init __initdata;
@@ -395,7 +395,7 @@ int __init e820__update_table(struct e820_table *table)
/* Continue building up new map based on this information: */
if (current_type != last_type || e820_nomerge(current_type)) {
- if (last_type != 0) {
+ if (last_type) {
new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
/* Move forward only if the new size was non-zero: */
if (new_entries[new_nr_entries].size != 0)
@@ -403,7 +403,7 @@ int __init e820__update_table(struct e820_table *table)
if (++new_nr_entries >= max_nr_entries)
break;
}
- if (current_type != 0) {
+ if (current_type) {
new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
new_entries[new_nr_entries].type = current_type;
last_addr = change_point[chg_idx]->addr;
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index 958accf2ccf0..9fcfa5c4dad7 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void)
struct fpu *fpu = &current->thread.fpu;
int cpu = smp_processor_id();
- if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER)))
return;
if (!fpregs_state_valid(fpu, cpu)) {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 9baa89a8877d..caf33486dc5e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
this_cpu_write(in_kernel_fpu, true);
- if (!(current->flags & PF_KTHREAD) &&
+ if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
save_fpregs_to_fpstate(&current->thread.fpu);
@@ -853,12 +853,12 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
* Initialize register state that may prevent from entering low-power idle.
* This function will be invoked from the cpuidle driver only when needed.
*/
-void fpu_idle_fpregs(void)
+noinstr void fpu_idle_fpregs(void)
{
/* Note: AMX_TILE being enabled implies XGETBV1 support */
if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
(xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
tile_release();
- fpregs_deactivate(&current->thread.fpu);
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}
}
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 71f336425e58..c8eb1ac5125a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1091,6 +1091,8 @@ int __init hpet_enable(void)
if (!hpet_counting())
goto out_nohpet;
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index bbb0f737aab1..b01644c949b2 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -127,7 +127,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
set_debugreg(*dr7, 7);
if (info->mask)
- set_dr_addr_mask(info->mask, i);
+ amd_set_dr_addr_mask(info->mask, i);
return 0;
}
@@ -166,7 +166,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
set_debugreg(dr7, 7);
if (info->mask)
- set_dr_addr_mask(0, i);
+ amd_set_dr_addr_mask(0, i);
/*
* Ensure the write to cpu_dr7 is after we've set the DR7 register.
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 3aa5304200c5..4d8aff05a509 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq)
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ irq_set_status_flags(irq, IRQ_LEVEL);
enable_irq(irq);
lapic_assign_legacy_vector(irq, true);
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index beb1bada1b0a..c683666876f1 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -65,8 +65,10 @@ void __init init_ISA_irqs(void)
legacy_pic->init(0);
- for (i = 0; i < nr_legacy_irqs(); i++)
+ for (i = 0; i < nr_legacy_irqs(); i++) {
irq_set_chip_and_handler(i, chip, handle_level_irq);
+ irq_set_status_flags(i, IRQ_LEVEL);
+ }
}
void __init init_IRQ(void)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index b36f3c367cb2..f7f6042eb7e6 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -464,50 +464,26 @@ static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(kprobe_emulate_call);
-static nokprobe_inline
-void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
{
unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
- if (cond)
- ip += p->ainsn.rel32;
+ ip += p->ainsn.rel32;
int3_emulate_jmp(regs, ip);
}
-
-static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
-{
- __kprobe_emulate_jmp(p, regs, true);
-}
NOKPROBE_SYMBOL(kprobe_emulate_jmp);
-static const unsigned long jcc_mask[6] = {
- [0] = X86_EFLAGS_OF,
- [1] = X86_EFLAGS_CF,
- [2] = X86_EFLAGS_ZF,
- [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
- [4] = X86_EFLAGS_SF,
- [5] = X86_EFLAGS_PF,
-};
-
static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
{
- bool invert = p->ainsn.jcc.type & 1;
- bool match;
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
- if (p->ainsn.jcc.type < 0xc) {
- match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
- } else {
- match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
- ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
- if (p->ainsn.jcc.type >= 0xe)
- match = match || (regs->flags & X86_EFLAGS_ZF);
- }
- __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
+ int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32);
}
NOKPROBE_SYMBOL(kprobe_emulate_jcc);
static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
bool match;
if (p->ainsn.loop.type != 3) { /* LOOP* */
@@ -535,7 +511,9 @@ static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
else if (p->ainsn.loop.type == 1) /* LOOPE */
match = match && (regs->flags & X86_EFLAGS_ZF);
- __kprobe_emulate_jmp(p, regs, match);
+ if (match)
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
}
NOKPROBE_SYMBOL(kprobe_emulate_loop);
@@ -625,7 +603,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn)
/* 1 byte conditional jump */
p->ainsn.emulate_op = kprobe_emulate_jcc;
p->ainsn.jcc.type = opcode & 0xf;
- p->ainsn.rel32 = *(char *)insn->immediate.bytes;
+ p->ainsn.rel32 = insn->immediate.value;
break;
case 0x0f:
opcode = insn->opcode.bytes[1];
@@ -659,17 +637,19 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn)
* is determined by the MOD/RM byte.
*/
opcode = insn->modrm.bytes[0];
- if ((opcode & 0x30) == 0x10) {
- if ((opcode & 0x8) == 0x8)
- return -EOPNOTSUPP; /* far call */
- /* call absolute, indirect */
+ switch (X86_MODRM_REG(opcode)) {
+ case 0b010: /* FF /2, call near, absolute indirect */
p->ainsn.emulate_op = kprobe_emulate_call_indirect;
- } else if ((opcode & 0x30) == 0x20) {
- if ((opcode & 0x8) == 0x8)
- return -EOPNOTSUPP; /* far jmp */
- /* jmp near absolute indirect */
+ break;
+ case 0b100: /* FF /4, jmp near, absolute indirect */
p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
- } else
+ break;
+ case 0b011: /* FF /3, call far, absolute indirect */
+ case 0b101: /* FF /5, jmp far, absolute indirect */
+ return -EOPNOTSUPP;
+ }
+
+ if (!p->ainsn.emulate_op)
break;
if (insn->addr_bytes != sizeof(unsigned long))
@@ -990,20 +970,6 @@ int kprobe_int3_handler(struct pt_regs *regs)
kprobe_post_process(p, regs, kcb);
return 1;
}
- }
-
- if (*addr != INT3_INSN_OPCODE) {
- /*
- * The breakpoint instruction was removed right
- * after we hit it. Another cpu has removed
- * either a probepoint or a debugger breakpoint
- * at this address. In either case, no further
- * handling of this interrupt is appropriate.
- * Back up over the (now missing) int3 and run
- * the original instruction.
- */
- regs->ip = (unsigned long)addr;
- return 1;
} /* else: not a kprobe fault; let the kernel handle it */
return 0;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index e57e07b0edb6..57b0037d0a99 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -46,8 +46,8 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
/* This function only handles jump-optimized kprobe */
if (kp && kprobe_optimized(kp)) {
op = container_of(kp, struct optimized_kprobe, kp);
- /* If op->list is not empty, op is under optimizing */
- if (list_empty(&op->list))
+ /* If op is optimized or under unoptimizing */
+ if (list_empty(&op->list) || optprobe_queued_unopt(op))
goto found;
}
}
@@ -353,7 +353,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
for (i = 1; i < op->optinsn.size; i++) {
p = get_kprobe(op->kp.addr + i);
- if (p && !kprobe_disabled(p))
+ if (p && !kprobe_disarmed(p))
return -EEXIST;
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 16333ba1904b..0f35d44c56fe 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struct timespec64 *now)
return -ENODEV;
}
-static u64 kvm_clock_read(void)
+static noinstr u64 kvm_clock_read(void)
{
u64 ret;
preempt_disable_notrace();
- ret = pvclock_clocksource_read(this_cpu_pvti());
+ ret = pvclock_clocksource_read_nowd(this_cpu_pvti());
preempt_enable_notrace();
return ret;
}
@@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs)
return kvm_clock_read();
}
-static u64 kvm_sched_clock_read(void)
+static noinstr u64 kvm_sched_clock_read(void)
{
return kvm_clock_read() - kvm_sched_clock_offset;
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 705fb2a41d7d..84ad0e61ba6e 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -129,22 +129,27 @@ int apply_relocate(Elf32_Shdr *sechdrs,
return 0;
}
#else /*X86_64*/
-static int __apply_relocate_add(Elf64_Shdr *sechdrs,
+static int __write_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me,
- void *(*write)(void *dest, const void *src, size_t len))
+ void *(*write)(void *dest, const void *src, size_t len),
+ bool apply)
{
unsigned int i;
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
u64 val;
+ u64 zero = 0ULL;
- DEBUGP("Applying relocate section %u to %u\n",
+ DEBUGP("%s relocate section %u to %u\n",
+ apply ? "Applying" : "Clearing",
relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ size_t size;
+
/* This is where to make the change */
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
@@ -162,56 +167,53 @@ static int __apply_relocate_add(Elf64_Shdr *sechdrs,
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
- break;
+ continue; /* nothing to write */
case R_X86_64_64:
- if (*(u64 *)loc != 0)
- goto invalid_relocation;
- write(loc, &val, 8);
+ size = 8;
break;
case R_X86_64_32:
- if (*(u32 *)loc != 0)
- goto invalid_relocation;
- write(loc, &val, 4);
- if (val != *(u32 *)loc)
+ if (val != *(u32 *)&val)
goto overflow;
+ size = 4;
break;
case R_X86_64_32S:
- if (*(s32 *)loc != 0)
- goto invalid_relocation;
- write(loc, &val, 4);
- if ((s64)val != *(s32 *)loc)
+ if ((s64)val != *(s32 *)&val)
goto overflow;
+ size = 4;
break;
case R_X86_64_PC32:
case R_X86_64_PLT32:
- if (*(u32 *)loc != 0)
- goto invalid_relocation;
val -= (u64)loc;
- write(loc, &val, 4);
-#if 0
- if ((s64)val != *(s32 *)loc)
- goto overflow;
-#endif
+ size = 4;
break;
case R_X86_64_PC64:
- if (*(u64 *)loc != 0)
- goto invalid_relocation;
val -= (u64)loc;
- write(loc, &val, 8);
+ size = 8;
break;
default:
pr_err("%s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
+
+ if (apply) {
+ if (memcmp(loc, &zero, size)) {
+ pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &val, size);
+ } else {
+ if (memcmp(loc, &val, size)) {
+ pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &zero, size);
+ }
}
return 0;
-invalid_relocation:
- pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
- return -ENOEXEC;
-
overflow:
pr_err("overflow in relocation type %d val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), val);
@@ -220,11 +222,12 @@ overflow:
return -ENOEXEC;
}
-int apply_relocate_add(Elf64_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
+static int write_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me,
+ bool apply)
{
int ret;
bool early = me->state == MODULE_STATE_UNFORMED;
@@ -235,8 +238,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
mutex_lock(&text_mutex);
}
- ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
- write);
+ ret = __write_relocate_add(sechdrs, strtab, symindex, relsec, me,
+ write, apply);
if (!early) {
text_poke_sync();
@@ -246,6 +249,26 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return ret;
}
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ return write_relocate_add(sechdrs, strtab, symindex, relsec, me, true);
+}
+
+#ifdef CONFIG_LIVEPATCH
+void clear_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ write_relocate_add(sechdrs, strtab, symindex, relsec, me, false);
+}
+#endif
+
#endif
int module_finalize(const Elf_Ehdr *hdr,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index cec0bfa3bc04..776f4b1e395b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -69,6 +69,15 @@ struct nmi_stats {
unsigned int unknown;
unsigned int external;
unsigned int swallow;
+ unsigned long recv_jiffies;
+ unsigned long idt_seq;
+ unsigned long idt_nmi_seq;
+ unsigned long idt_ignored;
+ atomic_long_t idt_calls;
+ unsigned long idt_seq_snap;
+ unsigned long idt_nmi_seq_snap;
+ unsigned long idt_ignored_snap;
+ long idt_calls_snap;
};
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
irqentry_state_t irq_state;
+ struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
/*
* Re-enable NMIs right here when running as an SEV-ES guest. This might
* cause nested NMIs, but those can be handled safely.
*/
sev_es_nmi_complete();
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
+ arch_atomic_long_inc(&nsp->idt_calls);
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return;
@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
}
this_cpu_write(nmi_state, NMI_EXECUTING);
this_cpu_write(nmi_cr2, read_cr2());
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
nmi_restart:
/*
@@ -509,8 +526,19 @@ nmi_restart:
inc_irq_stat(__nmi_count);
- if (!ignore_nmis)
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
+ WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
+ } else if (!ignore_nmis) {
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
+ }
default_do_nmi(regs);
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
+ }
+ }
irqentry_nmi_exit(regs, irq_state);
@@ -525,16 +553,94 @@ nmi_restart:
if (user_mode(regs))
mds_user_clear_cpu_buffers();
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(nsp->idt_seq & 0x1);
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
}
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_noist)
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
{
exc_nmi(regs);
}
-#endif
#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
+EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
+#endif
+#endif
+
+#ifdef CONFIG_NMI_CHECK_CPU
+
+static char *nmi_check_stall_msg[] = {
+/* */
+/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */
+/* | +------ cpu_is_offline(cpu) */
+/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
+/* | | | NMI handler has been invoked. */
+/* | | | */
+/* V V V */
+/* 0 0 0 */ "NMIs are not reaching exc_nmi() handler",
+/* 0 0 1 */ "exc_nmi() handler is ignoring NMIs",
+/* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler",
+/* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs",
+/* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler",
+/* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs",
+/* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler",
+/* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs",
+};
+
+void nmi_backtrace_stall_snap(const struct cpumask *btp)
+{
+ int cpu;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq);
+ nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq);
+ nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored);
+ nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls);
+ }
+}
+
+void nmi_backtrace_stall_check(const struct cpumask *btp)
+{
+ int cpu;
+ int idx;
+ unsigned long nmi_seq;
+ unsigned long j = jiffies;
+ char *modp;
+ char *msgp;
+ char *msghp;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ modp = "";
+ msghp = "";
+ nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
+ msgp = "CPU entered NMI handler function, but has not exited";
+ } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) {
+ msgp = "CPU is handling NMIs";
+ } else {
+ idx = ((nsp->idt_seq_snap & 0x1) << 2) |
+ (cpu_is_offline(cpu) << 1) |
+ (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
+ msgp = nmi_check_stall_msg[idx];
+ if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
+ modp = ", but OK because ignore_nmis was set";
+ if (nmi_seq & ~0x1)
+ msghp = " (CPU currently in NMI handler function)";
+ else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
+ msghp = " (CPU exited one NMI handler function)";
+ }
+ pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n",
+ __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies));
+ }
+}
+
#endif
void stop_nmi(void)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 327757afb027..42e182868873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -32,6 +32,7 @@
#include <asm/special_insns.h>
#include <asm/tlb.h>
#include <asm/io_bitmap.h>
+#include <asm/gsseg.h>
/*
* nop stub, which must not clobber anything *including the stack* to
@@ -216,6 +217,11 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
native_set_debugreg(regno, val);
}
+noinstr void pv_native_wbinvd(void)
+{
+ native_wbinvd();
+}
+
static noinstr void pv_native_irq_enable(void)
{
native_irq_enable();
@@ -225,6 +231,11 @@ static noinstr void pv_native_irq_disable(void)
{
native_irq_disable();
}
+
+static noinstr void pv_native_safe_halt(void)
+{
+ native_safe_halt();
+}
#endif
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
@@ -256,7 +267,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
- .cpu.wbinvd = native_wbinvd,
+ .cpu.wbinvd = pv_native_wbinvd,
.cpu.read_msr = native_read_msr,
.cpu.write_msr = native_write_msr,
.cpu.read_msr_safe = native_read_msr_safe,
@@ -290,7 +301,7 @@ struct paravirt_patch_template pv_ops = {
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
- .irq.safe_halt = native_safe_halt,
+ .irq.safe_halt = pv_native_safe_halt,
.irq.halt = native_halt,
#endif /* CONFIG_PARAVIRT_XXL */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 40d156a31676..b650cde3f64d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -24,6 +24,7 @@
#include <linux/cpuidle.h>
#include <linux/acpi.h>
#include <linux/elf-randomize.h>
+#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
@@ -694,7 +695,24 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
-static void (*x86_idle)(void);
+/*
+ * We use this if we don't have any better idle routine..
+ */
+void __cpuidle default_idle(void)
+{
+ raw_safe_halt();
+ raw_local_irq_disable();
+}
+#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
+EXPORT_SYMBOL(default_idle);
+#endif
+
+DEFINE_STATIC_CALL_NULL(x86_idle, default_idle);
+
+static bool x86_idle_set(void)
+{
+ return !!static_call_query(x86_idle);
+}
#ifndef CONFIG_SMP
static inline void play_dead(void)
@@ -717,28 +735,18 @@ void arch_cpu_idle_dead(void)
/*
* Called from the generic idle code.
*/
-void arch_cpu_idle(void)
-{
- x86_idle();
-}
-
-/*
- * We use this if we don't have any better idle routine..
- */
-void __cpuidle default_idle(void)
+void __cpuidle arch_cpu_idle(void)
{
- raw_safe_halt();
+ static_call(x86_idle)();
}
-#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
-EXPORT_SYMBOL(default_idle);
-#endif
+EXPORT_SYMBOL_GPL(arch_cpu_idle);
#ifdef CONFIG_XEN
bool xen_set_default_idle(void)
{
- bool ret = !!x86_idle;
+ bool ret = x86_idle_set();
- x86_idle = default_idle;
+ static_call_update(x86_idle, default_idle);
return ret;
}
@@ -800,13 +808,7 @@ static void amd_e400_idle(void)
default_idle();
- /*
- * The switch back from broadcast mode needs to be called with
- * interrupts disabled.
- */
- raw_local_irq_disable();
tick_broadcast_exit();
- raw_local_irq_enable();
}
/*
@@ -864,12 +866,10 @@ static __cpuidle void mwait_idle(void)
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
- if (!need_resched())
+ if (!need_resched()) {
__sti_mwait(0, 0);
- else
- raw_local_irq_enable();
- } else {
- raw_local_irq_enable();
+ raw_local_irq_disable();
+ }
}
__current_clr_polling();
}
@@ -880,20 +880,20 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
#endif
- if (x86_idle || boot_option_idle_override == IDLE_POLL)
+ if (x86_idle_set() || boot_option_idle_override == IDLE_POLL)
return;
if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
pr_info("using AMD E400 aware idle routine\n");
- x86_idle = amd_e400_idle;
+ static_call_update(x86_idle, amd_e400_idle);
} else if (prefer_mwait_c1_over_halt(c)) {
pr_info("using mwait in idle threads\n");
- x86_idle = mwait_idle;
+ static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- x86_idle = tdx_safe_halt;
+ static_call_update(x86_idle, tdx_safe_halt);
} else
- x86_idle = default_idle;
+ static_call_update(x86_idle, default_idle);
}
void amd_e400_c1e_apic_setup(void)
@@ -946,7 +946,7 @@ static int __init idle_setup(char *str)
* To continue to load the CPU idle driver, don't touch
* the boot_option_idle_override.
*/
- x86_idle = default_idle;
+ static_call_update(x86_idle, default_idle);
boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
/*
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index eda37df016f0..56acf53a782a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
return flags & valid_flags;
}
-u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+static __always_inline
+u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
{
unsigned version;
u64 ret;
@@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
flags = src->flags;
} while (pvclock_read_retry(src, version));
- if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
src->flags &= ~PVCLOCK_GUEST_STOPPED;
pvclock_touch_watchdogs();
}
@@ -100,16 +101,25 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
* updating at the same time, and one of them could be slightly behind,
* making the assumption that last_value always go forward fail to hold.
*/
- last = atomic64_read(&last_value);
+ last = arch_atomic64_read(&last_value);
do {
- if (ret < last)
+ if (ret <= last)
return last;
- last = atomic64_cmpxchg(&last_value, last, ret);
- } while (unlikely(last != ret));
+ } while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret));
return ret;
}
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, true);
+}
+
+noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, false);
+}
+
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
struct timespec64 *ts)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3636ea4aa71..d03c551defcc 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -528,33 +528,29 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct pt_regs *regs)
-{
- cpu_emergency_vmxoff();
-}
+static inline void nmi_shootdown_cpus_on_restart(void);
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
-static void emergency_vmx_disable_all(void)
+static void emergency_reboot_disable_virtualization(void)
{
/* Just make sure we won't change CPUs while doing this */
local_irq_disable();
/*
- * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
- * the machine, because the CPU blocks INIT when it's in VMX root.
+ * Disable virtualization on all CPUs before rebooting to avoid hanging
+ * the system, as VMX and SVM block INIT when running in the host.
*
* We can't take any locks and we may be on an inconsistent state, so
- * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
+ * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
*
- * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
- * doesn't prevent a different CPU from being in VMX root operation.
+ * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
+ * other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx()) {
- /* Safely force _this_ CPU out of VMX root operation. */
- __cpu_emergency_vmxoff();
+ if (cpu_has_vmx() || cpu_has_svm(NULL)) {
+ /* Safely force _this_ CPU out of VMX/SVM operation. */
+ cpu_emergency_disable_virtualization();
- /* Halt and exit VMX root operation on the other CPUs. */
- nmi_shootdown_cpus(vmxoff_nmi);
+ /* Disable VMX/SVM and halt on other CPUs. */
+ nmi_shootdown_cpus_on_restart();
}
}
@@ -590,7 +586,7 @@ static void native_machine_emergency_restart(void)
unsigned short mode;
if (reboot_emergency)
- emergency_vmx_disable_all();
+ emergency_reboot_disable_virtualization();
tboot_shutdown(TB_SHUTDOWN_REBOOT);
@@ -795,6 +791,17 @@ void machine_crash_shutdown(struct pt_regs *regs)
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+}
+
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
@@ -817,7 +824,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, regs);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -828,18 +842,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-/*
- * Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
@@ -867,7 +895,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
}
/*
@@ -897,6 +935,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* No other CPUs to shoot down */
}
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
void run_crash_ipi_callback(struct pt_regs *regs)
{
}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 349046434513..1309b9b05338 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -138,15 +138,12 @@ static __init int add_rtc_cmos(void)
static const char * const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
- struct pnp_id *id;
int i;
pnp_for_each_dev(dev) {
- for (id = dev->id; id; id = id->next) {
- for (i = 0; i < ARRAY_SIZE(ids); i++) {
- if (compare_pnp_id(id, ids[i]) != 0)
- return 0;
- }
+ for (i = 0; i < ARRAY_SIZE(ids); i++) {
+ if (compare_pnp_id(dev->id, ids[i]) != 0)
+ return 0;
}
}
#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 88188549647c..16babff771bd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,11 +114,6 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
/* CPU data as detected by the assembly code in head_32.S */
struct cpuinfo_x86 new_cpu_data;
-
-/* Common CPU data for all CPUs */
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-
unsigned int def_to_bigsmp;
struct apm_info apm_info;
@@ -132,11 +127,10 @@ EXPORT_SYMBOL(ist_info);
struct ist_info ist_info;
#endif
-#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
#endif
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
__visible unsigned long mmu_cr4_features __ro_after_init;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 1504eb8d25aa..004cb30b7419 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -360,7 +360,7 @@ static bool strict_sigaltstack_size __ro_after_init = false;
static int __init strict_sas_size(char *arg)
{
- return kstrtobool(arg, &strict_sigaltstack_size);
+ return kstrtobool(arg, &strict_sigaltstack_size) == 0;
}
__setup("strict_sas_size", strict_sas_size);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 2553136cf39b..9027fc088f97 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -31,6 +31,7 @@
#include <asm/sigframe.h>
#include <asm/sighandling.h>
#include <asm/smap.h>
+#include <asm/gsseg.h>
#ifdef CONFIG_IA32_EMULATION
#include <asm/ia32_unistd.h>
@@ -54,12 +55,14 @@ static inline void reload_segments(struct sigcontext_32 *sc)
}
#define sigset32_t compat_sigset_t
+#define siginfo32_t compat_siginfo_t
#define restore_altstack32 compat_restore_altstack
#define unsafe_save_altstack32 unsafe_compat_save_altstack
#else
#define sigset32_t sigset_t
+#define siginfo32_t siginfo_t
#define __NR_ia32_sigreturn __NR_sigreturn
#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn
#define restore_altstack32 restore_altstack
@@ -377,3 +380,128 @@ Efault:
user_access_end();
return -EFAULT;
}
+
+/*
+ * The siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 9);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo32_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo32_t) == 4);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo32_t, _sifields) == 3 * sizeof(int));
+
+static_assert(offsetof(siginfo32_t, si_signo) == 0);
+static_assert(offsetof(siginfo32_t, si_errno) == 4);
+static_assert(offsetof(siginfo32_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo32_t, _sifields) == \
+ offsetof(siginfo32_t, _sifields.name))
+
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo32_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0xC);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+
+CHECK_SI_OFFSET(_timer);
+#ifdef CONFIG_COMPAT
+/* compat_siginfo_t doesn't have si_sys_private */
+CHECK_SI_SIZE (_timer, 3*sizeof(int));
+#else
+CHECK_SI_SIZE (_timer, 4*sizeof(int));
+#endif
+static_assert(offsetof(siginfo32_t, si_tid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_overrun) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 5*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_status) == 0x14);
+static_assert(offsetof(siginfo32_t, si_utime) == 0x18);
+static_assert(offsetof(siginfo32_t, si_stime) == 0x1C);
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 4*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_addr) == 0x0C);
+
+static_assert(offsetof(siginfo32_t, si_trapno) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_addr_lsb) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_lower) == 0x14);
+static_assert(offsetof(siginfo32_t, si_upper) == 0x18);
+
+static_assert(offsetof(siginfo32_t, si_pkey) == 0x14);
+
+static_assert(offsetof(siginfo32_t, si_perf_data) == 0x10);
+static_assert(offsetof(siginfo32_t, si_perf_type) == 0x14);
+static_assert(offsetof(siginfo32_t, si_perf_flags) == 0x18);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_band) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_fd) == 0x10);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_call_addr) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_syscall) == 0x10);
+static_assert(offsetof(siginfo32_t, si_arch) == 0x14);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ff9c55064223..13a1e6083837 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -381,3 +381,130 @@ badframe:
return 0;
}
#endif /* CONFIG_X86_X32_ABI */
+
+#ifdef CONFIG_COMPAT
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ if (!act)
+ return;
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
+#endif /* CONFIG_COMPAT */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 9);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo_t) == 8);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo_t, si_signo) == 0);
+static_assert(offsetof(siginfo_t, si_errno) == 4);
+static_assert(offsetof(siginfo_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo_t, _sifields) == \
+ offsetof(siginfo_t, _sifields.name))
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+
+CHECK_SI_OFFSET(_timer);
+CHECK_SI_SIZE (_timer, 6*sizeof(int));
+static_assert(offsetof(siginfo_t, si_tid) == 0x10);
+static_assert(offsetof(siginfo_t, si_overrun) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_status) == 0x18);
+static_assert(offsetof(siginfo_t, si_utime) == 0x20);
+static_assert(offsetof(siginfo_t, si_stime) == 0x28);
+
+#ifdef CONFIG_X86_X32_ABI
+/* no _sigchld_x32 in the generic siginfo_t */
+static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) ==
+ 7*sizeof(int));
+static_assert(offsetof(compat_siginfo_t, _sifields) ==
+ offsetof(compat_siginfo_t, _sifields._sigchld_x32));
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18);
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20);
+#endif
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_addr) == 0x10);
+
+static_assert(offsetof(siginfo_t, si_trapno) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_lower) == 0x20);
+static_assert(offsetof(siginfo_t, si_upper) == 0x28);
+
+static_assert(offsetof(siginfo_t, si_pkey) == 0x20);
+
+static_assert(offsetof(siginfo_t, si_perf_data) == 0x18);
+static_assert(offsetof(siginfo_t, si_perf_type) == 0x20);
+static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_band) == 0x10);
+static_assert(offsetof(siginfo_t, si_fd) == 0x18);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_call_addr) == 0x10);
+static_assert(offsetof(siginfo_t, si_syscall) == 0x18);
+static_assert(offsetof(siginfo_t, si_arch) == 0x1C);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
deleted file mode 100644
index 879ef8c72f5c..000000000000
--- a/arch/x86/kernel/signal_compat.c
+++ /dev/null
@@ -1,191 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compat.h>
-#include <linux/uaccess.h>
-#include <linux/ptrace.h>
-
-/*
- * The compat_siginfo_t structure and handing code is very easy
- * to break in several ways. It must always be updated when new
- * updates are made to the main siginfo_t, and
- * copy_siginfo_to_user32() must be updated when the
- * (arch-independent) copy_siginfo_to_user() is updated.
- *
- * It is also easy to put a new member in the compat_siginfo_t
- * which has implicit alignment which can move internal structure
- * alignment around breaking the ABI. This can happen if you,
- * for instance, put a plain 64-bit value in there.
- */
-static inline void signal_compat_build_tests(void)
-{
- int _sifields_offset = offsetof(compat_siginfo_t, _sifields);
-
- /*
- * If adding a new si_code, there is probably new data in
- * the siginfo. Make sure folks bumping the si_code
- * limits also have to look at this code. Make sure any
- * new fields are handled in copy_siginfo_to_user32()!
- */
- BUILD_BUG_ON(NSIGILL != 11);
- BUILD_BUG_ON(NSIGFPE != 15);
- BUILD_BUG_ON(NSIGSEGV != 9);
- BUILD_BUG_ON(NSIGBUS != 5);
- BUILD_BUG_ON(NSIGTRAP != 6);
- BUILD_BUG_ON(NSIGCHLD != 6);
- BUILD_BUG_ON(NSIGSYS != 2);
-
- /* This is part of the ABI and can never change in size: */
- BUILD_BUG_ON(sizeof(siginfo_t) != 128);
- BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
-
- /* This is a part of the ABI and can never change in alignment */
- BUILD_BUG_ON(__alignof__(siginfo_t) != 8);
- BUILD_BUG_ON(__alignof__(compat_siginfo_t) != 4);
-
- /*
- * The offsets of all the (unioned) si_fields are fixed
- * in the ABI, of course. Make sure none of them ever
- * move and are always at the beginning:
- */
- BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int));
-#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name))
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0);
- BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4);
- BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8);
-
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8);
- /*
- * Ensure that the size of each si_field never changes.
- * If it does, it is a sign that the
- * copy_siginfo_to_user32() code below needs to updated
- * along with the size in the CHECK_SI_SIZE().
- *
- * We repeat this check for both the generic and compat
- * siginfos.
- *
- * Note: it is OK for these to grow as long as the whole
- * structure stays within the padding size (checked
- * above).
- */
-#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name))
-#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name))
-
- CHECK_CSI_OFFSET(_kill);
- CHECK_CSI_SIZE (_kill, 2*sizeof(int));
- CHECK_SI_SIZE (_kill, 2*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
-
- CHECK_CSI_OFFSET(_timer);
- CHECK_CSI_SIZE (_timer, 3*sizeof(int));
- CHECK_SI_SIZE (_timer, 6*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
-
- CHECK_CSI_OFFSET(_rt);
- CHECK_CSI_SIZE (_rt, 3*sizeof(int));
- CHECK_SI_SIZE (_rt, 4*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
-
- CHECK_CSI_OFFSET(_sigchld);
- CHECK_CSI_SIZE (_sigchld, 5*sizeof(int));
- CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18);
- BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20);
- BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C);
-
-#ifdef CONFIG_X86_X32_ABI
- CHECK_CSI_OFFSET(_sigchld_x32);
- CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int));
- /* no _sigchld_x32 in the generic siginfo_t */
- BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20);
-#endif
-
- CHECK_CSI_OFFSET(_sigfault);
- CHECK_CSI_SIZE (_sigfault, 4*sizeof(int));
- CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20);
- BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18);
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20);
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18);
-
- CHECK_CSI_OFFSET(_sigpoll);
- CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
- CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10);
-
- CHECK_CSI_OFFSET(_sigsys);
- CHECK_CSI_SIZE (_sigsys, 3*sizeof(int));
- CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18);
- BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14);
-
- /* any new si_fields should be added here */
-}
-
-void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
-{
- signal_compat_build_tests();
-
- if (!act)
- return;
-
- if (in_ia32_syscall())
- act->sa.sa_flags |= SA_IA32_ABI;
- if (in_x32_syscall())
- act->sa.sa_flags |= SA_X32_ABI;
-}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 06db901fabe8..375b33ecafa2 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -32,7 +32,7 @@
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
#include <asm/kexec.h>
-#include <asm/virtext.h>
+#include <asm/reboot.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -122,7 +122,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
return NMI_HANDLED;
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
return NMI_HANDLED;
@@ -134,7 +134,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
ack_APIC_irq();
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 55cad72715d9..9013bb28255a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1833,7 +1833,7 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- mwait_play_dead(); /* Only returns on failure */
+ mwait_play_dead();
if (cpuidle_play_dead())
hlt_play_dead();
}
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 2ebc338980bc..b70670a98597 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -9,6 +9,7 @@ enum insn_type {
NOP = 1, /* site cond-call */
JMP = 2, /* tramp / site tail-call */
RET = 3, /* tramp / site cond-tail-call */
+ JCC = 4,
};
/*
@@ -25,12 +26,40 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
+{
+ u8 ret = 0;
+
+ if (insn[0] == 0x0f) {
+ u8 tmp = insn[1];
+ if ((tmp & 0xf0) == 0x80)
+ ret = tmp;
+ }
+
+ return ret;
+}
+
+extern void __static_call_return(void);
+
+asm (".global __static_call_return\n\t"
+ ".type __static_call_return, @function\n\t"
+ ASM_FUNC_ALIGN "\n\t"
+ "__static_call_return:\n\t"
+ ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ "ret; int3\n\t"
+ ".size __static_call_return, . - __static_call_return \n\t");
+
static void __ref __static_call_transform(void *insn, enum insn_type type,
void *func, bool modinit)
{
const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
+ u8 op, buf[6];
+
+ if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
+ type = JCC;
switch (type) {
case CALL:
@@ -57,6 +86,20 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
else
code = &retinsn;
break;
+
+ case JCC:
+ if (!func) {
+ func = __static_call_return;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ func = x86_return_thunk;
+ }
+
+ buf[0] = 0x0f;
+ __text_gen_insn(buf+1, op, insn+1, func, 5);
+ code = buf;
+ size = 6;
+
+ break;
}
if (memcmp(insn, code, size) == 0)
@@ -68,9 +111,9 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
text_poke_bp(insn, code, size, emulate);
}
-static void __static_call_validate(void *insn, bool tail, bool tramp)
+static void __static_call_validate(u8 *insn, bool tail, bool tramp)
{
- u8 opcode = *(u8 *)insn;
+ u8 opcode = insn[0];
if (tramp && memcmp(insn+5, tramp_ud, 3)) {
pr_err("trampoline signature fail");
@@ -79,7 +122,8 @@ static void __static_call_validate(void *insn, bool tail, bool tramp)
if (tail) {
if (opcode == JMP32_INSN_OPCODE ||
- opcode == RET_INSN_OPCODE)
+ opcode == RET_INSN_OPCODE ||
+ __is_Jcc(insn))
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 3c883e064242..3ffbab0081f4 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -12,6 +12,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
+#include <asm/gsseg.h>
#include "tls.h"
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a78e73da4a74..344698852146 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -48,6 +48,8 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
+static int __read_mostly tsc_force_recalibrate;
+
static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
@@ -215,7 +217,7 @@ static void __init cyc2ns_init_secondary_cpus(void)
/*
* Scheduler clock - returns current time in nanosec units.
*/
-u64 native_sched_clock(void)
+noinstr u64 native_sched_clock(void)
{
if (static_branch_likely(&__use_tsc)) {
u64 tsc_now = rdtsc();
@@ -248,7 +250,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
+noinstr u64 sched_clock(void)
{
return paravirt_sched_clock();
}
@@ -258,8 +260,7 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
bool using_native_sched_clock(void) { return true; }
#endif
@@ -292,6 +293,7 @@ __setup("notsc", notsc_setup);
static int no_sched_irq_time;
static int no_tsc_watchdog;
+static int tsc_as_watchdog;
static int __init tsc_setup(char *str)
{
@@ -301,8 +303,22 @@ static int __init tsc_setup(char *str)
no_sched_irq_time = 1;
if (!strcmp(str, "unstable"))
mark_tsc_unstable("boot parameter");
- if (!strcmp(str, "nowatchdog"))
+ if (!strcmp(str, "nowatchdog")) {
no_tsc_watchdog = 1;
+ if (tsc_as_watchdog)
+ pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
+ __func__);
+ tsc_as_watchdog = 0;
+ }
+ if (!strcmp(str, "recalibrate"))
+ tsc_force_recalibrate = 1;
+ if (!strcmp(str, "watchdog")) {
+ if (no_tsc_watchdog)
+ pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
+ __func__);
+ else
+ tsc_as_watchdog = 1;
+ }
return 1;
}
@@ -912,8 +928,7 @@ void recalibrate_cpu_khz(void)
cpu_khz_old, cpu_khz);
#endif
}
-
-EXPORT_SYMBOL(recalibrate_cpu_khz);
+EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);
static unsigned long long cyc2ns_suspend;
@@ -1186,6 +1201,12 @@ static void __init tsc_disable_clocksource_watchdog(void)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
+bool tsc_clocksource_watchdog_disabled(void)
+{
+ return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
+ tsc_as_watchdog && !no_tsc_watchdog;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1374,6 +1395,25 @@ restart:
else
freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+ /* Will hit this only if tsc_force_recalibrate has been set */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+
+ /* Warn if the deviation exceeds 500 ppm */
+ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
+ pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
+ pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+ }
+
+ pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
+ hpet ? "HPET" : "PM_TIMER",
+ (unsigned long)freq / 1000,
+ (unsigned long)freq % 1000);
+
+ return;
+ }
+
/* Make sure we're within 1% */
if (abs(tsc_khz - freq) > tsc_khz/100)
goto out;
@@ -1407,8 +1447,10 @@ static int __init init_tsc_clocksource(void)
if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
return 0;
- if (tsc_unstable)
- goto unreg;
+ if (tsc_unstable) {
+ clocksource_unregister(&clocksource_tsc_early);
+ return 0;
+ }
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1421,9 +1463,10 @@ static int __init init_tsc_clocksource(void)
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
-unreg:
clocksource_unregister(&clocksource_tsc_early);
- return 0;
+
+ if (!tsc_force_recalibrate)
+ return 0;
}
schedule_delayed_work(&tsc_irqwork, 0);
@@ -1510,6 +1553,11 @@ void __init tsc_early_init(void)
void __init tsc_init(void)
{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+
/*
* native_calibrate_cpu_early can only calibrate using methods that are
* available early in boot.
@@ -1517,11 +1565,6 @@ void __init tsc_init(void)
if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
x86_platform.calibrate_cpu = native_calibrate_cpu;
- if (!boot_cpu_has(X86_FEATURE_TSC)) {
- setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
- return;
- }
-
if (!tsc_khz) {
/* We failed to determine frequencies earlier, try again */
if (!determine_cpu_tsc_frequencies(false)) {
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 2e0ee14229bf..25f155205770 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -129,7 +129,6 @@ SECTIONS
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
- CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
SOFTIRQENTRY_TEXT
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fbeaa9ddef59..8e578311ca9d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -49,6 +49,7 @@ config KVM
select SRCU
select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
+ select KVM_GENERIC_HARDWARE_ENABLING
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 596061c1610e..599aebec2d52 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -8,6 +8,7 @@
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
* Copyright IBM Corporation, 2008
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/export.h>
@@ -25,6 +26,7 @@
#include "mmu.h"
#include "trace.h"
#include "pmu.h"
+#include "xen.h"
/*
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
@@ -180,15 +182,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
return 0;
}
-static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
{
- u32 function;
+ struct kvm_hypervisor_cpuid cpuid = {};
struct kvm_cpuid_entry2 *entry;
+ u32 base;
- vcpu->arch.kvm_cpuid_base = 0;
-
- for_each_possible_hypervisor_cpuid_base(function) {
- entry = kvm_find_cpuid_entry(vcpu, function);
+ for_each_possible_hypervisor_cpuid_base(base) {
+ entry = kvm_find_cpuid_entry(vcpu, base);
if (entry) {
u32 signature[3];
@@ -197,19 +199,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
signature[1] = entry->ecx;
signature[2] = entry->edx;
- BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE));
- if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) {
- vcpu->arch.kvm_cpuid_base = function;
+ if (!memcmp(signature, sig, sizeof(signature))) {
+ cpuid.base = base;
+ cpuid.limit = entry->eax;
break;
}
}
}
+
+ return cpuid;
}
static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
struct kvm_cpuid_entry2 *entries, int nent)
{
- u32 base = vcpu->arch.kvm_cpuid_base;
+ u32 base = vcpu->arch.kvm_cpuid.base;
if (!base)
return NULL;
@@ -439,7 +443,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_entries = e2;
vcpu->arch.cpuid_nent = nent;
- kvm_update_kvm_cpuid_base(vcpu);
+ vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+ vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
kvm_vcpu_after_set_cpuid(vcpu);
return 0;
@@ -663,8 +668,9 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) |
- F(AVX_IFMA)
+ F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) |
+ F(FZRM) | F(FSRS) | F(FSRC) |
+ F(AMX_FP16) | F(AVX_IFMA)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
@@ -701,6 +707,10 @@ void kvm_set_cpu_caps(void)
if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX,
+ SF(CONSTANT_TSC)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
@@ -741,6 +751,27 @@ void kvm_set_cpu_caps(void)
0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
F(SME_COHERENT));
+ kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
+ F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
+ F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
+ );
+
+ /*
+ * Synthesize "LFENCE is serializing" into the AMD-defined entry in
+ * KVM's supported CPUID if the feature is reported as supported by the
+ * kernel. LFENCE_RDTSC was a Linux-defined synthetic feature long
+ * before AMD joined the bandwagon, e.g. LFENCE is serializing on most
+ * CPUs that support SSE2. On CPUs that don't support AMD's leaf,
+ * kvm_cpu_cap_mask() will unfortunately drop the flag due to ANDing
+ * the mask with the raw host CPUID, and reporting support in AMD's
+ * leaf can make it easier for userspace to detect the feature.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ kvm_cpu_cap_set(X86_FEATURE_LFENCE_RDTSC);
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE);
+ kvm_cpu_cap_set(X86_FEATURE_NO_SMM_CTL_MSR);
+
kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
@@ -1148,8 +1179,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx &= ~GENMASK(17, 16);
break;
case 0x80000007: /* Advanced power management */
- /* invariant TSC is CPUID.80000007H:EDX[8] */
- entry->edx &= (1 << 8);
+ cpuid_entry_override(entry, CPUID_8000_0007_EDX);
+
/* mask against host */
entry->edx &= boot_cpu_data.x86_power;
entry->eax = entry->ebx = entry->ecx = 0;
@@ -1222,25 +1253,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
case 0x80000021:
entry->ebx = entry->ecx = entry->edx = 0;
- /*
- * Pass down these bits:
- * EAX 0 NNDBP, Processor ignores nested data breakpoints
- * EAX 2 LAS, LFENCE always serializing
- * EAX 6 NSCB, Null selector clear base
- *
- * Other defined bits are for MSRs that KVM does not expose:
- * EAX 3 SPCL, SMM page configuration lock
- * EAX 13 PCMSR, Prefetch control MSR
- *
- * KVM doesn't support SMM_CTL.
- * EAX 9 SMM_CTL MSR is not supported
- */
- entry->eax &= BIT(0) | BIT(2) | BIT(6);
- entry->eax |= BIT(9);
- if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- entry->eax |= BIT(2);
- if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
- entry->eax |= BIT(6);
+ cpuid_entry_override(entry, CPUID_8000_0021_EAX);
break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
@@ -1482,6 +1495,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
(data & TSX_CTRL_CPUID_CLEAR))
*ebx &= ~(F(RTM) | F(HLE));
+ } else if (function == 0x80000007) {
+ if (kvm_hv_invtsc_suppressed(vcpu))
+ *edx &= ~SF(CONSTANT_TSC);
}
} else {
*eax = *ebx = *ecx = *edx = 0;
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index c1390357126a..ee8c4c3496ed 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -4,6 +4,8 @@
*
* Copyright 2016 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
#include "lapic.h"
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5cc3efa0e21c..a20bec931764 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -17,6 +17,7 @@
*
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
@@ -1633,7 +1634,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_SS:
/*
* segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
+ * selector's RPL != CPL or DPL != CPL
*/
if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
goto exception;
@@ -1695,11 +1696,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/*
* segment is not a data or readable code segment or
* ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
+ * and ((RPL > DPL) or (CPL > DPL)))
*/
if ((seg_desc.type & 0xa) == 0x8 ||
(((seg_desc.type & 0xc) != 0xc) &&
- (rpl > dpl && cpl > dpl)))
+ (rpl > dpl || cpl > dpl)))
goto exception;
break;
}
@@ -2309,7 +2310,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
- if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
+ if (!ctxt->ops->is_smm(ctxt))
return emulate_ud(ctxt);
if (ctxt->ops->leave_smm(ctxt))
@@ -2615,8 +2616,8 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
return true;
}
-static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
- u16 port, u16 len)
+static bool emulator_io_permitted(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
{
if (ctxt->perm_ok)
return true;
@@ -3961,7 +3962,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
static int check_perm_in(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ if (!emulator_io_permitted(ctxt, ctxt->src.val, ctxt->dst.bytes))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -3970,7 +3971,7 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
static int check_perm_out(struct x86_emulate_ctxt *ctxt)
{
ctxt->src.bytes = min(ctxt->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ if (!emulator_io_permitted(ctxt, ctxt->dst.val, ctxt->src.bytes))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -5132,7 +5133,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
const struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
- unsigned emul_flags;
+ bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt);
ctxt->mem_read.pos = 0;
@@ -5147,7 +5148,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- emul_flags = ctxt->ops->get_hflags(ctxt);
if (unlikely(ctxt->d &
(No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
@@ -5181,7 +5181,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(&ctxt->dst);
}
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+ if (unlikely(is_guest_mode) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5210,7 +5210,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5264,7 +5264,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index e8296942a868..b28fd020066f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -17,6 +17,7 @@
* Ben-Ami Yassour <benami@il.ibm.com>
* Andrey Smetanin <asmetanin@virtuozzo.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "lapic.h"
@@ -43,6 +44,24 @@
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
+/*
+ * As per Hyper-V TLFS, extended hypercalls start from 0x8001
+ * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value
+ * where each bit tells which extended hypercall is available besides
+ * HvExtCallQueryCapabilities.
+ *
+ * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit
+ * assigned.
+ *
+ * 0x8002 - Bit 0
+ * 0x8003 - Bit 1
+ * ..
+ * 0x8041 - Bit 63
+ *
+ * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64
+ */
+#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64)
+
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick);
@@ -999,6 +1018,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
r = true;
@@ -1283,6 +1303,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
case HV_X64_MSR_TSC_EMULATION_STATUS:
return hv_vcpu->cpuid_cache.features_eax &
HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_TSC_INVARIANT;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
return hv_vcpu->cpuid_cache.features_edx &
@@ -1410,12 +1433,22 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
if (!host)
return 1;
break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ /* Only bit 0 is supported */
+ if (data & ~HV_EXPOSE_INVARIANT_TSC)
+ return 1;
+
+ /* The feature can't be disabled from the guest */
+ if (!host && hv->hv_invtsc_control && !data)
+ return 1;
+
+ hv->hv_invtsc_control = data;
+ break;
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_set_msr(vcpu, msr, data, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
return 0;
@@ -1536,8 +1569,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
@@ -1585,11 +1617,14 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_TSC_EMULATION_STATUS:
data = hv->hv_tsc_emulation_status;
break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ data = hv->hv_invtsc_control;
+ break;
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_get_msr(vcpu, msr, pdata, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
@@ -1654,7 +1689,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = APIC_BUS_FREQUENCY;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
*pdata = data;
@@ -2420,6 +2455,9 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
case HVCALL_SEND_IPI:
return hv_vcpu->cpuid_cache.enlightenments_eax &
HV_X64_CLUSTER_IPI_RECOMMENDED;
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ return hv_vcpu->cpuid_cache.features_ebx &
+ HV_ENABLE_EXTENDED_HYPERCALLS;
default:
break;
}
@@ -2512,14 +2550,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = hc.param;
- vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
- vcpu->arch.complete_userspace_io =
- kvm_hv_hypercall_complete_userspace;
- return 0;
+ goto hypercall_userspace_exit;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -2578,15 +2609,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = HV_STATUS_OPERATION_DENIED;
break;
}
- vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = hc.param;
- vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
- vcpu->arch.complete_userspace_io =
- kvm_hv_hypercall_complete_userspace;
- return 0;
+ goto hypercall_userspace_exit;
}
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ goto hypercall_userspace_exit;
default:
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
@@ -2594,6 +2624,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
hypercall_complete:
return kvm_hv_hypercall_complete(vcpu, ret);
+
+hypercall_userspace_exit:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
+ vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace;
+ return 0;
}
void kvm_hv_init_vm(struct kvm *kvm)
@@ -2733,9 +2772,11 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
ent->eax |= HV_ACCESS_FREQUENCY_MSRS;
ent->eax |= HV_ACCESS_REENLIGHTENMENT;
+ ent->eax |= HV_ACCESS_TSC_INVARIANT;
ent->ebx |= HV_POST_MESSAGES;
ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 9f96414a31c5..f83b8db72b11 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -136,6 +136,33 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
HV_SYNIC_STIMER_COUNT);
}
+/*
+ * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8])
+ * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to.
+ */
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * If Hyper-V's invariant TSC control is not exposed to the guest,
+ * the invariant TSC CPUID flag is not suppressed, Windows guests were
+ * observed to be able to handle it correctly. Going forward, VMMs are
+ * encouraged to enable Hyper-V's invariant TSC control when invariant
+ * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V.
+ */
+ if (!hv_vcpu ||
+ !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT))
+ return false;
+
+ /*
+ * If Hyper-V's invariant TSC control is exposed to the guest, KVM is
+ * responsible for suppressing the invariant TSC CPUID flag if the
+ * Hyper-V control is not enabled.
+ */
+ return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC);
+}
+
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index e0a7a0e7a73c..cd57a517d04a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -30,7 +30,7 @@
* Based on QEMU and Xen.
*/
-#define pr_fmt(fmt) "pit: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
if (ps->period < min_period) {
pr_info_ratelimited(
- "kvm: requested %lld ns "
+ "requested %lld ns "
"i8254 timer period limited to %lld ns\n",
ps->period, min_period);
ps->period = min_period;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e1bb6218bb96..4756bcb5724f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,6 +26,8 @@
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
* Port from Qemu.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
@@ -35,7 +37,7 @@
#include "trace.h"
#define pr_pic_unimpl(fmt, ...) \
- pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__)
static void pic_irq_request(struct kvm *kvm, int level);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 765943d7cfa5..042dee556125 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -26,6 +26,7 @@
* Yaozu (Eddie) Dong <eddie.dong@intel.com>
* Based on Xen 3.1 code.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a70952eca905..b2c397dd2bc6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -7,6 +7,7 @@
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/export.h>
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 3742d9adacfc..16d076a1b91a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -8,6 +8,7 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (irq->dest_mode == APIC_DEST_PHYSICAL &&
irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ pr_info("apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
@@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm)
irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
if (irq_source_id >= BITS_PER_LONG) {
- printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ pr_warn("exhausted allocatable IRQ sources!\n");
irq_source_id = -EFAULT;
goto unlock;
}
@@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
irq_source_id >= BITS_PER_LONG) {
- printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ pr_err("IRQ source ID out of range!\n");
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index c09174f73a34..4c91f626c058 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -76,6 +76,18 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
}
/*
+ * kvm_register_test_and_mark_available() is a special snowflake that uses an
+ * arch bitop directly to avoid the explicit instrumentation that comes with
+ * the generic bitops. This allows code that cannot be instrumented (noinstr
+ * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers.
+ */
+static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+/*
* The "raw" register helpers are only for cases where the full 64 bits of a
* register are read/written irrespective of current vCPU mode. In other words,
* odds are good you shouldn't be using the raw variants.
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 2d9662be8333..ab65f3a47dfd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -220,7 +220,8 @@ struct x86_emulate_ops {
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
- unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
+ bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
+ bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt);
int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
@@ -275,10 +276,6 @@ enum x86emul_mode {
X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
};
-/* These match some of the HF_* flags defined in kvm_host.h */
-#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
-#define X86EMUL_SMM_MASK (1 << 6)
-
/*
* fastop functions are declared as taking a never-defined fastop parameter,
* so they can't be called from C directly.
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index ee4f696a0782..482d6639ef88 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mshyperv.h>
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4efdb4a4d72c..e542cf285b51 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -15,6 +15,7 @@
*
* Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
@@ -166,9 +167,19 @@ static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
}
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
- switch (map->mode) {
+ switch (map->logical_mode) {
+ case KVM_APIC_MODE_SW_DISABLED:
+ /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+ *cluster = map->xapic_flat_map;
+ *mask = 0;
+ return true;
case KVM_APIC_MODE_X2APIC: {
u32 offset = (dest_id >> 16) * 16;
u32 max_apic_id = map->max_apic_id;
@@ -193,8 +204,10 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
*cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
*mask = dest_id & 0xf;
return true;
+ case KVM_APIC_MODE_MAP_DISABLED:
+ return false;
default:
- /* Not optimized. */
+ WARN_ON_ONCE(1);
return false;
}
}
@@ -206,6 +219,134 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
kvfree(map);
}
+static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu,
+ bool *xapic_id_mismatch)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 x2apic_id = kvm_x2apic_id(apic);
+ u32 xapic_id = kvm_xapic_id(apic);
+ u32 physical_id;
+
+ /*
+ * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+ * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+ * 32-bit value. Any unwanted aliasing due to truncation results will
+ * be detected below.
+ */
+ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+ *xapic_id_mismatch = true;
+
+ /*
+ * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+ * Allow sending events to vCPUs by their x2APIC ID even if the target
+ * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+ * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+ * and collide).
+ *
+ * Honor the architectural (and KVM's non-optimized) behavior if
+ * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed
+ * to process messages independently. If multiple vCPUs have the same
+ * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+ * manually modified its xAPIC IDs, events targeting that ID are
+ * supposed to be recognized by all vCPUs with said ID.
+ */
+ if (vcpu->kvm->arch.x2apic_format) {
+ /* See also kvm_apic_match_physical_addr(). */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+ } else {
+ /*
+ * Disable the optimized map if the physical APIC ID is already
+ * mapped, i.e. is aliased to multiple vCPUs. The optimized
+ * map requires a strict 1:1 mapping between IDs and vCPUs.
+ */
+ if (apic_x2apic_mode(apic))
+ physical_id = x2apic_id;
+ else
+ physical_id = xapic_id;
+
+ if (new->phys_map[physical_id])
+ return -EINVAL;
+
+ new->phys_map[physical_id] = apic;
+ }
+
+ return 0;
+}
+
+static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ enum kvm_apic_logical_mode logical_mode;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+
+ if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ return;
+
+ if (!kvm_apic_sw_enabled(apic))
+ return;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ if (!ldr)
+ return;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_mode = KVM_APIC_MODE_X2APIC;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ /*
+ * To optimize logical mode delivery, all software-enabled APICs must
+ * be configured for the same mode.
+ */
+ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+ new->logical_mode = logical_mode;
+ } else if (new->logical_mode != logical_mode) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is read-only and derived directly from the
+ * x2APIC ID, thus is guaranteed to be addressable. KVM reuses
+ * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+ * reversing the LDR calculation to get cluster of APICs, i.e. no
+ * additional work is required.
+ */
+ if (apic_x2apic_mode(apic)) {
+ WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ return;
+ }
+
+ if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+ &cluster, &mask))) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ if (!mask)
+ return;
+
+ ldr = ffs(mask) - 1;
+ if (!is_power_of_2(mask) || cluster[ldr])
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ else
+ cluster[ldr] = apic;
+}
+
/*
* CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
*
@@ -224,6 +365,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
+ bool xapic_id_mismatch = false;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
@@ -256,54 +398,41 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
goto out;
new->max_apic_id = max_id;
+ new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
kvm_for_each_vcpu(i, vcpu, kvm) {
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_lapic **cluster;
- u16 mask;
- u32 ldr;
- u8 xapic_id;
- u32 x2apic_id;
-
if (!kvm_apic_present(vcpu))
continue;
- xapic_id = kvm_xapic_id(apic);
- x2apic_id = kvm_x2apic_id(apic);
-
- /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
- if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
- x2apic_id <= new->max_apic_id)
- new->phys_map[x2apic_id] = apic;
- /*
- * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
- * prevent them from masking VCPUs with APIC ID <= 0xff.
- */
- if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
- new->phys_map[xapic_id] = apic;
-
- if (!kvm_apic_sw_enabled(apic))
- continue;
-
- ldr = kvm_lapic_get_reg(apic, APIC_LDR);
-
- if (apic_x2apic_mode(apic)) {
- new->mode |= KVM_APIC_MODE_X2APIC;
- } else if (ldr) {
- ldr = GET_APIC_LOGICAL_ID(ldr);
- if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
- new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
- else
- new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+ if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+ kvfree(new);
+ new = NULL;
+ goto out;
}
- if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
- continue;
-
- if (mask)
- cluster[ffs(mask) - 1] = apic;
+ kvm_recalculate_logical_map(new, vcpu);
}
out:
+ /*
+ * The optimized map is effectively KVM's internal version of APICv,
+ * and all unwanted aliasing that results in disabling the optimized
+ * map also applies to APICv.
+ */
+ if (!new)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+
+ if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+
+ if (xapic_id_mismatch)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+
old = rcu_dereference_protected(kvm->arch.apic_map,
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new);
@@ -360,11 +489,6 @@ static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
-static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
-{
- return ((id >> 4) << 16) | (1 << (id & 0xf));
-}
-
static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = kvm_apic_calc_x2apic_ldr(id);
@@ -941,8 +1065,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
{
if (!kvm->arch.disabled_lapic_found) {
kvm->arch.disabled_lapic_found = true;
- printk(KERN_INFO
- "Disabled LAPIC found during irq injection\n");
+ pr_info("Disabled LAPIC found during irq injection\n");
}
}
@@ -951,7 +1074,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
{
if (kvm->arch.x2apic_broadcast_quirk_disabled) {
if ((irq->dest_id == APIC_BROADCAST &&
- map->mode != KVM_APIC_MODE_X2APIC))
+ map->logical_mode != KVM_APIC_MODE_X2APIC))
return true;
if (irq->dest_id == X2APIC_BROADCAST)
return true;
@@ -1364,7 +1487,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
ktime_t remaining, now;
s64 ns;
- u32 tmcct;
ASSERT(apic != NULL);
@@ -1379,10 +1501,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- tmcct = div64_u64(ns,
- (APIC_BUS_CYCLE_NS * apic->divide_count));
-
- return tmcct;
+ return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -1442,19 +1561,15 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
#define APIC_REGS_MASK(first, count) \
(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
-static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
{
- unsigned char alignment = offset & 0xf;
- u32 result;
- /* this bitmask has a bit cleared for each reserved register */
+ /* Leave bits '0' for reserved and write-only registers. */
u64 valid_reg_mask =
APIC_REG_MASK(APIC_ID) |
APIC_REG_MASK(APIC_LVR) |
APIC_REG_MASK(APIC_TASKPRI) |
APIC_REG_MASK(APIC_PROCPRI) |
APIC_REG_MASK(APIC_LDR) |
- APIC_REG_MASK(APIC_DFR) |
APIC_REG_MASK(APIC_SPIV) |
APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
@@ -1474,21 +1589,33 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
- /*
- * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
- * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
- * manually handled by the caller.
- */
+ /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
if (!apic_x2apic_mode(apic))
valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_DFR) |
APIC_REG_MASK(APIC_ICR2);
- else
- WARN_ON_ONCE(offset == APIC_ICR);
+
+ return valid_reg_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask);
+
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
+{
+ unsigned char alignment = offset & 0xf;
+ u32 result;
+
+ /*
+ * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
+ * x2APIC and needs to be manually handled by the caller.
+ */
+ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
if (alignment + len > 4)
return 1;
- if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset)))
+ if (offset > 0x3f0 ||
+ !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
return 1;
result = __apic_read(apic, offset & ~0xf);
@@ -1560,7 +1687,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
if (apic->lapic_timer.period < min_period) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
+ "vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
apic->lapic_timer.period, min_period);
@@ -1841,11 +1968,15 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
if (unlikely(count_reg != APIC_TMICT)) {
deadline = tmict_to_ns(apic,
kvm_lapic_get_reg(apic, count_reg));
- if (unlikely(deadline <= 0))
- deadline = apic->lapic_timer.period;
+ if (unlikely(deadline <= 0)) {
+ if (apic_lvtt_period(apic))
+ deadline = apic->lapic_timer.period;
+ else
+ deadline = 0;
+ }
else if (unlikely(deadline > apic->lapic_timer.period)) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested lapic timer restore with "
+ "vcpu %i: requested lapic timer restore with "
"starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
"Using initial count to start timer.\n",
apic->vcpu->vcpu_id,
@@ -2068,19 +2199,6 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
-static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
-{
- struct kvm *kvm = apic->vcpu->kvm;
-
- if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
- return;
-
- if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
- return;
-
- kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
-}
-
static int get_lvt_index(u32 reg)
{
if (reg == APIC_LVTCMCI)
@@ -2101,7 +2219,6 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- kvm_lapic_xapic_id_updated(apic);
} else {
ret = 1;
}
@@ -2219,10 +2336,14 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic))
- kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
- else
+ /*
+ * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold
+ * the vector, everything else is reserved.
+ */
+ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
ret = 1;
+ else
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
break;
default:
ret = 1;
@@ -2284,23 +2405,18 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 val;
- if (apic_x2apic_mode(apic)) {
- if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm))
- return;
- } else {
- val = kvm_lapic_get_reg(apic, offset);
- }
-
/*
* ICR is a single 64-bit register when x2APIC is enabled. For legacy
* xAPIC, ICR writes need to go down the common (slightly slower) path
* to get the upper half from ICR2.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR) {
+ val = kvm_lapic_get_reg64(apic, APIC_ICR);
kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
trace_kvm_apic_write(APIC_ICR, val);
} else {
/* TODO: optimize to just emulate side effect w/o one more write */
+ val = kvm_lapic_get_reg(apic, offset);
kvm_lapic_reg_write(apic, offset, (u32)val);
}
}
@@ -2394,11 +2510,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
}
- if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
- kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE)
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ else if (value & MSR_IA32_APICBASE_ENABLE)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
- kvm_vcpu_update_apicv(vcpu);
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
}
@@ -2429,6 +2549,78 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
*/
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
+ apic->highest_isr_cache = -1;
+}
+
+int kvm_alloc_apic_access_page(struct kvm *kvm)
+{
+ struct page *page;
+ void __user *hva;
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_memslot_enabled ||
+ kvm->arch.apic_access_memslot_inhibited)
+ goto out;
+
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva)) {
+ ret = PTR_ERR(hva);
+ goto out;
+ }
+
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_memslot_enabled = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
+
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.apic_access_memslot_enabled)
+ return;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled) {
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ /*
+ * Clear "enabled" after the memslot is deleted so that a
+ * different vCPU doesn't get a false negative when checking
+ * the flag out of slots_lock. No additional memory barrier is
+ * needed as modifying memslots requires waiting other vCPUs to
+ * drop SRCU (see above), and false positives are ok as the
+ * flag is rechecked after acquiring slots_lock.
+ */
+ kvm->arch.apic_access_memslot_enabled = false;
+
+ /*
+ * Mark the memslot as inhibited to prevent reallocating the
+ * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+ */
+ kvm->arch.apic_access_memslot_inhibited = true;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
}
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -2484,7 +2676,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -2756,9 +2947,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
}
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
- if (!apic_x2apic_mode(apic))
- kvm_lapic_xapic_id_updated(apic);
-
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
@@ -2772,7 +2960,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
__start_apic_timer(apic, APIC_TMCCT);
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
@@ -2943,13 +3130,17 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
{
/*
- * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+ * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
* can be written as such, all other registers remain accessible only
* through 32-bit reads/writes.
*/
if (reg == APIC_ICR)
return kvm_x2apic_icr_write(apic, data);
+ /* Bits 63:32 are reserved in all other registers. */
+ if (data >> 32)
+ return 1;
+
return kvm_lapic_reg_write(apic, reg, (u32)data);
}
@@ -2972,9 +3163,6 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_DFR)
- return 1;
-
return kvm_lapic_msr_read(apic, reg, data);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 58c3242fcc7a..0a0ea4b5dd8c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -112,6 +112,8 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
+int kvm_alloc_apic_access_page(struct kvm *kvm);
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
@@ -144,6 +146,8 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
void kvm_lapic_exit(void);
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
+
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6bdaacb6faa0..168c46fd8dd1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -230,14 +230,14 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
}
#ifdef CONFIG_X86_64
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+extern bool tdp_mmu_enabled;
#else
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+#define tdp_mmu_enabled false
#endif
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
- return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
+ return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
}
static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 835426254e76..c8ebe542c565 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -14,6 +14,7 @@
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "irq.h"
#include "ioapic.h"
@@ -43,6 +44,7 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/kern_levels.h>
+#include <linux/kstrtox.h>
#include <linux/kthread.h>
#include <asm/page.h>
@@ -99,6 +101,13 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
*/
bool tdp_enabled = false;
+static bool __ro_after_init tdp_mmu_allowed;
+
+#ifdef CONFIG_X86_64
+bool __read_mostly tdp_mmu_enabled = true;
+module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+#endif
+
static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
@@ -261,6 +270,17 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
kvm_flush_remote_tlbs_with_range(kvm, &range);
}
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
+
+/* Flush the range of guest memory mapped by the given SPTE. */
+static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
+
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+}
+
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
@@ -609,9 +629,14 @@ static bool mmu_spte_age(u64 *sptep)
return true;
}
+static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
+{
+ return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
+}
+
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
- if (is_tdp_mmu(vcpu->arch.mmu)) {
+ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_begin();
} else {
/*
@@ -630,7 +655,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
- if (is_tdp_mmu(vcpu->arch.mmu)) {
+ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_end();
} else {
/*
@@ -800,7 +825,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1174,8 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
drop_spte(kvm, sptep);
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
}
/*
@@ -1279,7 +1303,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
@@ -1312,7 +1336,7 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
@@ -1395,7 +1419,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
}
}
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
@@ -1456,7 +1480,7 @@ restart:
}
if (need_flush && kvm_available_flush_tlb_with_range()) {
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
return false;
}
@@ -1558,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
@@ -1571,7 +1595,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
return flush;
@@ -1626,8 +1650,7 @@ static void __rmap_add(struct kvm *kvm,
kvm->stat.max_mmu_rmap_size = rmap_count;
if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
kvm_zap_all_rmap_sptes(kvm, rmap_head);
- kvm_flush_remote_tlbs_with_address(
- kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
}
}
@@ -1646,7 +1669,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
return young;
@@ -1659,7 +1682,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
return young;
@@ -1921,7 +1944,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
return true;
/* TDP MMU pages do not use the MMU generation. */
- return !sp->tdp_mmu_page &&
+ return !is_tdp_mmu_page(sp) &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2355,7 +2378,16 @@ static void __link_shadow_page(struct kvm *kvm,
mmu_page_add_parent_pte(cache, sp, sptep);
- if (sp->unsync_children || sp->unsync)
+ /*
+ * The non-direct sub-pagetable must be updated before linking. For
+ * L1 sp, the pagetable is updated via kvm_sync_page() in
+ * kvm_mmu_find_shadow_page() without write-protecting the gfn,
+ * so sp->unsync can be true or false. For higher level non-direct
+ * sp, the pagetable is updated/synced via mmu_sync_children() in
+ * FNAME(fetch)(), so sp->unsync_children can only be false.
+ * WARN_ON_ONCE() if anything happens unexpectedly.
+ */
+ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
mark_unsync(sptep);
}
@@ -2383,7 +2415,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
return;
drop_parent_pte(child, sptep);
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
}
}
@@ -2867,8 +2899,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (flush)
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -3116,11 +3147,11 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
!is_large_pte(spte) &&
spte_to_child_sp(spte)->nx_huge_page_disallowed) {
/*
- * A small SPTE exists for this pfn, but FNAME(fetch)
- * and __direct_map would like to create a large PTE
- * instead: just force them to go down another level,
- * patching back for them into pfn the next 9 bits of
- * the address.
+ * A small SPTE exists for this pfn, but FNAME(fetch),
+ * direct_map(), or kvm_tdp_mmu_map() would like to create a
+ * large PTE instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of the
+ * address.
*/
u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
KVM_PAGES_PER_HPAGE(cur_level - 1);
@@ -3129,7 +3160,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
@@ -3147,7 +3178,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
if (it.level == fault->goal_level)
break;
@@ -3173,14 +3204,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return ret;
}
-static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
+static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
{
- send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
+ unsigned long hva = gfn_to_hva_memslot(slot, gfn);
+
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
}
-static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- if (is_sigpending_pfn(pfn)) {
+ if (is_sigpending_pfn(fault->pfn)) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
}
@@ -3190,43 +3223,43 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
*/
- if (pfn == KVM_PFN_ERR_RO_FAULT)
+ if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
return RET_PF_EMULATE;
- if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
+ if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(fault->slot, fault->gfn);
return RET_PF_RETRY;
}
return -EFAULT;
}
-static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- unsigned int access)
+static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ unsigned int access)
{
- /* The pfn is invalid, report the error! */
- if (unlikely(is_error_pfn(fault->pfn)))
- return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn);
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
- if (unlikely(!fault->slot)) {
- gva_t gva = fault->is_tdp ? 0 : fault->addr;
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
+ access & shadow_mmio_access_mask);
- vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
- access & shadow_mmio_access_mask);
- /*
- * If MMIO caching is disabled, emulate immediately without
- * touching the shadow page tables as attempting to install an
- * MMIO SPTE will just be an expensive nop. Do not cache MMIO
- * whose gfn is greater than host.MAXPHYADDR, any guest that
- * generates such gfns is running nested and is being tricked
- * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
- * and only if L1's MAXPHYADDR is inaccurate with respect to
- * the hardware's).
- */
- if (unlikely(!enable_mmio_caching) ||
- unlikely(fault->gfn > kvm_mmu_max_gfn()))
- return RET_PF_EMULATE;
- }
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!enable_mmio_caching))
+ return RET_PF_EMULATE;
+
+ /*
+ * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
+ * any guest that generates such gfns is running nested and is being
+ * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
+ * only if L1's MAXPHYADDR is inaccurate with respect to the
+ * hardware's).
+ */
+ if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
+ return RET_PF_EMULATE;
return RET_PF_CONTINUE;
}
@@ -3350,7 +3383,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
do {
u64 new_spte;
- if (is_tdp_mmu(vcpu->arch.mmu))
+ if (tdp_mmu_enabled)
sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
@@ -3433,8 +3466,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
if (++retry_count > 4) {
- printk_once(KERN_WARNING
- "kvm: Fast #PF retrying more than 4 times.\n");
+ pr_warn_once("Fast #PF retrying more than 4 times.\n");
break;
}
@@ -3596,7 +3628,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_unlock;
- if (is_tdp_mmu_enabled(vcpu->kvm)) {
+ if (tdp_mmu_enabled) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
mmu->root.hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
@@ -4026,7 +4058,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
walk_shadow_page_lockless_begin(vcpu);
- if (is_tdp_mmu(vcpu->arch.mmu))
+ if (is_tdp_mmu_active(vcpu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
@@ -4174,7 +4206,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
-static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
bool async;
@@ -4235,12 +4267,33 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return RET_PF_CONTINUE;
}
+static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ unsigned int access)
+{
+ int ret;
+
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ ret = __kvm_faultin_pfn(vcpu, fault);
+ if (ret != RET_PF_CONTINUE)
+ return ret;
+
+ if (unlikely(is_error_pfn(fault->pfn)))
+ return kvm_handle_error_pfn(vcpu, fault);
+
+ if (unlikely(!fault->slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ return RET_PF_CONTINUE;
+}
+
/*
* Returns true if the page fault is stale and needs to be retried, i.e. if the
* root was invalidated by a memslot update or a relevant mmu_notifier fired.
*/
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault, int mmu_seq)
+ struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
@@ -4260,19 +4313,13 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
return true;
return fault->slot &&
- mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+ mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
}
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
-
- unsigned long mmu_seq;
int r;
- fault->gfn = fault->addr >> PAGE_SHIFT;
- fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
-
if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
@@ -4284,41 +4331,24 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
- smp_rmb();
-
- r = kvm_faultin_pfn(vcpu, fault);
- if (r != RET_PF_CONTINUE)
- return r;
-
- r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
return r;
r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
- if (is_tdp_mmu_fault)
- read_lock(&vcpu->kvm->mmu_lock);
- else
- write_lock(&vcpu->kvm->mmu_lock);
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
- if (is_page_fault_stale(vcpu, fault, mmu_seq))
+ r = make_mmu_pages_available(vcpu);
+ if (r)
goto out_unlock;
- if (is_tdp_mmu_fault) {
- r = kvm_tdp_mmu_map(vcpu, fault);
- } else {
- r = make_mmu_pages_available(vcpu);
- if (r)
- goto out_unlock;
- r = __direct_map(vcpu, fault);
- }
+ r = direct_map(vcpu, fault);
out_unlock:
- if (is_tdp_mmu_fault)
- read_unlock(&vcpu->kvm->mmu_lock);
- else
- write_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -4366,6 +4396,42 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+#ifdef CONFIG_X86_64
+static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_EMULATE;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ read_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = kvm_tdp_mmu_map(vcpu, fault);
+
+out_unlock:
+ read_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(fault->pfn);
+ return r;
+}
+#endif
+
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
/*
@@ -4383,13 +4449,19 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
+ gfn_t base = gfn_round_for_level(fault->gfn,
+ fault->max_level);
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
}
}
+#ifdef CONFIG_X86_64
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_page_fault(vcpu, fault);
+#endif
+
return direct_page_fault(vcpu, fault);
}
@@ -4494,10 +4566,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
struct kvm_mmu *mmu = vcpu->arch.mmu;
union kvm_mmu_page_role new_role = mmu->root_role;
- if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
- /* kvm_mmu_ensure_valid_pgd will set up a new root. */
+ /*
+ * Return immediately if no usable root was found, kvm_mmu_reload()
+ * will establish a valid root prior to the next VM-Enter.
+ */
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
return;
- }
/*
* It's possible that the cached previous root page is obsolete because
@@ -5719,6 +5793,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
+#ifdef CONFIG_X86_64
+ tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
+#endif
/*
* max_huge_page_level reflects KVM's MMU capabilities irrespective
* of kernel support, e.g. KVM may be capable of using 1GB pages when
@@ -5966,7 +6043,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* write and in the same critical section as making the reload request,
* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_invalidate_all_roots(kvm);
/*
@@ -5991,7 +6068,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Deferring the zap until the final reference to the root is put would
* lead to use-after-free.
*/
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_zap_invalidated_roots(kvm);
}
@@ -6017,9 +6094,11 @@ int kvm_mmu_init_vm(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- r = kvm_mmu_init_tdp_mmu(kvm);
- if (r < 0)
- return r;
+ if (tdp_mmu_enabled) {
+ r = kvm_mmu_init_tdp_mmu(kvm);
+ if (r < 0)
+ return r;
+ }
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -6049,7 +6128,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_page_track_unregister_notifier(kvm, node);
- kvm_mmu_uninit_tdp_mmu(kvm);
+ if (tdp_mmu_enabled)
+ kvm_mmu_uninit_tdp_mmu(kvm);
mmu_free_vm_memory_caches(kvm);
}
@@ -6103,7 +6183,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
gfn_end, true, flush);
@@ -6136,7 +6216,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
read_unlock(&kvm->mmu_lock);
@@ -6379,7 +6459,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
u64 start, u64 end,
int target_level)
{
- if (!is_tdp_mmu_enabled(kvm))
+ if (!tdp_mmu_enabled)
return;
if (kvm_memslots_have_rmaps(kvm))
@@ -6400,7 +6480,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
u64 start = memslot->base_gfn;
u64 end = start + memslot->npages;
- if (!is_tdp_mmu_enabled(kvm))
+ if (!tdp_mmu_enabled)
return;
if (kvm_memslots_have_rmaps(kvm)) {
@@ -6450,8 +6530,7 @@ restart:
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
else
need_tlb_flush = 1;
@@ -6483,7 +6562,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
@@ -6518,7 +6597,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
read_unlock(&kvm->mmu_lock);
@@ -6553,7 +6632,7 @@ restart:
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_zap_all(kvm);
write_unlock(&kvm->mmu_lock);
@@ -6579,7 +6658,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
* zap all shadow pages.
*/
if (unlikely(gen == 0)) {
- kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_zap_all_fast(kvm);
}
}
@@ -6684,7 +6763,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
new_val = 1;
else if (sysfs_streq(val, "auto"))
new_val = get_nx_auto_mode();
- else if (strtobool(val, &new_val) < 0)
+ else if (kstrtobool(val, &new_val) < 0)
return -EINVAL;
__set_nx_huge_pages(new_val);
@@ -6718,6 +6797,13 @@ void __init kvm_mmu_x86_module_init(void)
if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
+ /*
+ * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
+ * TDP MMU is actually enabled is determined in kvm_configure_mmu()
+ * when the vendor module is loaded.
+ */
+ tdp_mmu_allowed = tdp_mmu_enabled;
+
kvm_mmu_spte_module_init();
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index dbaf6755c5a7..cc58631e2336 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -156,6 +156,11 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
}
+static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
+{
+ return gfn & -KVM_PAGES_PER_HPAGE(level);
+}
+
int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
gfn_t gfn, bool can_unsync, bool prefetch);
@@ -164,8 +169,17 @@ void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
+
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
+
+/* Flush the given page (huge or not) of guest memory. */
+static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
+{
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
+}
+
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
extern int nx_huge_pages;
@@ -199,7 +213,7 @@ struct kvm_page_fault {
/*
* Maximum page size that can be created for this fault; input to
- * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
+ * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
*/
u8 max_level;
@@ -222,6 +236,7 @@ struct kvm_page_fault {
struct kvm_memory_slot *slot;
/* Outputs of kvm_faultin_pfn. */
+ unsigned long mmu_seq;
kvm_pfn_t pfn;
hva_t hva;
bool map_writable;
@@ -279,6 +294,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
};
int r;
+ if (vcpu->arch.mmu->root_role.direct) {
+ fault.gfn = fault.addr >> PAGE_SHIFT;
+ fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
+ }
+
/*
* Async #PF "faults", a.k.a. prefetch faults, are not faults from the
* guest perspective and have already been counted at the time of the
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 2e09d1b6249f..0a2ac438d647 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -10,6 +10,7 @@
* Author:
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/rculist.h>
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 0f6455072055..57f0b75c80f9 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -642,12 +642,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
goto out_gpte_changed;
- for (shadow_walk_init(&it, vcpu, fault->addr);
- shadow_walk_okay(&it) && it.level > gw->level;
- shadow_walk_next(&it)) {
+ for_each_shadow_entry(vcpu, fault->addr, it) {
gfn_t table_gfn;
clear_sp_write_flooding_count(it.sptep);
+ if (it.level == gw->level)
+ break;
table_gfn = gw->table_gfn[it.level - 2];
access = gw->pt_access[it.level - 2];
@@ -692,8 +692,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
trace_kvm_mmu_spte_requested(fault);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
- clear_sp_write_flooding_count(it.sptep);
-
/*
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
@@ -701,7 +699,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
if (it.level == fault->goal_level)
break;
@@ -791,7 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
struct guest_walker walker;
int r;
- unsigned long mmu_seq;
bool is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
@@ -838,14 +835,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
else
fault->max_level = walker.level;
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
- smp_rmb();
-
- r = kvm_faultin_pfn(vcpu, fault);
- if (r != RET_PF_CONTINUE)
- return r;
-
- r = handle_abnormal_pfn(vcpu, fault, walker.pte_access);
+ r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
return r;
@@ -871,7 +861,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (is_page_fault_stale(vcpu, fault, mmu_seq))
+ if (is_page_fault_stale(vcpu, fault))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
@@ -937,8 +927,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
if (is_shadow_present_pte(old_spte))
- kvm_flush_remote_tlbs_with_address(vcpu->kvm,
- sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
if (!rmap_can_add(vcpu))
break;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c0fd7e049b4e..c15bfca3ed15 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -7,7 +7,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2020 Red Hat, Inc. and/or its affiliates.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "mmu.h"
@@ -147,9 +147,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
WARN_ON_ONCE(!pte_access && !shadow_present_mask);
if (sp->role.ad_disabled)
- spte |= SPTE_TDP_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
- spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+ spte |= SPTE_TDP_AD_WRPROT_ONLY;
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -317,7 +317,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
shadow_user_mask | shadow_x_mask | shadow_me_value;
if (ad_disabled)
- spte |= SPTE_TDP_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED;
else
spte |= shadow_accessed_mask;
@@ -352,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte)
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
- "kvm: Access Tracking saved bit locations are not zero\n");
+ "Access Tracking saved bit locations are not zero\n");
spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 6f54dc9409c9..1279db2eab44 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -28,10 +28,10 @@
*/
#define SPTE_TDP_AD_SHIFT 52
#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT)
-static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
+#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
@@ -164,7 +164,7 @@ extern u64 __read_mostly shadow_me_value;
extern u64 __read_mostly shadow_me_mask;
/*
- * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
* pages.
*/
@@ -266,18 +266,18 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
- return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
/*
- * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
* and non-TDP SPTEs will never set these bits. Optimize for 64-bit
* TDP and do the A/D type check unconditionally.
*/
- return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
@@ -435,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte)
{
if (spte & shadow_mmu_writable_mask)
WARN_ONCE(!(spte & shadow_host_writable_mask),
- "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx",
spte);
else
WARN_ONCE(is_writable_pte(spte),
- "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+ KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte);
}
static inline bool is_mmu_writable_spte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 39b48e7d7d1a..d2eb0d4f8710 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu_internal.h"
#include "tdp_iter.h"
@@ -15,11 +16,6 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
-static gfn_t round_gfn_for_level(gfn_t gfn, int level)
-{
- return gfn & -KVM_PAGES_PER_HPAGE(level);
-}
-
/*
* Return the TDP iterator to the root PT and allow it to continue its
* traversal over the paging structure from there.
@@ -30,7 +26,7 @@ void tdp_iter_restart(struct tdp_iter *iter)
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
iter->valid = true;
@@ -97,7 +93,7 @@ static bool try_step_down(struct tdp_iter *iter)
iter->level--;
iter->pt_path[iter->level - 1] = child_pt;
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
@@ -139,7 +135,7 @@ static bool try_step_up(struct tdp_iter *iter)
return false;
iter->level++;
- iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d6df38d371a0..7c25dbf32ecc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu.h"
#include "mmu_internal.h"
@@ -10,23 +11,15 @@
#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-static bool __read_mostly tdp_mmu_enabled = true;
-module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
-
/* Initializes the TDP MMU for the VM, if enabled. */
int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
struct workqueue_struct *wq;
- if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
- return 0;
-
wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
if (!wq)
return -ENOMEM;
- /* This should not be changed for the lifetime of the VM. */
- kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
kvm->arch.tdp_mmu_zap_wq = wq;
@@ -47,9 +40,6 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
- if (!kvm->arch.tdp_mmu_enabled)
- return;
-
/* Also waits for any queued work items. */
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
@@ -144,7 +134,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
- WARN_ON(!root->tdp_mmu_page);
+ WARN_ON(!is_tdp_mmu_page(root));
/*
* The root now has refcount=0. It is valid, but readers already
@@ -690,8 +680,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
if (ret)
return ret;
- kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
- KVM_PAGES_PER_HPAGE(iter->level));
+ kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
/*
* No other thread can overwrite the removed SPTE as they must either
@@ -1090,8 +1079,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
!is_last_spte(iter->old_spte, iter->level))
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(iter->level + 1));
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
/*
* If the page fault was caused by a write but the page is write
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index d3714200b932..0a63b1afabd3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,6 +7,9 @@
#include "spte.h"
+int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
@@ -68,31 +71,9 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
u64 *spte);
#ifdef CONFIG_X86_64
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-
-static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
-{
- struct kvm_mmu_page *sp;
- hpa_t hpa = mmu->root.hpa;
-
- if (WARN_ON(!VALID_PAGE(hpa)))
- return false;
-
- /*
- * A NULL shadow page is legal when shadowing a non-paging guest with
- * PAE paging, as the MMU will be direct with root_hpa pointing at the
- * pae_root page, not a shadow page.
- */
- sp = to_shadow_page(hpa);
- return sp && is_tdp_mmu_page(sp) && sp->root_count;
-}
#else
-static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; }
-static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
-static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
#endif
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a8502e02f479..9fac1ec03463 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -13,6 +13,7 @@
* Paolo Bonzini <pbonzini@redhat.com>
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mtrr.h>
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index eb594620dd75..612e6c70ce2e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -9,6 +9,7 @@
* Gleb Natapov <gleb@redhat.com>
* Wei Huang <wei@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/kvm_host.h>
@@ -28,9 +29,18 @@
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_GPL(kvm_pmu_cap);
-static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
+/* Precise Distribution of Instructions Retired (PDIR) */
+static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ /* Instruction-Accurate PDIR (PDIR++) */
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* Precise Distribution (PDist) */
+static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
{}
};
@@ -155,6 +165,28 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
+static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
+{
+ /*
+ * For some model specific pebs counters with special capabilities
+ * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
+ * level to the maximum value (currently 3, backwards compatible)
+ * so that the perf subsystem would assign specific hardware counter
+ * with that capability for vPMC.
+ */
+ if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
+ (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
+ return 3;
+
+ /*
+ * The non-zero precision level of guest event makes the ordinary
+ * guest event becomes a guest PEBS event and triggers the host
+ * PEBS PMI handler to determine whether the PEBS overflow PMI
+ * comes from the host counters or the guest.
+ */
+ return 1;
+}
+
static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
bool exclude_user, bool exclude_kernel,
bool intr)
@@ -186,22 +218,12 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
}
if (pebs) {
/*
- * The non-zero precision level of guest event makes the ordinary
- * guest event becomes a guest PEBS event and triggers the host
- * PEBS PMI handler to determine whether the PEBS overflow PMI
- * comes from the host counters or the guest.
- *
* For most PEBS hardware events, the difference in the software
* precision levels of guest and host PEBS events will not affect
* the accuracy of the PEBS profiling result, because the "event IP"
* in the PEBS record is calibrated on the guest side.
- *
- * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
- * could possibly care here is unsupported and needs changes.
*/
- attr.precise_ip = 1;
- if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
- attr.precise_ip = 3;
+ attr.precise_ip = pmc_get_pebs_precise_level(pmc);
}
event = perf_event_create_kernel_counter(&attr, -1, current,
@@ -254,48 +276,128 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true;
}
-static int cmp_u64(const void *pa, const void *pb)
+static int filter_cmp(const void *pa, const void *pb, u64 mask)
{
- u64 a = *(u64 *)pa;
- u64 b = *(u64 *)pb;
+ u64 a = *(u64 *)pa & mask;
+ u64 b = *(u64 *)pb & mask;
return (a > b) - (a < b);
}
+
+static int filter_sort_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE));
+}
+
+/*
+ * For the event filter, searching is done on the 'includes' list and
+ * 'excludes' list separately rather than on the 'events' list (which
+ * has both). As a result the exclude bit can be ignored.
+ */
+static int filter_event_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
+}
+
+static int find_filter_index(u64 *events, u64 nevents, u64 key)
+{
+ u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
+ filter_event_cmp);
+
+ if (!fe)
+ return -1;
+
+ return fe - events;
+}
+
+static bool is_filter_entry_match(u64 filter_event, u64 umask)
+{
+ u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
+ u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
+
+ BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
+ (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
+ ARCH_PERFMON_EVENTSEL_UMASK);
+
+ return (umask & mask) == match;
+}
+
+static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
+{
+ u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
+ u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
+ int i, index;
+
+ index = find_filter_index(events, nevents, event_select);
+ if (index < 0)
+ return false;
+
+ /*
+ * Entries are sorted by the event select. Walk the list in both
+ * directions to process all entries with the targeted event select.
+ */
+ for (i = index; i < nevents; i++) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ for (i = index - 1; i >= 0; i--) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
+ u64 eventsel)
+{
+ if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
+ !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
+ return f->action == KVM_PMU_EVENT_ALLOW;
+
+ return f->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
+ int idx)
+{
+ int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
+
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+
+ return true;
+}
+
static bool check_pmu_event_filter(struct kvm_pmc *pmc)
{
- struct kvm_pmu_event_filter *filter;
+ struct kvm_x86_pmu_event_filter *filter;
struct kvm *kvm = pmc->vcpu->kvm;
- bool allow_event = true;
- __u64 key;
- int idx;
if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
return false;
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (!filter)
- goto out;
+ return true;
- if (pmc_is_gp(pmc)) {
- key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
- if (bsearch(&key, filter->events, filter->nevents,
- sizeof(__u64), cmp_u64))
- allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
- else
- allow_event = filter->action == KVM_PMU_EVENT_DENY;
- } else {
- idx = pmc->idx - INTEL_PMC_IDX_FIXED;
- if (filter->action == KVM_PMU_EVENT_DENY &&
- test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- allow_event = false;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- allow_event = false;
- }
+ if (pmc_is_gp(pmc))
+ return is_gp_event_allowed(filter, pmc->eventsel);
-out:
- return allow_event;
+ return is_fixed_event_allowed(filter, pmc->idx);
}
static void reprogram_counter(struct kvm_pmc *pmc)
@@ -592,47 +694,133 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
}
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
+{
+ u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
+ KVM_PMU_MASKED_ENTRY_UMASK_MASK |
+ KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE;
+ int i;
+
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & ~mask)
+ return false;
+ }
+
+ return true;
+}
+
+static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < filter->nevents; i++) {
+ /*
+ * Skip events that are impossible to match against a guest
+ * event. When filtering, only the event select + unit mask
+ * of the guest event is used. To maintain backwards
+ * compatibility, impossible filters can't be rejected :-(
+ */
+ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
+ ARCH_PERFMON_EVENTSEL_UMASK))
+ continue;
+ /*
+ * Convert userspace events to a common in-kernel event so
+ * only one code path is needed to support both events. For
+ * the in-kernel events use masked events because they are
+ * flexible enough to handle both cases. To convert to masked
+ * events all that's needed is to add an "all ones" umask_mask,
+ * (unmasked filter events don't support EXCLUDE).
+ */
+ filter->events[j++] = filter->events[i] |
+ (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
+ }
+
+ filter->nevents = j;
+}
+
+static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i;
+
+ if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
+ convert_to_masked_filter(filter);
+ else if (!is_masked_filter_valid(filter))
+ return -EINVAL;
+
+ /*
+ * Sort entries by event select and includes vs. excludes so that all
+ * entries for a given event select can be processed efficiently during
+ * filtering. The EXCLUDE flag uses a more significant bit than the
+ * event select, and so the sorted list is also effectively split into
+ * includes and excludes sub-lists.
+ */
+ sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
+ filter_sort_cmp, NULL);
+
+ i = filter->nevents;
+ /* Find the first EXCLUDE event (only supported for masked events). */
+ if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
+ break;
+ }
+ }
+
+ filter->nr_includes = i;
+ filter->nr_excludes = filter->nevents - filter->nr_includes;
+ filter->includes = filter->events;
+ filter->excludes = filter->events + filter->nr_includes;
+
+ return 0;
+}
+
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
- struct kvm_pmu_event_filter tmp, *filter;
+ struct kvm_pmu_event_filter __user *user_filter = argp;
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm_pmu_event_filter tmp;
struct kvm_vcpu *vcpu;
unsigned long i;
size_t size;
int r;
- if (copy_from_user(&tmp, argp, sizeof(tmp)))
+ if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
return -EFAULT;
if (tmp.action != KVM_PMU_EVENT_ALLOW &&
tmp.action != KVM_PMU_EVENT_DENY)
return -EINVAL;
- if (tmp.flags != 0)
+ if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
return -EINVAL;
if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
return -E2BIG;
size = struct_size(filter, events, tmp.nevents);
- filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
+ filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
if (!filter)
return -ENOMEM;
+ filter->action = tmp.action;
+ filter->nevents = tmp.nevents;
+ filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
+ filter->flags = tmp.flags;
+
r = -EFAULT;
- if (copy_from_user(filter, argp, size))
+ if (copy_from_user(filter->events, user_filter->events,
+ sizeof(filter->events[0]) * filter->nevents))
goto cleanup;
- /* Ensure nevents can't be changed between the user copies. */
- *filter = tmp;
-
- /*
- * Sort the in-kernel list so that we can search it with bsearch.
- */
- sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+ r = prepare_filter_lists(filter);
+ if (r)
+ goto cleanup;
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
synchronize_srcu_expedited(&kvm->srcu);
BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
@@ -643,8 +831,6 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
- mutex_unlock(&kvm->lock);
-
r = 0;
cleanup:
kfree(filter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index cdb91009701d..be62c16f2265 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -18,12 +18,6 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
-struct kvm_event_hw_type_mapping {
- u8 eventsel;
- u8 unit_mask;
- unsigned event_type;
-};
-
struct kvm_pmu_ops {
bool (*hw_event_available)(struct kvm_pmc *pmc);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
@@ -40,6 +34,9 @@ struct kvm_pmu_ops {
void (*reset)(struct kvm_vcpu *vcpu);
void (*deliver_pmi)(struct kvm_vcpu *vcpu);
void (*cleanup)(struct kvm_vcpu *vcpu);
+
+ const u64 EVENTSEL_EVENT;
+ const int MAX_NR_GP_COUNTERS;
};
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
@@ -161,25 +158,39 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
extern struct x86_pmu_capability kvm_pmu_cap;
-static inline void kvm_init_pmu_capability(void)
+static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
{
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
- perf_get_x86_pmu_capability(&kvm_pmu_cap);
-
- /*
- * For Intel, only support guest architectural pmu
- * on a host with architectural pmu.
- */
- if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp)
+ /*
+ * Hybrid PMUs don't play nice with virtualization without careful
+ * configuration by userspace, and KVM's APIs for reporting supported
+ * vPMU features do not account for hybrid PMUs. Disable vPMU support
+ * for hybrid PMUs until KVM gains a way to let userspace opt-in.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
enable_pmu = false;
+ if (enable_pmu) {
+ perf_get_x86_pmu_capability(&kvm_pmu_cap);
+
+ /*
+ * For Intel, only support guest architectural pmu
+ * on a host with architectural pmu.
+ */
+ if ((is_intel && !kvm_pmu_cap.version) ||
+ !kvm_pmu_cap.num_counters_gp)
+ enable_pmu = false;
+ }
+
if (!enable_pmu) {
memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
return;
}
kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
+ kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
+ pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
KVM_PMC_MAX_FIXED);
}
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 042d0aca3c92..a5717282bb9c 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -14,6 +14,7 @@
enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
CPUID_7_1_EDX,
+ CPUID_8000_0007_EDX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -43,6 +44,9 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+/* CPUID level 0x80000007 (EDX). */
+#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -68,6 +72,8 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
[CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
+ [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
+ [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
};
/*
@@ -100,6 +106,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
return KVM_X86_FEATURE_SGX2;
else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
return KVM_X86_FEATURE_SGX_EDECCSSA;
+ else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
+ return KVM_X86_FEATURE_CONSTANT_TSC;
return x86_feature;
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index a9c1c2af8d94..b42111a24cc2 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "x86.h"
@@ -110,8 +111,6 @@ static void check_smram_offsets(void)
void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
{
- BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
-
trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
if (entering_smm) {
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 6919dee69f18..ca684979e90d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/hashtable.h>
@@ -53,7 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
-enum avic_modes avic_mode;
+bool x2avic_enabled;
/*
* This is a wrapper of struct amd_iommu_ir_data.
@@ -72,20 +72,25 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- /* Note:
- * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
- * MSR accesses, while interrupt injection to a running vCPU
- * can be achieved using AVIC doorbell. The AVIC hardware still
- * accelerate MMIO accesses, but this does not cause any harm
- * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+ /*
+ * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+ * accesses, while interrupt injection to a running vCPU can be
+ * achieved using AVIC doorbell. KVM disables the APIC access page
+ * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+ * AVIC in hybrid mode activates only the doorbell mechanism.
*/
- if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
- avic_mode == AVIC_MODE_X2) {
+ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
/* For xAVIC and hybrid-xAVIC modes */
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
@@ -241,8 +246,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
u64 *avic_physical_id_table;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
+ (index > X2AVIC_MAX_PHYSICAL_ID))
return NULL;
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
@@ -250,47 +255,14 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
return &avic_physical_id_table[index];
}
-/*
- * Note:
- * AVIC hardware walks the nested page table to check permissions,
- * but does not use the SPA address specified in the leaf page
- * table entry since it uses address in the AVIC_BACKING_PAGE pointer
- * field of the VMCB. Therefore, we set up the
- * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
- */
-static int avic_alloc_access_page(struct kvm *kvm)
-{
- void __user *ret;
- int r = 0;
-
- mutex_lock(&kvm->slots_lock);
-
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
-
- ret = __x86_set_memory_region(kvm,
- APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
- if (IS_ERR(ret)) {
- r = PTR_ERR(ret);
- goto out;
- }
-
- kvm->arch.apic_access_memslot_enabled = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+ (id > X2AVIC_MAX_PHYSICAL_ID))
return -EINVAL;
if (!vcpu->arch.apic->regs)
@@ -299,7 +271,13 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
- ret = avic_alloc_access_page(vcpu->kvm);
+ /*
+ * Note, AVIC hardware walks the nested page table to check
+ * permissions, but does not use the SPA address specified in
+ * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+ * pointer field of the VMCB.
+ */
+ ret = kvm_alloc_apic_access_page(vcpu->kvm);
if (ret)
return ret;
}
@@ -339,6 +317,60 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
put_cpu();
}
+
+static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+{
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+}
+
+static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+ u32 icrl)
+{
+ /*
+ * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+ * i.e. APIC ID == vCPU ID.
+ */
+ struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+
+ /* Once again, nothing to do if the target vCPU doesn't exist. */
+ if (unlikely(!target_vcpu))
+ return;
+
+ avic_kick_vcpu(target_vcpu, icrl);
+}
+
+static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+ u32 logid_index, u32 icrl)
+{
+ u32 physical_id;
+
+ if (avic_logical_id_table) {
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ /* Nothing to do if the logical destination is invalid. */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return;
+
+ physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC, the logical APIC ID is a read-only value that is
+ * derived from the x2APIC ID, thus the x2APIC ID can be found
+ * by reversing the calculation (stored in logid_index). Note,
+ * bits 31:20 of the x2APIC ID aren't propagated to the logical
+ * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+ */
+ physical_id = logid_index;
+ }
+
+ avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+}
+
/*
* A fast-path version of avic_kick_target_vcpus(), which attempts to match
* destination APIC ID to vCPU without looping through all vCPUs.
@@ -346,11 +378,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
- u32 l1_physical_id, dest;
- struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 dest;
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
@@ -367,18 +398,18 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
return -EINVAL;
- l1_physical_id = dest;
-
- if (WARN_ON_ONCE(l1_physical_id != index))
+ if (WARN_ON_ONCE(dest != index))
return -EINVAL;
+ avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
} else {
- u32 bitmap, cluster;
- int logid_index;
+ u32 *avic_logical_id_table;
+ unsigned long bitmap, i;
+ u32 cluster;
if (apic_x2apic_mode(source)) {
/* 16 bit dest mask, 16 bit cluster id */
- bitmap = dest & 0xFFFF0000;
+ bitmap = dest & 0xFFFF;
cluster = (dest >> 16) << 4;
} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
/* 8 bit dest mask*/
@@ -390,67 +421,32 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
cluster = (dest >> 4) << 2;
}
+ /* Nothing to do if there are no destinations in the cluster. */
if (unlikely(!bitmap))
- /* guest bug: nobody to send the logical interrupt to */
return 0;
- if (!is_power_of_2(bitmap))
- /* multiple logical destinations, use slow path */
- return -EINVAL;
-
- logid_index = cluster + __ffs(bitmap);
-
- if (!apic_x2apic_mode(source)) {
- u32 *avic_logical_id_table =
- page_address(kvm_svm->avic_logical_id_table_page);
-
- u32 logid_entry = avic_logical_id_table[logid_index];
-
- if (WARN_ON_ONCE(index != logid_index))
- return -EINVAL;
-
- /* guest bug: non existing/reserved logical destination */
- if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
- return 0;
-
- l1_physical_id = logid_entry &
- AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
- } else {
- /*
- * For x2APIC logical mode, cannot leverage the index.
- * Instead, calculate physical ID from logical ID in ICRH.
- */
- int cluster = (icrh & 0xffff0000) >> 16;
- int apic = ffs(icrh & 0xffff) - 1;
-
- /*
- * If the x2APIC logical ID sub-field (i.e. icrh[15:0])
- * contains anything but a single bit, we cannot use the
- * fast path, because it is limited to a single vCPU.
- */
- if (apic < 0 || icrh != (1 << apic))
- return -EINVAL;
+ if (apic_x2apic_mode(source))
+ avic_logical_id_table = NULL;
+ else
+ avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
- l1_physical_id = (cluster << 4) + apic;
- }
+ /*
+ * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+ * IDs, thus each bit in the destination is guaranteed to map
+ * to at most one vCPU.
+ */
+ for_each_set_bit(i, &bitmap, 16)
+ avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+ cluster + i, icrl);
}
- target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
- if (unlikely(!target_vcpu))
- /* guest bug: non existing vCPU is a target of this IPI*/
- return 0;
-
- target_vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(target_vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
+ u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
unsigned long i;
struct kvm_vcpu *vcpu;
@@ -466,21 +462,9 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
- u32 dest;
-
- if (apic_x2apic_mode(vcpu->arch.apic))
- dest = icrh;
- else
- dest = GET_XAPIC_DEST_FIELD(icrh);
-
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
- dest, icrl & APIC_DEST_MASK)) {
- vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
- }
+ dest, icrl & APIC_DEST_MASK))
+ avic_kick_vcpu(vcpu, icrl);
}
}
@@ -496,14 +480,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
* Emulate IPIs that are not handled by AVIC hardware, which
- * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
- * a trap, e.g. ICR holds the correct value and RIP has been
- * advanced, KVM is responsible only for emulating the IPI.
- * Sadly, hardware may sometimes leave the BUSY flag set, in
- * which case KVM needs to emulate the ICR write as well in
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
* order to clear the BUSY flag.
*/
if (icrl & APIC_ICR_BUSY)
@@ -519,8 +507,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
*/
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
break;
- case AVIC_IPI_FAILURE_INVALID_TARGET:
- break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
break;
@@ -541,33 +527,33 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- int index;
u32 *logical_apic_id_table;
- int dlid = GET_APIC_LOGICAL_ID(ldr);
-
- if (!dlid)
- return NULL;
+ u32 cluster, index;
- if (flat) { /* flat */
- index = ffs(dlid) - 1;
- if (index > 7)
- return NULL;
- } else { /* cluster */
- int cluster = (dlid & 0xf0) >> 4;
- int apic = ffs(dlid & 0x0f) - 1;
+ ldr = GET_APIC_LOGICAL_ID(ldr);
- if ((apic < 0) || (apic > 7) ||
- (cluster >= 0xf))
+ if (flat) {
+ cluster = 0;
+ } else {
+ cluster = (ldr >> 4);
+ if (cluster >= 0xf)
return NULL;
- index = (cluster << 2) + apic;
+ ldr &= 0xf;
}
+ if (!ldr || !is_power_of_2(ldr))
+ return NULL;
+
+ index = __ffs(ldr);
+ if (WARN_ON_ONCE(index > 7))
+ return NULL;
+ index += (cluster << 2);
logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
}
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
@@ -575,15 +561,13 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
entry = avic_get_logical_id_entry(vcpu, ldr, flat);
if (!entry)
- return -EINVAL;
+ return;
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
-
- return 0;
}
static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
@@ -601,29 +585,23 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
}
-static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
- int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
u32 id = kvm_xapic_id(vcpu->arch.apic);
/* AVIC does not support LDR update for x2APIC */
if (apic_x2apic_mode(vcpu->arch.apic))
- return 0;
+ return;
if (ldr == svm->ldr_reg)
- return 0;
+ return;
avic_invalidate_logical_id_entry(vcpu);
- if (ldr)
- ret = avic_ldr_write(vcpu, id, ldr);
-
- if (!ret)
- svm->ldr_reg = ldr;
-
- return ret;
+ svm->ldr_reg = ldr;
+ avic_ldr_write(vcpu, id, ldr);
}
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
@@ -645,12 +623,14 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
switch (offset) {
case APIC_LDR:
- if (avic_handle_ldr_update(vcpu))
- return 0;
+ avic_handle_ldr_update(vcpu);
break;
case APIC_DFR:
avic_handle_dfr_update(vcpu);
break;
+ case APIC_RRR:
+ /* Ignore writes to Read Remote Data, it's read-only. */
+ return 1;
default:
break;
}
@@ -739,18 +719,6 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
- return;
-
- if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) {
- WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id);
- return;
- }
- avic_refresh_apicv_exec_ctrl(vcpu);
-}
-
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
@@ -995,23 +963,6 @@ out:
return ret;
}
-bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_NESTED) |
- BIT(APICV_INHIBIT_REASON_IRQWIN) |
- BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_SEV) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
-}
-
-
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
@@ -1064,6 +1015,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
@@ -1092,17 +1044,15 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-
-void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb01.ptr;
- bool activated = kvm_vcpu_apicv_active(vcpu);
- if (!enable_apicv)
+ if (!lapic_in_kernel(vcpu) || !enable_apicv)
return;
- if (activated) {
+ if (kvm_vcpu_apicv_active(vcpu)) {
/**
* During AVIC temporary deactivation, guest could update
* APIC ID, DFR and LDR registers, which would not be trapped
@@ -1116,6 +1066,16 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
avic_deactivate_vmcb(svm);
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ avic_refresh_virtual_apic_mode(vcpu);
if (activated)
avic_vcpu_load(vcpu, vcpu->cpu);
@@ -1160,37 +1120,37 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
* - Hypervisor can support both xAVIC and x2AVIC in the same guest.
* - The mode can be switched at run-time.
*/
-bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
+bool avic_hardware_setup(void)
{
if (!npt_enabled)
return false;
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+ if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+ pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+ pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+ }
+ return false;
+ }
+
if (boot_cpu_has(X86_FEATURE_AVIC)) {
- avic_mode = AVIC_MODE_X1;
pr_info("AVIC enabled\n");
} else if (force_avic) {
/*
* Some older systems does not advertise AVIC support.
* See Revision Guide for specific AMD processor for more detail.
*/
- avic_mode = AVIC_MODE_X1;
pr_warn("AVIC is not supported in CPUID but force enabled");
pr_warn("Your system might crash and burn");
}
/* AVIC is a prerequisite for x2AVIC. */
- if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
- if (avic_mode == AVIC_MODE_X1) {
- avic_mode = AVIC_MODE_X2;
- pr_info("x2AVIC enabled\n");
- } else {
- pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
- pr_warn(FW_BUG "Try enable AVIC using force_avic option");
- }
- }
+ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+ if (x2avic_enabled)
+ pr_info("x2AVIC enabled\n");
- if (avic_mode != AVIC_MODE_NONE)
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- return !!avic_mode;
+ return true;
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index add65dd59756..05d38944a6c0 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
@@ -1008,7 +1008,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
- vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
@@ -1104,7 +1103,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
* to benefit from it right away.
*/
if (kvm_apicv_activated(vcpu->kvm))
- kvm_vcpu_update_apicv(vcpu);
+ __kvm_vcpu_update_apicv(vcpu);
return 0;
}
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 0e313fbae055..cc77a0681800 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -9,6 +9,8 @@
*
* Implementation is based on pmu_intel.c file
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
@@ -229,4 +231,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.reset = amd_pmu_reset,
+ .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 86d6897f4806..c25aeb550cd9 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -6,6 +6,7 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
@@ -812,7 +813,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (!IS_ALIGNED(dst_paddr, 16) ||
!IS_ALIGNED(paddr, 16) ||
!IS_ALIGNED(size, 16)) {
- tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
+ tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!tpage)
return -ENOMEM;
@@ -1293,7 +1294,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- if ((params.guest_len + offset > PAGE_SIZE))
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
/* Pin guest memory */
@@ -1473,7 +1474,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- if ((params.guest_len + offset > PAGE_SIZE))
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9a194aa1a75a..252e7f37e4e2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1,4 +1,4 @@
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
@@ -519,21 +519,37 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu)
vcpu->arch.osvw.status |= 1;
}
-static int has_svm(void)
+static bool kvm_is_svm_supported(void)
{
+ int cpu = raw_smp_processor_id();
const char *msg;
+ u64 vm_cr;
if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
+ pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ return false;
}
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
- return 0;
+ return false;
}
- return 1;
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
+ pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
}
void __svm_write_tsc_multiplier(u64 multiplier)
@@ -572,10 +588,6 @@ static int svm_hardware_enable(void)
if (efer & EFER_SVME)
return -EBUSY;
- if (!has_svm()) {
- pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
- return -EINVAL;
- }
sd = per_cpu_ptr(&svm_data, me);
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
@@ -813,7 +825,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (intercept == svm->x2avic_msrs_intercepted)
return;
- if (avic_mode != AVIC_MODE_X2 ||
+ if (!x2avic_enabled ||
!apic_x2apic_mode(svm->vcpu.arch.apic))
return;
@@ -1326,6 +1338,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.microcode_version = 0x01000065;
svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
+ svm->nmi_masked = false;
+ svm->awaiting_iret_completion = false;
+
if (sev_es_guest(vcpu->kvm))
sev_es_vcpu_reset(svm);
}
@@ -2076,7 +2091,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu)
* Erratum 383 triggered. Guest state is corrupt so kill the
* guest.
*/
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
+ pr_err("Guest triggered AMD Erratum 383\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -2470,7 +2485,7 @@ static int iret_interception(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
++vcpu->stat.nmi_window_exits;
- vcpu->arch.hflags |= HF_IRET_MASK;
+ svm->awaiting_iret_completion = true;
if (!sev_es_guest(vcpu->kvm)) {
svm_clr_intercept(svm, INTERCEPT_IRET);
svm->nmi_iret_rip = kvm_rip_read(vcpu);
@@ -3003,8 +3018,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_DEBUGCTLMSR:
if (!lbrv) {
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}
if (data & DEBUGCTL_RESERVED_BITS)
@@ -3033,7 +3047,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_CR:
return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
- vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
@@ -3466,7 +3480,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
if (svm->nmi_l1_to_l2)
return;
- vcpu->arch.hflags |= HF_NMI_MASK;
+ svm->nmi_masked = true;
if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
@@ -3571,7 +3585,6 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- bool ret;
if (!gif_set(svm))
return true;
@@ -3579,10 +3592,8 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return false;
- ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (vcpu->arch.hflags & HF_NMI_MASK);
-
- return ret;
+ return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
+ svm->nmi_masked;
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -3602,7 +3613,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
- return !!(vcpu->arch.hflags & HF_NMI_MASK);
+ return to_svm(vcpu)->nmi_masked;
}
static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3610,11 +3621,11 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
struct vcpu_svm *svm = to_svm(vcpu);
if (masked) {
- vcpu->arch.hflags |= HF_NMI_MASK;
+ svm->nmi_masked = true;
if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
} else {
- vcpu->arch.hflags &= ~HF_NMI_MASK;
+ svm->nmi_masked = false;
if (!sev_es_guest(vcpu->kvm))
svm_clr_intercept(svm, INTERCEPT_IRET);
}
@@ -3700,7 +3711,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
+ if (svm->nmi_masked && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
@@ -3824,10 +3835,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+ if (svm->awaiting_iret_completion &&
(sev_es_guest(vcpu->kvm) ||
kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
- vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ svm->awaiting_iret_completion = false;
+ svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -4076,17 +4088,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
@@ -4098,11 +4099,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xd9;
}
-static int __init svm_check_processor_compat(void)
-{
- return 0;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -4629,7 +4625,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+ pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
/*
* If the fault occurred in userspace, arbitrarily inject #GP
@@ -4701,7 +4697,9 @@ static int svm_vm_init(struct kvm *kvm)
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
- .name = "kvm_amd",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
@@ -4771,10 +4769,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = avic_set_virtual_apic_mode,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
.apicv_post_state_restore = avic_apicv_post_state_restore,
+ .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
.get_exit_info = svm_get_exit_info,
@@ -4969,6 +4967,9 @@ static __init int svm_hardware_setup(void)
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
+ kvm_enable_efer_bits(EFER_AUTOIBRS);
+
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
pause_filter_count = 0;
@@ -4978,7 +4979,7 @@ static __init int svm_hardware_setup(void)
}
if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
@@ -4996,7 +4997,7 @@ static __init int svm_hardware_setup(void)
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Setup shadow_me_value and shadow_me_mask */
kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
@@ -5022,12 +5023,14 @@ static __init int svm_hardware_setup(void)
nrips = false;
}
- enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
+ enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ } else if (!x2avic_enabled) {
+ svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
}
if (vls) {
@@ -5086,10 +5089,7 @@ err:
static struct kvm_x86_init_ops svm_init_ops __initdata = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
- .check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
.pmu_ops = &amd_pmu_ops,
@@ -5097,15 +5097,37 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void)
{
+ int r;
+
__unused_size_checks();
- return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
+ if (!kvm_is_svm_supported())
+ return -EOPNOTSUPP;
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ kvm_x86_vendor_exit();
+ return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
+ kvm_x86_vendor_exit();
}
module_init(svm_init)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 4826e6cc611b..839809972da1 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -35,14 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int vgif;
extern bool intercept_smi;
-
-enum avic_modes {
- AVIC_MODE_NONE = 0,
- AVIC_MODE_X1,
- AVIC_MODE_X2,
-};
-
-extern enum avic_modes avic_mode;
+extern bool x2avic_enabled;
/*
* Clean bits in VMCB.
@@ -237,8 +230,26 @@ struct vcpu_svm {
struct svm_nested_state nested;
+ /* NMI mask value, used when vNMI is not enabled */
+ bool nmi_masked;
+
+ /*
+ * True when NMIs are still masked but guest IRET was just intercepted
+ * and KVM is waiting for RIP to change, which will signal that the
+ * intercepted IRET was retired and thus NMI can be unmasked.
+ */
+ bool awaiting_iret_completion;
+
+ /*
+ * Set when KVM is awaiting IRET completion and needs to inject NMIs as
+ * soon as the IRET completes (e.g. NMI is pending injection). KVM
+ * temporarily steals RFLAGS.TF to single-step the guest in this case
+ * in order to regain control as soon as the NMI-blocking condition
+ * goes away.
+ */
bool nmi_singlestep;
u64 nmi_singlestep_guest_rflags;
+
bool nmi_l1_to_l2;
unsigned long soft_int_csbase;
@@ -280,6 +291,9 @@ struct vcpu_svm {
bool guest_state_loaded;
bool x2avic_msrs_intercepted;
+
+ /* Guest GIF value, used when vGIF is not enabled */
+ bool guest_gif;
};
struct svm_cpu_data {
@@ -497,7 +511,7 @@ static inline void enable_gif(struct vcpu_svm *svm)
if (vmcb)
vmcb->control.int_ctl |= V_GIF_MASK;
else
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ svm->guest_gif = true;
}
static inline void disable_gif(struct vcpu_svm *svm)
@@ -507,7 +521,7 @@ static inline void disable_gif(struct vcpu_svm *svm)
if (vmcb)
vmcb->control.int_ctl &= ~V_GIF_MASK;
else
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+ svm->guest_gif = false;
}
static inline bool gif_set(struct vcpu_svm *svm)
@@ -517,7 +531,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
if (vmcb)
return !!(vmcb->control.int_ctl & V_GIF_MASK);
else
- return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+ return svm->guest_gif;
}
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
@@ -628,8 +642,23 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
-
-bool avic_hardware_setup(struct kvm_x86_ops *ops);
+#define AVIC_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_NESTED) | \
+ BIT(APICV_INHIBIT_REASON_IRQWIN) | \
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_SEV) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+)
+
+bool avic_hardware_setup(void);
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -641,14 +670,13 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
-void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 26a89d0da93e..7af8422d3382 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V for SVM.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 45faf84476ce..cff838f15db5 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -30,11 +30,11 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
hve->hv_enlightenments_control.msr_bitmap = 1;
}
-static inline void svm_hv_hardware_setup(void)
+static inline __init void svm_hv_hardware_setup(void)
{
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
- pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n");
+ pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
svm_x86_ops.tlb_remote_flush_with_range =
hv_remote_flush_tlb_with_range;
@@ -43,7 +43,7 @@ static inline void svm_hv_hardware_setup(void)
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
int cpu;
- pr_info("kvm: Hyper-V Direct TLB Flush enabled\n");
+ pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n");
for_each_online_cpu(cpu) {
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(cpu);
@@ -84,7 +84,7 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
}
-static inline void svm_hv_hardware_setup(void)
+static inline __init void svm_hv_hardware_setup(void)
{
}
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cd2ac9536c99..45162c1bcd8f 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -66,13 +66,13 @@ struct vmcs_config {
u64 misc;
struct nested_vmx_msrs nested;
};
-extern struct vmcs_config vmcs_config;
+extern struct vmcs_config vmcs_config __ro_after_init;
struct vmx_capability {
u32 ept;
u32 vpid;
};
-extern struct vmx_capability vmx_capability;
+extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index ae03d1fe0355..22daca752797 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/errno.h>
#include <linux/smp.h>
@@ -361,35 +362,43 @@ enum evmcs_revision {
enum evmcs_ctrl_type {
EVMCS_EXIT_CTRLS,
EVMCS_ENTRY_CTRLS,
+ EVMCS_EXEC_CTRL,
EVMCS_2NDEXEC,
+ EVMCS_3RDEXEC,
EVMCS_PINCTRL,
EVMCS_VMFUNC,
NR_EVMCS_CTRLS,
};
-static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
+static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
[EVMCS_EXIT_CTRLS] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL,
},
[EVMCS_ENTRY_CTRLS] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL,
+ },
+ [EVMCS_EXEC_CTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL,
},
[EVMCS_2NDEXEC] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING,
+ },
+ [EVMCS_3RDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC,
},
[EVMCS_PINCTRL] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL,
},
[EVMCS_VMFUNC] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC,
},
};
-static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type)
+static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type)
{
enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY;
- return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev];
+ return evmcs_supported_ctrls[ctrl_type][evmcs_rev];
}
static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
@@ -413,7 +422,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
{
u32 ctl_low = (u32)*pdata;
u32 ctl_high = (u32)(*pdata >> 32);
- u32 unsupported_ctrls;
+ u32 supported_ctrls;
/*
* Hyper-V 2016 and 2019 try using these features even when eVMCS
@@ -422,27 +431,31 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
switch (msr_index) {
case MSR_IA32_VMX_EXIT_CTLS:
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS);
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS);
if (!evmcs_has_perf_global_ctrl(vcpu))
- unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- ctl_high &= ~unsupported_ctrls;
+ supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
break;
case MSR_IA32_VMX_ENTRY_CTLS:
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS);
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS);
if (!evmcs_has_perf_global_ctrl(vcpu))
- unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- ctl_high &= ~unsupported_ctrls;
+ supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL);
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC);
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC);
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
- ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL);
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL);
break;
case MSR_IA32_VMX_VMFUNC:
- ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC);
+ ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC);
break;
}
@@ -452,7 +465,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type,
u32 val)
{
- return !(val & evmcs_get_unsupported_ctls(ctrl_type));
+ return !(val & ~evmcs_get_supported_ctls(ctrl_type));
}
int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
@@ -461,6 +474,10 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
vmcs12->pin_based_vm_exec_control)))
return -EINVAL;
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL,
+ vmcs12->cpu_based_vm_exec_control)))
+ return -EINVAL;
+
if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC,
vmcs12->secondary_vm_exec_control)))
return -EINVAL;
@@ -488,6 +505,38 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
return 0;
}
+#if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
+ * is: in case a feature has corresponding fields in eVMCS described and it was
+ * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a
+ * feature which has no corresponding eVMCS field, this likely means that KVM
+ * needs to be updated.
+ */
+#define evmcs_check_vmcs_conf(field, ctrl) \
+ do { \
+ typeof(vmcs_conf->field) unsupported; \
+ \
+ unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \
+ if (unsupported) { \
+ pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\
+ (u64)unsupported); \
+ vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \
+ } \
+ } \
+ while (0)
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
+ evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
+ evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC);
+ evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC);
+ evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL);
+ evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL);
+}
+#endif
+
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 571e7929d14e..78d17667e7ec 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -48,22 +48,84 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
* Currently unsupported in KVM:
* GUEST_IA32_RTIT_CTL = 0x00002814,
*/
-#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
- PIN_BASED_VMX_PREEMPTION_TIMER)
-#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
-#define EVMCS1_UNSUPPORTED_2NDEXEC \
- (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
- SECONDARY_EXEC_APIC_REGISTER_VIRT | \
- SECONDARY_EXEC_ENABLE_PML | \
- SECONDARY_EXEC_ENABLE_VMFUNC | \
- SECONDARY_EXEC_SHADOW_VMCS | \
+#define EVMCS1_SUPPORTED_PINCTRL \
+ (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING | \
+ PIN_BASED_VIRTUAL_NMIS)
+
+#define EVMCS1_SUPPORTED_EXEC_CTRL \
+ (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING | \
+ CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+
+#define EVMCS1_SUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
SECONDARY_EXEC_TSC_SCALING | \
- SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
- (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
-#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0)
-#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
+
+#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE | \
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_IA32E_MODE | \
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMFUNC (0)
struct evmcs_field {
u16 offset;
@@ -117,9 +179,7 @@ static __always_inline int get_evmcs_offset(unsigned long field,
{
int offset = evmcs_field_offset(field, clean_field);
- WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
-
+ WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
return offset;
}
@@ -136,7 +196,7 @@ static __always_inline void evmcs_write64(unsigned long field, u64 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline void evmcs_write32(unsigned long field, u32 value)
+static __always_inline void evmcs_write32(unsigned long field, u32 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -148,7 +208,7 @@ static inline void evmcs_write32(unsigned long field, u32 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline void evmcs_write16(unsigned long field, u16 value)
+static __always_inline void evmcs_write16(unsigned long field, u16 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -160,7 +220,7 @@ static inline void evmcs_write16(unsigned long field, u16 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline u64 evmcs_read64(unsigned long field)
+static __always_inline u64 evmcs_read64(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -170,7 +230,7 @@ static inline u64 evmcs_read64(unsigned long field)
return *(u64 *)((char *)current_evmcs + offset);
}
-static inline u32 evmcs_read32(unsigned long field)
+static __always_inline u32 evmcs_read32(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -180,7 +240,7 @@ static inline u32 evmcs_read32(unsigned long field)
return *(u32 *)((char *)current_evmcs + offset);
}
-static inline u16 evmcs_read16(unsigned long field)
+static __always_inline u16 evmcs_read16(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -190,16 +250,6 @@ static inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
-static inline void evmcs_touch_msr_bitmap(void)
-{
- if (unlikely(!current_evmcs))
- return;
-
- if (current_evmcs->hv_enlightenments_control.msr_bitmap)
- current_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-}
-
static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -211,15 +261,15 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1;
}
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
-static inline void evmcs_write32(unsigned long field, u32 value) {}
-static inline void evmcs_write16(unsigned long field, u16 value) {}
-static inline u64 evmcs_read64(unsigned long field) { return 0; }
-static inline u32 evmcs_read32(unsigned long field) { return 0; }
-static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
+static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
+static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
+static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
+static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
#define EVMPTR_INVALID (-1ULL)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d93c715cda6a..7c4f5ca405c7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/objtool.h>
#include <linux/percpu.h>
@@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
+ pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -5863,11 +5864,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
u32 function = kvm_rax_read(vcpu);
/*
- * VMFUNC is only supported for nested guests, but we always enable the
- * secondary control for simplicity; for non-nested mode, fake that we
- * didn't by injecting #UD.
+ * VMFUNC should never execute cleanly while L1 is active; KVM supports
+ * VMFUNC for nested VMs, but not for L1.
*/
- if (!is_guest_mode(vcpu)) {
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6880,6 +6880,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_TSC_SCALING |
@@ -6912,18 +6913,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_PML;
msrs->ept_caps |= VMX_EPT_AD_BIT;
}
- }
- if (cpu_has_vmx_vmfunc()) {
- msrs->secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_VMFUNC;
/*
- * Advertise EPTP switching unconditionally
- * since we emulate it
+ * Advertise EPTP switching irrespective of hardware support,
+ * KVM emulates it in software so long as VMFUNC is supported.
*/
- if (enable_ept)
- msrs->vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (cpu_has_vmx_vmfunc())
+ msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
}
/*
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e5cec07ca8d9..e8a3be0b9df9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -8,6 +8,8 @@
* Avi Kivity <avi@redhat.com>
* Gleb Natapov <gleb@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
@@ -20,16 +22,19 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
-static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
- [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
- [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
- [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+static struct {
+ u8 eventsel;
+ u8 unit_mask;
+} const intel_arch_events[] = {
+ [0] = { 0x3c, 0x00 },
+ [1] = { 0xc0, 0x00 },
+ [2] = { 0x3c, 0x01 },
+ [3] = { 0x2e, 0x4f },
+ [4] = { 0x2e, 0x41 },
+ [5] = { 0xc4, 0x00 },
+ [6] = { 0xc5, 0x00 },
/* The above index must match CPUID 0x0A.EBX bit vector */
- [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
+ [7] = { 0x00, 0x03 },
};
/* mapping between fixed pmc index and intel_arch_events array */
@@ -762,8 +767,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
return;
warn:
- pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
- vcpu->vcpu_id);
+ pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id);
}
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
@@ -810,4 +814,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.reset = intel_pmu_reset,
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
+ .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 1b56c5e5c9fb..94c38bea60e7 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <asm/irq_remapping.h>
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index b12da2a6dec9..aa53c98034bf 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2021 Intel Corporation. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/sgx.h>
@@ -164,7 +165,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
(attributes & SGX_ATTR_PROVISIONKEY)) {
if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
- pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+ pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n");
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -381,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu)
return handle_encls_ecreate(vcpu);
if (leaf == EINIT)
return handle_encls_einit(vcpu);
- WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+ WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf);
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
return 0;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index ac290a44a693..7c1996b433e2 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -75,7 +75,7 @@ struct loaded_vmcs {
struct vmcs_controls_shadow controls_shadow;
};
-static inline bool is_intr_type(u32 intr_info, u32 type)
+static __always_inline bool is_intr_type(u32 intr_info, u32 type)
{
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
@@ -146,7 +146,7 @@ static inline bool is_icebp(u32 intr_info)
return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
}
-static inline bool is_nmi(u32 intr_info)
+static __always_inline bool is_nmi(u32 intr_info)
{
return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
}
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 2251b60920f8..106a72c923ca 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "vmcs12.h"
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 766c6b3ef5ed..f550540ed54e 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,6 +31,39 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
+.macro VMX_DO_EVENT_IRQOFF call_insn call_target
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ \call_insn \call_target
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ mov %_ASM_BP, %_ASM_SP
+ pop %_ASM_BP
+ RET
+.endm
+
.section .noinstr.text, "ax"
/**
@@ -69,8 +102,8 @@ SYM_FUNC_START(__vmx_vcpu_run)
*/
push %_ASM_ARG2
- /* Copy @flags to BL, _ASM_ARG3 is volatile. */
- mov %_ASM_ARG3B, %bl
+ /* Copy @flags to EBX, _ASM_ARG3 is volatile. */
+ mov %_ASM_ARG3L, %ebx
lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
@@ -106,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- testb $VMX_RUN_VMRESUME, %bl
+ test $VMX_RUN_VMRESUME, %ebx
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@@ -128,7 +161,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Check EFLAGS.ZF from 'testb' above */
+ /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
jz .Lvmlaunch
/*
@@ -266,6 +299,10 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
SYM_FUNC_END(__vmx_vcpu_run)
+SYM_FUNC_START(vmx_do_nmi_irqoff)
+ VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(vmx_do_nmi_irqoff)
+
.section .text, "ax"
@@ -320,35 +357,6 @@ SYM_FUNC_START(vmread_error_trampoline)
SYM_FUNC_END(vmread_error_trampoline)
#endif
-SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
- /*
- * Unconditionally create a stack frame, getting the correct RSP on the
- * stack (for x86-64) would take two instructions anyways, and RBP can
- * be used to restore RSP to make objtool happy (see below).
- */
- push %_ASM_BP
- mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
- /*
- * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
- * creating the synthetic interrupt stack frame for the IRQ/NMI.
- */
- and $-16, %rsp
- push $__KERNEL_DS
- push %rbp
-#endif
- pushf
- push $__KERNEL_CS
- CALL_NOSPEC _ASM_ARG1
-
- /*
- * "Restore" RSP from RBP, even though IRET has already unwound RSP to
- * the correct value. objtool doesn't know the callee will IRET and,
- * without the explicit restore, thinks the stack is getting walloped.
- * Using an unwind hint is problematic due to x86-64's dynamic alignment.
- */
- mov %_ASM_BP, %_ASM_SP
- pop %_ASM_BP
- RET
-SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
+SYM_FUNC_START(vmx_do_interrupt_irqoff)
+ VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fc9008dbed33..bcac3efcde41 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -12,6 +12,7 @@
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/highmem.h>
#include <linux/hrtimer.h>
@@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault)
if (fault)
kvm_spurious_fault();
else
- vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
}
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
- vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
ext, vpid, gva);
}
noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
{
- vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
ext, eptp, gpa);
}
@@ -488,8 +489,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
@@ -523,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
+static struct kvm_x86_ops vmx_x86_ops __initdata;
+
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -551,6 +554,71 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
return 0;
}
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!static_branch_unlikely(&enable_evmcs))
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
@@ -1613,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (!instr_len)
goto rip_updated;
- WARN(exit_reason.enclave_mode,
- "KVM: skipping instruction after SGX enclave VM-Exit");
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len;
@@ -2138,9 +2206,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
}
@@ -2448,88 +2514,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
}
}
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !boot_cpu_has(X86_FEATURE_VMX);
-}
-
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
- u64 msr;
-
- cr4_set_bits(X86_CR4_VMXE);
-
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- : : [vmxon_pointer] "m"(vmxon_pointer)
- : : fault);
- return 0;
-
-fault:
- WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- cr4_clear_bits(X86_CR4_VMXE);
-
- return -EFAULT;
-}
-
-static int vmx_hardware_enable(void)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- int r;
-
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
-
- /*
- * This can happen if we hot-added a CPU but failed to allocate
- * VP assist page for it.
- */
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
- return -EFAULT;
-
- intel_pt_handle_vmx(1);
-
- r = kvm_cpu_vmxon(phys_addr);
- if (r) {
- intel_pt_handle_vmx(0);
- return r;
- }
-
- if (enable_ept)
- ept_sync_global();
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
-}
-
-static void vmx_hardware_disable(void)
-{
- vmclear_local_loaded_vmcss();
-
- if (cpu_vmxoff())
- kvm_spurious_fault();
-
- intel_pt_handle_vmx(0);
-}
-
/*
* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
* directly instead of going through cpu_has(), to ensure KVM is trapping
@@ -2565,8 +2549,7 @@ static bool cpu_has_perf_global_ctrl_bug(void)
return false;
}
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
@@ -2584,7 +2567,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
-static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
u64 allowed;
@@ -2593,8 +2576,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
return ctl_opt & allowed;
}
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- struct vmx_capability *vmx_cap)
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
@@ -2752,9 +2735,127 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->vmentry_ctrl = _vmentry_control;
vmcs_conf->misc = misc_msr;
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
+
+ return 0;
+}
+
+static bool kvm_is_vmx_supported(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!cpu_has_vmx()) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+static int vmx_hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ if (enable_ept)
+ ept_sync_global();
+
return 0;
}
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+static void vmx_hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
@@ -2950,9 +3051,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not "
- "paragraph aligned when entering "
- "protected mode (seg=%d)", seg);
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
@@ -2982,8 +3082,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
* vcpu. Warn the user that an update is overdue.
*/
if (!kvm_vmx->tss_addr)
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
+ pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
vmx_segment_cache_clear(vmx);
@@ -3440,18 +3539,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
@@ -3803,39 +3899,6 @@ static void seg_setup(int seg)
vmcs_write32(sf->ar_bytes, ar);
}
-static int alloc_apic_access_page(struct kvm *kvm)
-{
- struct page *page;
- void __user *hva;
- int ret = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
- hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (IS_ERR(hva)) {
- ret = PTR_ERR(hva);
- goto out;
- }
-
- page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page)) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * Do not pin the page in memory, so that memory hot-unplug
- * is able to migrate it.
- */
- put_page(page);
- kvm->arch.apic_access_memslot_enabled = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return ret;
-}
-
int allocate_vpid(void)
{
int vpid;
@@ -3868,8 +3931,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
vmx->nested.force_msr_bitmap_recalc = true;
}
@@ -3950,29 +4018,20 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
-{
- unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
- unsigned long read_intercept;
- int msr;
-
- read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned int read_idx = msr / BITS_PER_LONG;
- unsigned int write_idx = read_idx + (0x800 / sizeof(long));
-
- msr_bitmap[read_idx] = read_intercept;
- msr_bitmap[write_idx] = ~0ul;
- }
-}
-
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
+ /*
+ * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
+ * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
+ * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
+ */
+ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
+ const int write_idx = read_idx + (0x800 / sizeof(u64));
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
u8 mode;
- if (!cpu_has_vmx_msr_bitmap())
+ if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
return;
if (cpu_has_secondary_exec_ctrls() &&
@@ -3990,7 +4049,18 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
vmx->x2apic_msr_bitmap_mode = mode;
- vmx_reset_x2apic_msrs(vcpu, mode);
+ /*
+ * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
+ * registers (0x840 and above) intercepted, KVM doesn't support them.
+ * Intercept all writes by default and poke holes as needed. Pass
+ * through reads for all valid registers by default in x2APIC+APICv
+ * mode, only the current timer count needs on-demand emulation by KVM.
+ */
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
+ msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ else
+ msr_bitmap[read_idx] = ~0ull;
+ msr_bitmap[write_idx] = ~0ull;
/*
* TPR reads and writes can be virtualized even if virtual interrupt
@@ -4522,6 +4592,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ /*
+ * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
+ * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
+ */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
@@ -4538,7 +4614,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
* it needs to be set here when dirty logging is already active, e.g.
* if this vCPU was created after dirty logging was enabled.
*/
- if (!vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
if (cpu_has_vmx_xsaves()) {
@@ -5102,8 +5178,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vect_info = vmx->idt_vectoring_info;
intr_info = vmx_get_intr_info(vcpu);
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
if (is_machine_check(intr_info) || is_nmi(intr_info))
- return 1; /* handled by handle_exception_nmi_irqoff() */
+ return 1;
/*
* Queue the exception here instead of in handle_nm_fault_irqoff().
@@ -6793,17 +6874,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
- unsigned long entry)
-{
- bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
-
- kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
- vmx_do_interrupt_nmi_irqoff(entry);
- kvm_after_interrupt(vcpu);
-}
+void vmx_do_interrupt_irqoff(unsigned long entry);
+void vmx_do_nmi_irqoff(void);
static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
@@ -6825,9 +6897,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct vcpu_vmx *vmx)
{
- const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6839,9 +6910,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
- /* We need to handle NMIs before interrupts are enabled */
- else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
@@ -6851,10 +6919,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
- "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ vmx_do_interrupt_irqoff(gate_offset(desc));
+ kvm_after_interrupt(vcpu);
+
vcpu->arch.at_instruction_boundary = true;
}
@@ -6868,7 +6939,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_nmi_irqoff(vmx);
+ handle_exception_irqoff(vmx);
}
/*
@@ -7103,9 +7174,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
}
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_vmx *vmx,
- unsigned long flags)
+ unsigned int flags)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
guest_state_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
@@ -7129,6 +7201,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_enable_fb_clear(vmx);
+ if (unlikely(vmx->fail))
+ vmx->exit_reason.full = 0xdead;
+ else
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+
+ if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
+ is_nmi(vmx_get_intr_info(vcpu))) {
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+ }
+
guest_state_exit_irqoff();
}
@@ -7223,7 +7307,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
kvm_wait_lapic_expire(vcpu);
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
- vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx));
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs)) {
@@ -7270,12 +7354,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->idt_vectoring_info = 0;
- if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- }
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
@@ -7389,7 +7470,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
- err = alloc_apic_access_page(vcpu->kvm);
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
@@ -7449,29 +7530,6 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static int __init vmx_check_processor_compat(void)
-{
- struct vmcs_config vmcs_conf;
- struct vmx_capability vmx_cap;
-
- if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !this_cpu_has(X86_FEATURE_VMX)) {
- pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
- return -EIO;
- }
-
- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- return -EIO;
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- return -EIO;
- }
- return 0;
-}
-
static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
@@ -7943,17 +8001,20 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (WARN_ON_ONCE(!enable_pml))
+ return;
+
if (is_guest_mode(vcpu)) {
vmx->nested.update_vmcs01_cpu_dirty_logging = true;
return;
}
/*
- * Note, cpu_dirty_logging_count can be changed concurrent with this
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
* code, but in that case another update request will be made and so
* the guest will never run with a stale PML value.
*/
- if (vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
else
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
@@ -8051,17 +8112,16 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
-}
+#define VMX_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
+)
static void vmx_vm_destroy(struct kvm *kvm)
{
@@ -8071,7 +8131,9 @@ static void vmx_vm_destroy(struct kvm *kvm)
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = "kvm_intel",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
.hardware_unsetup = vmx_hardware_unsetup,
@@ -8145,7 +8207,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
@@ -8291,7 +8353,7 @@ static __init int hardware_setup(void)
return -EIO;
if (cpu_has_perf_global_ctrl_bug())
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
if (boot_cpu_has(X86_FEATURE_NX))
@@ -8299,7 +8361,7 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_MPX)) {
rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
if (!cpu_has_vmx_mpx())
@@ -8318,7 +8380,7 @@ static __init int hardware_setup(void)
/* NX support is required for shadow paging. */
if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
@@ -8470,9 +8532,6 @@ static __init int hardware_setup(void)
}
static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
.handle_intel_pt_intr = NULL,
@@ -8490,41 +8549,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
+ kvm_x86_vendor_exit();
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8532,56 +8573,29 @@ static int __init vmx_init(void)
{
int r, cpu;
-#if IS_ENABLED(CONFIG_HYPERV)
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
+
/*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
+ * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
+ * to unwind if a later step fails.
*/
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
-
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
+ hv_init_evmcs();
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
-
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
- = hv_enable_l2_tlb_flush;
-
- } else {
- enlightened_vmcs = false;
- }
-#endif
-
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_x86_vendor_init(&vmx_init_ops);
if (r)
return r;
/*
- * Must be called after kvm_init() so enable_ept is properly set
+ * Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
* the pre module init parser. If no parameter was given, it will
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ if (r)
+ goto err_l1d_flush;
vmx_setup_fb_clear_ctrl();
@@ -8605,6 +8619,21 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
+
+err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
}
module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a3da84f4ea45..2acdc54bc34b 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -640,12 +640,12 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
(1 << VCPU_EXREG_EXIT_INFO_1) | \
(1 << VCPU_EXREG_EXIT_INFO_2))
-static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
return container_of(kvm, struct kvm_vmx, kvm);
}
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
@@ -669,25 +669,23 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
-static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1))
vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- }
+
return vmx->exit_qualification;
}
-static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2))
vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- }
+
return vmx->exit_intr_info;
}
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 842dc898c972..db95bde52998 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -100,8 +100,10 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
return value;
do_fail:
- WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
- pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ instrumentation_begin();
+ WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ instrumentation_end();
return 0;
do_exception:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index da4bbd043a7b..7713420abab0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,6 +15,7 @@
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "irq.h"
@@ -128,6 +129,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static DEFINE_MUTEX(vendor_module_lock);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
#define KVM_X86_OP(func) \
@@ -191,6 +193,10 @@ module_param(enable_pmu, bool, 0444);
bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644);
+/* Enable/disable SMT_RSB bug mitigation */
+bool __read_mostly mitigate_smt_rsb;
+module_param(mitigate_smt_rsb, bool, 0444);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -1417,7 +1423,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
* may depend on host virtualization features rather than host cpu features.
*/
-static const u32 msrs_to_save_all[] = {
+static const u32 msrs_to_save_base[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1434,6 +1440,10 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
MSR_IA32_UMWAIT_CONTROL,
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+};
+
+static const u32 msrs_to_save_pmu[] = {
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
@@ -1458,11 +1468,10 @@ static const u32 msrs_to_save_all[] = {
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
-
- MSR_IA32_XFD, MSR_IA32_XFD_ERR,
};
-static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
static unsigned num_msrs_to_save;
static const u32 emulated_msrs_all[] = {
@@ -1480,7 +1489,7 @@ static const u32 emulated_msrs_all[] = {
HV_X64_MSR_STIMER0_CONFIG,
HV_X64_MSR_VP_ASSIST_PAGE,
HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- HV_X64_MSR_TSC_EMULATION_STATUS,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
HV_X64_MSR_SYNDBG_OPTIONS,
HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
@@ -1685,6 +1694,9 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
+ return false;
+
if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
return false;
@@ -2086,7 +2098,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
!guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
return kvm_handle_invalid_op(vcpu);
- pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
+ pr_warn_once("%s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
@@ -2433,7 +2445,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
@@ -3158,6 +3171,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
+ kvm_xen_update_tsc_info(v);
}
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
@@ -3555,9 +3569,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
+static bool kvm_is_msr_to_save(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- bool pr = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
@@ -3603,15 +3628,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data == BIT_ULL(18)) {
vcpu->arch.msr_hwcr = data;
} else if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
- data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
- "0x%llx\n", data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
@@ -3791,16 +3814,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- pr = true;
- fallthrough;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (pr || data != 0)
- vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
+ if (data)
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_K7_CLK_CTL:
/*
@@ -3821,15 +3841,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
@@ -3877,20 +3896,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.guest_fpu.xfd_err = data;
break;
#endif
- case MSR_IA32_PEBS_ENABLE:
- case MSR_IA32_DS_AREA:
- case MSR_PEBS_DATA_CFG:
- case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
+
/*
* Userspace is allowed to write '0' to MSRs that KVM reports
* as to-be-saved, even if an MSRs isn't fully supported.
*/
- return !msr_info->host_initiated || data;
- default:
- if (kvm_pmu_is_valid_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr_info);
+ if (msr_info->host_initiated && !data &&
+ kvm_is_msr_to_save(msr))
+ break;
+
return KVM_MSR_RET_INVALID;
}
return 0;
@@ -3980,20 +3997,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
msr_info->data = 0;
break;
- case MSR_IA32_PEBS_ENABLE:
- case MSR_IA32_DS_AREA:
- case MSR_PEBS_DATA_CFG:
- case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
- if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to read MSRs that KVM reports as
- * to-be-saved, even if an MSR isn't fully supported.
- */
- if (!msr_info->host_initiated)
- return 1;
- msr_info->data = 0;
- break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -4191,6 +4194,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data,
msr_info->host_initiated);
@@ -4248,6 +4252,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
+
+ /*
+ * Userspace is allowed to read MSRs that KVM reports as
+ * to-be-saved, even if an MSR isn't fully supported.
+ */
+ if (msr_info->host_initiated &&
+ kvm_is_msr_to_save(msr_info->index)) {
+ msr_info->data = 0;
+ break;
+ }
+
return KVM_MSR_RET_INVALID;
}
return 0;
@@ -4285,8 +4300,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
{
struct kvm_msrs msrs;
struct kvm_msr_entry *entries;
- int r, n;
unsigned size;
+ int r;
r = -EFAULT;
if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
@@ -4303,17 +4318,11 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
goto out;
}
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
- if (r < 0)
- goto out_free;
+ r = __msr_io(vcpu, &msrs, entries, do_msr);
- r = -EFAULT;
if (writeback && copy_to_user(user_msrs->entries, entries, size))
- goto out_free;
-
- r = n;
+ r = -EFAULT;
-out_free:
kfree(entries);
out:
return r;
@@ -4401,6 +4410,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_PMU_EVENT_FILTER:
+ case KVM_CAP_PMU_EVENT_MASKED_EVENTS:
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
@@ -4448,10 +4458,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
- KVM_X86_DISABLE_EXITS_CSTATE;
- if(kvm_can_mwait_in_guest())
- r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+ if (!mitigate_smt_rsb) {
+ r |= KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_CSTATE;
+
+ if (kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ }
break;
case KVM_CAP_X86_SMM:
if (!IS_ENABLED(CONFIG_KVM_SMM))
@@ -5254,12 +5269,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
{
unsigned long val;
+ memset(dbgregs, 0, sizeof(*dbgregs));
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -6227,15 +6241,26 @@ split_irqchip_unlock:
if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
break;
- if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
- kvm_can_mwait_in_guest())
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
kvm->arch.pause_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
+
+#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
+ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
+
+ if (!mitigate_smt_rsb) {
+ if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
+ (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ pr_warn_once(SMT_RSB_MSG);
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+ kvm->arch.cstate_in_guest = true;
+ }
+
r = 0;
break;
case KVM_CAP_MSR_PLATFORM_INFO:
@@ -6460,7 +6485,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
struct kvm_x86_msr_filter *new_filter, *old_filter;
bool default_allow;
bool empty = true;
- int r = 0;
+ int r;
u32 i;
if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
@@ -6486,17 +6511,14 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
}
mutex_lock(&kvm->lock);
-
- /* The per-VM filter is protected by kvm->lock... */
- old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
-
- rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
synchronize_srcu(&kvm->srcu);
kvm_free_msr_filter(old_filter);
kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
- mutex_unlock(&kvm->lock);
return 0;
}
@@ -6996,83 +7018,98 @@ out:
return r;
}
-static void kvm_init_msr_list(void)
+static void kvm_probe_msr_to_save(u32 msr_index)
{
u32 dummy[2];
+
+ if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
+ return;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed to guests in
+ * some cases.
+ */
+ switch (msr_index) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ return;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
+ return;
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+ return;
+ break;
+ case MSR_IA32_RTIT_CTL:
+ case MSR_IA32_RTIT_STATUS:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
+ return;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
+ return;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
+ return;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (msr_index - MSR_IA32_RTIT_ADDR0_A >=
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
+ return;
+ break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
+ kvm_pmu_cap.num_counters_fixed)
+ return;
+ break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ return;
+ break;
+ default:
+ break;
+ }
+
+ msrs_to_save[num_msrs_to_save++] = msr_index;
+}
+
+static void kvm_init_msr_list(void)
+{
unsigned i;
BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
- "Please update the fixed PMCs in msrs_to_saved_all[]");
+ "Please update the fixed PMCs in msrs_to_save_pmu[]");
num_msrs_to_save = 0;
num_emulated_msrs = 0;
num_msr_based_features = 0;
- for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
- if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
- continue;
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
+ kvm_probe_msr_to_save(msrs_to_save_base[i]);
- /*
- * Even MSRs that are valid in the host may not be exposed
- * to the guests in some cases.
- */
- switch (msrs_to_save_all[i]) {
- case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
- continue;
- break;
- case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
- !kvm_cpu_cap_has(X86_FEATURE_RDPID))
- continue;
- break;
- case MSR_IA32_UMWAIT_CONTROL:
- if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
- continue;
- break;
- case MSR_IA32_RTIT_CTL:
- case MSR_IA32_RTIT_STATUS:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
- continue;
- break;
- case MSR_IA32_RTIT_CR3_MATCH:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
- continue;
- break;
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
- !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
- continue;
- break;
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
- intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
- continue;
- break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
- if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
- min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
- continue;
- break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
- if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
- min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
- continue;
- break;
- case MSR_IA32_XFD:
- case MSR_IA32_XFD_ERR:
- if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
- continue;
- break;
- default:
- break;
- }
-
- msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
+ if (enable_pmu) {
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
+ kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
@@ -7699,7 +7736,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
emul_write:
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+ pr_warn_once("emulating exchange as write\n");
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
@@ -8145,9 +8182,14 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
-static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
+{
+ return is_smm(emul_to_vcpu(ctxt));
+}
+
+static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
{
- return emul_to_vcpu(ctxt)->arch.hflags;
+ return is_guest_mode(emul_to_vcpu(ctxt));
}
#ifndef CONFIG_KVM_SMM
@@ -8216,7 +8258,8 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
- .get_hflags = emulator_get_hflags,
+ .is_smm = emulator_is_smm,
+ .is_guest_mode = emulator_is_guest_mode,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
@@ -8260,7 +8303,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
if (!ctxt) {
- pr_err("kvm: failed to allocate vcpu's emulator\n");
+ pr_err("failed to allocate vcpu's emulator\n");
return NULL;
}
@@ -8288,8 +8331,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
- BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
-
ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->exception.vector = -1;
@@ -9271,35 +9312,66 @@ static struct notifier_block pvclock_gtod_notifier = {
};
#endif
-int kvm_arch_init(void *opaque)
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
+static int kvm_x86_check_processor_compatibility(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Compatibility checks are done when loading KVM and when enabling
+ * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ * compatible, i.e. KVM should never perform a compatibility check on
+ * an offline CPU.
+ */
+ WARN_ON(!cpu_online(cpu));
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return static_call(kvm_x86_check_processor_compatibility)();
+}
+
+static void kvm_x86_check_cpu_compat(void *ret)
+{
+ *(int *)ret = kvm_x86_check_processor_compatibility();
+}
+
+static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
- struct kvm_x86_init_ops *ops = opaque;
u64 host_pat;
- int r;
+ int r, cpu;
if (kvm_x86_ops.hardware_enable) {
- pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
- if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
-
/*
* KVM explicitly assumes that the guest has an FPU and
* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
* vCPU's FPU state as a fxregs_state struct.
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
- printk(KERN_ERR "kvm: inadequate fpu\n");
+ pr_err("inadequate fpu\n");
return -EOPNOTSUPP;
}
@@ -9317,19 +9389,19 @@ int kvm_arch_init(void *opaque)
*/
if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
(host_pat & GENMASK(2, 0)) != 6) {
- pr_err("kvm: host PAT[0] is not WB\n");
+ pr_err("host PAT[0] is not WB\n");
return -EIO;
}
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
- pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ pr_err("failed to allocate cache for x86 emulator\n");
return -ENOMEM;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
if (!user_return_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ pr_err("failed to allocate percpu kvm_user_return_msrs\n");
r = -ENOMEM;
goto out_free_x86_emulator_cache;
}
@@ -9339,13 +9411,37 @@ int kvm_arch_init(void *opaque)
if (r)
goto out_free_percpu;
- kvm_timer_init();
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ kvm_init_pmu_capability(ops->pmu_ops);
+
+ r = ops->hardware_setup();
+ if (r != 0)
+ goto out_mmu_exit;
+
+ kvm_ops_update(ops);
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ if (r < 0)
+ goto out_unwind_ops;
+ }
+
+ /*
+ * Point of no return! DO NOT add error paths below this point unless
+ * absolutely necessary, as most operations from this point forward
+ * require unwinding.
+ */
+ kvm_timer_init();
+
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
@@ -9355,8 +9451,35 @@ int kvm_arch_init(void *opaque)
set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ kvm_caps.supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
+
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
+ }
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ kvm_init_msr_list();
return 0;
+out_unwind_ops:
+ kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
+out_mmu_exit:
+ kvm_mmu_vendor_module_exit();
out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
@@ -9364,8 +9487,22 @@ out_free_x86_emulator_cache:
return r;
}
-void kvm_arch_exit(void)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
+ int r;
+
+ mutex_lock(&vendor_module_lock);
+ r = __kvm_x86_vendor_init(ops);
+ mutex_unlock(&vendor_module_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+
+void kvm_x86_vendor_exit(void)
+{
+ kvm_unregister_perf_callbacks();
+
#ifdef CONFIG_X86_64
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
clear_hv_tscchange_cb();
@@ -9382,7 +9519,7 @@ void kvm_arch_exit(void)
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
@@ -9390,7 +9527,11 @@ void kvm_arch_exit(void)
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
+ mutex_lock(&vendor_module_lock);
+ kvm_x86_ops.hardware_enable = NULL;
+ mutex_unlock(&vendor_module_lock);
}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
@@ -10043,7 +10184,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
-void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool activate;
@@ -10078,7 +10219,30 @@ out:
preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
+
+static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+ * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
+ * and hardware doesn't support x2APIC virtualization. E.g. some AMD
+ * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
+ * this case so that KVM can the AVIC doorbell to inject interrupts to
+ * running vCPUs, but KVM must not create SPTEs for the APIC base as
+ * the vCPU would incorrectly be able to access the vAPIC page via MMIO
+ * despite being in x2APIC mode. For simplicity, inhibiting the APIC
+ * access page is sticky.
+ */
+ if (apic_x2apic_mode(vcpu->arch.apic) &&
+ kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
+ kvm_inhibit_apic_access_page(vcpu);
+
+ __kvm_vcpu_update_apicv(vcpu);
+}
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set)
@@ -10087,7 +10251,7 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
+ if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
return;
old = new = kvm->arch.apicv_inhibit_reasons;
@@ -11531,7 +11695,7 @@ static int sync_regs(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && kvm->created_vcpus)
- pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ pr_warn_once("SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
if (!kvm->arch.max_vcpu_ids)
@@ -11608,7 +11772,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto free_wbinvd_dirty_mask;
if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
- pr_err("kvm: failed to allocate vcpu's fpu\n");
+ pr_err("failed to allocate vcpu's fpu\n");
goto free_emulate_ctxt;
}
@@ -11882,6 +12046,11 @@ int kvm_arch_hardware_enable(void)
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
+
+ ret = kvm_x86_check_processor_compatibility();
+ if (ret)
+ return ret;
+
ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
@@ -11968,88 +12137,6 @@ void kvm_arch_hardware_disable(void)
drop_user_return_notifiers();
}
-static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
-{
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
-
-#define __KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
- WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
- static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-
- kvm_pmu_ops_update(ops->pmu_ops);
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
- struct kvm_x86_init_ops *ops = opaque;
- int r;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
- kvm_init_pmu_capability();
-
- r = ops->hardware_setup();
- if (r != 0)
- return r;
-
- kvm_ops_update(ops);
-
- kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
-
- if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
- kvm_caps.supported_xss = 0;
-
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
-
- if (kvm_caps.has_tsc_control) {
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated because it will always
- * be 1 on all machines.
- */
- u64 max = min(0x7fffffffULL,
- __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
- kvm_caps.max_guest_tsc_khz = max;
- }
- kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- kvm_init_msr_list();
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- kvm_unregister_perf_callbacks();
-
- static_call(kvm_x86_hardware_unsetup)();
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- struct kvm_x86_init_ops *ops = opaque;
-
- WARN_ON(!irqs_disabled());
-
- if (__cr4_reserved_bits(cpu_has, c) !=
- __cr4_reserved_bits(cpu_has, &boot_cpu_data))
- return -EIO;
-
- return ops->check_processor_compatibility();
-}
-
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
@@ -12222,7 +12309,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
*/
hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, 0);
- if (IS_ERR((void *)hva))
+ if (IS_ERR_VALUE(hva))
return (void __user *)hva;
} else {
if (!slot || !slot->npages)
@@ -12437,16 +12524,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
- struct kvm_arch *ka = &kvm->arch;
+ int nr_slots;
if (!kvm_x86_ops.cpu_dirty_log_size)
return;
- if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
- (!enable && --ka->cpu_dirty_logging_count == 0))
+ nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
+ if ((enable && nr_slots == 1) || !nr_slots)
kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
-
- WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
}
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
@@ -13456,6 +13541,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
static int __init kvm_x86_init(void)
{
kvm_mmu_x86_module_init();
+ mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
return 0;
}
module_init(kvm_x86_init);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9de72586f406..a8167b47b8c8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -331,6 +331,18 @@ extern bool report_ignored_msrs;
extern bool eager_page_split;
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
+}
+
+static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
+}
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -382,13 +394,13 @@ enum kvm_intr_type {
KVM_HANDLING_NMI,
};
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
- enum kvm_intr_type intr)
+static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
}
-static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 8fd41f5deae3..40edf4d1974c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -5,6 +5,7 @@
*
* KVM Xen emulation
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "xen.h"
@@ -22,6 +23,9 @@
#include <xen/interface/event_channel.h>
#include <xen/interface/sched.h>
+#include <asm/xen/cpuid.h>
+
+#include "cpuid.h"
#include "trace.h"
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
@@ -2076,6 +2080,29 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
del_timer_sync(&vcpu->arch.xen.poll_timer);
}
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+ u32 function;
+
+ if (!vcpu->arch.xen.cpuid.base)
+ return;
+
+ function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3);
+ if (function > vcpu->arch.xen.cpuid.limit)
+ return;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
+ if (entry) {
+ entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
+ entry->edx = vcpu->arch.hv_clock.tsc_shift;
+ }
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 2);
+ if (entry)
+ entry->eax = vcpu->arch.hw_tsc_khz;
+}
+
void kvm_xen_init_vm(struct kvm *kvm)
{
mutex_init(&kvm->arch.xen.xen_lock);
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index ea33d80a0c51..f8f1fe22d090 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -9,6 +9,8 @@
#ifndef __ARCH_X86_KVM_XEN_H__
#define __ARCH_X86_KVM_XEN_H__
+#include <asm/xen/hypervisor.h>
+
#ifdef CONFIG_KVM_XEN
#include <linux/jump_label_ratelimit.h>
@@ -32,6 +34,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
@@ -135,6 +138,10 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
{
return false;
}
+
+static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+}
#endif
int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index b6da09339308..80570eb3c89b 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -7,16 +7,18 @@
#include <linux/string.h>
#include <linux/ctype.h>
#include <asm/setup.h>
+#include <asm/cmdline.h>
static inline int myisspace(u8 c)
{
return c <= ' '; /* Close enough approximation */
}
-/**
+/*
* Find a boolean option (like quiet,noapic,nosmp....)
*
* @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
* @option: option string to look for
*
* Returns the position of that @option (starts counting with 1)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index dd8cd8831251..a64017602010 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -8,7 +8,7 @@
#include <asm/alternative.h>
#include <asm/export.h>
-.pushsection .noinstr.text, "ax"
+.section .noinstr.text, "ax"
/*
* We build a jump to memcpy_orig by default which gets NOPped out on
@@ -43,7 +43,7 @@ SYM_TYPED_FUNC_START(__memcpy)
SYM_FUNC_END(__memcpy)
EXPORT_SYMBOL(__memcpy)
-SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+SYM_FUNC_ALIAS(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy)
/*
@@ -184,4 +184,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
RET
SYM_FUNC_END(memcpy_orig)
-.popsection
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 724bbf83eb5b..02661861e5dd 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -13,6 +13,8 @@
#undef memmove
+.section .noinstr.text, "ax"
+
/*
* Implement memmove(). This can handle overlap between src and dst.
*
@@ -213,5 +215,5 @@ SYM_FUNC_START(__memmove)
SYM_FUNC_END(__memmove)
EXPORT_SYMBOL(__memmove)
-SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
+SYM_FUNC_ALIAS(memmove, __memmove)
EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index fc9ffd3ff3b2..6143b1a6fa2c 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -6,6 +6,8 @@
#include <asm/alternative.h>
#include <asm/export.h>
+.section .noinstr.text, "ax"
+
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -43,7 +45,7 @@ SYM_FUNC_START(__memset)
SYM_FUNC_END(__memset)
EXPORT_SYMBOL(__memset)
-SYM_FUNC_ALIAS_WEAK(memset, __memset)
+SYM_FUNC_ALIAS(memset, __memset)
EXPORT_SYMBOL(memset)
/*
diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c
index a018ec4fba53..92cd8ecc3a2c 100644
--- a/arch/x86/lib/misc.c
+++ b/arch/x86/lib/misc.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <asm/misc.h>
+
/*
* Count the digits of @val including a possible sign.
*
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index d12d1358f96d..5168ee0360b2 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1047,6 +1047,7 @@ GrpTable: Grp6
3: LTR Ew
4: VERR Ew
5: VERW Ew
+6: LKGS Ew (F2)
EndTable
GrpTable: Grp7
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7b0d4ab894c8..a498ae1fbe66 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address)
}
NOKPROBE_SYMBOL(vmalloc_fault);
-static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -284,27 +284,6 @@ static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end)
}
}
-void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
-{
- __arch_sync_kernel_mappings(start, end);
-#ifdef CONFIG_KMSAN
- /*
- * KMSAN maintains two additional metadata page mappings for the
- * [VMALLOC_START, VMALLOC_END) range. These mappings start at
- * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and
- * have to be synced together with the vmalloc memory mapping.
- */
- if (start >= VMALLOC_START && end < VMALLOC_END) {
- __arch_sync_kernel_mappings(
- start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START,
- end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START);
- __arch_sync_kernel_mappings(
- start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START,
- end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START);
- }
-#endif
-}
-
static bool low_pfn(unsigned long pfn)
{
return pfn < max_low_pfn;
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index fb4b1b5e0dea..46a00aa858b6 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -159,10 +159,10 @@ static inline void set_page_memtype(struct page *pg,
break;
}
+ old_flags = READ_ONCE(pg->flags);
do {
- old_flags = pg->flags;
new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
- } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
+ } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags));
}
#else
static inline enum page_cache_mode get_page_memtype(struct page *pg)
@@ -387,8 +387,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end,
u8 mtrr_type, uniform;
mtrr_type = mtrr_type_lookup(start, end, &uniform);
- if (mtrr_type != MTRR_TYPE_WRBACK &&
- mtrr_type != MTRR_TYPE_INVALID)
+ if (mtrr_type != MTRR_TYPE_WRBACK)
return _PAGE_CACHE_MODE_UC_MINUS;
return _PAGE_CACHE_MODE_WB;
@@ -1000,7 +999,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
ret = reserve_pfn_range(paddr, size, prot, 0);
if (ret == 0 && vma)
- vma->vm_flags |= VM_PAT;
+ vm_flags_set(vma, VM_PAT);
return ret;
}
@@ -1046,7 +1045,7 @@ void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
* can be for the entire vma (in which case pfn, size are zero).
*/
void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size)
+ unsigned long size, bool mm_wr_locked)
{
resource_size_t paddr;
unsigned long prot;
@@ -1065,8 +1064,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
- if (vma)
- vma->vm_flags &= ~VM_PAT;
+ if (vma) {
+ if (mm_wr_locked)
+ vm_flags_clear(vma, VM_PAT);
+ else
+ __vm_flags_mod(vma, 0, VM_PAT);
+ }
}
/*
@@ -1076,7 +1079,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
*/
void untrack_pfn_moved(struct vm_area_struct *vma)
{
- vma->vm_flags &= ~VM_PAT;
+ vm_flags_clear(vma, VM_PAT);
}
pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c1e31e9a85d7..92d73ccede70 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1205,7 +1205,7 @@ void __flush_tlb_all(void)
*/
VM_WARN_ON_ONCE(preemptible());
- if (boot_cpu_has(X86_FEATURE_PGE)) {
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else {
/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b808be77635e..1056bbf55b17 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1003,6 +1003,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
u8 b2 = 0, b3 = 0;
u8 *start_of_ldx;
s64 jmp_offset;
+ s16 insn_off;
u8 jmp_cond;
u8 *func;
int nops;
@@ -1369,57 +1370,52 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+ insn_off = insn->off;
+
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
- * add abs(insn->off) to the limit to make sure that negative
- * offset won't be an issue.
- * insn->off is s16, so it won't affect valid pointers.
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
+ * src_reg is used as scratch for src_reg += insn->off and restored
+ * after emit_ldx if necessary
*/
- u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
- u8 *end_of_jmp1, *end_of_jmp2;
- /* Conservatively check that src_reg + insn->off is a kernel address:
- * 1. src_reg + insn->off >= limit
- * 2. src_reg + insn->off doesn't become small positive.
- * Cannot do src_reg + insn->off >= limit in one branch,
- * since it needs two spare registers, but JIT has only one.
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
+ u8 *end_of_jmp;
+
+ /* At end of these emitted checks, insn->off will have been added
+ * to src_reg, so no need to do relative load with insn->off offset
*/
+ insn_off = 0;
/* movabsq r11, limit */
EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
EMIT((u32)limit, 4);
EMIT(limit >> 32, 4);
+
+ if (insn->off) {
+ /* add src_reg, insn->off */
+ maybe_emit_1mod(&prog, src_reg, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
+ }
+
/* cmp src_reg, r11 */
maybe_emit_mod(&prog, src_reg, AUX_REG, true);
EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
- /* if unsigned '<' goto end_of_jmp2 */
- EMIT2(X86_JB, 0);
- end_of_jmp1 = prog;
-
- /* mov r11, src_reg */
- emit_mov_reg(&prog, true, AUX_REG, src_reg);
- /* add r11, insn->off */
- maybe_emit_1mod(&prog, AUX_REG, true);
- EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
- /* jmp if not carry to start_of_ldx
- * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
- * that has to be rejected.
- */
- EMIT2(0x73 /* JNC */, 0);
- end_of_jmp2 = prog;
+
+ /* if unsigned '>=', goto load */
+ EMIT2(X86_JAE, 0);
+ end_of_jmp = prog;
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JB above to jump to xor dst_reg */
- end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
- /* populate jmp_offset for JNC above to jump to start_of_ldx */
+ /* populate jmp_offset for JAE above to jump to start_of_ldx */
start_of_ldx = prog;
- end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
+ end_of_jmp[-1] = start_of_ldx - end_of_jmp;
}
- emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+ emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen + (start_of_ldx - temp);
@@ -1428,6 +1424,18 @@ st: if (is_imm8(insn->off))
/* populate jmp_offset for JMP above */
start_of_ldx[-1] = prog - start_of_ldx;
+ if (insn->off && src_reg != dst_reg) {
+ /* sub src_reg, insn->off
+ * Restore src_reg after "add src_reg, insn->off" in prev
+ * if statement. But if src_reg == dst_reg, emit_ldx
+ * above already clobbered src_reg, so no need to restore.
+ * If add src_reg, insn->off was unnecessary, no need to
+ * restore either.
+ */
+ maybe_emit_1mod(&prog, src_reg, true);
+ EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
+ }
+
if (!bpf_prog->aux->extable)
break;
@@ -1849,62 +1857,59 @@ emit_jmp:
return proglen;
}
-static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
+static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
int stack_size)
{
- int i, j, arg_size, nr_regs;
+ int i, j, arg_size;
+ bool next_same_struct = false;
+
/* Store function arguments to stack.
* For a function that accepts two pointers the sequence will be:
* mov QWORD PTR [rbp-0x10],rdi
* mov QWORD PTR [rbp-0x8],rsi
*/
- for (i = 0, j = 0; i < min(nr_args, 6); i++) {
- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
- nr_regs = (m->arg_size[i] + 7) / 8;
+ for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
+ /* The arg_size is at most 16 bytes, enforced by the verifier. */
+ arg_size = m->arg_size[j];
+ if (arg_size > 8) {
arg_size = 8;
- } else {
- nr_regs = 1;
- arg_size = m->arg_size[i];
+ next_same_struct = !next_same_struct;
}
- while (nr_regs) {
- emit_stx(prog, bytes_to_bpf_size(arg_size),
- BPF_REG_FP,
- j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
- -(stack_size - j * 8));
- nr_regs--;
- j++;
- }
+ emit_stx(prog, bytes_to_bpf_size(arg_size),
+ BPF_REG_FP,
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ -(stack_size - i * 8));
+
+ j = next_same_struct ? j : j + 1;
}
}
-static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
+static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
int stack_size)
{
- int i, j, arg_size, nr_regs;
+ int i, j, arg_size;
+ bool next_same_struct = false;
/* Restore function arguments from stack.
* For a function that accepts two pointers the sequence will be:
* EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
* EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
*/
- for (i = 0, j = 0; i < min(nr_args, 6); i++) {
- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
- nr_regs = (m->arg_size[i] + 7) / 8;
+ for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
+ /* The arg_size is at most 16 bytes, enforced by the verifier. */
+ arg_size = m->arg_size[j];
+ if (arg_size > 8) {
arg_size = 8;
- } else {
- nr_regs = 1;
- arg_size = m->arg_size[i];
+ next_same_struct = !next_same_struct;
}
- while (nr_regs) {
- emit_ldx(prog, bytes_to_bpf_size(arg_size),
- j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
- BPF_REG_FP,
- -(stack_size - j * 8));
- nr_regs--;
- j++;
- }
+ emit_ldx(prog, bytes_to_bpf_size(arg_size),
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ BPF_REG_FP,
+ -(stack_size - i * 8));
+
+ j = next_same_struct ? j : j + 1;
}
}
@@ -2130,8 +2135,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
struct bpf_tramp_links *tlinks,
void *func_addr)
{
- int ret, i, nr_args = m->nr_args, extra_nregs = 0;
- int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
+ int i, ret, nr_regs = m->nr_args, stack_size = 0;
+ int regs_off, nregs_off, ip_off, run_ctx_off;
struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
@@ -2140,17 +2145,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
u8 *prog;
bool save_ret;
- /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
- if (nr_args > 6)
- return -ENOTSUPP;
-
- for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
+ /* extra registers for struct arguments */
+ for (i = 0; i < m->nr_args; i++)
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
- extra_nregs += (m->arg_size[i] + 7) / 8 - 1;
- }
- if (nr_args + extra_nregs > 6)
+ nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+
+ /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
+ if (nr_regs > 6)
return -ENOTSUPP;
- stack_size += extra_nregs * 8;
/* Generated trampoline stack layout:
*
@@ -2164,7 +2166,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
* [ ... ]
* RBP - regs_off [ reg_arg1 ] program's ctx pointer
*
- * RBP - args_off [ arg regs count ] always
+ * RBP - nregs_off [ regs count ] always
*
* RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
*
@@ -2176,11 +2178,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (save_ret)
stack_size += 8;
+ stack_size += nr_regs * 8;
regs_off = stack_size;
- /* args count */
+ /* regs count */
stack_size += 8;
- args_off = stack_size;
+ nregs_off = stack_size;
if (flags & BPF_TRAMP_F_IP_ARG)
stack_size += 8; /* room for IP address argument */
@@ -2213,11 +2216,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
EMIT1(0x53); /* push rbx */
/* Store number of argument registers of the traced function:
- * mov rax, nr_args + extra_nregs
- * mov QWORD PTR [rbp - args_off], rax
+ * mov rax, nr_regs
+ * mov QWORD PTR [rbp - nregs_off], rax
*/
- emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs);
- emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
+ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs);
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off);
if (flags & BPF_TRAMP_F_IP_ARG) {
/* Store IP address of the traced function:
@@ -2228,7 +2231,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
- save_regs(m, &prog, nr_args, regs_off);
+ save_regs(m, &prog, nr_regs, regs_off);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
@@ -2258,7 +2261,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- restore_regs(m, &prog, nr_args, regs_off);
+ restore_regs(m, &prog, nr_regs, regs_off);
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
@@ -2299,7 +2302,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
- restore_regs(m, &prog, nr_args, regs_off);
+ restore_regs(m, &prog, nr_regs, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs,
* the return value is only updated on the stack and still needs to be
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index b94f727251b6..8babce71915f 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -392,6 +392,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
for (i = 0; i < msidesc->nvec_used; i++)
xen_destroy_irq(msidesc->irq + i);
+ msidesc->irq = 0;
}
}
@@ -433,6 +434,7 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
};
static struct msi_domain_info xen_pci_msi_domain_info = {
+ .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
.ops = &xen_pci_msi_domain_ops,
};
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 55d9caf66401..f3f2d87cce1b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -380,7 +380,7 @@ static int __init efi_systab_init(unsigned long phys)
return -ENOMEM;
}
- ret = efi_systab_check_header(hdr, 1);
+ ret = efi_systab_check_header(hdr);
if (ret) {
early_memunmap(p, size);
return ret;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index b36596bf0fc3..232acf418cfb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -389,10 +389,15 @@ static int __init efi_update_mappings(efi_memory_desc_t *md, unsigned long pf)
return err1 || err2;
}
-static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md)
+bool efi_disable_ibt_for_runtime __ro_after_init = true;
+
+static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md,
+ bool has_ibt)
{
unsigned long pf = 0;
+ efi_disable_ibt_for_runtime |= !has_ibt;
+
if (md->attribute & EFI_MEMORY_XP)
pf |= _PAGE_NX;
@@ -414,6 +419,7 @@ void __init efi_runtime_update_mappings(void)
* exists, since it is intended to supersede EFI_PROPERTIES_TABLE.
*/
if (efi_enabled(EFI_MEM_ATTR)) {
+ efi_disable_ibt_for_runtime = false;
efi_memattr_apply_permissions(NULL, efi_update_mem_attr);
return;
}
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 1a536a187d74..ee21d6a36a80 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -166,10 +166,9 @@ static struct irq_domain *uv_get_irq_domain(void)
if (!fn)
goto out;
- uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL);
- if (uv_domain)
- uv_domain->parent = x86_vector_domain;
- else
+ uv_domain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0, fn,
+ &uv_domain_ops, NULL);
+ if (!uv_domain)
irq_domain_free_fwnode(fn);
out:
mutex_unlock(&uv_lock);
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index cafd01f730da..29b2203bc82c 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -16,7 +16,7 @@ static int __init gate_vma_init(void)
vma_init(&gate_vma, NULL);
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
- gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC);
gate_vma.vm_page_prot = PAGE_READONLY;
return 0;
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5b1379662877..bb59cc6ddb2d 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -276,6 +276,7 @@ static void __init xen_init_capabilities(void)
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
setup_clear_cpu_cap(X86_FEATURE_SME);
+ setup_clear_cpu_cap(X86_FEATURE_LKGS);
/*
* Xen PV would need some work to support PCID: CR3 handling as well
@@ -1068,7 +1069,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
.write_cr4 = xen_write_cr4,
- .wbinvd = native_wbinvd,
+ .wbinvd = pv_native_wbinvd,
.read_msr = xen_read_msr,
.write_msr = xen_write_msr,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 06c3c2fb4b06..6092fea7d651 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -24,7 +24,7 @@ noinstr void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
-static void xen_safe_halt(void)
+static noinstr void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8db26f10fb1d..c2be3efb2ba0 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -934,12 +934,8 @@ void xen_enable_syscall(void)
static void __init xen_pvmmu_arch_setup(void)
{
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
- HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_pae_extended_cr3);
-
if (register_callback(CALLBACKTYPE_event,
xen_asm_exc_xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index bd02f9d50107..22fb982ff971 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -21,6 +21,8 @@ void xen_smp_send_reschedule(int cpu);
void xen_smp_send_call_function_ipi(const struct cpumask *mask);
void xen_smp_send_call_function_single_ipi(int cpu);
+void __noreturn xen_cpu_bringup_again(unsigned long stack);
+
struct xen_common_irq {
int irq;
char *name;
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 6175f2c5c822..a9cf8c8fa074 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -381,21 +381,12 @@ static void xen_pv_cpu_die(unsigned int cpu)
}
}
-static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
+static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
- cpu_bringup();
- /*
- * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
- * clears certain data that the cpu_idle loop (which called us
- * and that we return from) expects. The only way to get that
- * data back is to call:
- */
- tick_nohz_idle_enter();
- tick_nohz_idle_stop_tick_protected();
-
- cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
+ xen_cpu_bringup_again((unsigned long)task_pt_regs(current));
+ BUG();
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -409,7 +400,7 @@ static void xen_pv_cpu_die(unsigned int cpu)
BUG();
}
-static void xen_pv_play_dead(void)
+static void __noreturn xen_pv_play_dead(void)
{
BUG();
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 9ef0a5cca96e..1d597364b49d 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -60,9 +60,17 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs)
return xen_clocksource_read();
}
-static u64 xen_sched_clock(void)
+static noinstr u64 xen_sched_clock(void)
{
- return xen_clocksource_read() - xen_sched_clock_offset;
+ struct pvclock_vcpu_time_info *src;
+ u64 ret;
+
+ preempt_disable_notrace();
+ src = &__this_cpu_read(xen_vcpu)->time;
+ ret = pvclock_clocksource_read_nowd(src);
+ ret -= xen_sched_clock_offset;
+ preempt_enable_notrace();
+ return ret;
}
static void xen_read_wallclock(struct timespec64 *ts)
@@ -474,15 +482,51 @@ static void xen_setup_vsyscall_time_info(void)
xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
}
+/*
+ * Check if it is possible to safely use the tsc as a clocksource. This is
+ * only true if the hypervisor notifies the guest that its tsc is invariant,
+ * the tsc is stable, and the tsc instruction will never be emulated.
+ */
+static int __init xen_tsc_safe_clocksource(void)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)))
+ return 0;
+
+ if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC)))
+ return 0;
+
+ if (check_tsc_unstable())
+ return 0;
+
+ /* Leaf 4, sub-leaf 0 (0x40000x03) */
+ cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
+
+ /* tsc_mode = no_emulate (2) */
+ if (ebx != 2)
+ return 0;
+
+ return 1;
+}
+
static void __init xen_time_init(void)
{
struct pvclock_vcpu_time_info *pvti;
int cpu = smp_processor_id();
struct timespec64 tp;
- /* As Dom0 is never moved, no penalty on using TSC there */
+ /*
+ * As Dom0 is never moved, no penalty on using TSC there.
+ *
+ * If it is possible for the guest to determine that the tsc is a safe
+ * clocksource, then set xen_clocksource rating below that of the tsc
+ * so that the system prefers tsc instead.
+ */
if (xen_initial_domain())
xen_clocksource.rating = 275;
+ else if (xen_tsc_safe_clocksource())
+ xen_clocksource.rating = 299;
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..e36ea4268bd2 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -76,6 +76,13 @@ SYM_CODE_START(asm_cpu_bringup_and_idle)
call cpu_bringup_and_idle
SYM_CODE_END(asm_cpu_bringup_and_idle)
+
+SYM_CODE_START(xen_cpu_bringup_again)
+ UNWIND_HINT_FUNC
+ mov %rdi, %rsp
+ UNWIND_HINT_REGS
+ call cpu_bringup_and_idle
+SYM_CODE_END(xen_cpu_bringup_again)
.popsection
#endif
#endif