summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/Kconfig90
-rw-r--r--arch/arm64/Makefile23
-rw-r--r--arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi2
-rw-r--r--arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi4
-rw-r--r--arch/arm64/boot/dts/freescale/Makefile1
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts1
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mq.dtsi2
-rw-r--r--arch/arm64/boot/dts/marvell/armada-37xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts50
-rw-r--r--arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts63
-rw-r--r--arch/arm64/boot/dts/xilinx/zynqmp.dtsi12
-rw-r--r--arch/arm64/configs/defconfig12
-rw-r--r--arch/arm64/crypto/aes-neonbs-core.S4
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c4
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c3
-rw-r--r--arch/arm64/crypto/sha2-ce-glue.c3
-rw-r--r--arch/arm64/include/asm/arch_gicv3.h8
-rw-r--r--arch/arm64/include/asm/archrandom.h5
-rw-r--r--arch/arm64/include/asm/assembler.h29
-rw-r--r--arch/arm64/include/asm/barrier.h1
-rw-r--r--arch/arm64/include/asm/boot.h3
-rw-r--r--arch/arm64/include/asm/compat.h2
-rw-r--r--arch/arm64/include/asm/cpu_ops.h2
-rw-r--r--arch/arm64/include/asm/cpucaps.h7
-rw-r--r--arch/arm64/include/asm/cpufeature.h32
-rw-r--r--arch/arm64/include/asm/efi.h5
-rw-r--r--arch/arm64/include/asm/esr.h4
-rw-r--r--arch/arm64/include/asm/exception.h1
-rw-r--r--arch/arm64/include/asm/extable.h9
-rw-r--r--arch/arm64/include/asm/fpsimd.h3
-rw-r--r--arch/arm64/include/asm/fpsimdmacros.h48
-rw-r--r--arch/arm64/include/asm/hardirq.h9
-rw-r--r--arch/arm64/include/asm/hwcap.h11
-rw-r--r--arch/arm64/include/asm/hyp_image.h36
-rw-r--r--arch/arm64/include/asm/insn.h4
-rw-r--r--arch/arm64/include/asm/io.h1
-rw-r--r--arch/arm64/include/asm/irq_work.h4
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h2
-rw-r--r--arch/arm64/include/asm/kvm_arm.h5
-rw-r--r--arch/arm64/include/asm/kvm_asm.h192
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h28
-rw-r--r--arch/arm64/include/asm/kvm_host.h77
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h9
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h341
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h309
-rw-r--r--arch/arm64/include/asm/kvm_ptrauth.h6
-rw-r--r--arch/arm64/include/asm/memory.h24
-rw-r--r--arch/arm64/include/asm/mman.h56
-rw-r--r--arch/arm64/include/asm/mmu.h14
-rw-r--r--arch/arm64/include/asm/mmu_context.h11
-rw-r--r--arch/arm64/include/asm/module.lds.h (renamed from arch/arm64/kernel/module.lds)2
-rw-r--r--arch/arm64/include/asm/mte.h86
-rw-r--r--arch/arm64/include/asm/numa.h3
-rw-r--r--arch/arm64/include/asm/page-def.h5
-rw-r--r--arch/arm64/include/asm/page.h19
-rw-r--r--arch/arm64/include/asm/pci.h1
-rw-r--r--arch/arm64/include/asm/percpu.h28
-rw-r--r--arch/arm64/include/asm/perf_event.h3
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h40
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h28
-rw-r--r--arch/arm64/include/asm/pgtable.h114
-rw-r--r--arch/arm64/include/asm/processor.h56
-rw-r--r--arch/arm64/include/asm/ptrace.h14
-rw-r--r--arch/arm64/include/asm/smp.h16
-rw-r--r--arch/arm64/include/asm/spectre.h32
-rw-r--r--arch/arm64/include/asm/stacktrace.h2
-rw-r--r--arch/arm64/include/asm/stage2_pgtable.h215
-rw-r--r--arch/arm64/include/asm/sysreg.h87
-rw-r--r--arch/arm64/include/asm/thread_info.h4
-rw-r--r--arch/arm64/include/asm/topology.h2
-rw-r--r--arch/arm64/include/asm/traps.h2
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h14
-rw-r--r--arch/arm64/include/asm/xen/page.h6
-rw-r--r--arch/arm64/include/uapi/asm/hwcap.h2
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h25
-rw-r--r--arch/arm64/include/uapi/asm/mman.h1
-rw-r--r--arch/arm64/include/uapi/asm/ptrace.h4
-rw-r--r--arch/arm64/kernel/Makefile6
-rw-r--r--arch/arm64/kernel/acpi.c22
-rw-r--r--arch/arm64/kernel/cpu-reset.S4
-rw-r--r--arch/arm64/kernel/cpu_errata.c495
-rw-r--r--arch/arm64/kernel/cpufeature.c132
-rw-r--r--arch/arm64/kernel/cpuinfo.c178
-rw-r--r--arch/arm64/kernel/debug-monitors.c2
-rw-r--r--arch/arm64/kernel/entry-common.c21
-rw-r--r--arch/arm64/kernel/entry-fpsimd.S25
-rw-r--r--arch/arm64/kernel/entry.S45
-rw-r--r--arch/arm64/kernel/fpsimd.c12
-rw-r--r--arch/arm64/kernel/head.S16
-rw-r--r--arch/arm64/kernel/hibernate.c125
-rw-r--r--arch/arm64/kernel/image-vars.h7
-rw-r--r--arch/arm64/kernel/image.h1
-rw-r--r--arch/arm64/kernel/insn.c11
-rw-r--r--arch/arm64/kernel/irq.c11
-rw-r--r--arch/arm64/kernel/machine_kexec_file.c6
-rw-r--r--arch/arm64/kernel/mte.c336
-rw-r--r--arch/arm64/kernel/paravirt.c26
-rw-r--r--arch/arm64/kernel/perf_callchain.c6
-rw-r--r--arch/arm64/kernel/perf_event.c272
-rw-r--r--arch/arm64/kernel/perf_regs.c2
-rw-r--r--arch/arm64/kernel/pointer_auth.c4
-rw-r--r--arch/arm64/kernel/probes/decode-insn.c9
-rw-r--r--arch/arm64/kernel/probes/kprobes.c78
-rw-r--r--arch/arm64/kernel/process.c71
-rw-r--r--arch/arm64/kernel/proton-pack.c790
-rw-r--r--arch/arm64/kernel/ptrace.c51
-rw-r--r--arch/arm64/kernel/relocate_kernel.S12
-rw-r--r--arch/arm64/kernel/return_address.c8
-rw-r--r--arch/arm64/kernel/setup.c4
-rw-r--r--arch/arm64/kernel/signal.c13
-rw-r--r--arch/arm64/kernel/smccc-call.S2
-rw-r--r--arch/arm64/kernel/smp.c129
-rw-r--r--arch/arm64/kernel/smp_spin_table.c4
-rw-r--r--arch/arm64/kernel/ssbd.c129
-rw-r--r--arch/arm64/kernel/stacktrace.c117
-rw-r--r--arch/arm64/kernel/suspend.c7
-rw-r--r--arch/arm64/kernel/syscall.c10
-rw-r--r--arch/arm64/kernel/topology.c41
-rw-r--r--arch/arm64/kernel/traps.c132
-rw-r--r--arch/arm64/kernel/vdso.c51
-rw-r--r--arch/arm64/kernel/vdso/Makefile12
-rwxr-xr-xarch/arm64/kernel/vdso/gen_vdso_offsets.sh2
-rw-r--r--arch/arm64/kernel/vdso32/Makefile8
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S45
-rw-r--r--arch/arm64/kvm/Kconfig3
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arm.c113
-rw-r--r--arch/arm64/kvm/hyp.S34
-rw-r--r--arch/arm64/kvm/hyp/Makefile3
-rw-r--r--arch/arm64/kvm/hyp/entry.S95
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S107
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/debug-sr.h4
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h50
-rw-r--r--arch/arm64/kvm/hyp/nvhe/.gitignore2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile62
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S187
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S67
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c117
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S19
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c52
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c9
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c892
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c35
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c4
-rw-r--r--arch/arm64/kvm/hypercalls.c33
-rw-r--r--arch/arm64/kvm/inject_fault.c1
-rw-r--r--arch/arm64/kvm/mmu.c1611
-rw-r--r--arch/arm64/kvm/pmu-emul.c221
-rw-r--r--arch/arm64/kvm/pmu.c13
-rw-r--r--arch/arm64/kvm/psci.c74
-rw-r--r--arch/arm64/kvm/pvtime.c29
-rw-r--r--arch/arm64/kvm/reset.c44
-rw-r--r--arch/arm64/kvm/sys_regs.c24
-rw-r--r--arch/arm64/kvm/trace_arm.h16
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h6
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c24
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c4
-rw-r--r--arch/arm64/lib/Makefile2
-rw-r--r--arch/arm64/lib/mte.S151
-rw-r--r--arch/arm64/mm/Makefile3
-rw-r--r--arch/arm64/mm/context.c105
-rw-r--r--arch/arm64/mm/copypage.c25
-rw-r--r--arch/arm64/mm/dma-mapping.c2
-rw-r--r--arch/arm64/mm/extable.c4
-rw-r--r--arch/arm64/mm/fault.c13
-rw-r--r--arch/arm64/mm/init.c46
-rw-r--r--arch/arm64/mm/kasan_init.c10
-rw-r--r--arch/arm64/mm/mmu.c33
-rw-r--r--arch/arm64/mm/mteswap.c83
-rw-r--r--arch/arm64/mm/numa.c23
-rw-r--r--arch/arm64/mm/pageattr.c1
-rw-r--r--arch/arm64/mm/proc.S32
-rw-r--r--arch/arm64/mm/ptdump.c (renamed from arch/arm64/mm/dump.c)6
-rw-r--r--arch/arm64/net/bpf_jit_comp.c43
177 files changed, 6067 insertions, 4355 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d232837cbee..f858c352f72a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -29,6 +29,7 @@ config ARM64
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
+ select ARCH_STACKWALK
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
@@ -106,6 +107,7 @@ config ARM64
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_IDLE_POLL_SETUP
+ select GENERIC_IRQ_IPI
select GENERIC_IRQ_MULTI_HANDLER
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
@@ -121,6 +123,7 @@ config ARM64
select GENERIC_VDSO_TIME_NS
select HANDLE_DOMAIN_IRQ
select HARDIRQS_SW_RESEND
+ select HAVE_MOVE_PMD
select HAVE_PCI
select HAVE_ACPI_APEI if (ACPI && EFI)
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
@@ -192,6 +195,7 @@ config ARM64
select PCI_SYSCALL if PCI
select POWER_RESET
select POWER_SUPPLY
+ select SET_FS
select SPARSE_IRQ
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
@@ -211,12 +215,18 @@ config ARM64_PAGE_SHIFT
default 14 if ARM64_16K_PAGES
default 12
-config ARM64_CONT_SHIFT
+config ARM64_CONT_PTE_SHIFT
int
default 5 if ARM64_64K_PAGES
default 7 if ARM64_16K_PAGES
default 4
+config ARM64_CONT_PMD_SHIFT
+ int
+ default 5 if ARM64_64K_PAGES
+ default 5 if ARM64_16K_PAGES
+ default 4
+
config ARCH_MMAP_RND_BITS_MIN
default 14 if ARM64_64K_PAGES
default 16 if ARM64_16K_PAGES
@@ -1033,19 +1043,6 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
config CC_HAVE_SHADOW_CALL_STACK
def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
config PARAVIRT
bool "Enable paravirtualization code"
help
@@ -1165,32 +1162,6 @@ config UNMAP_KERNEL_AT_EL0
If unsure, say Y.
-config HARDEN_BRANCH_PREDICTOR
- bool "Harden the branch predictor against aliasing attacks" if EXPERT
- default y
- help
- Speculation attacks against some high-performance processors rely on
- being able to manipulate the branch predictor for a victim context by
- executing aliasing branches in the attacker context. Such attacks
- can be partially mitigated against by clearing internal branch
- predictor state and limiting the prediction logic in some situations.
-
- This config option will take CPU-specific actions to harden the
- branch predictor against aliasing attacks and may rely on specific
- instruction sequences or control bits being set by the system
- firmware.
-
- If unsure, say Y.
-
-config ARM64_SSBD
- bool "Speculative Store Bypass Disable" if EXPERT
- default y
- help
- This enables mitigation of the bypassing of previous stores
- by speculative loads.
-
- If unsure, say Y.
-
config RODATA_FULL_DEFAULT_ENABLED
bool "Apply r/o permissions of VM areas also to their linear aliases"
default y
@@ -1630,8 +1601,6 @@ config ARM64_BTI_KERNEL
depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697
depends on !CC_IS_GCC || GCC_VERSION >= 100100
- # https://reviews.llvm.org/rGb8ae3fdfa579dbf366b1bb1cbfdbf8c51db7fa55
- depends on !CC_IS_CLANG || CLANG_VERSION >= 100001
depends on !(CC_IS_CLANG && GCOV_KERNEL)
depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
help
@@ -1664,6 +1633,39 @@ config ARCH_RANDOM
provides a high bandwidth, cryptographically secure
hardware random number generator.
+config ARM64_AS_HAS_MTE
+ # Initial support for MTE went in binutils 2.32.0, checked with
+ # ".arch armv8.5-a+memtag" below. However, this was incomplete
+ # as a late addition to the final architecture spec (LDGM/STGM)
+ # is only supported in the newer 2.32.x and 2.33 binutils
+ # versions, hence the extra "stgm" instruction check below.
+ def_bool $(as-instr,.arch armv8.5-a+memtag\nstgm xzr$(comma)[x0])
+
+config ARM64_MTE
+ bool "Memory Tagging Extension support"
+ default y
+ depends on ARM64_AS_HAS_MTE && ARM64_TAGGED_ADDR_ABI
+ select ARCH_USES_HIGH_VMA_FLAGS
+ help
+ Memory Tagging (part of the ARMv8.5 Extensions) provides
+ architectural support for run-time, always-on detection of
+ various classes of memory error to aid with software debugging
+ to eliminate vulnerabilities arising from memory-unsafe
+ languages.
+
+ This option enables the support for the Memory Tagging
+ Extension at EL0 (i.e. for userspace).
+
+ Selecting this option allows the feature to be detected at
+ runtime. Any secondary CPU not implementing this feature will
+ not be allowed a late bring-up.
+
+ Userspace binaries that want to use this feature must
+ explicitly opt in. The mechanism for the userspace is
+ described in:
+
+ Documentation/arm64/memory-tagging-extension.rst.
+
endmenu
config ARM64_SVE
@@ -1876,6 +1878,10 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
def_bool y
depends on HUGETLB_PAGE && MIGRATION
+config ARCH_ENABLE_THP_MIGRATION
+ def_bool y
+ depends on TRANSPARENT_HUGEPAGE
+
menu "Power management options"
source "kernel/power/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 130569f90c54..5789c2d18d43 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -10,14 +10,13 @@
#
# Copyright (C) 1995-2001 by Russell King
-LDFLAGS_vmlinux :=--no-undefined -X
-CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
+LDFLAGS_vmlinux :=--no-undefined -X -z norelro
ifeq ($(CONFIG_RELOCATABLE), y)
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
# for relative relocs, since this leads to better Image compression
# with the relocation offsets always being zero.
-LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \
+LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \
$(call ld-option, --no-apply-dynamic-relocs)
endif
@@ -29,6 +28,10 @@ LDFLAGS_vmlinux += --fix-cortex-a53-843419
endif
endif
+# We never want expected sections to be placed heuristically by the
+# linker. All sections should be explicitly named in the linker script.
+LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn)
+
ifeq ($(CONFIG_ARM64_USE_LSE_ATOMICS), y)
ifneq ($(CONFIG_ARM64_LSE_ATOMICS), y)
$(warning LSE atomics not supported by binutils)
@@ -47,13 +50,16 @@ endif
KBUILD_CFLAGS += -mgeneral-regs-only \
$(compat_vdso) $(cc_has_k_constraint)
-KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += $(call cc-disable-warning, psabi)
KBUILD_AFLAGS += $(compat_vdso)
KBUILD_CFLAGS += $(call cc-option,-mabi=lp64)
KBUILD_AFLAGS += $(call cc-option,-mabi=lp64)
+# Avoid generating .eh_frame* sections.
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables
+KBUILD_AFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables
+
ifeq ($(CONFIG_STACKPROTECTOR_PER_TASK),y)
prepare: stack_protector_prepare
stack_protector_prepare: prepare0
@@ -120,10 +126,6 @@ endif
CHECKFLAGS += -D__aarch64__
-ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
-KBUILD_LDS_MODULE += $(srctree)/arch/arm64/kernel/module.lds
-endif
-
ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y)
KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
CC_FLAGS_FTRACE := -fpatchable-function-entry=2
@@ -132,9 +134,6 @@ endif
# Default value
head-y := arch/arm64/kernel/head.o
-# The byte offset of the kernel image in RAM from the start of RAM.
-TEXT_OFFSET := 0x0
-
ifeq ($(CONFIG_KASAN_SW_TAGS), y)
KASAN_SHADOW_SCALE_SHIFT := 4
else
@@ -145,8 +144,6 @@ KBUILD_CFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT)
KBUILD_CPPFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT)
KBUILD_AFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT)
-export TEXT_OFFSET
-
core-y += arch/arm64/
libs-y := arch/arm64/lib/ $(libs-y)
libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
diff --git a/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi b/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi
index 15f7b0ed3836..39802066232e 100644
--- a/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi
+++ b/arch/arm64/boot/dts/broadcom/northstar2/ns2.dtsi
@@ -745,7 +745,7 @@
};
qspi: spi@66470200 {
- compatible = "brcm,spi-bcm-qspi", "brcm,spi-ns2-qspi";
+ compatible = "brcm,spi-ns2-qspi", "brcm,spi-bcm-qspi";
reg = <0x66470200 0x184>,
<0x66470000 0x124>,
<0x67017408 0x004>,
diff --git a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
index 250fc01de78d..24aab3ea3f52 100644
--- a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
+++ b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi
@@ -795,8 +795,8 @@
reg = <0x27>;
interrupt-parent = <&gpa1>;
interrupts = <3 IRQ_TYPE_LEVEL_HIGH>;
- s3fwrn5,en-gpios = <&gpf1 4 GPIO_ACTIVE_HIGH>;
- s3fwrn5,fw-gpios = <&gpj0 2 GPIO_ACTIVE_HIGH>;
+ en-gpios = <&gpf1 4 GPIO_ACTIVE_HIGH>;
+ wake-gpios = <&gpj0 2 GPIO_ACTIVE_HIGH>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/Makefile b/arch/arm64/boot/dts/freescale/Makefile
index a39f0a1723e0..903c0eb61290 100644
--- a/arch/arm64/boot/dts/freescale/Makefile
+++ b/arch/arm64/boot/dts/freescale/Makefile
@@ -28,6 +28,7 @@ dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-honeycomb.dtb
dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-qds.dtb
dtb-$(CONFIG_ARCH_LAYERSCAPE) += fsl-lx2160a-rdb.dtb
+dtb-$(CONFIG_ARCH_MXC) += imx8mm-beacon-kit.dtb
dtb-$(CONFIG_ARCH_MXC) += imx8mm-evk.dtb
dtb-$(CONFIG_ARCH_MXC) += imx8mn-evk.dtb
dtb-$(CONFIG_ARCH_MXC) += imx8mn-ddr4-evk.dtb
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
index c2dc1232f93f..1efb61cff454 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
@@ -199,6 +199,7 @@
&enetc_port0 {
phy-handle = <&sgmii_phy0>;
phy-connection-type = "sgmii";
+ managed = "in-band-status";
status = "okay";
mdio {
diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
index 9de2aa1c573c..a5154f13a18e 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
@@ -702,7 +702,7 @@
reg = <0x30bd0000 0x10000>;
interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MP_CLK_SDMA1_ROOT>,
- <&clk IMX8MP_CLK_SDMA1_ROOT>;
+ <&clk IMX8MP_CLK_AHB>;
clock-names = "ipg", "ahb";
#dma-cells = <3>;
fsl,sdma-ram-script-name = "imx/sdma/sdma-imx7d.bin";
diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
index f70435cf9ad5..561fa792fe5a 100644
--- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
@@ -423,7 +423,7 @@
tmu: tmu@30260000 {
compatible = "fsl,imx8mq-tmu";
reg = <0x30260000 0x10000>;
- interrupt = <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MQ_CLK_TMU_ROOT>;
little-endian;
fsl,tmu-range = <0xb0000 0xa0026 0x80048 0x70061>;
diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
index 2bbc69b4dc99..d5b6c0a1c54a 100644
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
@@ -316,7 +316,7 @@
};
pcie_reset_pins: pcie-reset-pins {
- groups = "pcie1";
+ groups = "pcie1"; /* this actually controls "pcie1_reset" */
function = "gpio";
};
diff --git a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
index d174ad214857..9a11e5c60c26 100644
--- a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
+++ b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
@@ -143,6 +143,56 @@
mdio: mdio-bus {
#address-cells = <1>;
#size-cells = <0>;
+
+ switch@0 {
+ compatible = "mediatek,mt7531";
+ reg = <0>;
+ reset-gpios = <&pio 54 0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ label = "wan";
+ };
+
+ port@1 {
+ reg = <1>;
+ label = "lan0";
+ };
+
+ port@2 {
+ reg = <2>;
+ label = "lan1";
+ };
+
+ port@3 {
+ reg = <3>;
+ label = "lan2";
+ };
+
+ port@4 {
+ reg = <4>;
+ label = "lan3";
+ };
+
+ port@6 {
+ reg = <6>;
+ label = "cpu";
+ ethernet = <&gmac0>;
+ phy-mode = "2500base-x";
+
+ fixed-link {
+ speed = <2500>;
+ full-duplex;
+ pause;
+ };
+ };
+ };
+ };
+
};
};
diff --git a/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts b/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts
index 0b4de627f96e..08ad0ffb24df 100644
--- a/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts
+++ b/arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts
@@ -105,20 +105,71 @@
pinctrl-0 = <&eth_pins>;
status = "okay";
- gmac1: mac@1 {
+ gmac0: mac@0 {
compatible = "mediatek,eth-mac";
- reg = <1>;
- phy-handle = <&phy5>;
+ reg = <0>;
+ phy-mode = "2500base-x";
+
+ fixed-link {
+ speed = <2500>;
+ full-duplex;
+ pause;
+ };
};
mdio-bus {
#address-cells = <1>;
#size-cells = <0>;
- phy5: ethernet-phy@5 {
- reg = <5>;
- phy-mode = "sgmii";
+ switch@0 {
+ compatible = "mediatek,mt7531";
+ reg = <0>;
+ reset-gpios = <&pio 54 0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ label = "lan0";
+ };
+
+ port@1 {
+ reg = <1>;
+ label = "lan1";
+ };
+
+ port@2 {
+ reg = <2>;
+ label = "lan2";
+ };
+
+ port@3 {
+ reg = <3>;
+ label = "lan3";
+ };
+
+ port@4 {
+ reg = <4>;
+ label = "wan";
+ };
+
+ port@6 {
+ reg = <6>;
+ label = "cpu";
+ ethernet = <&gmac0>;
+ phy-mode = "2500base-x";
+
+ fixed-link {
+ speed = <2500>;
+ full-duplex;
+ pause;
+ };
+ };
+ };
};
+
};
};
diff --git a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
index 9174ddc76bdc..3ec99f13c259 100644
--- a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
+++ b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
@@ -13,6 +13,7 @@
*/
#include <dt-bindings/power/xlnx-zynqmp-power.h>
+#include <dt-bindings/reset/xlnx-zynqmp-resets.h>
/ {
compatible = "xlnx,zynqmp";
@@ -558,6 +559,15 @@
};
};
+ psgtr: phy@fd400000 {
+ compatible = "xlnx,zynqmp-psgtr-v1.1";
+ status = "disabled";
+ reg = <0x0 0xfd400000 0x0 0x40000>,
+ <0x0 0xfd3d0000 0x0 0x1000>;
+ reg-names = "serdes", "siou";
+ #phy-cells = <4>;
+ };
+
rtc: rtc@ffa60000 {
compatible = "xlnx,zynqmp-rtc";
status = "disabled";
@@ -601,7 +611,7 @@
power-domains = <&zynqmp_firmware PD_SD_1>;
};
- smmu: smmu@fd800000 {
+ smmu: iommu@fd800000 {
compatible = "arm,mmu-500";
reg = <0x0 0xfd800000 0x0 0x20000>;
status = "disabled";
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index e0f33826819f..6d04b9577b0b 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -724,6 +724,17 @@ CONFIG_USB_GADGET=y
CONFIG_USB_RENESAS_USBHS_UDC=m
CONFIG_USB_RENESAS_USB3=m
CONFIG_USB_TEGRA_XUDC=m
+CONFIG_USB_CONFIGFS=m
+CONFIG_USB_CONFIGFS_SERIAL=y
+CONFIG_USB_CONFIGFS_ACM=y
+CONFIG_USB_CONFIGFS_OBEX=y
+CONFIG_USB_CONFIGFS_NCM=y
+CONFIG_USB_CONFIGFS_ECM=y
+CONFIG_USB_CONFIGFS_ECM_SUBSET=y
+CONFIG_USB_CONFIGFS_RNDIS=y
+CONFIG_USB_CONFIGFS_EEM=y
+CONFIG_USB_CONFIGFS_MASS_STORAGE=y
+CONFIG_USB_CONFIGFS_F_FS=y
CONFIG_TYPEC=m
CONFIG_TYPEC_TCPM=m
CONFIG_TYPEC_FUSB302=m
@@ -914,6 +925,7 @@ CONFIG_ARCH_TEGRA_194_SOC=y
CONFIG_ARCH_K3_AM6_SOC=y
CONFIG_ARCH_K3_J721E_SOC=y
CONFIG_TI_SCI_PM_DOMAINS=y
+CONFIG_EXTCON_PTN5150=m
CONFIG_EXTCON_USB_GPIO=y
CONFIG_EXTCON_USBC_CROS_EC=y
CONFIG_IIO=y
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index b357164379f6..63a52ad9a75c 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -788,7 +788,7 @@ SYM_FUNC_START_LOCAL(__xts_crypt8)
0: mov bskey, x21
mov rounds, x22
- br x7
+ br x16
SYM_FUNC_END(__xts_crypt8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
@@ -806,7 +806,7 @@ SYM_FUNC_END(__xts_crypt8)
uzp1 v30.4s, v30.4s, v25.4s
ld1 {v25.16b}, [x24]
-99: adr x7, \do8
+99: adr x16, \do8
bl __xts_crypt8
ldp q16, q17, [sp, #.Lframe_local_offset]
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index da1034867aaa..8536008e3e35 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -347,7 +347,7 @@ static int gcm_encrypt(struct aead_request *req)
u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
u64 dg[2] = {};
- u128 lengths;
+ be128 lengths;
u8 *tag;
int err;
@@ -461,7 +461,7 @@ static int gcm_decrypt(struct aead_request *req)
u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
u64 dg[2] = {};
- u128 lengths;
+ be128 lengths;
u8 *tag;
int err;
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index 565ef604ca04..c63b99211db3 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -25,6 +25,9 @@ struct sha1_ce_state {
u32 finalize;
};
+extern const u32 sha1_ce_offsetof_count;
+extern const u32 sha1_ce_offsetof_finalize;
+
asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
int blocks);
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 9450d19b9e6e..5e956d7582a5 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -25,6 +25,9 @@ struct sha256_ce_state {
u32 finalize;
};
+extern const u32 sha256_ce_offsetof_count;
+extern const u32 sha256_ce_offsetof_finalize;
+
asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
int blocks);
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 6647ae4f0231..880b9054d75c 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -153,7 +153,7 @@ static inline bool gic_prio_masking_enabled(void)
static inline void gic_pmr_mask_irqs(void)
{
- BUILD_BUG_ON(GICD_INT_DEF_PRI < (GIC_PRIO_IRQOFF |
+ BUILD_BUG_ON(GICD_INT_DEF_PRI < (__GIC_PRIO_IRQOFF |
GIC_PRIO_PSR_I_SET));
BUILD_BUG_ON(GICD_INT_DEF_PRI >= GIC_PRIO_IRQON);
/*
@@ -162,6 +162,12 @@ static inline void gic_pmr_mask_irqs(void)
* are applied to IRQ priorities
*/
BUILD_BUG_ON((0x80 | (GICD_INT_DEF_PRI >> 1)) >= GIC_PRIO_IRQON);
+ /*
+ * Same situation as above, but now we make sure that we can mask
+ * regular interrupts.
+ */
+ BUILD_BUG_ON((0x80 | (GICD_INT_DEF_PRI >> 1)) < (__GIC_PRIO_IRQOFF_NS |
+ GIC_PRIO_PSR_I_SET));
gic_write_pmr(GIC_PRIO_IRQOFF);
}
diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h
index 44209f6146aa..ffb1a40d5475 100644
--- a/arch/arm64/include/asm/archrandom.h
+++ b/arch/arm64/include/asm/archrandom.h
@@ -79,10 +79,5 @@ arch_get_random_seed_long_early(unsigned long *v)
}
#define arch_get_random_seed_long_early arch_get_random_seed_long_early
-#else
-
-static inline bool __arm64_rndr(unsigned long *v) { return false; }
-static inline bool __init __early_cpu_has_rndr(void) { return false; }
-
#endif /* CONFIG_ARCH_RANDOM */
#endif /* _ASM_ARCHRANDOM_H */
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 54d181177656..ddbe6bf00e33 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -219,6 +219,23 @@ lr .req x30 // link register
.endm
/*
+ * @dst: destination register
+ */
+#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
+ .macro this_cpu_offset, dst
+ mrs \dst, tpidr_el2
+ .endm
+#else
+ .macro this_cpu_offset, dst
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+ mrs \dst, tpidr_el1
+alternative_else
+ mrs \dst, tpidr_el2
+alternative_endif
+ .endm
+#endif
+
+ /*
* @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP)
* @sym: The name of the per-cpu variable
* @tmp: scratch register
@@ -226,11 +243,7 @@ lr .req x30 // link register
.macro adr_this_cpu, dst, sym, tmp
adrp \tmp, \sym
add \dst, \tmp, #:lo12:\sym
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- mrs \tmp, tpidr_el1
-alternative_else
- mrs \tmp, tpidr_el2
-alternative_endif
+ this_cpu_offset \tmp
add \dst, \dst, \tmp
.endm
@@ -241,11 +254,7 @@ alternative_endif
*/
.macro ldr_this_cpu dst, sym, tmp
adr_l \dst, \sym
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- mrs \tmp, tpidr_el1
-alternative_else
- mrs \tmp, tpidr_el2
-alternative_endif
+ this_cpu_offset \tmp
ldr \dst, [\dst, \tmp]
.endm
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index fb4c27506ef4..c3009b0e5239 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -45,6 +45,7 @@
#define rmb() dsb(ld)
#define wmb() dsb(st)
+#define dma_mb() dmb(osh)
#define dma_rmb() dmb(oshld)
#define dma_wmb() dmb(oshst)
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
index c7f67da13cd9..3e7943fd17a4 100644
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -13,8 +13,7 @@
#define MAX_FDT_SIZE SZ_2M
/*
- * arm64 requires the kernel image to placed
- * TEXT_OFFSET bytes beyond a 2 MB aligned base
+ * arm64 requires the kernel image to placed at a 2 MB aligned base address
*/
#define MIN_KIMG_ALIGN SZ_2M
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index 935d2aa231bf..23a9fb73c04f 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -35,8 +35,6 @@ typedef s32 compat_nlink_t;
typedef u16 compat_ipc_pid_t;
typedef u32 compat_caddr_t;
typedef __kernel_fsid_t compat_fsid_t;
-typedef s64 compat_s64;
-typedef u64 compat_u64;
struct compat_stat {
#ifdef __AARCH64EB__
diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h
index d28e8f37d3b4..e95c4df83911 100644
--- a/arch/arm64/include/asm/cpu_ops.h
+++ b/arch/arm64/include/asm/cpu_ops.h
@@ -21,7 +21,7 @@
* mechanism for doing so, tests whether it is possible to boot
* the given CPU.
* @cpu_boot: Boots a cpu into the kernel.
- * @cpu_postboot: Optionally, perform any post-boot cleanup or necesary
+ * @cpu_postboot: Optionally, perform any post-boot cleanup or necessary
* synchronisation. Called from the cpu being booted.
* @cpu_can_disable: Determines whether a CPU can be disabled based on
* mechanism-specific information.
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 07b643a70710..42868dbd29fd 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -31,13 +31,13 @@
#define ARM64_HAS_DCPOP 21
#define ARM64_SVE 22
#define ARM64_UNMAP_KERNEL_AT_EL0 23
-#define ARM64_HARDEN_BRANCH_PREDICTOR 24
+#define ARM64_SPECTRE_V2 24
#define ARM64_HAS_RAS_EXTN 25
#define ARM64_WORKAROUND_843419 26
#define ARM64_HAS_CACHE_IDC 27
#define ARM64_HAS_CACHE_DIC 28
#define ARM64_HW_DBM 29
-#define ARM64_SSBD 30
+#define ARM64_SPECTRE_V4 30
#define ARM64_MISMATCHED_CACHE_TYPE 31
#define ARM64_HAS_STAGE2_FWB 32
#define ARM64_HAS_CRC32 33
@@ -64,7 +64,8 @@
#define ARM64_BTI 54
#define ARM64_HAS_ARMv8_4_TTL 55
#define ARM64_HAS_TLB_RANGE 56
+#define ARM64_MTE 57
-#define ARM64_NCAPS 57
+#define ARM64_NCAPS 58
#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 89b4f0142c28..f7e7144af174 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -358,7 +358,7 @@ static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap)
}
/*
- * Generic helper for handling capabilties with multiple (match,enable) pairs
+ * Generic helper for handling capabilities with multiple (match,enable) pairs
* of call backs, sharing the same capability bit.
* Iterate over each entry to see if at least one matches.
*/
@@ -681,6 +681,12 @@ static __always_inline bool system_uses_irq_prio_masking(void)
cpus_have_const_cap(ARM64_HAS_IRQ_PRIO_MASKING);
}
+static inline bool system_supports_mte(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_MTE) &&
+ cpus_have_const_cap(ARM64_MTE);
+}
+
static inline bool system_has_prio_mask_debugging(void)
{
return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) &&
@@ -698,30 +704,6 @@ static inline bool system_supports_tlb_range(void)
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
}
-#define ARM64_BP_HARDEN_UNKNOWN -1
-#define ARM64_BP_HARDEN_WA_NEEDED 0
-#define ARM64_BP_HARDEN_NOT_REQUIRED 1
-
-int get_spectre_v2_workaround_state(void);
-
-#define ARM64_SSBD_UNKNOWN -1
-#define ARM64_SSBD_FORCE_DISABLE 0
-#define ARM64_SSBD_KERNEL 1
-#define ARM64_SSBD_FORCE_ENABLE 2
-#define ARM64_SSBD_MITIGATED 3
-
-static inline int arm64_get_ssbd_state(void)
-{
-#ifdef CONFIG_ARM64_SSBD
- extern int ssbd_state;
- return ssbd_state;
-#else
- return ARM64_SSBD_UNKNOWN;
-#endif
-}
-
-void arm64_set_ssbd_mitigation(bool state);
-
extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index d4ab3f73e7a3..973b14415271 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -65,7 +65,7 @@ efi_status_t __efi_rt_asm_wrapper(void *, const char *, ...);
(SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN)
/* on arm64, the FDT may be located anywhere in system RAM */
-static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base)
+static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr)
{
return ULONG_MAX;
}
@@ -80,8 +80,7 @@ static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base)
* apply to other bootloaders, and are required for some kernel
* configurations.
*/
-static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
- unsigned long image_addr)
+static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr)
{
return (image_addr & ~(SZ_1G - 1UL)) + (1UL << (VA_BITS_MIN - 1));
}
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 035003acfa87..22c81f1edda2 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -35,7 +35,9 @@
#define ESR_ELx_EC_SYS64 (0x18)
#define ESR_ELx_EC_SVE (0x19)
#define ESR_ELx_EC_ERET (0x1a) /* EL2 only */
-/* Unallocated EC: 0x1b - 0x1E */
+/* Unallocated EC: 0x1B */
+#define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */
+/* Unallocated EC: 0x1D - 0x1E */
#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */
#define ESR_ELx_EC_IABT_LOW (0x20)
#define ESR_ELx_EC_IABT_CUR (0x21)
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7577a754d443..99b9383cd036 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -47,4 +47,5 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr);
void do_cp15instr(unsigned int esr, struct pt_regs *regs);
void do_el0_svc(struct pt_regs *regs);
void do_el0_svc_compat(struct pt_regs *regs);
+void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr);
#endif /* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h
index 840a35ed92ec..b15eb4a3e6b2 100644
--- a/arch/arm64/include/asm/extable.h
+++ b/arch/arm64/include/asm/extable.h
@@ -22,6 +22,15 @@ struct exception_table_entry
#define ARCH_HAS_RELATIVE_EXTABLE
+static inline bool in_bpf_jit(struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_BPF_JIT))
+ return false;
+
+ return regs->pc >= BPF_JIT_REGION_START &&
+ regs->pc < BPF_JIT_REGION_END;
+}
+
#ifdef CONFIG_BPF_JIT
int arm64_bpf_fixup_exception(const struct exception_table_entry *ex,
struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 59f10dd13f12..bec5f14b622a 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -69,6 +69,9 @@ static inline void *sve_pffr(struct thread_struct *thread)
extern void sve_save_state(void *state, u32 *pfpsr);
extern void sve_load_state(void const *state, u32 const *pfpsr,
unsigned long vq_minus_1);
+extern void sve_flush_live(void);
+extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state,
+ unsigned long vq_minus_1);
extern unsigned int sve_get_vl(void);
struct arm64_cpu_capabilities;
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index 636e9d9c7929..af43367534c7 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -164,25 +164,59 @@
| ((\np) << 5)
.endm
+/* PFALSE P\np.B */
+.macro _sve_pfalse np
+ _sve_check_preg \np
+ .inst 0x2518e400 \
+ | (\np)
+.endm
+
.macro __for from:req, to:req
.if (\from) == (\to)
- _for__body \from
+ _for__body %\from
.else
- __for \from, (\from) + ((\to) - (\from)) / 2
- __for (\from) + ((\to) - (\from)) / 2 + 1, \to
+ __for %\from, %((\from) + ((\to) - (\from)) / 2)
+ __for %((\from) + ((\to) - (\from)) / 2 + 1), %\to
.endif
.endm
.macro _for var:req, from:req, to:req, insn:vararg
.macro _for__body \var:req
+ .noaltmacro
\insn
+ .altmacro
.endm
+ .altmacro
__for \from, \to
+ .noaltmacro
.purgem _for__body
.endm
+/* Update ZCR_EL1.LEN with the new VQ */
+.macro sve_load_vq xvqminus1, xtmp, xtmp2
+ mrs_s \xtmp, SYS_ZCR_EL1
+ bic \xtmp2, \xtmp, ZCR_ELx_LEN_MASK
+ orr \xtmp2, \xtmp2, \xvqminus1
+ cmp \xtmp2, \xtmp
+ b.eq 921f
+ msr_s SYS_ZCR_EL1, \xtmp2 //self-synchronising
+921:
+.endm
+
+/* Preserve the first 128-bits of Znz and zero the rest. */
+.macro _sve_flush_z nz
+ _sve_check_zreg \nz
+ mov v\nz\().16b, v\nz\().16b
+.endm
+
+.macro sve_flush
+ _for n, 0, 31, _sve_flush_z \n
+ _for n, 0, 15, _sve_pfalse \n
+ _sve_wrffr 0
+.endm
+
.macro sve_save nxbase, xpfpsr, nxtmp
_for n, 0, 31, _sve_str_v \n, \nxbase, \n - 34
_for n, 0, 15, _sve_str_p \n, \nxbase, \n - 16
@@ -197,13 +231,7 @@
.endm
.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
- mrs_s x\nxtmp, SYS_ZCR_EL1
- bic \xtmp2, x\nxtmp, ZCR_ELx_LEN_MASK
- orr \xtmp2, \xtmp2, \xvqminus1
- cmp \xtmp2, x\nxtmp
- b.eq 921f
- msr_s SYS_ZCR_EL1, \xtmp2 // self-synchronising
-921:
+ sve_load_vq \xvqminus1, x\nxtmp, \xtmp2
_for n, 0, 31, _sve_ldr_v \n, \nxbase, \n - 34
_sve_ldr_p 0, \nxbase
_sve_wrffr 0
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index 985493af704b..5ffa4bacdad3 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -13,21 +13,12 @@
#include <asm/kvm_arm.h>
#include <asm/sysreg.h>
-#define NR_IPI 7
-
typedef struct {
unsigned int __softirq_pending;
- unsigned int ipi_irqs[NR_IPI];
} ____cacheline_aligned irq_cpustat_t;
#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
-#define __inc_irq_stat(cpu, member) __IRQ_STAT(cpu, member)++
-#define __get_irq_stat(cpu, member) __IRQ_STAT(cpu, member)
-
-u64 smp_irq_stat_cpu(unsigned int cpu);
-#define arch_irq_stat_cpu smp_irq_stat_cpu
-
#define __ARCH_IRQ_EXIT_IRQS_DISABLED 1
struct nmi_ctx {
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 22f73fe09030..9a5498c2c8ee 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -8,18 +8,27 @@
#include <uapi/asm/hwcap.h>
#include <asm/cpufeature.h>
+#define COMPAT_HWCAP_SWP (1 << 0)
#define COMPAT_HWCAP_HALF (1 << 1)
#define COMPAT_HWCAP_THUMB (1 << 2)
+#define COMPAT_HWCAP_26BIT (1 << 3)
#define COMPAT_HWCAP_FAST_MULT (1 << 4)
+#define COMPAT_HWCAP_FPA (1 << 5)
#define COMPAT_HWCAP_VFP (1 << 6)
#define COMPAT_HWCAP_EDSP (1 << 7)
+#define COMPAT_HWCAP_JAVA (1 << 8)
+#define COMPAT_HWCAP_IWMMXT (1 << 9)
+#define COMPAT_HWCAP_CRUNCH (1 << 10)
+#define COMPAT_HWCAP_THUMBEE (1 << 11)
#define COMPAT_HWCAP_NEON (1 << 12)
#define COMPAT_HWCAP_VFPv3 (1 << 13)
+#define COMPAT_HWCAP_VFPV3D16 (1 << 14)
#define COMPAT_HWCAP_TLS (1 << 15)
#define COMPAT_HWCAP_VFPv4 (1 << 16)
#define COMPAT_HWCAP_IDIVA (1 << 17)
#define COMPAT_HWCAP_IDIVT (1 << 18)
#define COMPAT_HWCAP_IDIV (COMPAT_HWCAP_IDIVA|COMPAT_HWCAP_IDIVT)
+#define COMPAT_HWCAP_VFPD32 (1 << 19)
#define COMPAT_HWCAP_LPAE (1 << 20)
#define COMPAT_HWCAP_EVTSTRM (1 << 21)
@@ -95,7 +104,7 @@
#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH)
#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG)
#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI)
-/* reserved for KERNEL_HWCAP_MTE __khwcap2_feature(MTE) */
+#define KERNEL_HWCAP_MTE __khwcap2_feature(MTE)
/*
* This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h
new file mode 100644
index 000000000000..daa1a1da539e
--- /dev/null
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ */
+
+#ifndef __ARM64_HYP_IMAGE_H__
+#define __ARM64_HYP_IMAGE_H__
+
+/*
+ * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
+ * to separate it from the kernel proper.
+ */
+#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
+
+#ifdef LINKER_SCRIPT
+
+/*
+ * KVM nVHE ELF section names are prefixed with .hyp, to separate them
+ * from the kernel proper.
+ */
+#define HYP_SECTION_NAME(NAME) .hyp##NAME
+
+/* Defines an ELF hyp section from input section @NAME and its subsections. */
+#define HYP_SECTION(NAME) \
+ HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
+
+/*
+ * Defines a linker script alias of a kernel-proper symbol referenced by
+ * KVM nVHE hyp code.
+ */
+#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym;
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* __ARM64_HYP_IMAGE_H__ */
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 0bc46149e491..4b39293d0f72 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -359,9 +359,13 @@ __AARCH64_INSN_FUNCS(brk, 0xFFE0001F, 0xD4200000)
__AARCH64_INSN_FUNCS(exception, 0xFF000000, 0xD4000000)
__AARCH64_INSN_FUNCS(hint, 0xFFFFF01F, 0xD503201F)
__AARCH64_INSN_FUNCS(br, 0xFFFFFC1F, 0xD61F0000)
+__AARCH64_INSN_FUNCS(br_auth, 0xFEFFF800, 0xD61F0800)
__AARCH64_INSN_FUNCS(blr, 0xFFFFFC1F, 0xD63F0000)
+__AARCH64_INSN_FUNCS(blr_auth, 0xFEFFF800, 0xD63F0800)
__AARCH64_INSN_FUNCS(ret, 0xFFFFFC1F, 0xD65F0000)
+__AARCH64_INSN_FUNCS(ret_auth, 0xFFFFFBFF, 0xD65F0BFF)
__AARCH64_INSN_FUNCS(eret, 0xFFFFFFFF, 0xD69F03E0)
+__AARCH64_INSN_FUNCS(eret_auth, 0xFFFFFBFF, 0xD69F0BFF)
__AARCH64_INSN_FUNCS(mrs, 0xFFF00000, 0xD5300000)
__AARCH64_INSN_FUNCS(msr_imm, 0xFFF8F01F, 0xD500401F)
__AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000)
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index ff50dd731852..fd172c41df90 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -110,6 +110,7 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
#define __io_par(v) __iormb(v)
#define __iowmb() dma_wmb()
+#define __iomb() dma_mb()
/*
* Relaxed I/O memory access primitives. These follow the Device memory
diff --git a/arch/arm64/include/asm/irq_work.h b/arch/arm64/include/asm/irq_work.h
index 8a1ef1907760..a1020285ea75 100644
--- a/arch/arm64/include/asm/irq_work.h
+++ b/arch/arm64/include/asm/irq_work.h
@@ -2,11 +2,9 @@
#ifndef __ASM_IRQ_WORK_H
#define __ASM_IRQ_WORK_H
-#include <asm/smp.h>
-
static inline bool arch_irq_work_has_interrupt(void)
{
- return !!__smp_cross_call;
+ return true;
}
#endif /* __ASM_IRQ_WORK_H */
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 329fb15f6bac..19ca76ea60d9 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -86,7 +86,7 @@
+ EARLY_PGDS((vstart), (vend)) /* each PGDIR needs a next level page table */ \
+ EARLY_PUDS((vstart), (vend)) /* each PUD needs a next level page table */ \
+ EARLY_PMDS((vstart), (vend))) /* each PMD needs a next level page table */
-#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR + TEXT_OFFSET, _end))
+#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end))
#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 1da8e3dc4455..64ce29378467 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -12,6 +12,7 @@
#include <asm/types.h>
/* Hyp Configuration Register (HCR) bits */
+#define HCR_ATA (UL(1) << 56)
#define HCR_FWB (UL(1) << 46)
#define HCR_API (UL(1) << 41)
#define HCR_APK (UL(1) << 40)
@@ -66,7 +67,7 @@
* TWI: Trap WFI
* TIDCP: Trap L2CTLR/L2ECTLR
* BSU_IS: Upgrade barriers to the inner shareable domain
- * FB: Force broadcast of all maintainance operations
+ * FB: Force broadcast of all maintenance operations
* AMO: Override CPSR.A and enable signaling with VA
* IMO: Override CPSR.I and enable signaling with VI
* FMO: Override CPSR.F and enable signaling with VF
@@ -78,7 +79,7 @@
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
HCR_FMO | HCR_IMO | HCR_PTW )
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
-#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK)
+#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
/* TCR_EL2 Registers bits */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 6f98fbd0ac81..54387ccd1ab2 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -7,11 +7,9 @@
#ifndef __ARM_KVM_ASM_H__
#define __ARM_KVM_ASM_H__
+#include <asm/hyp_image.h>
#include <asm/virt.h>
-#define VCPU_WORKAROUND_2_FLAG_SHIFT 0
-#define VCPU_WORKAROUND_2_FLAG (_AC(1, UL) << VCPU_WORKAROUND_2_FLAG_SHIFT)
-
#define ARM_EXIT_WITH_SERROR_BIT 31
#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
#define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP)
@@ -38,17 +36,34 @@
#define __SMCCC_WORKAROUND_1_SMC_SZ 36
+#define KVM_HOST_SMCCC_ID(id) \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ ARM_SMCCC_OWNER_VENDOR_HYP, \
+ (id))
+
+#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name)
+
+#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0
+#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1
+#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5
+#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6
+#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11
+#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
+
#ifndef __ASSEMBLY__
#include <linux/mm.h>
-/*
- * Translate name of a symbol defined in nVHE hyp to the name seen
- * by kernel proper. All nVHE symbols are prefixed by the build system
- * to avoid clashes with the VHE variants.
- */
-#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
-
#define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
#define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[]
@@ -60,10 +75,53 @@
DECLARE_KVM_VHE_SYM(sym); \
DECLARE_KVM_NVHE_SYM(sym)
+#define DECLARE_KVM_VHE_PER_CPU(type, sym) \
+ DECLARE_PER_CPU(type, sym)
+#define DECLARE_KVM_NVHE_PER_CPU(type, sym) \
+ DECLARE_PER_CPU(type, kvm_nvhe_sym(sym))
+
+#define DECLARE_KVM_HYP_PER_CPU(type, sym) \
+ DECLARE_KVM_VHE_PER_CPU(type, sym); \
+ DECLARE_KVM_NVHE_PER_CPU(type, sym)
+
+/*
+ * Compute pointer to a symbol defined in nVHE percpu region.
+ * Returns NULL if percpu memory has not been allocated yet.
+ */
+#define this_cpu_ptr_nvhe_sym(sym) per_cpu_ptr_nvhe_sym(sym, smp_processor_id())
+#define per_cpu_ptr_nvhe_sym(sym, cpu) \
+ ({ \
+ unsigned long base, off; \
+ base = kvm_arm_hyp_percpu_base[cpu]; \
+ off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \
+ (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \
+ base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \
+ })
+
+#if defined(__KVM_NVHE_HYPERVISOR__)
+
+#define CHOOSE_NVHE_SYM(sym) sym
+#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym)
+
+/* The nVHE hypervisor shouldn't even try to access VHE symbols */
+extern void *__nvhe_undefined_symbol;
+#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol
+#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol)
+#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol)
+
+#elif defined(__KVM_VHE_HYPERVISOR__)
+
#define CHOOSE_VHE_SYM(sym) sym
-#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
+#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym)
+
+/* The VHE hypervisor shouldn't even try to access nVHE symbols */
+extern void *__vhe_undefined_symbol;
+#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol
+#define this_cpu_ptr_hyp_sym(sym) (&__vhe_undefined_symbol)
+#define per_cpu_ptr_hyp_sym(sym, cpu) (&__vhe_undefined_symbol)
+
+#else
-#ifndef __KVM_NVHE_HYPERVISOR__
/*
* BIG FAT WARNINGS:
*
@@ -75,12 +133,21 @@
* - Don't let the nVHE hypervisor have access to this, as it will
* pick the *wrong* symbol (yes, it runs at EL2...).
*/
-#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \
+#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \
+ ? CHOOSE_VHE_SYM(sym) \
: CHOOSE_NVHE_SYM(sym))
-#else
-/* The nVHE hypervisor shouldn't even try to access anything */
-extern void *__nvhe_undefined_symbol;
-#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol
+
+#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \
+ ? this_cpu_ptr(&sym) \
+ : this_cpu_ptr_nvhe_sym(sym))
+
+#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \
+ ? per_cpu_ptr(&sym, cpu) \
+ : per_cpu_ptr_nvhe_sym(sym, cpu))
+
+#define CHOOSE_VHE_SYM(sym) sym
+#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
+
#endif
/* Translate a kernel address @ptr into its equivalent linear mapping */
@@ -98,15 +165,19 @@ struct kvm_vcpu;
struct kvm_s2_mmu;
DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
+DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
+#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
+extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+DECLARE_KVM_NVHE_SYM(__per_cpu_start);
+DECLARE_KVM_NVHE_SYM(__per_cpu_end);
+
extern atomic_t arm64_el2_vector_last_slot;
DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
#define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
-#endif
extern void __kvm_flush_vm_context(void);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
@@ -149,26 +220,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
addr; \
})
-/*
- * Home-grown __this_cpu_{ptr,read} variants that always work at HYP,
- * provided that sym is really a *symbol* and not a pointer obtained from
- * a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps
- * sparse quiet.
- */
-#define __hyp_this_cpu_ptr(sym) \
- ({ \
- void *__ptr; \
- __verify_pcpu_ptr(&sym); \
- __ptr = hyp_symbol_addr(sym); \
- __ptr += read_sysreg(tpidr_el2); \
- (typeof(sym) __kernel __force *)__ptr; \
- })
-
-#define __hyp_this_cpu_read(sym) \
- ({ \
- *__hyp_this_cpu_ptr(sym); \
- })
-
#define __KVM_EXTABLE(from, to) \
" .pushsection __kvm_ex_table, \"a\"\n" \
" .align 3\n" \
@@ -199,20 +250,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
#else /* __ASSEMBLY__ */
-.macro hyp_adr_this_cpu reg, sym, tmp
- adr_l \reg, \sym
- mrs \tmp, tpidr_el2
- add \reg, \reg, \tmp
-.endm
-
-.macro hyp_ldr_this_cpu reg, sym, tmp
- adr_l \reg, \sym
- mrs \tmp, tpidr_el2
- ldr \reg, [\reg, \tmp]
-.endm
-
.macro get_host_ctxt reg, tmp
- hyp_adr_this_cpu \reg, kvm_host_data, \tmp
+ adr_this_cpu \reg, kvm_host_data, \tmp
add \reg, \reg, #HOST_DATA_CONTEXT
.endm
@@ -221,6 +260,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
.endm
+.macro get_loaded_vcpu vcpu, ctxt
+ adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu
+ ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
+.macro set_loaded_vcpu vcpu, ctxt, tmp
+ adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp
+ str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
/*
* KVM extable for unexpected exceptions.
* In the same format _asm_extable, but output to a different section so that
@@ -236,6 +285,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
.popsection
.endm
+#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
+#define CPU_LR_OFFSET CPU_XREG_OFFSET(30)
+#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8)
+
+/*
+ * We treat x18 as callee-saved as the host may use it as a platform
+ * register (e.g. for shadow call stack).
+ */
+.macro save_callee_saved_regs ctxt
+ str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
+ stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+ stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+ stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+ stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+ stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+ stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro restore_callee_saved_regs ctxt
+ // We require \ctxt is not x18-x28
+ ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
+ ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+ ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+ ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+ ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+ ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+ ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro save_sp_el0 ctxt, tmp
+ mrs \tmp, sp_el0
+ str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+.endm
+
+.macro restore_sp_el0 ctxt, tmp
+ ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+ msr sp_el0, \tmp
+.endm
+
#endif
#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 49a55be2b9a2..5ef2669ccd6c 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -298,15 +298,15 @@ static __always_inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
return (kvm_vcpu_get_esr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
}
-static __always_inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu)
+static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu)
{
return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW);
}
+/* Always check for S1PTW *before* using this. */
static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
{
- return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR) ||
- kvm_vcpu_dabt_iss1tw(vcpu); /* AF/DBM update */
+ return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR;
}
static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
@@ -335,6 +335,11 @@ static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu)
return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW;
}
+static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
+}
+
static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
{
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
@@ -372,6 +377,9 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
{
+ if (kvm_vcpu_abt_iss1tw(vcpu))
+ return true;
+
if (kvm_vcpu_trap_is_iabt(vcpu))
return false;
@@ -383,20 +391,6 @@ static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
}
-static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG;
-}
-
-static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu,
- bool flag)
-{
- if (flag)
- vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
- else
- vcpu->arch.workaround_flags &= ~VCPU_WORKAROUND_2_FLAG;
-}
-
static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
{
if (vcpu_mode_is_32bit(vcpu)) {
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e52c927aade5..0aecbab6a7fb 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -11,6 +11,7 @@
#ifndef __ARM64_KVM_HOST_H__
#define __ARM64_KVM_HOST_H__
+#include <linux/arm-smccc.h>
#include <linux/bitmap.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -79,8 +80,8 @@ struct kvm_s2_mmu {
* for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the
* canonical stage-2 page tables.
*/
- pgd_t *pgd;
phys_addr_t pgd_phys;
+ struct kvm_pgtable *pgt;
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
@@ -110,6 +111,13 @@ struct kvm_arch {
* supported.
*/
bool return_nisv_io_abort_to_user;
+
+ /*
+ * VM-wide PMU filter, implemented as a bitmap and big enough for
+ * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
+ */
+ unsigned long *pmu_filter;
+ unsigned int pmuver;
};
struct kvm_vcpu_fault_info {
@@ -262,8 +270,6 @@ struct kvm_host_data {
struct kvm_pmu_events pmu_events;
};
-typedef struct kvm_host_data kvm_host_data_t;
-
struct vcpu_reset_state {
unsigned long pc;
unsigned long r0;
@@ -368,7 +374,6 @@ struct kvm_vcpu_arch {
/* Guest PV state */
struct {
- u64 steal;
u64 last_steal;
gpa_t base;
} steal;
@@ -481,18 +486,15 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
-u64 __kvm_call_hyp(void *hypfn, ...);
-
-#define kvm_call_hyp_nvhe(f, ...) \
- do { \
- DECLARE_KVM_NVHE_SYM(f); \
- __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
- } while(0)
-
-#define kvm_call_hyp_nvhe_ret(f, ...) \
+#define kvm_call_hyp_nvhe(f, ...) \
({ \
- DECLARE_KVM_NVHE_SYM(f); \
- __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
+ struct arm_smccc_res res; \
+ \
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \
+ ##__VA_ARGS__, &res); \
+ WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \
+ \
+ res.a1; \
})
/*
@@ -518,7 +520,7 @@ u64 __kvm_call_hyp(void *hypfn, ...);
ret = f(__VA_ARGS__); \
isb(); \
} else { \
- ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \
+ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \
} \
\
ret; \
@@ -544,6 +546,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu);
void kvm_update_stolen_time(struct kvm_vcpu *vcpu);
+bool kvm_arm_pvtime_supported(void);
int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
@@ -565,7 +568,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
-DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
{
@@ -631,46 +634,6 @@ static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
static inline void kvm_clr_pmu_events(u32 clr) {}
#endif
-#define KVM_BP_HARDEN_UNKNOWN -1
-#define KVM_BP_HARDEN_WA_NEEDED 0
-#define KVM_BP_HARDEN_NOT_REQUIRED 1
-
-static inline int kvm_arm_harden_branch_predictor(void)
-{
- switch (get_spectre_v2_workaround_state()) {
- case ARM64_BP_HARDEN_WA_NEEDED:
- return KVM_BP_HARDEN_WA_NEEDED;
- case ARM64_BP_HARDEN_NOT_REQUIRED:
- return KVM_BP_HARDEN_NOT_REQUIRED;
- case ARM64_BP_HARDEN_UNKNOWN:
- default:
- return KVM_BP_HARDEN_UNKNOWN;
- }
-}
-
-#define KVM_SSBD_UNKNOWN -1
-#define KVM_SSBD_FORCE_DISABLE 0
-#define KVM_SSBD_KERNEL 1
-#define KVM_SSBD_FORCE_ENABLE 2
-#define KVM_SSBD_MITIGATED 3
-
-static inline int kvm_arm_have_ssbd(void)
-{
- switch (arm64_get_ssbd_state()) {
- case ARM64_SSBD_FORCE_DISABLE:
- return KVM_SSBD_FORCE_DISABLE;
- case ARM64_SSBD_KERNEL:
- return KVM_SSBD_KERNEL;
- case ARM64_SSBD_FORCE_ENABLE:
- return KVM_SSBD_FORCE_ENABLE;
- case ARM64_SSBD_MITIGATED:
- return KVM_SSBD_MITIGATED;
- case ARM64_SSBD_UNKNOWN:
- default:
- return KVM_SSBD_UNKNOWN;
- }
-}
-
void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 46689e7db46c..6b664de5ec1f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -12,6 +12,9 @@
#include <asm/alternative.h>
#include <asm/sysreg.h>
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
+
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
@@ -87,11 +90,11 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
void deactivate_traps_vhe_put(void);
#endif
-u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
+u64 __guest_enter(struct kvm_vcpu *vcpu);
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt);
+void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
-void __noreturn __hyp_do_panic(unsigned long, ...);
+void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
#endif
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 189839c3706a..331394306cce 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -9,6 +9,7 @@
#include <asm/page.h>
#include <asm/memory.h>
+#include <asm/mmu.h>
#include <asm/cpufeature.h>
/*
@@ -43,16 +44,6 @@
* HYP_VA_MIN = 1 << (VA_BITS - 1)
* HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
*
- * This of course assumes that the trampoline page exists within the
- * VA_BITS range. If it doesn't, then it means we're in the odd case
- * where the kernel idmap (as well as HYP) uses more levels than the
- * kernel runtime page tables (as seen when the kernel is configured
- * for 4k pages, 39bits VA, and yet memory lives just above that
- * limit, forcing the idmap to use 4 levels of page tables while the
- * kernel itself only uses 3). In this particular case, it doesn't
- * matter which side of VA_BITS we use, as we're guaranteed not to
- * conflict with anything.
- *
* When using VHE, there are no separate hyp mappings and all KVM
* functionality is already mapped as part of the main kernel
* mappings, and none of this applies in that case.
@@ -117,15 +108,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm))
#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
-static inline bool kvm_page_empty(void *ptr)
-{
- struct page *ptr_page = virt_to_page(ptr);
- return page_count(ptr_page) == 1;
-}
-
+#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
-int create_hyp_mappings(void *from, void *to, pgprot_t prot);
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
void __iomem **haddr);
@@ -141,149 +127,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
-
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
int kvm_mmu_init(void);
-void kvm_clear_hyp_idmap(void);
-
-#define kvm_mk_pmd(ptep) \
- __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
-#define kvm_mk_pud(pmdp) \
- __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
-#define kvm_mk_p4d(pmdp) \
- __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
-
-#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
-
-#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot)
-#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot)
-#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot)
-
-#define kvm_pud_pfn(pud) pud_pfn(pud)
-
-#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd)
-#define kvm_pud_mkhuge(pud) pud_mkhuge(pud)
-
-static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
-{
- pte_val(pte) |= PTE_S2_RDWR;
- return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
-{
- pmd_val(pmd) |= PMD_S2_RDWR;
- return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
-{
- pud_val(pud) |= PUD_S2_RDWR;
- return pud;
-}
-
-static inline pte_t kvm_s2pte_mkexec(pte_t pte)
-{
- pte_val(pte) &= ~PTE_S2_XN;
- return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
-{
- pmd_val(pmd) &= ~PMD_S2_XN;
- return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkexec(pud_t pud)
-{
- pud_val(pud) &= ~PUD_S2_XN;
- return pud;
-}
-
-static inline void kvm_set_s2pte_readonly(pte_t *ptep)
-{
- pteval_t old_pteval, pteval;
-
- pteval = READ_ONCE(pte_val(*ptep));
- do {
- old_pteval = pteval;
- pteval &= ~PTE_S2_RDWR;
- pteval |= PTE_S2_RDONLY;
- pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
- } while (pteval != old_pteval);
-}
-
-static inline bool kvm_s2pte_readonly(pte_t *ptep)
-{
- return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
-}
-
-static inline bool kvm_s2pte_exec(pte_t *ptep)
-{
- return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
-}
-
-static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
-{
- kvm_set_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
-{
- return kvm_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
-{
- return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
-}
-
-static inline void kvm_set_s2pud_readonly(pud_t *pudp)
-{
- kvm_set_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_readonly(pud_t *pudp)
-{
- return kvm_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_exec(pud_t *pudp)
-{
- return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
-}
-
-static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
-{
- return pud_mkyoung(pud);
-}
-
-static inline bool kvm_s2pud_young(pud_t pud)
-{
- return pud_young(pud);
-}
-
-#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
-
-#ifdef __PAGETABLE_PMD_FOLDED
-#define hyp_pmd_table_empty(pmdp) (0)
-#else
-#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
-#endif
-
-#ifdef __PAGETABLE_PUD_FOLDED
-#define hyp_pud_table_empty(pudp) (0)
-#else
-#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
-#endif
-
-#ifdef __PAGETABLE_P4D_FOLDED
-#define hyp_p4d_table_empty(p4dp) (0)
-#else
-#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
-#endif
struct kvm;
@@ -325,77 +171,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
}
}
-static inline void __kvm_flush_dcache_pte(pte_t pte)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pte_page(pte);
- kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
- }
-}
-
-static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pmd_page(pmd);
- kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
- }
-}
-
-static inline void __kvm_flush_dcache_pud(pud_t pud)
-{
- if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
- struct page *page = pud_page(pud);
- kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
- }
-}
-
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
-static inline bool __kvm_cpu_uses_extended_idmap(void)
-{
- return __cpu_uses_extended_idmap_level();
-}
-
-static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
-{
- return idmap_ptrs_per_pgd;
-}
-
-/*
- * Can't use pgd_populate here, because the extended idmap adds an extra level
- * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
- * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
- */
-static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
- pgd_t *hyp_pgd,
- pgd_t *merged_hyp_pgd,
- unsigned long hyp_idmap_start)
-{
- int idmap_idx;
- u64 pgd_addr;
-
- /*
- * Use the first entry to access the HYP mappings. It is
- * guaranteed to be free, otherwise we wouldn't use an
- * extended idmap.
- */
- VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
- pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
- merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-
- /*
- * Create another extended level entry that points to the boot HYP map,
- * which contains an ID mapping of the HYP init code. We essentially
- * merge the boot and runtime HYP maps by doing so, but they don't
- * overlap anyway, so this is fine.
- */
- idmap_idx = hyp_idmap_start >> VA_BITS;
- VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
- pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
- merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-}
-
static inline unsigned int kvm_get_vmid_bits(void)
{
int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
@@ -430,19 +208,17 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
return ret;
}
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
/*
* EL2 vectors can be mapped and rerouted in a number of ways,
* depending on the kernel configuration and CPU present:
*
- * - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the
- * hardening sequence is placed in one of the vector slots, which is
- * executed before jumping to the real vectors.
+ * - If the CPU is affected by Spectre-v2, the hardening sequence is
+ * placed in one of the vector slots, which is executed before jumping
+ * to the real vectors.
*
- * - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the
- * ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the
- * hardening sequence is mapped next to the idmap page, and executed
- * before jumping to the real vectors.
+ * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot
+ * containing the hardening sequence is mapped next to the idmap page,
+ * and executed before jumping to the real vectors.
*
* - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an
* empty slot is selected, mapped next to the idmap page, and
@@ -452,19 +228,16 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
* VHE, as we don't have hypervisor-specific mappings. If the system
* is VHE and yet selects this capability, it will be ignored.
*/
-#include <asm/mmu.h>
-
extern void *__kvm_bp_vect_base;
extern int __kvm_harden_el2_vector_slot;
-/* This is called on both VHE and !VHE systems */
static inline void *kvm_get_hyp_vector(void)
{
struct bp_hardening_data *data = arm64_get_bp_hardening_data();
void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
int slot = -1;
- if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) {
+ if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) {
vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
slot = data->hyp_vectors_slot;
}
@@ -481,102 +254,8 @@ static inline void *kvm_get_hyp_vector(void)
return vect;
}
-/* This is only called on a !VHE system */
-static inline int kvm_map_vectors(void)
-{
- /*
- * HBP = ARM64_HARDEN_BRANCH_PREDICTOR
- * HEL2 = ARM64_HARDEN_EL2_VECTORS
- *
- * !HBP + !HEL2 -> use direct vectors
- * HBP + !HEL2 -> use hardened vectors in place
- * !HBP + HEL2 -> allocate one vector slot and use exec mapping
- * HBP + HEL2 -> use hardened vertors and use exec mapping
- */
- if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) {
- __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
- __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
- }
-
- if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
- phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
- unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
-
- /*
- * Always allocate a spare vector slot, as we don't
- * know yet which CPUs have a BP hardening slot that
- * we can reuse.
- */
- __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
- BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
- return create_hyp_exec_mappings(vect_pa, size,
- &__kvm_bp_vect_base);
- }
-
- return 0;
-}
-#else
-static inline void *kvm_get_hyp_vector(void)
-{
- return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
-}
-
-static inline int kvm_map_vectors(void)
-{
- return 0;
-}
-#endif
-
-#ifdef CONFIG_ARM64_SSBD
-DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
-
-static inline int hyp_map_aux_data(void)
-{
- int cpu, err;
-
- for_each_possible_cpu(cpu) {
- u64 *ptr;
-
- ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu);
- err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP);
- if (err)
- return err;
- }
- return 0;
-}
-#else
-static inline int hyp_map_aux_data(void)
-{
- return 0;
-}
-#endif
-
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
-/*
- * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
- * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
- * 52bit IPS.
- */
-static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
-{
- int x = ARM64_VTTBR_X(ipa_shift, levels);
-
- return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
-}
-
-static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
-{
- unsigned int x = arm64_vttbr_x(ipa_shift, levels);
-
- return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
-}
-
-static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
-{
- return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
-}
-
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
{
struct kvm_vmid *vmid = &mmu->vmid;
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
new file mode 100644
index 000000000000..52ab38db04c7
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_PGTABLE_H__
+#define __ARM64_KVM_PGTABLE_H__
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+typedef u64 kvm_pte_t;
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits: Maximum input address size, in bits.
+ * @start_level: Level at which the page-table walk starts.
+ * @pgd: Pointer to the first top-level entry of the page-table.
+ * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ */
+struct kvm_pgtable {
+ u32 ia_bits;
+ u32 start_level;
+ kvm_pte_t *pgd;
+
+ /* Stage-2 only */
+ struct kvm_s2_mmu *mmu;
+};
+
+/**
+ * enum kvm_pgtable_prot - Page-table permissions and attributes.
+ * @KVM_PGTABLE_PROT_X: Execute permission.
+ * @KVM_PGTABLE_PROT_W: Write permission.
+ * @KVM_PGTABLE_PROT_R: Read permission.
+ * @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ */
+enum kvm_pgtable_prot {
+ KVM_PGTABLE_PROT_X = BIT(0),
+ KVM_PGTABLE_PROT_W = BIT(1),
+ KVM_PGTABLE_PROT_R = BIT(2),
+
+ KVM_PGTABLE_PROT_DEVICE = BIT(3),
+};
+
+#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
+#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
+#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
+
+/**
+ * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
+ * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
+ * entries.
+ * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
+ * children.
+ * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
+ * children.
+ */
+enum kvm_pgtable_walk_flags {
+ KVM_PGTABLE_WALK_LEAF = BIT(0),
+ KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
+ KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
+};
+
+typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg);
+
+/**
+ * struct kvm_pgtable_walker - Hook into a page-table walk.
+ * @cb: Callback function to invoke during the walk.
+ * @arg: Argument passed to the callback function.
+ * @flags: Bitwise-OR of flags to identify the entry types on which to
+ * invoke the callback function.
+ */
+struct kvm_pgtable_walker {
+ const kvm_pgtable_visitor_fn_t cb;
+ void * const arg;
+ const enum kvm_pgtable_walk_flags flags;
+};
+
+/**
+ * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
+ * @pgt: Uninitialised page-table structure to initialise.
+ * @va_bits: Maximum virtual address bits.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+
+/**
+ * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
+ * @addr: Virtual address at which to place the mapping.
+ * @size: Size of the mapping.
+ * @phys: Physical address of the memory to map.
+ * @prot: Permissions and attributes for the mapping.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable. Attempts to install a new mapping
+ * for a virtual address that is already mapped will be rejected with an
+ * error and a WARN().
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+ enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * @pgt: Uninitialised page-table structure to initialise.
+ * @kvm: KVM structure representing the guest virtual machine.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+
+/**
+ * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address at which to place the mapping.
+ * @size: Size of the mapping.
+ * @phys: Physical address of the memory to map.
+ * @prot: Permissions and attributes for the mapping.
+ * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to
+ * allocate page-table pages.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable.
+ *
+ * Note that this function will both coalesce existing table entries and split
+ * existing block mappings, relying on page-faults to fault back areas outside
+ * of the new mapping lazily.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ struct kvm_mmu_memory_cache *mc);
+
+/**
+ * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to remove the mapping.
+ * @size: Size of the mapping.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * TLB invalidation is performed for each page-table entry cleared during the
+ * unmapping operation and the reference count for the page-table page
+ * containing the cleared entry is decremented, with unreferenced pages being
+ * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
+ * FWB is not supported by the CPU.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
+ * without TLB invalidation.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to write-protect,
+ * @size: Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * set the access flag in that entry.
+ *
+ * Return: The old page-table entry prior to setting the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * clear the access flag in that entry.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
+ * page-table entry.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ * @prot: Additional permissions to grant for the mapping.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * relax the permissions in that entry according to the read, write and
+ * execute permissions specified by @prot. No permissions are removed, and
+ * TLB invalidation is performed after updating the entry.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
+ * access flag set.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * Return: True if the page-table entry has the access flag set, false otherwise.
+ */
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
+ * of Coherency for guest stage-2 address
+ * range.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to flush.
+ * @size: Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_walk() - Walk a page-table.
+ * @pgt: Page-table structure initialised by kvm_pgtable_*_init().
+ * @addr: Input address for the start of the walk.
+ * @size: Size of the range to walk.
+ * @walker: Walker callback description.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * The walker will walk the page-table entries corresponding to the input
+ * address range specified, visiting entries according to the walker flags.
+ * Invalid entries are treated as leaf entries. Leaf entries are reloaded
+ * after invoking the walker callback, allowing the walker to descend into
+ * a newly installed table.
+ *
+ * Returning a negative error code from the walker callback function will
+ * terminate the walk immediately with the same error code.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker);
+
+#endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h
index 0ddf98c3ba9f..0cd0965255d2 100644
--- a/arch/arm64/include/asm/kvm_ptrauth.h
+++ b/arch/arm64/include/asm/kvm_ptrauth.h
@@ -60,7 +60,7 @@
.endm
/*
- * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will
+ * Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will
* check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as
* (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and
* then proceed ahead with the save/restore of Pointer Authentication
@@ -78,7 +78,7 @@ alternative_else_nop_endif
.L__skip_switch\@:
.endm
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
alternative_if_not ARM64_HAS_ADDRESS_AUTH
b .L__skip_switch\@
alternative_else_nop_endif
@@ -96,7 +96,7 @@ alternative_else_nop_endif
#else /* !CONFIG_ARM64_PTR_AUTH */
.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
.endm
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
.endm
#endif /* CONFIG_ARM64_PTR_AUTH */
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index afa722504bfd..cd61239bae8c 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -126,13 +126,18 @@
/*
* Memory types available.
+ *
+ * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in
+ * the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note
+ * that protection_map[] only contains MT_NORMAL attributes.
*/
-#define MT_DEVICE_nGnRnE 0
-#define MT_DEVICE_nGnRE 1
-#define MT_DEVICE_GRE 2
-#define MT_NORMAL_NC 3
-#define MT_NORMAL 4
-#define MT_NORMAL_WT 5
+#define MT_NORMAL 0
+#define MT_NORMAL_TAGGED 1
+#define MT_NORMAL_NC 2
+#define MT_NORMAL_WT 3
+#define MT_DEVICE_nGnRnE 4
+#define MT_DEVICE_nGnRE 5
+#define MT_DEVICE_GRE 6
/*
* Memory types for Stage-2 translation
@@ -164,12 +169,11 @@
extern u64 vabits_actual;
#define PAGE_END (_PAGE_END(vabits_actual))
-extern s64 physvirt_offset;
extern s64 memstart_addr;
/* PHYS_OFFSET - the physical address of the start of memory. */
#define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
-/* the virtual base of the kernel image (minus TEXT_OFFSET) */
+/* the virtual base of the kernel image */
extern u64 kimage_vaddr;
/* the offset between the kernel virtual and physical mappings */
@@ -240,7 +244,7 @@ static inline const void *__tag_set(const void *addr, u8 tag)
*/
#define __is_lm_address(addr) (!(((u64)addr) & BIT(vabits_actual - 1)))
-#define __lm_to_phys(addr) (((addr) + physvirt_offset))
+#define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
#define __kimg_to_phys(addr) ((addr) - kimage_voffset)
#define __virt_to_phys_nodebug(x) ({ \
@@ -258,7 +262,7 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x);
#define __phys_addr_symbol(x) __pa_symbol_nodebug(x)
#endif /* CONFIG_DEBUG_VIRTUAL */
-#define __phys_to_virt(x) ((unsigned long)((x) - physvirt_offset))
+#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)
#define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset))
/*
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 081ec8de9ea6..e3e28f7daf62 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -9,16 +9,53 @@
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
unsigned long pkey __always_unused)
{
+ unsigned long ret = 0;
+
if (system_supports_bti() && (prot & PROT_BTI))
- return VM_ARM64_BTI;
+ ret |= VM_ARM64_BTI;
- return 0;
+ if (system_supports_mte() && (prot & PROT_MTE))
+ ret |= VM_MTE;
+
+ return ret;
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+{
+ /*
+ * Only allow MTE on anonymous mappings as these are guaranteed to be
+ * backed by tags-capable memory. The vm_flags may be overridden by a
+ * filesystem supporting MTE (RAM-based).
+ */
+ if (system_supports_mte() && (flags & MAP_ANONYMOUS))
+ return VM_MTE_ALLOWED;
+
+ return 0;
+}
+#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+
static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
{
- return (vm_flags & VM_ARM64_BTI) ? __pgprot(PTE_GP) : __pgprot(0);
+ pteval_t prot = 0;
+
+ if (vm_flags & VM_ARM64_BTI)
+ prot |= PTE_GP;
+
+ /*
+ * There are two conditions required for returning a Normal Tagged
+ * memory type: (1) the user requested it via PROT_MTE passed to
+ * mmap() or mprotect() and (2) the corresponding vma supports MTE. We
+ * register (1) as VM_MTE in the vma->vm_flags and (2) as
+ * VM_MTE_ALLOWED. Note that the latter can only be set during the
+ * mmap() call since mprotect() does not accept MAP_* flags.
+ * Checking for VM_MTE only is sufficient since arch_validate_flags()
+ * does not permit (VM_MTE & !VM_MTE_ALLOWED).
+ */
+ if (vm_flags & VM_MTE)
+ prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED);
+
+ return __pgprot(prot);
}
#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
@@ -30,8 +67,21 @@ static inline bool arch_validate_prot(unsigned long prot,
if (system_supports_bti())
supported |= PROT_BTI;
+ if (system_supports_mte())
+ supported |= PROT_MTE;
+
return (prot & ~supported) == 0;
}
#define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)
+static inline bool arch_validate_flags(unsigned long vm_flags)
+{
+ if (!system_supports_mte())
+ return true;
+
+ /* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */
+ return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED);
+}
+#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
+
#endif /* ! __ASM_MMAN_H__ */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index a7a5ecaa2e83..b2e91c187e2a 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -17,11 +17,14 @@
#ifndef __ASSEMBLY__
+#include <linux/refcount.h>
+
typedef struct {
atomic64_t id;
#ifdef CONFIG_COMPAT
void *sigpage;
#endif
+ refcount_t pinned;
void *vdso;
unsigned long flags;
} mm_context_t;
@@ -45,7 +48,6 @@ struct bp_hardening_data {
bp_hardening_cb_t fn;
};
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
@@ -57,21 +59,13 @@ static inline void arm64_apply_bp_hardening(void)
{
struct bp_hardening_data *d;
- if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR))
+ if (!cpus_have_const_cap(ARM64_SPECTRE_V2))
return;
d = arm64_get_bp_hardening_data();
if (d->fn)
d->fn();
}
-#else
-static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
-{
- return NULL;
-}
-
-static inline void arm64_apply_bp_hardening(void) { }
-#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
extern void arm64_memblock_init(void);
extern void paging_init(void);
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index f2d7537d6f83..0672236e1aea 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -177,7 +177,13 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
#define destroy_context(mm) do { } while(0)
void check_and_switch_context(struct mm_struct *mm);
-#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
+static inline int
+init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ atomic64_set(&mm->context.id, 0);
+ refcount_set(&mm->context.pinned, 0);
+ return 0;
+}
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
static inline void update_saved_ttbr0(struct task_struct *tsk,
@@ -248,6 +254,9 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
void verify_cpu_asid_bits(void);
void post_ttbr_update_workaround(void);
+unsigned long arm64_mm_context_get(struct mm_struct *mm);
+void arm64_mm_context_put(struct mm_struct *mm);
+
#endif /* !__ASSEMBLY__ */
#endif /* !__ASM_MMU_CONTEXT_H */
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/include/asm/module.lds.h
index 22e36a21c113..691f15af788e 100644
--- a/arch/arm64/kernel/module.lds
+++ b/arch/arm64/include/asm/module.lds.h
@@ -1,5 +1,7 @@
+#ifdef CONFIG_ARM64_MODULE_PLTS
SECTIONS {
.plt (NOLOAD) : { BYTE(0) }
.init.plt (NOLOAD) : { BYTE(0) }
.text.ftrace_trampoline (NOLOAD) : { BYTE(0) }
}
+#endif
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
new file mode 100644
index 000000000000..1c99fcadb58c
--- /dev/null
+++ b/arch/arm64/include/asm/mte.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+#ifndef __ASM_MTE_H
+#define __ASM_MTE_H
+
+#define MTE_GRANULE_SIZE UL(16)
+#define MTE_GRANULE_MASK (~(MTE_GRANULE_SIZE - 1))
+#define MTE_TAG_SHIFT 56
+#define MTE_TAG_SIZE 4
+
+#ifndef __ASSEMBLY__
+
+#include <linux/page-flags.h>
+
+#include <asm/pgtable-types.h>
+
+void mte_clear_page_tags(void *addr);
+unsigned long mte_copy_tags_from_user(void *to, const void __user *from,
+ unsigned long n);
+unsigned long mte_copy_tags_to_user(void __user *to, void *from,
+ unsigned long n);
+int mte_save_tags(struct page *page);
+void mte_save_page_tags(const void *page_addr, void *tag_storage);
+bool mte_restore_tags(swp_entry_t entry, struct page *page);
+void mte_restore_page_tags(void *page_addr, const void *tag_storage);
+void mte_invalidate_tags(int type, pgoff_t offset);
+void mte_invalidate_tags_area(int type);
+void *mte_allocate_tag_storage(void);
+void mte_free_tag_storage(char *storage);
+
+#ifdef CONFIG_ARM64_MTE
+
+/* track which pages have valid allocation tags */
+#define PG_mte_tagged PG_arch_2
+
+void mte_sync_tags(pte_t *ptep, pte_t pte);
+void mte_copy_page_tags(void *kto, const void *kfrom);
+void flush_mte_state(void);
+void mte_thread_switch(struct task_struct *next);
+void mte_suspend_exit(void);
+long set_mte_ctrl(struct task_struct *task, unsigned long arg);
+long get_mte_ctrl(struct task_struct *task);
+int mte_ptrace_copy_tags(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data);
+
+#else
+
+/* unused if !CONFIG_ARM64_MTE, silence the compiler */
+#define PG_mte_tagged 0
+
+static inline void mte_sync_tags(pte_t *ptep, pte_t pte)
+{
+}
+static inline void mte_copy_page_tags(void *kto, const void *kfrom)
+{
+}
+static inline void flush_mte_state(void)
+{
+}
+static inline void mte_thread_switch(struct task_struct *next)
+{
+}
+static inline void mte_suspend_exit(void)
+{
+}
+static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg)
+{
+ return 0;
+}
+static inline long get_mte_ctrl(struct task_struct *task)
+{
+ return 0;
+}
+static inline int mte_ptrace_copy_tags(struct task_struct *child,
+ long request, unsigned long addr,
+ unsigned long data)
+{
+ return -EIO;
+}
+
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_MTE_H */
diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h
index 626ad01e83bf..dd870390d639 100644
--- a/arch/arm64/include/asm/numa.h
+++ b/arch/arm64/include/asm/numa.h
@@ -25,6 +25,9 @@ const struct cpumask *cpumask_of_node(int node);
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
static inline const struct cpumask *cpumask_of_node(int node)
{
+ if (node == NUMA_NO_NODE)
+ return cpu_all_mask;
+
return node_to_cpumask_map[node];
}
#endif
diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h
index f99d48ecbeef..2403f7b4cdbf 100644
--- a/arch/arm64/include/asm/page-def.h
+++ b/arch/arm64/include/asm/page-def.h
@@ -11,13 +11,8 @@
#include <linux/const.h>
/* PAGE_SHIFT determines the page size */
-/* CONT_SHIFT determines the number of pages which can be tracked together */
#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT
-#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT
#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
-#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT))
-#define CONT_MASK (~(CONT_SIZE-1))
-
#endif /* __ASM_PAGE_DEF_H */
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index c01b52add377..012cffc574e8 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -15,18 +15,25 @@
#include <linux/personality.h> /* for READ_IMPLIES_EXEC */
#include <asm/pgtable-types.h>
-extern void __cpu_clear_user_page(void *p, unsigned long user);
-extern void __cpu_copy_user_page(void *to, const void *from,
- unsigned long user);
+struct page;
+struct vm_area_struct;
+
extern void copy_page(void *to, const void *from);
extern void clear_page(void *to);
+void copy_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma);
+#define __HAVE_ARCH_COPY_USER_HIGHPAGE
+
+void copy_highpage(struct page *to, struct page *from);
+#define __HAVE_ARCH_COPY_HIGHPAGE
+
#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
-#define clear_user_page(addr,vaddr,pg) __cpu_clear_user_page(addr, vaddr)
-#define copy_user_page(to,from,vaddr,pg) __cpu_copy_user_page(to, from, vaddr)
+#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
typedef struct page *pgtable_t;
@@ -36,7 +43,7 @@ extern int pfn_valid(unsigned long);
#endif /* !__ASSEMBLY__ */
-#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
+#define VM_DATA_DEFAULT_FLAGS (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED)
#include <asm-generic/getorder.h>
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 70b323cf8300..b33ca260e3c9 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -17,6 +17,7 @@
#define pcibios_assign_all_busses() \
(pci_has_flag(PCI_REASSIGN_ALL_BUS))
+#define arch_can_pci_mmap_wc() 1
#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1
extern int isa_dma_bridge_buggy;
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 0b6409b89e5e..1599e17379d8 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off)
:: "r" (off) : "memory");
}
-static inline unsigned long __my_cpu_offset(void)
+static inline unsigned long __hyp_my_cpu_offset(void)
+{
+ /*
+ * Non-VHE hyp code runs with preemption disabled. No need to hazard
+ * the register access against barrier() as in __kern_my_cpu_offset.
+ */
+ return read_sysreg(tpidr_el2);
+}
+
+static inline unsigned long __kern_my_cpu_offset(void)
{
unsigned long off;
@@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void)
return off;
}
-#define __my_cpu_offset __my_cpu_offset()
+
+#ifdef __KVM_NVHE_HYPERVISOR__
+#define __my_cpu_offset __hyp_my_cpu_offset()
+#else
+#define __my_cpu_offset __kern_my_cpu_offset()
+#endif
#define PERCPU_RW_OPS(sz) \
static inline unsigned long __percpu_read_##sz(void *ptr) \
@@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd)
#include <asm-generic/percpu.h>
+/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */
+#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT)
+#undef this_cpu_ptr
+#define this_cpu_ptr raw_cpu_ptr
+#undef __this_cpu_read
+#define __this_cpu_read raw_cpu_read
+#undef __this_cpu_write
+#define __this_cpu_write raw_cpu_write
+#endif
+
#endif /* __ASM_PERCPU_H */
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 2c2d7dbe8a02..60731f602d3e 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -236,6 +236,9 @@
#define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */
#define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */
+/* PMMIR_EL1.SLOTS mask */
+#define ARMV8_PMU_SLOTS_MASK 0xff
+
#ifdef CONFIG_PERF_EVENTS
struct pt_regs;
extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d400a4d9aee2..01a96d07ae74 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -81,25 +81,15 @@
/*
* Contiguous page definitions.
*/
-#ifdef CONFIG_ARM64_64K_PAGES
-#define CONT_PTE_SHIFT (5 + PAGE_SHIFT)
-#define CONT_PMD_SHIFT (5 + PMD_SHIFT)
-#elif defined(CONFIG_ARM64_16K_PAGES)
-#define CONT_PTE_SHIFT (7 + PAGE_SHIFT)
-#define CONT_PMD_SHIFT (5 + PMD_SHIFT)
-#else
-#define CONT_PTE_SHIFT (4 + PAGE_SHIFT)
-#define CONT_PMD_SHIFT (4 + PMD_SHIFT)
-#endif
-
+#define CONT_PTE_SHIFT (CONFIG_ARM64_CONT_PTE_SHIFT + PAGE_SHIFT)
#define CONT_PTES (1 << (CONT_PTE_SHIFT - PAGE_SHIFT))
#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE)
#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1))
+
+#define CONT_PMD_SHIFT (CONFIG_ARM64_CONT_PMD_SHIFT + PMD_SHIFT)
#define CONT_PMDS (1 << (CONT_PMD_SHIFT - PMD_SHIFT))
#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE)
#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1))
-/* the numerical offset of the PTE within a range of CONT_PTES */
-#define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1))
/*
* Hardware page table definitions.
@@ -156,7 +146,6 @@
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
#define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */
-#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */
#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
#ifdef CONFIG_ARM64_PA_BITS_52
@@ -173,34 +162,11 @@
#define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2)
/*
- * 2nd stage PTE definitions
- */
-#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
-#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
-#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */
-#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */
-
-#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
-#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
-#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */
-#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */
-
-#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */
-#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
-#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */
-
-/*
* Memory Attribute override for Stage-2 (MemAttr[3:0])
*/
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
/*
- * EL2/HYP PTE/PMD definitions
- */
-#define PMD_HYP PMD_SECT_USER
-#define PTE_HYP PTE_USER
-
-/*
* Highest possible physical address supported.
*/
#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 4d867c6446c4..046be789fbb4 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -19,6 +19,13 @@
#define PTE_DEVMAP (_AT(pteval_t, 1) << 57)
#define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
+/*
+ * This bit indicates that the entry is present i.e. pmd_page()
+ * still points to a valid huge page in memory even if the pmd
+ * has been invalidated.
+ */
+#define PMD_PRESENT_INVALID (_AT(pteval_t, 1) << 59) /* only when !PMD_SECT_VALID */
+
#ifndef __ASSEMBLY__
#include <asm/cpufeature.h>
@@ -50,25 +57,21 @@ extern bool arm64_use_ng_mappings;
#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC))
#define PROT_NORMAL_WT (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_WT))
#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL))
+#define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED))
#define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
#define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT
#define PAGE_KERNEL __pgprot(PROT_NORMAL)
+#define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED)
#define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
#define PAGE_KERNEL_ROX __pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY)
#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN)
#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
-#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
-#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
-#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
-#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN)
-
#define PAGE_S2_MEMATTR(attr) \
({ \
u64 __val; \
@@ -79,19 +82,6 @@ extern bool arm64_use_ng_mappings;
__val; \
})
-#define PAGE_S2_XN \
- ({ \
- u64 __val; \
- if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \
- __val = 0; \
- else \
- __val = PTE_S2_XN; \
- __val; \
- })
-
-#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
-#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
-
#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d5d3fbe73953..4ff12a7adcfd 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -9,6 +9,7 @@
#include <asm/proc-fns.h>
#include <asm/memory.h>
+#include <asm/mte.h>
#include <asm/pgtable-hwdef.h>
#include <asm/pgtable-prot.h>
#include <asm/tlbflush.h>
@@ -23,6 +24,8 @@
#define VMALLOC_START (MODULES_END)
#define VMALLOC_END (- PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
+#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
+
#define FIRST_USER_ADDRESS 0UL
#ifndef __ASSEMBLY__
@@ -33,13 +36,6 @@
#include <linux/mm_types.h>
#include <linux/sched.h>
-extern struct page *vmemmap;
-
-extern void __pte_error(const char *file, int line, unsigned long val);
-extern void __pmd_error(const char *file, int line, unsigned long val);
-extern void __pud_error(const char *file, int line, unsigned long val);
-extern void __pgd_error(const char *file, int line, unsigned long val);
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
@@ -51,13 +47,22 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
+ * Outside of a few very special situations (e.g. hibernation), we always
+ * use broadcast TLB invalidation instructions, therefore a spurious page
+ * fault on one CPU which has been handled concurrently by another CPU
+ * does not need to perform additional invalidation.
+ */
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+
+/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
*/
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
#define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page))
-#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte))
+#define pte_ERROR(e) \
+ pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e))
/*
* Macros to convert between a physical address and its placement in a
@@ -90,6 +95,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
#define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN))
#define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT))
#define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP))
+#define pte_tagged(pte) ((pte_val(pte) & PTE_ATTRINDX_MASK) == \
+ PTE_ATTRINDX(MT_NORMAL_TAGGED))
#define pte_cont_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK; \
@@ -145,6 +152,18 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
return pte;
}
+static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+ pmd_val(pmd) &= ~pgprot_val(prot);
+ return pmd;
+}
+
+static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
+{
+ pmd_val(pmd) |= pgprot_val(prot);
+ return pmd;
+}
+
static inline pte_t pte_wrprotect(pte_t pte)
{
pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
@@ -284,6 +303,10 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
__sync_icache_dcache(pte);
+ if (system_supports_mte() &&
+ pte_present(pte) && pte_tagged(pte) && !pte_special(pte))
+ mte_sync_tags(ptep, pte);
+
__check_racy_pte_update(mm, ptep, pte);
set_pte(ptep, pte);
@@ -363,15 +386,24 @@ static inline int pmd_protnone(pmd_t pmd)
}
#endif
+#define pmd_present_invalid(pmd) (!!(pmd_val(pmd) & PMD_PRESENT_INVALID))
+
+static inline int pmd_present(pmd_t pmd)
+{
+ return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd);
+}
+
/*
* THP definitions.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#define pmd_present(pmd) pte_present(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_valid(pmd) pte_valid(pmd_pte(pmd))
@@ -381,7 +413,14 @@ static inline int pmd_protnone(pmd_t pmd)
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
+
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+ pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID));
+ pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID));
+
+ return pmd;
+}
#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
@@ -541,7 +580,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
#if CONFIG_PGTABLE_LEVELS > 2
-#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
+#define pmd_ERROR(e) \
+ pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
#define pud_none(pud) (!pud_val(pud))
#define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT))
@@ -608,7 +648,8 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
#if CONFIG_PGTABLE_LEVELS > 3
-#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
+#define pud_ERROR(e) \
+ pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e))
#define p4d_none(p4d) (!p4d_val(p4d))
#define p4d_bad(p4d) (!(p4d_val(p4d) & 2))
@@ -667,15 +708,21 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d)
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
-#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
+#define pgd_ERROR(e) \
+ pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e))
#define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr))
#define pgd_clear_fixmap() clear_fixmap(FIX_PGD)
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
+ /*
+ * Normal and Normal-Tagged are two different memory types and indices
+ * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK.
+ */
const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
- PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP;
+ PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP |
+ PTE_ATTRINDX_MASK;
/* preserve the hardware dirty information */
if (pte_hw_dirty(pte))
pte = pte_mkdirty(pte);
@@ -847,6 +894,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(swp) ((pte_t) { (swp).val })
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) })
+#define __swp_entry_to_pmd(swp) __pmd((swp).val)
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
/*
* Ensure that there are not more swap files than can be encoded in the kernel
* PTEs.
@@ -855,6 +907,38 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
extern int kern_addr_valid(unsigned long addr);
+#ifdef CONFIG_ARM64_MTE
+
+#define __HAVE_ARCH_PREPARE_TO_SWAP
+static inline int arch_prepare_to_swap(struct page *page)
+{
+ if (system_supports_mte())
+ return mte_save_tags(page);
+ return 0;
+}
+
+#define __HAVE_ARCH_SWAP_INVALIDATE
+static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
+{
+ if (system_supports_mte())
+ mte_invalidate_tags(type, offset);
+}
+
+static inline void arch_swap_invalidate_area(int type)
+{
+ if (system_supports_mte())
+ mte_invalidate_tags_area(type);
+}
+
+#define __HAVE_ARCH_SWAP_RESTORE
+static inline void arch_swap_restore(swp_entry_t entry, struct page *page)
+{
+ if (system_supports_mte() && mte_restore_tags(entry, page))
+ set_bit(PG_mte_tagged, &page->flags);
+}
+
+#endif /* CONFIG_ARM64_MTE */
+
/*
* On AArch64, the cache coherency is handled via the set_pte_at() function.
*/
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..fce8cbecd6bc 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -38,6 +38,7 @@
#include <asm/pgtable-hwdef.h>
#include <asm/pointer_auth.h>
#include <asm/ptrace.h>
+#include <asm/spectre.h>
#include <asm/types.h>
/*
@@ -151,6 +152,10 @@ struct thread_struct {
struct ptrauth_keys_user keys_user;
struct ptrauth_keys_kernel keys_kernel;
#endif
+#ifdef CONFIG_ARM64_MTE
+ u64 sctlr_tcf0;
+ u64 gcr_user_incl;
+#endif
};
static inline void arch_thread_struct_whitelist(unsigned long *offset,
@@ -197,40 +202,15 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
regs->pmr_save = GIC_PRIO_IRQON;
}
-static inline void set_ssbs_bit(struct pt_regs *regs)
-{
- regs->pstate |= PSR_SSBS_BIT;
-}
-
-static inline void set_compat_ssbs_bit(struct pt_regs *regs)
-{
- regs->pstate |= PSR_AA32_SSBS_BIT;
-}
-
static inline void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
start_thread_common(regs, pc);
regs->pstate = PSR_MODE_EL0t;
-
- if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
- set_ssbs_bit(regs);
-
+ spectre_v4_enable_task_mitigation(current);
regs->sp = sp;
}
-static inline bool is_ttbr0_addr(unsigned long addr)
-{
- /* entry assembly clears tags for TTBR0 addrs */
- return addr < TASK_SIZE;
-}
-
-static inline bool is_ttbr1_addr(unsigned long addr)
-{
- /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
- return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
-}
-
#ifdef CONFIG_COMPAT
static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
@@ -244,13 +224,23 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
regs->pstate |= PSR_AA32_E_BIT;
#endif
- if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
- set_compat_ssbs_bit(regs);
-
+ spectre_v4_enable_task_mitigation(current);
regs->compat_sp = sp;
}
#endif
+static inline bool is_ttbr0_addr(unsigned long addr)
+{
+ /* entry assembly clears tags for TTBR0 addrs */
+ return addr < TASK_SIZE;
+}
+
+static inline bool is_ttbr1_addr(unsigned long addr)
+{
+ /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
+ return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
+}
+
/* Forward declaration, a strange C thing */
struct task_struct;
@@ -315,10 +305,10 @@ extern void __init minsigstksz_setup(void);
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
/* PR_{SET,GET}_TAGGED_ADDR_CTRL prctl */
-long set_tagged_addr_ctrl(unsigned long arg);
-long get_tagged_addr_ctrl(void);
-#define SET_TAGGED_ADDR_CTRL(arg) set_tagged_addr_ctrl(arg)
-#define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl()
+long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg);
+long get_tagged_addr_ctrl(struct task_struct *task);
+#define SET_TAGGED_ADDR_CTRL(arg) set_tagged_addr_ctrl(current, arg)
+#define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current)
#endif
/*
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 966ed30ed5f7..997cf8c8cd52 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -31,9 +31,21 @@
* interrupt disabling temporarily does not rely on IRQ priorities.
*/
#define GIC_PRIO_IRQON 0xe0
-#define GIC_PRIO_IRQOFF (GIC_PRIO_IRQON & ~0x80)
+#define __GIC_PRIO_IRQOFF (GIC_PRIO_IRQON & ~0x80)
+#define __GIC_PRIO_IRQOFF_NS 0xa0
#define GIC_PRIO_PSR_I_SET (1 << 4)
+#define GIC_PRIO_IRQOFF \
+ ({ \
+ extern struct static_key_false gic_nonsecure_priorities;\
+ u8 __prio = __GIC_PRIO_IRQOFF; \
+ \
+ if (static_branch_unlikely(&gic_nonsecure_priorities)) \
+ __prio = __GIC_PRIO_IRQOFF_NS; \
+ \
+ __prio; \
+ })
+
/* Additional SPSR bits not exposed in the UABI */
#define PSR_MODE_THREAD_BIT (1 << 0)
#define PSR_IL_BIT (1 << 20)
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 0eadbf933e35..2e7f529ec5a6 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -56,27 +56,15 @@ static inline void set_cpu_logical_map(int cpu, u64 hwid)
struct seq_file;
/*
- * generate IPI list text
- */
-extern void show_ipi_list(struct seq_file *p, int prec);
-
-/*
- * Called from C code, this handles an IPI.
- */
-extern void handle_IPI(int ipinr, struct pt_regs *regs);
-
-/*
* Discover the set of possible CPUs and determine their
* SMP operations.
*/
extern void smp_init_cpus(void);
/*
- * Provide a function to raise an IPI cross call on CPUs in callmap.
+ * Register IPI interrupts with the arch SMP code
*/
-extern void set_smp_cross_call(void (*)(const struct cpumask *, unsigned int));
-
-extern void (*__smp_cross_call)(const struct cpumask *, unsigned int);
+extern void set_smp_ipi_range(int ipi_base, int nr_ipi);
/*
* Called from the secondary holding pen, this is the secondary CPU entry point.
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
new file mode 100644
index 000000000000..fcdfbce302bd
--- /dev/null
+++ b/arch/arm64/include/asm/spectre.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Interface for managing mitigations for Spectre vulnerabilities.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ASM_SPECTRE_H
+#define __ASM_SPECTRE_H
+
+#include <asm/cpufeature.h>
+
+/* Watch out, ordering is important here. */
+enum mitigation_state {
+ SPECTRE_UNAFFECTED,
+ SPECTRE_MITIGATED,
+ SPECTRE_VULNERABLE,
+};
+
+struct task_struct;
+
+enum mitigation_state arm64_get_spectre_v2_state(void);
+bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope);
+void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+
+enum mitigation_state arm64_get_spectre_v4_state(void);
+bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope);
+void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+void spectre_v4_enable_task_mitigation(struct task_struct *tsk);
+
+#endif /* __ASM_SPECTRE_H */
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index fc7613023c19..eb29b1fe8255 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -63,7 +63,7 @@ struct stackframe {
extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
- int (*fn)(struct stackframe *, void *), void *data);
+ bool (*fn)(void *, unsigned long), void *data);
extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
const char *loglvl);
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 996bf98f0cab..fe341a6578c3 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -8,7 +8,6 @@
#ifndef __ARM64_S2_PGTABLE_H_
#define __ARM64_S2_PGTABLE_H_
-#include <linux/hugetlb.h>
#include <linux/pgtable.h>
/*
@@ -37,217 +36,12 @@
#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1)
/*
- * The number of PTRS across all concatenated stage2 tables given by the
- * number of bits resolved at the initial level.
- * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
- * in which case, stage2_pgd_ptrs will have one entry.
- */
-#define pgd_ptrs_shift(ipa, pgdir_shift) \
- ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
-#define __s2_pgd_ptrs(ipa, lvls) \
- (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
-#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
-
-#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-
-/*
* kvm_mmmu_cache_min_pages() is the number of pages required to install
* a stage-2 translation. We pre-allocate the entry level page table at
* the VM creation.
*/
#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
-/* Stage2 PUD definitions when the level is present */
-static inline bool kvm_stage2_has_pud(struct kvm *kvm)
-{
- return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
-}
-
-#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
-#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
-
-#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
-#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
-#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
-#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d)
-
-static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
- pgd_t *pgd, unsigned long address)
-{
- return p4d_offset(pgd, address);
-}
-
-static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
-{
-}
-
-static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
-{
- return false;
-}
-
-static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
- phys_addr_t addr, phys_addr_t end)
-{
- return end;
-}
-
-static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
-{
- if (kvm_stage2_has_pud(kvm))
- return p4d_none(p4d);
- else
- return 0;
-}
-
-static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
-{
- if (kvm_stage2_has_pud(kvm))
- p4d_clear(p4dp);
-}
-
-static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
-{
- if (kvm_stage2_has_pud(kvm))
- return p4d_present(p4d);
- else
- return 1;
-}
-
-static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
-{
- if (kvm_stage2_has_pud(kvm))
- p4d_populate(NULL, p4d, pud);
-}
-
-static inline pud_t *stage2_pud_offset(struct kvm *kvm,
- p4d_t *p4d, unsigned long address)
-{
- if (kvm_stage2_has_pud(kvm))
- return pud_offset(p4d, address);
- else
- return (pud_t *)p4d;
-}
-
-static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
-{
- if (kvm_stage2_has_pud(kvm))
- free_page((unsigned long)pud);
-}
-
-static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
-{
- if (kvm_stage2_has_pud(kvm))
- return kvm_page_empty(pudp);
- else
- return false;
-}
-
-static inline phys_addr_t
-stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
- if (kvm_stage2_has_pud(kvm)) {
- phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
-
- return (boundary - 1 < end - 1) ? boundary : end;
- } else {
- return end;
- }
-}
-
-/* Stage2 PMD definitions when the level is present */
-static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
-{
- return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
-}
-
-#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
-
-static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_none(pud);
- else
- return 0;
-}
-
-static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- pud_clear(pud);
-}
-
-static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_present(pud);
- else
- return 1;
-}
-
-static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
-{
- if (kvm_stage2_has_pmd(kvm))
- pud_populate(NULL, pud, pmd);
-}
-
-static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
- pud_t *pud, unsigned long address)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pmd_offset(pud, address);
- else
- return (pmd_t *)pud;
-}
-
-static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
-{
- if (kvm_stage2_has_pmd(kvm))
- free_page((unsigned long)pmd);
-}
-
-static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
-{
- if (kvm_stage2_has_pmd(kvm))
- return pud_huge(pud);
- else
- return 0;
-}
-
-static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
-{
- if (kvm_stage2_has_pmd(kvm))
- return kvm_page_empty(pmdp);
- else
- return 0;
-}
-
-static inline phys_addr_t
-stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
- if (kvm_stage2_has_pmd(kvm)) {
- phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
-
- return (boundary - 1 < end - 1) ? boundary : end;
- } else {
- return end;
- }
-}
-
-static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
-{
- return kvm_page_empty(ptep);
-}
-
-static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
-{
- return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
-}
-
static inline phys_addr_t
stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
@@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
return (boundary - 1 < end - 1) ? boundary : end;
}
-/*
- * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
- * the architectural page-table level.
- */
-#define S2_NO_LEVEL_HINT 0
-#define S2_PUD_LEVEL 1
-#define S2_PMD_LEVEL 2
-#define S2_PTE_LEVEL 3
-
#endif /* __ARM64_S2_PGTABLE_H_ */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 554a7e8ecb07..d52c1b3ce589 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -91,10 +91,12 @@
#define PSTATE_PAN pstate_field(0, 4)
#define PSTATE_UAO pstate_field(0, 3)
#define PSTATE_SSBS pstate_field(3, 1)
+#define PSTATE_TCO pstate_field(3, 4)
#define SET_PSTATE_PAN(x) __emit_inst(0xd500401f | PSTATE_PAN | ((!!x) << PSTATE_Imm_shift))
#define SET_PSTATE_UAO(x) __emit_inst(0xd500401f | PSTATE_UAO | ((!!x) << PSTATE_Imm_shift))
#define SET_PSTATE_SSBS(x) __emit_inst(0xd500401f | PSTATE_SSBS | ((!!x) << PSTATE_Imm_shift))
+#define SET_PSTATE_TCO(x) __emit_inst(0xd500401f | PSTATE_TCO | ((!!x) << PSTATE_Imm_shift))
#define __SYS_BARRIER_INSN(CRm, op2, Rt) \
__emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f))
@@ -181,6 +183,8 @@
#define SYS_SCTLR_EL1 sys_reg(3, 0, 1, 0, 0)
#define SYS_ACTLR_EL1 sys_reg(3, 0, 1, 0, 1)
#define SYS_CPACR_EL1 sys_reg(3, 0, 1, 0, 2)
+#define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5)
+#define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6)
#define SYS_ZCR_EL1 sys_reg(3, 0, 1, 2, 0)
@@ -218,6 +222,8 @@
#define SYS_ERXADDR_EL1 sys_reg(3, 0, 5, 4, 3)
#define SYS_ERXMISC0_EL1 sys_reg(3, 0, 5, 5, 0)
#define SYS_ERXMISC1_EL1 sys_reg(3, 0, 5, 5, 1)
+#define SYS_TFSR_EL1 sys_reg(3, 0, 5, 6, 0)
+#define SYS_TFSRE0_EL1 sys_reg(3, 0, 5, 6, 1)
#define SYS_FAR_EL1 sys_reg(3, 0, 6, 0, 0)
#define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0)
@@ -321,6 +327,8 @@
#define SYS_PMINTENSET_EL1 sys_reg(3, 0, 9, 14, 1)
#define SYS_PMINTENCLR_EL1 sys_reg(3, 0, 9, 14, 2)
+#define SYS_PMMIR_EL1 sys_reg(3, 0, 9, 14, 6)
+
#define SYS_MAIR_EL1 sys_reg(3, 0, 10, 2, 0)
#define SYS_AMAIR_EL1 sys_reg(3, 0, 10, 3, 0)
@@ -368,6 +376,7 @@
#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0)
#define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1)
+#define SYS_GMID_EL1 sys_reg(3, 1, 0, 0, 4)
#define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7)
#define SYS_CSSELR_EL1 sys_reg(3, 2, 0, 0, 0)
@@ -460,6 +469,7 @@
#define SYS_ESR_EL2 sys_reg(3, 4, 5, 2, 0)
#define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3)
#define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0)
+#define SYS_TFSR_EL2 sys_reg(3, 4, 5, 6, 0)
#define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0)
#define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1)
@@ -516,6 +526,7 @@
#define SYS_AFSR0_EL12 sys_reg(3, 5, 5, 1, 0)
#define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1)
#define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0)
+#define SYS_TFSR_EL12 sys_reg(3, 5, 5, 6, 0)
#define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0)
#define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0)
#define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0)
@@ -531,6 +542,15 @@
/* Common SCTLR_ELx flags. */
#define SCTLR_ELx_DSSBS (BIT(44))
+#define SCTLR_ELx_ATA (BIT(43))
+
+#define SCTLR_ELx_TCF_SHIFT 40
+#define SCTLR_ELx_TCF_NONE (UL(0x0) << SCTLR_ELx_TCF_SHIFT)
+#define SCTLR_ELx_TCF_SYNC (UL(0x1) << SCTLR_ELx_TCF_SHIFT)
+#define SCTLR_ELx_TCF_ASYNC (UL(0x2) << SCTLR_ELx_TCF_SHIFT)
+#define SCTLR_ELx_TCF_MASK (UL(0x3) << SCTLR_ELx_TCF_SHIFT)
+
+#define SCTLR_ELx_ITFSB (BIT(37))
#define SCTLR_ELx_ENIA (BIT(31))
#define SCTLR_ELx_ENIB (BIT(30))
#define SCTLR_ELx_ENDA (BIT(27))
@@ -559,6 +579,14 @@
#endif
/* SCTLR_EL1 specific flags. */
+#define SCTLR_EL1_ATA0 (BIT(42))
+
+#define SCTLR_EL1_TCF0_SHIFT 38
+#define SCTLR_EL1_TCF0_NONE (UL(0x0) << SCTLR_EL1_TCF0_SHIFT)
+#define SCTLR_EL1_TCF0_SYNC (UL(0x1) << SCTLR_EL1_TCF0_SHIFT)
+#define SCTLR_EL1_TCF0_ASYNC (UL(0x2) << SCTLR_EL1_TCF0_SHIFT)
+#define SCTLR_EL1_TCF0_MASK (UL(0x3) << SCTLR_EL1_TCF0_SHIFT)
+
#define SCTLR_EL1_BT1 (BIT(36))
#define SCTLR_EL1_BT0 (BIT(35))
#define SCTLR_EL1_UCI (BIT(26))
@@ -587,6 +615,7 @@
SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I |\
SCTLR_EL1_DZE | SCTLR_EL1_UCT |\
SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\
+ SCTLR_ELx_ITFSB| SCTLR_ELx_ATA | SCTLR_EL1_ATA0 |\
ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_RES1)
/* MAIR_ELx memory attributes (used by Linux) */
@@ -595,6 +624,7 @@
#define MAIR_ATTR_DEVICE_GRE UL(0x0c)
#define MAIR_ATTR_NORMAL_NC UL(0x44)
#define MAIR_ATTR_NORMAL_WT UL(0xbb)
+#define MAIR_ATTR_NORMAL_TAGGED UL(0xf0)
#define MAIR_ATTR_NORMAL UL(0xff)
#define MAIR_ATTR_MASK UL(0xff)
@@ -636,14 +666,22 @@
#define ID_AA64ISAR1_APA_SHIFT 4
#define ID_AA64ISAR1_DPB_SHIFT 0
-#define ID_AA64ISAR1_APA_NI 0x0
-#define ID_AA64ISAR1_APA_ARCHITECTED 0x1
-#define ID_AA64ISAR1_API_NI 0x0
-#define ID_AA64ISAR1_API_IMP_DEF 0x1
-#define ID_AA64ISAR1_GPA_NI 0x0
-#define ID_AA64ISAR1_GPA_ARCHITECTED 0x1
-#define ID_AA64ISAR1_GPI_NI 0x0
-#define ID_AA64ISAR1_GPI_IMP_DEF 0x1
+#define ID_AA64ISAR1_APA_NI 0x0
+#define ID_AA64ISAR1_APA_ARCHITECTED 0x1
+#define ID_AA64ISAR1_APA_ARCH_EPAC 0x2
+#define ID_AA64ISAR1_APA_ARCH_EPAC2 0x3
+#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC 0x4
+#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC_CMB 0x5
+#define ID_AA64ISAR1_API_NI 0x0
+#define ID_AA64ISAR1_API_IMP_DEF 0x1
+#define ID_AA64ISAR1_API_IMP_DEF_EPAC 0x2
+#define ID_AA64ISAR1_API_IMP_DEF_EPAC2 0x3
+#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC 0x4
+#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC_CMB 0x5
+#define ID_AA64ISAR1_GPA_NI 0x0
+#define ID_AA64ISAR1_GPA_ARCHITECTED 0x1
+#define ID_AA64ISAR1_GPI_NI 0x0
+#define ID_AA64ISAR1_GPI_IMP_DEF 0x1
/* id_aa64pfr0 */
#define ID_AA64PFR0_CSV3_SHIFT 60
@@ -686,6 +724,10 @@
#define ID_AA64PFR1_SSBS_PSTATE_INSNS 2
#define ID_AA64PFR1_BT_BTI 0x1
+#define ID_AA64PFR1_MTE_NI 0x0
+#define ID_AA64PFR1_MTE_EL0 0x1
+#define ID_AA64PFR1_MTE 0x2
+
/* id_aa64zfr0 */
#define ID_AA64ZFR0_F64MM_SHIFT 56
#define ID_AA64ZFR0_F32MM_SHIFT 52
@@ -920,6 +962,28 @@
#define CPACR_EL1_ZEN_EL0EN (BIT(17)) /* enable EL0 access, if EL1EN set */
#define CPACR_EL1_ZEN (CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN)
+/* TCR EL1 Bit Definitions */
+#define SYS_TCR_EL1_TCMA1 (BIT(58))
+#define SYS_TCR_EL1_TCMA0 (BIT(57))
+
+/* GCR_EL1 Definitions */
+#define SYS_GCR_EL1_RRND (BIT(16))
+#define SYS_GCR_EL1_EXCL_MASK 0xffffUL
+
+/* RGSR_EL1 Definitions */
+#define SYS_RGSR_EL1_TAG_MASK 0xfUL
+#define SYS_RGSR_EL1_SEED_SHIFT 8
+#define SYS_RGSR_EL1_SEED_MASK 0xffffUL
+
+/* GMID_EL1 field definitions */
+#define SYS_GMID_EL1_BS_SHIFT 0
+#define SYS_GMID_EL1_BS_SIZE 4
+
+/* TFSR{,E0}_EL1 bit definitions */
+#define SYS_TFSR_EL1_TF0_SHIFT 0
+#define SYS_TFSR_EL1_TF1_SHIFT 1
+#define SYS_TFSR_EL1_TF0 (UL(1) << SYS_TFSR_EL1_TF0_SHIFT)
+#define SYS_TFSR_EL1_TF1 (UK(2) << SYS_TFSR_EL1_TF1_SHIFT)
/* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */
#define SYS_MPIDR_SAFE_VAL (BIT(31))
@@ -1024,6 +1088,13 @@
write_sysreg(__scs_new, sysreg); \
} while (0)
+#define sysreg_clear_set_s(sysreg, clear, set) do { \
+ u64 __scs_val = read_sysreg_s(sysreg); \
+ u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \
+ if (__scs_new != __scs_val) \
+ write_sysreg_s(__scs_new, sysreg); \
+} while (0)
+
#endif
#endif /* __ASM_SYSREG_H */
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 5e784e16ee89..1fbab854a51b 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -67,6 +67,7 @@ void arch_release_task_struct(struct task_struct *tsk);
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
+#define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */
#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
#define TIF_SYSCALL_AUDIT 9 /* syscall auditing */
#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */
@@ -96,10 +97,11 @@ void arch_release_task_struct(struct task_struct *tsk);
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
#define _TIF_32BIT (1 << TIF_32BIT)
#define _TIF_SVE (1 << TIF_SVE)
+#define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT)
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
- _TIF_UPROBE | _TIF_FSCHECK)
+ _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT)
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index e042f6527981..11a465243f66 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -26,7 +26,9 @@ void topology_scale_freq_tick(void);
#endif /* CONFIG_ARM64_AMU_EXTN */
/* Replace task scheduler's default frequency-invariant accounting */
+#define arch_set_freq_scale topology_set_freq_scale
#define arch_scale_freq_capacity topology_get_freq_scale
+#define arch_scale_freq_invariant topology_scale_freq_invariant
/* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index cee5928e1b7d..d96dc2c7c09d 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -24,7 +24,7 @@ struct undef_hook {
void register_undef_hook(struct undef_hook *hook);
void unregister_undef_hook(struct undef_hook *hook);
-void force_signal_inject(int signal, int code, unsigned long address);
+void force_signal_inject(int signal, int code, unsigned long address, unsigned int err);
void arm64_notify_segfault(unsigned long addr);
void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str);
void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str);
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 3b859596840d..b3b2019f8d16 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 440
+#define __NR_compat_syscalls 441
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 734860ac7cf9..107f08e03b9f 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -53,7 +53,7 @@ __SYSCALL(__NR_lseek, compat_sys_lseek)
#define __NR_getpid 20
__SYSCALL(__NR_getpid, sys_getpid)
#define __NR_mount 21
-__SYSCALL(__NR_mount, compat_sys_mount)
+__SYSCALL(__NR_mount, sys_mount)
/* 22 was sys_umount */
__SYSCALL(22, sys_ni_syscall)
#define __NR_setuid 23
@@ -301,9 +301,9 @@ __SYSCALL(__NR_flock, sys_flock)
#define __NR_msync 144
__SYSCALL(__NR_msync, sys_msync)
#define __NR_readv 145
-__SYSCALL(__NR_readv, compat_sys_readv)
+__SYSCALL(__NR_readv, sys_readv)
#define __NR_writev 146
-__SYSCALL(__NR_writev, compat_sys_writev)
+__SYSCALL(__NR_writev, sys_writev)
#define __NR_getsid 147
__SYSCALL(__NR_getsid, sys_getsid)
#define __NR_fdatasync 148
@@ -697,7 +697,7 @@ __SYSCALL(__NR_sync_file_range2, compat_sys_aarch32_sync_file_range2)
#define __NR_tee 342
__SYSCALL(__NR_tee, sys_tee)
#define __NR_vmsplice 343
-__SYSCALL(__NR_vmsplice, compat_sys_vmsplice)
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
#define __NR_move_pages 344
__SYSCALL(__NR_move_pages, compat_sys_move_pages)
#define __NR_getcpu 345
@@ -763,9 +763,9 @@ __SYSCALL(__NR_sendmmsg, compat_sys_sendmmsg)
#define __NR_setns 375
__SYSCALL(__NR_setns, sys_setns)
#define __NR_process_vm_readv 376
-__SYSCALL(__NR_process_vm_readv, compat_sys_process_vm_readv)
+__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
#define __NR_process_vm_writev 377
-__SYSCALL(__NR_process_vm_writev, compat_sys_process_vm_writev)
+__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
#define __NR_kcmp 378
__SYSCALL(__NR_kcmp, sys_kcmp)
#define __NR_finit_module 379
@@ -887,6 +887,8 @@ __SYSCALL(__NR_openat2, sys_openat2)
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#define __NR_faccessat2 439
__SYSCALL(__NR_faccessat2, sys_faccessat2)
+#define __NR_process_madvise 440
+__SYSCALL(__NR_process_madvise, sys_process_madvise)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/asm/xen/page.h b/arch/arm64/include/asm/xen/page.h
index 31bbc803cecb..dffdc773221b 100644
--- a/arch/arm64/include/asm/xen/page.h
+++ b/arch/arm64/include/asm/xen/page.h
@@ -1 +1,7 @@
#include <xen/arm/page.h>
+#include <asm/mmu.h>
+
+static inline bool xen_kernel_unmapped_at_usr(void)
+{
+ return arm64_kernel_unmapped_at_el0();
+}
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 912162f73529..b8f41aa234ee 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -74,6 +74,6 @@
#define HWCAP2_DGH (1 << 15)
#define HWCAP2_RNG (1 << 16)
#define HWCAP2_BTI (1 << 17)
-/* reserved for HWCAP2_MTE (1 << 18) */
+#define HWCAP2_MTE (1 << 18)
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index ba85bb23f060..1c17c3a24411 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -159,6 +159,21 @@ struct kvm_sync_regs {
struct kvm_arch_memory_slot {
};
+/*
+ * PMU filter structure. Describe a range of events with a particular
+ * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
+ */
+struct kvm_pmu_event_filter {
+ __u16 base_event;
+ __u16 nevents;
+
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+ __u8 action;
+ __u8 pad[3];
+};
+
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
struct {
@@ -242,6 +257,15 @@ struct kvm_vcpu_events {
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2
+
+/*
+ * Only two states can be presented by the host kernel:
+ * - NOT_REQUIRED: the guest doesn't need to do anything
+ * - NOT_AVAIL: the guest isn't mitigated (it can still use SSBS if available)
+ *
+ * All the other values are deprecated. The host still accepts all
+ * values (they are ABI), but will narrow them to the above two.
+ */
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2)
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1
@@ -329,6 +353,7 @@ struct kvm_vcpu_events {
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
#define KVM_ARM_VCPU_PMU_V3_IRQ 0
#define KVM_ARM_VCPU_PMU_V3_INIT 1
+#define KVM_ARM_VCPU_PMU_V3_FILTER 2
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h
index 6fdd71eb644f..1e6482a838e1 100644
--- a/arch/arm64/include/uapi/asm/mman.h
+++ b/arch/arm64/include/uapi/asm/mman.h
@@ -5,5 +5,6 @@
#include <asm-generic/mman.h>
#define PROT_BTI 0x10 /* BTI guarded page */
+#define PROT_MTE 0x20 /* Normal Tagged mapping */
#endif /* ! _UAPI__ASM_MMAN_H */
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 42cbe34d95ce..758ae984ff97 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -51,6 +51,7 @@
#define PSR_PAN_BIT 0x00400000
#define PSR_UAO_BIT 0x00800000
#define PSR_DIT_BIT 0x01000000
+#define PSR_TCO_BIT 0x02000000
#define PSR_V_BIT 0x10000000
#define PSR_C_BIT 0x20000000
#define PSR_Z_BIT 0x40000000
@@ -75,6 +76,9 @@
/* syscall emulation path in ptrace */
#define PTRACE_SYSEMU 31
#define PTRACE_SYSEMU_SINGLESTEP 32
+/* MTE allocation tag access */
+#define PTRACE_PEEKMTETAGS 33
+#define PTRACE_POKEMTETAGS 34
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..bbaf0bc4ad60 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -3,8 +3,6 @@
# Makefile for the linux kernel.
#
-CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET)
-AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
CFLAGS_armv8_deprecated.o := -I$(src)
CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
@@ -19,7 +17,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
return_address.o cpuinfo.o cpu_errata.o \
cpufeature.o alternative.o cacheinfo.o \
smp.o smp_spin_table.o topology.o smccc-call.o \
- syscall.o
+ syscall.o proton-pack.o
targets += efi-entry.o
@@ -59,9 +57,9 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
-obj-$(CONFIG_ARM64_SSBD) += ssbd.o
obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
+obj-$(CONFIG_ARM64_MTE) += mte.o
obj-y += vdso/ probes/
obj-$(CONFIG_COMPAT_VDSO) += vdso32/
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index a85174d05473..cada0b816c8a 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -298,8 +298,21 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
case EFI_BOOT_SERVICES_DATA:
case EFI_CONVENTIONAL_MEMORY:
case EFI_PERSISTENT_MEMORY:
- pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys);
- return NULL;
+ if (memblock_is_map_memory(phys) ||
+ !memblock_is_region_memory(phys, size)) {
+ pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys);
+ return NULL;
+ }
+ /*
+ * Mapping kernel memory is permitted if the region in
+ * question is covered by a single memblock with the
+ * NOMAP attribute set: this enables the use of ACPI
+ * table overrides passed via initramfs, which are
+ * reserved in memory using arch_reserve_mem_area()
+ * below. As this particular use case only requires
+ * read access, fall through to the R/O mapping case.
+ */
+ fallthrough;
case EFI_RUNTIME_SERVICES_CODE:
/*
@@ -388,3 +401,8 @@ int apei_claim_sea(struct pt_regs *regs)
return err;
}
+
+void arch_reserve_mem_area(acpi_physical_address addr, size_t size)
+{
+ memblock_mark_nomap(addr, size);
+}
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S
index 4a18055b2ff9..37721eb6f9a1 100644
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -35,6 +35,10 @@ SYM_CODE_START(__cpu_soft_restart)
mov_q x13, SCTLR_ELx_FLAGS
bic x12, x12, x13
pre_disable_mmu_workaround
+ /*
+ * either disable EL1&0 translation regime or disable EL2&0 translation
+ * regime if HCR_EL2.E2H == 1
+ */
msr sctlr_el1, x12
isb
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index c332d49780dc..24d75af344b1 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -106,365 +106,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
}
-atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-
-DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
-
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
-static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
- const char *hyp_vecs_end)
-{
- void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
- int i;
-
- for (i = 0; i < SZ_2K; i += 0x80)
- memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
-
- __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
-}
-
-static void install_bp_hardening_cb(bp_hardening_cb_t fn,
- const char *hyp_vecs_start,
- const char *hyp_vecs_end)
-{
- static DEFINE_RAW_SPINLOCK(bp_lock);
- int cpu, slot = -1;
-
- /*
- * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
- * we're a guest. Skip the hyp-vectors work.
- */
- if (!hyp_vecs_start) {
- __this_cpu_write(bp_hardening_data.fn, fn);
- return;
- }
-
- raw_spin_lock(&bp_lock);
- for_each_possible_cpu(cpu) {
- if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
- slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
- break;
- }
- }
-
- if (slot == -1) {
- slot = atomic_inc_return(&arm64_el2_vector_last_slot);
- BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
- __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
- }
-
- __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
- __this_cpu_write(bp_hardening_data.fn, fn);
- raw_spin_unlock(&bp_lock);
-}
-#else
-static void install_bp_hardening_cb(bp_hardening_cb_t fn,
- const char *hyp_vecs_start,
- const char *hyp_vecs_end)
-{
- __this_cpu_write(bp_hardening_data.fn, fn);
-}
-#endif /* CONFIG_KVM_INDIRECT_VECTORS */
-
-#include <linux/arm-smccc.h>
-
-static void __maybe_unused call_smc_arch_workaround_1(void)
-{
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
-}
-
-static void call_hvc_arch_workaround_1(void)
-{
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
-}
-
-static void qcom_link_stack_sanitization(void)
-{
- u64 tmp;
-
- asm volatile("mov %0, x30 \n"
- ".rept 16 \n"
- "bl . + 4 \n"
- ".endr \n"
- "mov x30, %0 \n"
- : "=&r" (tmp));
-}
-
-static bool __nospectre_v2;
-static int __init parse_nospectre_v2(char *str)
-{
- __nospectre_v2 = true;
- return 0;
-}
-early_param("nospectre_v2", parse_nospectre_v2);
-
-/*
- * -1: No workaround
- * 0: No workaround required
- * 1: Workaround installed
- */
-static int detect_harden_bp_fw(void)
-{
- bp_hardening_cb_t cb;
- void *smccc_start, *smccc_end;
- struct arm_smccc_res res;
- u32 midr = read_cpuid_id();
-
- arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
-
- switch ((int)res.a0) {
- case 1:
- /* Firmware says we're just fine */
- return 0;
- case 0:
- break;
- default:
- return -1;
- }
-
- switch (arm_smccc_1_1_get_conduit()) {
- case SMCCC_CONDUIT_HVC:
- cb = call_hvc_arch_workaround_1;
- /* This is a guest, no need to patch KVM vectors */
- smccc_start = NULL;
- smccc_end = NULL;
- break;
-
-#if IS_ENABLED(CONFIG_KVM)
- case SMCCC_CONDUIT_SMC:
- cb = call_smc_arch_workaround_1;
- smccc_start = __smccc_workaround_1_smc;
- smccc_end = __smccc_workaround_1_smc +
- __SMCCC_WORKAROUND_1_SMC_SZ;
- break;
-#endif
-
- default:
- return -1;
- }
-
- if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) ||
- ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1))
- cb = qcom_link_stack_sanitization;
-
- if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR))
- install_bp_hardening_cb(cb, smccc_start, smccc_end);
-
- return 1;
-}
-
-DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
-
-int ssbd_state __read_mostly = ARM64_SSBD_KERNEL;
-static bool __ssb_safe = true;
-
-static const struct ssbd_options {
- const char *str;
- int state;
-} ssbd_options[] = {
- { "force-on", ARM64_SSBD_FORCE_ENABLE, },
- { "force-off", ARM64_SSBD_FORCE_DISABLE, },
- { "kernel", ARM64_SSBD_KERNEL, },
-};
-
-static int __init ssbd_cfg(char *buf)
-{
- int i;
-
- if (!buf || !buf[0])
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) {
- int len = strlen(ssbd_options[i].str);
-
- if (strncmp(buf, ssbd_options[i].str, len))
- continue;
-
- ssbd_state = ssbd_options[i].state;
- return 0;
- }
-
- return -EINVAL;
-}
-early_param("ssbd", ssbd_cfg);
-
-void __init arm64_update_smccc_conduit(struct alt_instr *alt,
- __le32 *origptr, __le32 *updptr,
- int nr_inst)
-{
- u32 insn;
-
- BUG_ON(nr_inst != 1);
-
- switch (arm_smccc_1_1_get_conduit()) {
- case SMCCC_CONDUIT_HVC:
- insn = aarch64_insn_get_hvc_value();
- break;
- case SMCCC_CONDUIT_SMC:
- insn = aarch64_insn_get_smc_value();
- break;
- default:
- return;
- }
-
- *updptr = cpu_to_le32(insn);
-}
-
-void __init arm64_enable_wa2_handling(struct alt_instr *alt,
- __le32 *origptr, __le32 *updptr,
- int nr_inst)
-{
- BUG_ON(nr_inst != 1);
- /*
- * Only allow mitigation on EL1 entry/exit and guest
- * ARCH_WORKAROUND_2 handling if the SSBD state allows it to
- * be flipped.
- */
- if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL)
- *updptr = cpu_to_le32(aarch64_insn_gen_nop());
-}
-
-void arm64_set_ssbd_mitigation(bool state)
-{
- int conduit;
-
- if (!IS_ENABLED(CONFIG_ARM64_SSBD)) {
- pr_info_once("SSBD disabled by kernel configuration\n");
- return;
- }
-
- if (this_cpu_has_cap(ARM64_SSBS)) {
- if (state)
- asm volatile(SET_PSTATE_SSBS(0));
- else
- asm volatile(SET_PSTATE_SSBS(1));
- return;
- }
-
- conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state,
- NULL);
-
- WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE);
-}
-
-static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
- int scope)
-{
- struct arm_smccc_res res;
- bool required = true;
- s32 val;
- bool this_cpu_safe = false;
- int conduit;
-
- WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
-
- if (cpu_mitigations_off())
- ssbd_state = ARM64_SSBD_FORCE_DISABLE;
-
- /* delay setting __ssb_safe until we get a firmware response */
- if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list))
- this_cpu_safe = true;
-
- if (this_cpu_has_cap(ARM64_SSBS)) {
- if (!this_cpu_safe)
- __ssb_safe = false;
- required = false;
- goto out_printmsg;
- }
-
- conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_2, &res);
-
- if (conduit == SMCCC_CONDUIT_NONE) {
- ssbd_state = ARM64_SSBD_UNKNOWN;
- if (!this_cpu_safe)
- __ssb_safe = false;
- return false;
- }
-
- val = (s32)res.a0;
-
- switch (val) {
- case SMCCC_RET_NOT_SUPPORTED:
- ssbd_state = ARM64_SSBD_UNKNOWN;
- if (!this_cpu_safe)
- __ssb_safe = false;
- return false;
-
- /* machines with mixed mitigation requirements must not return this */
- case SMCCC_RET_NOT_REQUIRED:
- pr_info_once("%s mitigation not required\n", entry->desc);
- ssbd_state = ARM64_SSBD_MITIGATED;
- return false;
-
- case SMCCC_RET_SUCCESS:
- __ssb_safe = false;
- required = true;
- break;
-
- case 1: /* Mitigation not required on this CPU */
- required = false;
- break;
-
- default:
- WARN_ON(1);
- if (!this_cpu_safe)
- __ssb_safe = false;
- return false;
- }
-
- switch (ssbd_state) {
- case ARM64_SSBD_FORCE_DISABLE:
- arm64_set_ssbd_mitigation(false);
- required = false;
- break;
-
- case ARM64_SSBD_KERNEL:
- if (required) {
- __this_cpu_write(arm64_ssbd_callback_required, 1);
- arm64_set_ssbd_mitigation(true);
- }
- break;
-
- case ARM64_SSBD_FORCE_ENABLE:
- arm64_set_ssbd_mitigation(true);
- required = true;
- break;
-
- default:
- WARN_ON(1);
- break;
- }
-
-out_printmsg:
- switch (ssbd_state) {
- case ARM64_SSBD_FORCE_DISABLE:
- pr_info_once("%s disabled from command-line\n", entry->desc);
- break;
-
- case ARM64_SSBD_FORCE_ENABLE:
- pr_info_once("%s forced from command-line\n", entry->desc);
- break;
- }
-
- return required;
-}
-
-/* known invulnerable cores */
-static const struct midr_range arm64_ssb_cpus[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
- MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
- {},
-};
-
#ifdef CONFIG_ARM64_ERRATUM_1463225
DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
@@ -519,83 +160,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \
CAP_MIDR_RANGE_LIST(midr_list)
-/* Track overall mitigation state. We are only mitigated if all cores are ok */
-static bool __hardenbp_enab = true;
-static bool __spectrev2_safe = true;
-
-int get_spectre_v2_workaround_state(void)
-{
- if (__spectrev2_safe)
- return ARM64_BP_HARDEN_NOT_REQUIRED;
-
- if (!__hardenbp_enab)
- return ARM64_BP_HARDEN_UNKNOWN;
-
- return ARM64_BP_HARDEN_WA_NEEDED;
-}
-
-/*
- * List of CPUs that do not need any Spectre-v2 mitigation at all.
- */
-static const struct midr_range spectre_v2_safe_list[] = {
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
- MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
- MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
- MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
- MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
- { /* sentinel */ }
-};
-
-/*
- * Track overall bp hardening for all heterogeneous cores in the machine.
- * We are only considered "safe" if all booted cores are known safe.
- */
-static bool __maybe_unused
-check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
-{
- int need_wa;
-
- WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
-
- /* If the CPU has CSV2 set, we're safe */
- if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1),
- ID_AA64PFR0_CSV2_SHIFT))
- return false;
-
- /* Alternatively, we have a list of unaffected CPUs */
- if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
- return false;
-
- /* Fallback to firmware detection */
- need_wa = detect_harden_bp_fw();
- if (!need_wa)
- return false;
-
- __spectrev2_safe = false;
-
- if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) {
- pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n");
- __hardenbp_enab = false;
- return false;
- }
-
- /* forced off */
- if (__nospectre_v2 || cpu_mitigations_off()) {
- pr_info_once("spectrev2 mitigation disabled by command line option\n");
- __hardenbp_enab = false;
- return false;
- }
-
- if (need_wa < 0) {
- pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n");
- __hardenbp_enab = false;
- }
-
- return (need_wa > 0);
-}
-
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
@@ -887,9 +451,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
- .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+ .desc = "Spectre-v2",
+ .capability = ARM64_SPECTRE_V2,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
- .matches = check_branch_predictor,
+ .matches = has_spectre_v2,
+ .cpu_enable = spectre_v2_enable_mitigation,
},
#ifdef CONFIG_RANDOMIZE_BASE
{
@@ -899,19 +465,23 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
- .desc = "Speculative Store Bypass Disable",
- .capability = ARM64_SSBD,
+ .desc = "Spectre-v4",
+ .capability = ARM64_SPECTRE_V4,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
- .matches = has_ssbd_mitigation,
- .midr_range_list = arm64_ssb_cpus,
+ .matches = has_spectre_v4,
+ .cpu_enable = spectre_v4_enable_mitigation,
},
#ifdef CONFIG_ARM64_ERRATUM_1418040
{
.desc = "ARM erratum 1418040",
.capability = ARM64_WORKAROUND_1418040,
ERRATA_MIDR_RANGE_LIST(erratum_1418040_list),
- .type = (ARM64_CPUCAP_SCOPE_LOCAL_CPU |
- ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU),
+ /*
+ * We need to allow affected CPUs to come in late, but
+ * also need the non-affected CPUs to be able to come
+ * in at any point in time. Wonderful.
+ */
+ .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
},
#endif
#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT
@@ -956,40 +526,3 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
{
}
};
-
-ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "Mitigation: __user pointer sanitization\n");
-}
-
-ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- switch (get_spectre_v2_workaround_state()) {
- case ARM64_BP_HARDEN_NOT_REQUIRED:
- return sprintf(buf, "Not affected\n");
- case ARM64_BP_HARDEN_WA_NEEDED:
- return sprintf(buf, "Mitigation: Branch predictor hardening\n");
- case ARM64_BP_HARDEN_UNKNOWN:
- default:
- return sprintf(buf, "Vulnerable\n");
- }
-}
-
-ssize_t cpu_show_spec_store_bypass(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- if (__ssb_safe)
- return sprintf(buf, "Not affected\n");
-
- switch (ssbd_state) {
- case ARM64_SSBD_KERNEL:
- case ARM64_SSBD_FORCE_ENABLE:
- if (IS_ENABLED(CONFIG_ARM64_SSBD))
- return sprintf(buf,
- "Mitigation: Speculative Store Bypass disabled via prctl\n");
- }
-
- return sprintf(buf, "Vulnerable\n");
-}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6424584be01e..dcc165b3fc04 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -75,6 +75,7 @@
#include <asm/cpu_ops.h>
#include <asm/fpsimd.h>
#include <asm/mmu_context.h>
+#include <asm/mte.h>
#include <asm/processor.h>
#include <asm/sysreg.h>
#include <asm/traps.h>
@@ -197,9 +198,9 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FCMA_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_API_SHIFT, 4, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_API_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_APA_SHIFT, 4, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_APA_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DPB_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -227,7 +228,9 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
+ ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MTE_SHIFT, 4, ID_AA64PFR1_MTE_NI),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0),
ARM64_FTR_END,
@@ -487,7 +490,7 @@ static const struct arm64_ftr_bits ftr_id_pfr1[] = {
};
static const struct arm64_ftr_bits ftr_id_pfr2[] = {
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -1111,6 +1114,7 @@ u64 read_sanitised_ftr_reg(u32 id)
return 0;
return regp->sys_val;
}
+EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg);
#define read_sysreg_case(r) \
case r: return read_sysreg_s(r)
@@ -1443,6 +1447,7 @@ static inline void __cpu_enable_hw_dbm(void)
write_sysreg(tcr, tcr_el1);
isb();
+ local_flush_tlb_all();
}
static bool cpu_has_broken_dbm(void)
@@ -1583,48 +1588,6 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
WARN_ON(val & (7 << 27 | 7 << 21));
}
-#ifdef CONFIG_ARM64_SSBD
-static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
-{
- if (user_mode(regs))
- return 1;
-
- if (instr & BIT(PSTATE_Imm_shift))
- regs->pstate |= PSR_SSBS_BIT;
- else
- regs->pstate &= ~PSR_SSBS_BIT;
-
- arm64_skip_faulting_instruction(regs, 4);
- return 0;
-}
-
-static struct undef_hook ssbs_emulation_hook = {
- .instr_mask = ~(1U << PSTATE_Imm_shift),
- .instr_val = 0xd500401f | PSTATE_SSBS,
- .fn = ssbs_emulation_handler,
-};
-
-static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused)
-{
- static bool undef_hook_registered = false;
- static DEFINE_RAW_SPINLOCK(hook_lock);
-
- raw_spin_lock(&hook_lock);
- if (!undef_hook_registered) {
- register_undef_hook(&ssbs_emulation_hook);
- undef_hook_registered = true;
- }
- raw_spin_unlock(&hook_lock);
-
- if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
- sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
- arm64_set_ssbd_mitigation(false);
- } else {
- arm64_set_ssbd_mitigation(true);
- }
-}
-#endif /* CONFIG_ARM64_SSBD */
-
#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
@@ -1648,11 +1611,37 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused)
#endif /* CONFIG_ARM64_RAS_EXTN */
#ifdef CONFIG_ARM64_PTR_AUTH
-static bool has_address_auth(const struct arm64_cpu_capabilities *entry,
- int __unused)
+static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, int scope)
{
- return __system_matches_cap(ARM64_HAS_ADDRESS_AUTH_ARCH) ||
- __system_matches_cap(ARM64_HAS_ADDRESS_AUTH_IMP_DEF);
+ int boot_val, sec_val;
+
+ /* We don't expect to be called with SCOPE_SYSTEM */
+ WARN_ON(scope == SCOPE_SYSTEM);
+ /*
+ * The ptr-auth feature levels are not intercompatible with lower
+ * levels. Hence we must match ptr-auth feature level of the secondary
+ * CPUs with that of the boot CPU. The level of boot cpu is fetched
+ * from the sanitised register whereas direct register read is done for
+ * the secondary CPUs.
+ * The sanitised feature state is guaranteed to match that of the
+ * boot CPU as a mismatched secondary CPU is parked before it gets
+ * a chance to update the state, with the capability.
+ */
+ boot_val = cpuid_feature_extract_field(read_sanitised_ftr_reg(entry->sys_reg),
+ entry->field_pos, entry->sign);
+ if (scope & SCOPE_BOOT_CPU)
+ return boot_val >= entry->min_field_value;
+ /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */
+ sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg),
+ entry->field_pos, entry->sign);
+ return sec_val == boot_val;
+}
+
+static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry,
+ int scope)
+{
+ return has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH], scope) ||
+ has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope);
}
static bool has_generic_auth(const struct arm64_cpu_capabilities *entry,
@@ -1702,6 +1691,22 @@ static void bti_enable(const struct arm64_cpu_capabilities *__unused)
}
#endif /* CONFIG_ARM64_BTI */
+#ifdef CONFIG_ARM64_MTE
+static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
+{
+ static bool cleared_zero_page = false;
+
+ /*
+ * Clear the tags in the zero page. This needs to be done via the
+ * linear map which has the Tagged attribute.
+ */
+ if (!cleared_zero_page) {
+ cleared_zero_page = true;
+ mte_clear_page_tags(lm_alias(empty_zero_page));
+ }
+}
+#endif /* CONFIG_ARM64_MTE */
+
/* Internal helper functions to match cpu capability type */
static bool
cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@ -1976,19 +1981,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.field_pos = ID_AA64ISAR0_CRC32_SHIFT,
.min_field_value = 1,
},
-#ifdef CONFIG_ARM64_SSBD
{
.desc = "Speculative Store Bypassing Safe (SSBS)",
.capability = ARM64_SSBS,
- .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_cpuid_feature,
.sys_reg = SYS_ID_AA64PFR1_EL1,
.field_pos = ID_AA64PFR1_SSBS_SHIFT,
.sign = FTR_UNSIGNED,
.min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY,
- .cpu_enable = cpu_enable_ssbs,
},
-#endif
#ifdef CONFIG_ARM64_CNP
{
.desc = "Common not Private translations",
@@ -2021,7 +2023,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.sign = FTR_UNSIGNED,
.field_pos = ID_AA64ISAR1_APA_SHIFT,
.min_field_value = ID_AA64ISAR1_APA_ARCHITECTED,
- .matches = has_cpuid_feature,
+ .matches = has_address_auth_cpucap,
},
{
.desc = "Address authentication (IMP DEF algorithm)",
@@ -2031,12 +2033,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.sign = FTR_UNSIGNED,
.field_pos = ID_AA64ISAR1_API_SHIFT,
.min_field_value = ID_AA64ISAR1_API_IMP_DEF,
- .matches = has_cpuid_feature,
+ .matches = has_address_auth_cpucap,
},
{
.capability = ARM64_HAS_ADDRESS_AUTH,
.type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
- .matches = has_address_auth,
+ .matches = has_address_auth_metacap,
},
{
.desc = "Generic authentication (architected algorithm)",
@@ -2121,6 +2123,19 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.sign = FTR_UNSIGNED,
},
#endif
+#ifdef CONFIG_ARM64_MTE
+ {
+ .desc = "Memory Tagging Extension",
+ .capability = ARM64_MTE,
+ .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64PFR1_EL1,
+ .field_pos = ID_AA64PFR1_MTE_SHIFT,
+ .min_field_value = ID_AA64PFR1_MTE,
+ .sign = FTR_UNSIGNED,
+ .cpu_enable = cpu_enable_mte,
+ },
+#endif /* CONFIG_ARM64_MTE */
{},
};
@@ -2237,6 +2252,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA),
HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG),
#endif
+#ifdef CONFIG_ARM64_MTE
+ HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_MTE_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_MTE, CAP_HWCAP, KERNEL_HWCAP_MTE),
+#endif /* CONFIG_ARM64_MTE */
{},
};
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index d0076c2159e6..6a7bb3729d60 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -43,94 +43,93 @@ static const char *icache_policy_str[] = {
unsigned long __icache_flags;
static const char *const hwcap_str[] = {
- "fp",
- "asimd",
- "evtstrm",
- "aes",
- "pmull",
- "sha1",
- "sha2",
- "crc32",
- "atomics",
- "fphp",
- "asimdhp",
- "cpuid",
- "asimdrdm",
- "jscvt",
- "fcma",
- "lrcpc",
- "dcpop",
- "sha3",
- "sm3",
- "sm4",
- "asimddp",
- "sha512",
- "sve",
- "asimdfhm",
- "dit",
- "uscat",
- "ilrcpc",
- "flagm",
- "ssbs",
- "sb",
- "paca",
- "pacg",
- "dcpodp",
- "sve2",
- "sveaes",
- "svepmull",
- "svebitperm",
- "svesha3",
- "svesm4",
- "flagm2",
- "frint",
- "svei8mm",
- "svef32mm",
- "svef64mm",
- "svebf16",
- "i8mm",
- "bf16",
- "dgh",
- "rng",
- "bti",
- /* reserved for "mte" */
- NULL
+ [KERNEL_HWCAP_FP] = "fp",
+ [KERNEL_HWCAP_ASIMD] = "asimd",
+ [KERNEL_HWCAP_EVTSTRM] = "evtstrm",
+ [KERNEL_HWCAP_AES] = "aes",
+ [KERNEL_HWCAP_PMULL] = "pmull",
+ [KERNEL_HWCAP_SHA1] = "sha1",
+ [KERNEL_HWCAP_SHA2] = "sha2",
+ [KERNEL_HWCAP_CRC32] = "crc32",
+ [KERNEL_HWCAP_ATOMICS] = "atomics",
+ [KERNEL_HWCAP_FPHP] = "fphp",
+ [KERNEL_HWCAP_ASIMDHP] = "asimdhp",
+ [KERNEL_HWCAP_CPUID] = "cpuid",
+ [KERNEL_HWCAP_ASIMDRDM] = "asimdrdm",
+ [KERNEL_HWCAP_JSCVT] = "jscvt",
+ [KERNEL_HWCAP_FCMA] = "fcma",
+ [KERNEL_HWCAP_LRCPC] = "lrcpc",
+ [KERNEL_HWCAP_DCPOP] = "dcpop",
+ [KERNEL_HWCAP_SHA3] = "sha3",
+ [KERNEL_HWCAP_SM3] = "sm3",
+ [KERNEL_HWCAP_SM4] = "sm4",
+ [KERNEL_HWCAP_ASIMDDP] = "asimddp",
+ [KERNEL_HWCAP_SHA512] = "sha512",
+ [KERNEL_HWCAP_SVE] = "sve",
+ [KERNEL_HWCAP_ASIMDFHM] = "asimdfhm",
+ [KERNEL_HWCAP_DIT] = "dit",
+ [KERNEL_HWCAP_USCAT] = "uscat",
+ [KERNEL_HWCAP_ILRCPC] = "ilrcpc",
+ [KERNEL_HWCAP_FLAGM] = "flagm",
+ [KERNEL_HWCAP_SSBS] = "ssbs",
+ [KERNEL_HWCAP_SB] = "sb",
+ [KERNEL_HWCAP_PACA] = "paca",
+ [KERNEL_HWCAP_PACG] = "pacg",
+ [KERNEL_HWCAP_DCPODP] = "dcpodp",
+ [KERNEL_HWCAP_SVE2] = "sve2",
+ [KERNEL_HWCAP_SVEAES] = "sveaes",
+ [KERNEL_HWCAP_SVEPMULL] = "svepmull",
+ [KERNEL_HWCAP_SVEBITPERM] = "svebitperm",
+ [KERNEL_HWCAP_SVESHA3] = "svesha3",
+ [KERNEL_HWCAP_SVESM4] = "svesm4",
+ [KERNEL_HWCAP_FLAGM2] = "flagm2",
+ [KERNEL_HWCAP_FRINT] = "frint",
+ [KERNEL_HWCAP_SVEI8MM] = "svei8mm",
+ [KERNEL_HWCAP_SVEF32MM] = "svef32mm",
+ [KERNEL_HWCAP_SVEF64MM] = "svef64mm",
+ [KERNEL_HWCAP_SVEBF16] = "svebf16",
+ [KERNEL_HWCAP_I8MM] = "i8mm",
+ [KERNEL_HWCAP_BF16] = "bf16",
+ [KERNEL_HWCAP_DGH] = "dgh",
+ [KERNEL_HWCAP_RNG] = "rng",
+ [KERNEL_HWCAP_BTI] = "bti",
+ [KERNEL_HWCAP_MTE] = "mte",
};
#ifdef CONFIG_COMPAT
+#define COMPAT_KERNEL_HWCAP(x) const_ilog2(COMPAT_HWCAP_ ## x)
static const char *const compat_hwcap_str[] = {
- "swp",
- "half",
- "thumb",
- "26bit",
- "fastmult",
- "fpa",
- "vfp",
- "edsp",
- "java",
- "iwmmxt",
- "crunch",
- "thumbee",
- "neon",
- "vfpv3",
- "vfpv3d16",
- "tls",
- "vfpv4",
- "idiva",
- "idivt",
- "vfpd32",
- "lpae",
- "evtstrm",
- NULL
+ [COMPAT_KERNEL_HWCAP(SWP)] = "swp",
+ [COMPAT_KERNEL_HWCAP(HALF)] = "half",
+ [COMPAT_KERNEL_HWCAP(THUMB)] = "thumb",
+ [COMPAT_KERNEL_HWCAP(26BIT)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(FAST_MULT)] = "fastmult",
+ [COMPAT_KERNEL_HWCAP(FPA)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(VFP)] = "vfp",
+ [COMPAT_KERNEL_HWCAP(EDSP)] = "edsp",
+ [COMPAT_KERNEL_HWCAP(JAVA)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(IWMMXT)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(CRUNCH)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(THUMBEE)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(NEON)] = "neon",
+ [COMPAT_KERNEL_HWCAP(VFPv3)] = "vfpv3",
+ [COMPAT_KERNEL_HWCAP(VFPV3D16)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(TLS)] = "tls",
+ [COMPAT_KERNEL_HWCAP(VFPv4)] = "vfpv4",
+ [COMPAT_KERNEL_HWCAP(IDIVA)] = "idiva",
+ [COMPAT_KERNEL_HWCAP(IDIVT)] = "idivt",
+ [COMPAT_KERNEL_HWCAP(VFPD32)] = NULL, /* Not possible on arm64 */
+ [COMPAT_KERNEL_HWCAP(LPAE)] = "lpae",
+ [COMPAT_KERNEL_HWCAP(EVTSTRM)] = "evtstrm",
};
+#define COMPAT_KERNEL_HWCAP2(x) const_ilog2(COMPAT_HWCAP2_ ## x)
static const char *const compat_hwcap2_str[] = {
- "aes",
- "pmull",
- "sha1",
- "sha2",
- "crc32",
- NULL
+ [COMPAT_KERNEL_HWCAP2(AES)] = "aes",
+ [COMPAT_KERNEL_HWCAP2(PMULL)] = "pmull",
+ [COMPAT_KERNEL_HWCAP2(SHA1)] = "sha1",
+ [COMPAT_KERNEL_HWCAP2(SHA2)] = "sha2",
+ [COMPAT_KERNEL_HWCAP2(CRC32)] = "crc32",
};
#endif /* CONFIG_COMPAT */
@@ -166,16 +165,25 @@ static int c_show(struct seq_file *m, void *v)
seq_puts(m, "Features\t:");
if (compat) {
#ifdef CONFIG_COMPAT
- for (j = 0; compat_hwcap_str[j]; j++)
- if (compat_elf_hwcap & (1 << j))
+ for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) {
+ if (compat_elf_hwcap & (1 << j)) {
+ /*
+ * Warn once if any feature should not
+ * have been present on arm64 platform.
+ */
+ if (WARN_ON_ONCE(!compat_hwcap_str[j]))
+ continue;
+
seq_printf(m, " %s", compat_hwcap_str[j]);
+ }
+ }
- for (j = 0; compat_hwcap2_str[j]; j++)
+ for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++)
if (compat_elf_hwcap2 & (1 << j))
seq_printf(m, " %s", compat_hwcap2_str[j]);
#endif /* CONFIG_COMPAT */
} else {
- for (j = 0; hwcap_str[j]; j++)
+ for (j = 0; j < ARRAY_SIZE(hwcap_str); j++)
if (cpu_have_feature(j))
seq_printf(m, " %s", hwcap_str[j]);
}
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 7310a4f7f993..fa76151de6ff 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -384,7 +384,7 @@ void __init debug_traps_init(void)
hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP,
TRAP_TRACE, "single-step handler");
hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP,
- TRAP_BRKPT, "ptrace BRK handler");
+ TRAP_BRKPT, "BRK handler");
}
/* Re-enable single step for syscall restarting. */
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index d3be9dbf5490..43d4c329775f 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -66,6 +66,13 @@ static void notrace el1_dbg(struct pt_regs *regs, unsigned long esr)
}
NOKPROBE_SYMBOL(el1_dbg);
+static void notrace el1_fpac(struct pt_regs *regs, unsigned long esr)
+{
+ local_daif_inherit(regs);
+ do_ptrauth_fault(regs, esr);
+}
+NOKPROBE_SYMBOL(el1_fpac);
+
asmlinkage void notrace el1_sync_handler(struct pt_regs *regs)
{
unsigned long esr = read_sysreg(esr_el1);
@@ -92,6 +99,9 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs)
case ESR_ELx_EC_BRK64:
el1_dbg(regs, esr);
break;
+ case ESR_ELx_EC_FPAC:
+ el1_fpac(regs, esr);
+ break;
default:
el1_inv(regs, esr);
}
@@ -227,6 +237,14 @@ static void notrace el0_svc(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(el0_svc);
+static void notrace el0_fpac(struct pt_regs *regs, unsigned long esr)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_ptrauth_fault(regs, esr);
+}
+NOKPROBE_SYMBOL(el0_fpac);
+
asmlinkage void notrace el0_sync_handler(struct pt_regs *regs)
{
unsigned long esr = read_sysreg(esr_el1);
@@ -272,6 +290,9 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs)
case ESR_ELx_EC_BRK64:
el0_dbg(regs, esr);
break;
+ case ESR_ELx_EC_FPAC:
+ el0_fpac(regs, esr);
+ break;
default:
el0_inv(regs, esr);
}
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index f880dd63ddc3..2ca395c25448 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -32,6 +32,7 @@ SYM_FUNC_START(fpsimd_load_state)
SYM_FUNC_END(fpsimd_load_state)
#ifdef CONFIG_ARM64_SVE
+
SYM_FUNC_START(sve_save_state)
sve_save 0, x1, 2
ret
@@ -46,4 +47,28 @@ SYM_FUNC_START(sve_get_vl)
_sve_rdvl 0, 1
ret
SYM_FUNC_END(sve_get_vl)
+
+/*
+ * Load SVE state from FPSIMD state.
+ *
+ * x0 = pointer to struct fpsimd_state
+ * x1 = VQ - 1
+ *
+ * Each SVE vector will be loaded with the first 128-bits taken from FPSIMD
+ * and the rest zeroed. All the other SVE registers will be zeroed.
+ */
+SYM_FUNC_START(sve_load_from_fpsimd_state)
+ sve_load_vq x1, x2, x3
+ fpsimd_restore x0, 8
+ _for n, 0, 15, _sve_pfalse \n
+ _sve_wrffr 0
+ ret
+SYM_FUNC_END(sve_load_from_fpsimd_state)
+
+/* Zero all SVE registers but the first 128-bits of each vector */
+SYM_FUNC_START(sve_flush_live)
+ sve_flush
+ ret
+SYM_FUNC_END(sve_flush_live)
+
#endif /* CONFIG_ARM64_SVE */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 55af8b504b65..f30007dff35f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -132,9 +132,8 @@ alternative_else_nop_endif
* them if required.
*/
.macro apply_ssbd, state, tmp1, tmp2
-#ifdef CONFIG_ARM64_SSBD
-alternative_cb arm64_enable_wa2_handling
- b .L__asm_ssbd_skip\@
+alternative_cb spectre_v4_patch_fw_mitigation_enable
+ b .L__asm_ssbd_skip\@ // Patched to NOP
alternative_cb_end
ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1
cbz \tmp2, .L__asm_ssbd_skip\@
@@ -142,10 +141,35 @@ alternative_cb_end
tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@
mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
mov w1, #\state
-alternative_cb arm64_update_smccc_conduit
+alternative_cb spectre_v4_patch_fw_mitigation_conduit
nop // Patched to SMC/HVC #0
alternative_cb_end
.L__asm_ssbd_skip\@:
+ .endm
+
+ /* Check for MTE asynchronous tag check faults */
+ .macro check_mte_async_tcf, flgs, tmp
+#ifdef CONFIG_ARM64_MTE
+alternative_if_not ARM64_MTE
+ b 1f
+alternative_else_nop_endif
+ mrs_s \tmp, SYS_TFSRE0_EL1
+ tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f
+ /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */
+ orr \flgs, \flgs, #_TIF_MTE_ASYNC_FAULT
+ str \flgs, [tsk, #TSK_TI_FLAGS]
+ msr_s SYS_TFSRE0_EL1, xzr
+1:
+#endif
+ .endm
+
+ /* Clear the MTE asynchronous tag check faults */
+ .macro clear_mte_async_tcf
+#ifdef CONFIG_ARM64_MTE
+alternative_if ARM64_MTE
+ dsb ish
+ msr_s SYS_TFSRE0_EL1, xzr
+alternative_else_nop_endif
#endif
.endm
@@ -182,6 +206,8 @@ alternative_cb_end
ldr x19, [tsk, #TSK_TI_FLAGS]
disable_step_tsk x19, x20
+ /* Check for asynchronous tag check faults in user space */
+ check_mte_async_tcf x19, x22
apply_ssbd 1, x22, x23
ptrauth_keys_install_kernel tsk, x20, x22, x23
@@ -233,6 +259,13 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
str x20, [sp, #S_PMR_SAVE]
alternative_else_nop_endif
+ /* Re-enable tag checking (TCO set on exception entry) */
+#ifdef CONFIG_ARM64_MTE
+alternative_if ARM64_MTE
+ SET_PSTATE_TCO(0)
+alternative_else_nop_endif
+#endif
+
/*
* Registers that may be useful after this macro is invoked:
*
@@ -697,11 +730,9 @@ el0_irq_naked:
bl trace_hardirqs_off
#endif
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
tbz x22, #55, 1f
bl do_el0_irq_bp_hardening
1:
-#endif
irq_handler
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -744,6 +775,8 @@ SYM_CODE_START_LOCAL(ret_to_user)
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
finish_ret_to_user:
+ /* Ignore asynchronous tag check faults in the uaccess routines */
+ clear_mte_async_tcf
enable_step_tsk x1, x2
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
bl stackleak_erase
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 55c8f3ec6705..062b21f30f94 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -32,9 +32,11 @@
#include <linux/swab.h>
#include <asm/esr.h>
+#include <asm/exception.h>
#include <asm/fpsimd.h>
#include <asm/cpufeature.h>
#include <asm/cputype.h>
+#include <asm/neon.h>
#include <asm/processor.h>
#include <asm/simd.h>
#include <asm/sigcontext.h>
@@ -312,7 +314,7 @@ static void fpsimd_save(void)
* re-enter user with corrupt state.
* There's no way to recover, so kill it:
*/
- force_signal_inject(SIGKILL, SI_KERNEL, 0);
+ force_signal_inject(SIGKILL, SI_KERNEL, 0, 0);
return;
}
@@ -676,7 +678,7 @@ int sve_set_current_vl(unsigned long arg)
vl = arg & PR_SVE_VL_LEN_MASK;
flags = arg & ~vl;
- if (!system_supports_sve())
+ if (!system_supports_sve() || is_compat_task())
return -EINVAL;
ret = sve_set_vector_length(current, vl, flags);
@@ -689,7 +691,7 @@ int sve_set_current_vl(unsigned long arg)
/* PR_SVE_GET_VL */
int sve_get_current_vl(void)
{
- if (!system_supports_sve())
+ if (!system_supports_sve() || is_compat_task())
return -EINVAL;
return sve_prctl_status(0);
@@ -928,7 +930,7 @@ void fpsimd_release_task(struct task_struct *dead_task)
* the SVE access trap will be disabled the next time this task
* reaches ret_to_user.
*
- * TIF_SVE should be clear on entry: otherwise, task_fpsimd_load()
+ * TIF_SVE should be clear on entry: otherwise, fpsimd_restore_current_state()
* would have disabled the SVE access trap for userspace during
* ret_to_user, making an SVE access trap impossible in that case.
*/
@@ -936,7 +938,7 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
{
/* Even if we chose not to use SVE, the hardware could still trap: */
if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) {
- force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
+ force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
return;
}
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 037421c66b14..d8d9caf02834 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -36,14 +36,10 @@
#include "efi-header.S"
-#define __PHYS_OFFSET (KERNEL_START - TEXT_OFFSET)
+#define __PHYS_OFFSET KERNEL_START
-#if (TEXT_OFFSET & 0xfff) != 0
-#error TEXT_OFFSET must be at least 4KB aligned
-#elif (PAGE_OFFSET & 0x1fffff) != 0
+#if (PAGE_OFFSET & 0x1fffff) != 0
#error PAGE_OFFSET must be at least 2MB aligned
-#elif TEXT_OFFSET > 0x1fffff
-#error TEXT_OFFSET must be less than 2MB
#endif
/*
@@ -55,7 +51,7 @@
* x0 = physical address to the FDT blob.
*
* This code is mostly position independent so you call this at
- * __pa(PAGE_OFFSET + TEXT_OFFSET).
+ * __pa(PAGE_OFFSET).
*
* Note that the callee-saved registers are used for storing variables
* that are useful before the MMU is enabled. The allocations are described
@@ -77,7 +73,7 @@ _head:
b primary_entry // branch to kernel start, magic
.long 0 // reserved
#endif
- le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian
+ .quad 0 // Image load offset from start of RAM, little-endian
le64sym _kernel_size_le // Effective size of kernel image, little-endian
le64sym _kernel_flags_le // Informative flags, little-endian
.quad 0 // reserved
@@ -382,7 +378,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
* Map the kernel image (starting with PHYS_OFFSET).
*/
adrp x0, init_pg_dir
- mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text)
+ mov_q x5, KIMAGE_VADDR // compile time __va(_text)
add x5, x5, x23 // add KASLR displacement
mov x4, PTRS_PER_PGD
adrp x6, _end // runtime __pa(_end)
@@ -474,7 +470,7 @@ SYM_FUNC_END(__primary_switched)
.pushsection ".rodata", "a"
SYM_DATA_START(kimage_vaddr)
- .quad _text - TEXT_OFFSET
+ .quad _text
SYM_DATA_END(kimage_vaddr)
EXPORT_SYMBOL(kimage_vaddr)
.popsection
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 68e14152d6e9..42003774d261 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -21,7 +21,6 @@
#include <linux/sched.h>
#include <linux/suspend.h>
#include <linux/utsname.h>
-#include <linux/version.h>
#include <asm/barrier.h>
#include <asm/cacheflush.h>
@@ -31,6 +30,7 @@
#include <asm/kexec.h>
#include <asm/memory.h>
#include <asm/mmu_context.h>
+#include <asm/mte.h>
#include <asm/pgalloc.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sections.h>
@@ -285,6 +285,117 @@ static int create_safe_exec_page(void *src_start, size_t length,
#define dcache_clean_range(start, end) __flush_dcache_area(start, (end - start))
+#ifdef CONFIG_ARM64_MTE
+
+static DEFINE_XARRAY(mte_pages);
+
+static int save_tags(struct page *page, unsigned long pfn)
+{
+ void *tag_storage, *ret;
+
+ tag_storage = mte_allocate_tag_storage();
+ if (!tag_storage)
+ return -ENOMEM;
+
+ mte_save_page_tags(page_address(page), tag_storage);
+
+ ret = xa_store(&mte_pages, pfn, tag_storage, GFP_KERNEL);
+ if (WARN(xa_is_err(ret), "Failed to store MTE tags")) {
+ mte_free_tag_storage(tag_storage);
+ return xa_err(ret);
+ } else if (WARN(ret, "swsusp: %s: Duplicate entry", __func__)) {
+ mte_free_tag_storage(ret);
+ }
+
+ return 0;
+}
+
+static void swsusp_mte_free_storage(void)
+{
+ XA_STATE(xa_state, &mte_pages, 0);
+ void *tags;
+
+ xa_lock(&mte_pages);
+ xas_for_each(&xa_state, tags, ULONG_MAX) {
+ mte_free_tag_storage(tags);
+ }
+ xa_unlock(&mte_pages);
+
+ xa_destroy(&mte_pages);
+}
+
+static int swsusp_mte_save_tags(void)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+ int ret = 0;
+ int n = 0;
+
+ if (!system_supports_mte())
+ return 0;
+
+ for_each_populated_zone(zone) {
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+ struct page *page = pfn_to_online_page(pfn);
+
+ if (!page)
+ continue;
+
+ if (!test_bit(PG_mte_tagged, &page->flags))
+ continue;
+
+ ret = save_tags(page, pfn);
+ if (ret) {
+ swsusp_mte_free_storage();
+ goto out;
+ }
+
+ n++;
+ }
+ }
+ pr_info("Saved %d MTE pages\n", n);
+
+out:
+ return ret;
+}
+
+static void swsusp_mte_restore_tags(void)
+{
+ XA_STATE(xa_state, &mte_pages, 0);
+ int n = 0;
+ void *tags;
+
+ xa_lock(&mte_pages);
+ xas_for_each(&xa_state, tags, ULONG_MAX) {
+ unsigned long pfn = xa_state.xa_index;
+ struct page *page = pfn_to_online_page(pfn);
+
+ mte_restore_page_tags(page_address(page), tags);
+
+ mte_free_tag_storage(tags);
+ n++;
+ }
+ xa_unlock(&mte_pages);
+
+ pr_info("Restored %d MTE pages\n", n);
+
+ xa_destroy(&mte_pages);
+}
+
+#else /* CONFIG_ARM64_MTE */
+
+static int swsusp_mte_save_tags(void)
+{
+ return 0;
+}
+
+static void swsusp_mte_restore_tags(void)
+{
+}
+
+#endif /* CONFIG_ARM64_MTE */
+
int swsusp_arch_suspend(void)
{
int ret = 0;
@@ -302,6 +413,10 @@ int swsusp_arch_suspend(void)
/* make the crash dump kernel image visible/saveable */
crash_prepare_suspend();
+ ret = swsusp_mte_save_tags();
+ if (ret)
+ return ret;
+
sleep_cpu = smp_processor_id();
ret = swsusp_save();
} else {
@@ -315,6 +430,8 @@ int swsusp_arch_suspend(void)
dcache_clean_range(__hyp_text_start, __hyp_text_end);
}
+ swsusp_mte_restore_tags();
+
/* make the crash dump kernel image protected again */
crash_post_resume();
@@ -332,11 +449,7 @@ int swsusp_arch_suspend(void)
* mitigation off behind our back, let's set the state
* to what we expect it to be.
*/
- switch (arm64_get_ssbd_state()) {
- case ARM64_SSBD_FORCE_ENABLE:
- case ARM64_SSBD_KERNEL:
- arm64_set_ssbd_mitigation(true);
- }
+ spectre_v4_enable_mitigation(NULL);
}
local_daif_restore(flags);
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8982b68289b7..61684a500914 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -61,16 +61,11 @@ __efistub__ctype = _ctype;
* memory mappings.
*/
-#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym;
-
/* Alternative callbacks for init-time patching of nVHE hyp code. */
-KVM_NVHE_ALIAS(arm64_enable_wa2_handling);
KVM_NVHE_ALIAS(kvm_patch_vector_branch);
KVM_NVHE_ALIAS(kvm_update_va_mask);
/* Global kernel state accessed by nVHE hyp code. */
-KVM_NVHE_ALIAS(arm64_ssbd_callback_required);
-KVM_NVHE_ALIAS(kvm_host_data);
KVM_NVHE_ALIAS(kvm_vgic_global_state);
/* Kernel constant needed to compute idmap addresses. */
@@ -101,6 +96,8 @@ KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
/* Static key checked in pmr_sync(). */
#ifdef CONFIG_ARM64_PSEUDO_NMI
KVM_NVHE_ALIAS(gic_pmr_sync);
+/* Static key checked in GIC_PRIO_IRQOFF. */
+KVM_NVHE_ALIAS(gic_nonsecure_priorities);
#endif
/* EL2 exception handling */
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index c7d38c660372..7bc3ba897901 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -62,7 +62,6 @@
*/
#define HEAD_SYMBOLS \
DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \
- DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \
DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS);
#endif /* __ARM64_KERNEL_IMAGE_H */
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index a107375005bc..6c0de2f60ea9 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -60,16 +60,10 @@ bool __kprobes aarch64_insn_is_steppable_hint(u32 insn)
case AARCH64_INSN_HINT_XPACLRI:
case AARCH64_INSN_HINT_PACIA_1716:
case AARCH64_INSN_HINT_PACIB_1716:
- case AARCH64_INSN_HINT_AUTIA_1716:
- case AARCH64_INSN_HINT_AUTIB_1716:
case AARCH64_INSN_HINT_PACIAZ:
case AARCH64_INSN_HINT_PACIASP:
case AARCH64_INSN_HINT_PACIBZ:
case AARCH64_INSN_HINT_PACIBSP:
- case AARCH64_INSN_HINT_AUTIAZ:
- case AARCH64_INSN_HINT_AUTIASP:
- case AARCH64_INSN_HINT_AUTIBZ:
- case AARCH64_INSN_HINT_AUTIBSP:
case AARCH64_INSN_HINT_BTI:
case AARCH64_INSN_HINT_BTIC:
case AARCH64_INSN_HINT_BTIJ:
@@ -176,7 +170,7 @@ bool __kprobes aarch64_insn_uses_literal(u32 insn)
bool __kprobes aarch64_insn_is_branch(u32 insn)
{
- /* b, bl, cb*, tb*, b.cond, br, blr */
+ /* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */
return aarch64_insn_is_b(insn) ||
aarch64_insn_is_bl(insn) ||
@@ -185,8 +179,11 @@ bool __kprobes aarch64_insn_is_branch(u32 insn)
aarch64_insn_is_tbz(insn) ||
aarch64_insn_is_tbnz(insn) ||
aarch64_insn_is_ret(insn) ||
+ aarch64_insn_is_ret_auth(insn) ||
aarch64_insn_is_br(insn) ||
+ aarch64_insn_is_br_auth(insn) ||
aarch64_insn_is_blr(insn) ||
+ aarch64_insn_is_blr_auth(insn) ||
aarch64_insn_is_bcond(insn);
}
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 04a327ccf84d..9cf2fb87584a 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -10,10 +10,10 @@
* Copyright (C) 2012 ARM Ltd.
*/
-#include <linux/kernel_stat.h>
#include <linux/irq.h>
#include <linux/memory.h>
#include <linux/smp.h>
+#include <linux/hardirq.h>
#include <linux/init.h>
#include <linux/irqchip.h>
#include <linux/kprobes.h>
@@ -22,20 +22,11 @@
#include <asm/daifflags.h>
#include <asm/vmap_stack.h>
-unsigned long irq_err_count;
-
/* Only access this in an NMI enter/exit */
DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts);
DEFINE_PER_CPU(unsigned long *, irq_stack_ptr);
-int arch_show_interrupts(struct seq_file *p, int prec)
-{
- show_ipi_list(p, prec);
- seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count);
- return 0;
-}
-
#ifdef CONFIG_VMAP_STACK
static void init_irq_stacks(void)
{
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 361a1143e09e..5b0e67b93cdc 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -215,8 +215,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
phys_addr_t start, end;
nr_ranges = 1; /* for exclusion of crashkernel region */
- for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL)
+ for_each_mem_range(i, &start, &end)
nr_ranges++;
cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL);
@@ -225,8 +224,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
cmem->max_nr_ranges = nr_ranges;
cmem->nr_ranges = 0;
- for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range(i, &start, &end) {
cmem->ranges[cmem->nr_ranges].start = start;
cmem->ranges[cmem->nr_ranges].end = end - 1;
cmem->nr_ranges++;
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
new file mode 100644
index 000000000000..52a0638ed967
--- /dev/null
+++ b/arch/arm64/kernel/mte.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/prctl.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/string.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/thread_info.h>
+#include <linux/uio.h>
+
+#include <asm/cpufeature.h>
+#include <asm/mte.h>
+#include <asm/ptrace.h>
+#include <asm/sysreg.h>
+
+static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
+{
+ pte_t old_pte = READ_ONCE(*ptep);
+
+ if (check_swap && is_swap_pte(old_pte)) {
+ swp_entry_t entry = pte_to_swp_entry(old_pte);
+
+ if (!non_swap_entry(entry) && mte_restore_tags(entry, page))
+ return;
+ }
+
+ mte_clear_page_tags(page_address(page));
+}
+
+void mte_sync_tags(pte_t *ptep, pte_t pte)
+{
+ struct page *page = pte_page(pte);
+ long i, nr_pages = compound_nr(page);
+ bool check_swap = nr_pages == 1;
+
+ /* if PG_mte_tagged is set, tags have already been initialised */
+ for (i = 0; i < nr_pages; i++, page++) {
+ if (!test_and_set_bit(PG_mte_tagged, &page->flags))
+ mte_sync_page_tags(page, ptep, check_swap);
+ }
+}
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = page_address(page1);
+ addr2 = page_address(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+
+ if (!system_supports_mte() || ret)
+ return ret;
+
+ /*
+ * If the page content is identical but at least one of the pages is
+ * tagged, return non-zero to avoid KSM merging. If only one of the
+ * pages is tagged, set_pte_at() may zero or change the tags of the
+ * other page via mte_sync_tags().
+ */
+ if (test_bit(PG_mte_tagged, &page1->flags) ||
+ test_bit(PG_mte_tagged, &page2->flags))
+ return addr1 != addr2;
+
+ return ret;
+}
+
+static void update_sctlr_el1_tcf0(u64 tcf0)
+{
+ /* ISB required for the kernel uaccess routines */
+ sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF0_MASK, tcf0);
+ isb();
+}
+
+static void set_sctlr_el1_tcf0(u64 tcf0)
+{
+ /*
+ * mte_thread_switch() checks current->thread.sctlr_tcf0 as an
+ * optimisation. Disable preemption so that it does not see
+ * the variable update before the SCTLR_EL1.TCF0 one.
+ */
+ preempt_disable();
+ current->thread.sctlr_tcf0 = tcf0;
+ update_sctlr_el1_tcf0(tcf0);
+ preempt_enable();
+}
+
+static void update_gcr_el1_excl(u64 incl)
+{
+ u64 excl = ~incl & SYS_GCR_EL1_EXCL_MASK;
+
+ /*
+ * Note that 'incl' is an include mask (controlled by the user via
+ * prctl()) while GCR_EL1 accepts an exclude mask.
+ * No need for ISB since this only affects EL0 currently, implicit
+ * with ERET.
+ */
+ sysreg_clear_set_s(SYS_GCR_EL1, SYS_GCR_EL1_EXCL_MASK, excl);
+}
+
+static void set_gcr_el1_excl(u64 incl)
+{
+ current->thread.gcr_user_incl = incl;
+ update_gcr_el1_excl(incl);
+}
+
+void flush_mte_state(void)
+{
+ if (!system_supports_mte())
+ return;
+
+ /* clear any pending asynchronous tag fault */
+ dsb(ish);
+ write_sysreg_s(0, SYS_TFSRE0_EL1);
+ clear_thread_flag(TIF_MTE_ASYNC_FAULT);
+ /* disable tag checking */
+ set_sctlr_el1_tcf0(SCTLR_EL1_TCF0_NONE);
+ /* reset tag generation mask */
+ set_gcr_el1_excl(0);
+}
+
+void mte_thread_switch(struct task_struct *next)
+{
+ if (!system_supports_mte())
+ return;
+
+ /* avoid expensive SCTLR_EL1 accesses if no change */
+ if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0)
+ update_sctlr_el1_tcf0(next->thread.sctlr_tcf0);
+ update_gcr_el1_excl(next->thread.gcr_user_incl);
+}
+
+void mte_suspend_exit(void)
+{
+ if (!system_supports_mte())
+ return;
+
+ update_gcr_el1_excl(current->thread.gcr_user_incl);
+}
+
+long set_mte_ctrl(struct task_struct *task, unsigned long arg)
+{
+ u64 tcf0;
+ u64 gcr_incl = (arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT;
+
+ if (!system_supports_mte())
+ return 0;
+
+ switch (arg & PR_MTE_TCF_MASK) {
+ case PR_MTE_TCF_NONE:
+ tcf0 = SCTLR_EL1_TCF0_NONE;
+ break;
+ case PR_MTE_TCF_SYNC:
+ tcf0 = SCTLR_EL1_TCF0_SYNC;
+ break;
+ case PR_MTE_TCF_ASYNC:
+ tcf0 = SCTLR_EL1_TCF0_ASYNC;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (task != current) {
+ task->thread.sctlr_tcf0 = tcf0;
+ task->thread.gcr_user_incl = gcr_incl;
+ } else {
+ set_sctlr_el1_tcf0(tcf0);
+ set_gcr_el1_excl(gcr_incl);
+ }
+
+ return 0;
+}
+
+long get_mte_ctrl(struct task_struct *task)
+{
+ unsigned long ret;
+
+ if (!system_supports_mte())
+ return 0;
+
+ ret = task->thread.gcr_user_incl << PR_MTE_TAG_SHIFT;
+
+ switch (task->thread.sctlr_tcf0) {
+ case SCTLR_EL1_TCF0_NONE:
+ return PR_MTE_TCF_NONE;
+ case SCTLR_EL1_TCF0_SYNC:
+ ret |= PR_MTE_TCF_SYNC;
+ break;
+ case SCTLR_EL1_TCF0_ASYNC:
+ ret |= PR_MTE_TCF_ASYNC;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Access MTE tags in another process' address space as given in mm. Update
+ * the number of tags copied. Return 0 if any tags copied, error otherwise.
+ * Inspired by __access_remote_vm().
+ */
+static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
+ struct iovec *kiov, unsigned int gup_flags)
+{
+ struct vm_area_struct *vma;
+ void __user *buf = kiov->iov_base;
+ size_t len = kiov->iov_len;
+ int ret;
+ int write = gup_flags & FOLL_WRITE;
+
+ if (!access_ok(buf, len))
+ return -EFAULT;
+
+ if (mmap_read_lock_killable(mm))
+ return -EIO;
+
+ while (len) {
+ unsigned long tags, offset;
+ void *maddr;
+ struct page *page = NULL;
+
+ ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page,
+ &vma, NULL);
+ if (ret <= 0)
+ break;
+
+ /*
+ * Only copy tags if the page has been mapped as PROT_MTE
+ * (PG_mte_tagged set). Otherwise the tags are not valid and
+ * not accessible to user. Moreover, an mprotect(PROT_MTE)
+ * would cause the existing tags to be cleared if the page
+ * was never mapped with PROT_MTE.
+ */
+ if (!test_bit(PG_mte_tagged, &page->flags)) {
+ ret = -EOPNOTSUPP;
+ put_page(page);
+ break;
+ }
+
+ /* limit access to the end of the page */
+ offset = offset_in_page(addr);
+ tags = min(len, (PAGE_SIZE - offset) / MTE_GRANULE_SIZE);
+
+ maddr = page_address(page);
+ if (write) {
+ tags = mte_copy_tags_from_user(maddr + offset, buf, tags);
+ set_page_dirty_lock(page);
+ } else {
+ tags = mte_copy_tags_to_user(buf, maddr + offset, tags);
+ }
+ put_page(page);
+
+ /* error accessing the tracer's buffer */
+ if (!tags)
+ break;
+
+ len -= tags;
+ buf += tags;
+ addr += tags * MTE_GRANULE_SIZE;
+ }
+ mmap_read_unlock(mm);
+
+ /* return an error if no tags copied */
+ kiov->iov_len = buf - kiov->iov_base;
+ if (!kiov->iov_len) {
+ /* check for error accessing the tracee's address space */
+ if (ret <= 0)
+ return -EIO;
+ else
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy MTE tags in another process' address space at 'addr' to/from tracer's
+ * iovec buffer. Return 0 on success. Inspired by ptrace_access_vm().
+ */
+static int access_remote_tags(struct task_struct *tsk, unsigned long addr,
+ struct iovec *kiov, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return -EPERM;
+
+ if (!tsk->ptrace || (current != tsk->parent) ||
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptracer_capable(tsk, mm->user_ns))) {
+ mmput(mm);
+ return -EPERM;
+ }
+
+ ret = __access_remote_tags(mm, addr, kiov, gup_flags);
+ mmput(mm);
+
+ return ret;
+}
+
+int mte_ptrace_copy_tags(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
+{
+ int ret;
+ struct iovec kiov;
+ struct iovec __user *uiov = (void __user *)data;
+ unsigned int gup_flags = FOLL_FORCE;
+
+ if (!system_supports_mte())
+ return -EIO;
+
+ if (get_user(kiov.iov_base, &uiov->iov_base) ||
+ get_user(kiov.iov_len, &uiov->iov_len))
+ return -EFAULT;
+
+ if (request == PTRACE_POKEMTETAGS)
+ gup_flags |= FOLL_WRITE;
+
+ /* align addr to the MTE tag granule */
+ addr &= MTE_GRANULE_MASK;
+
+ ret = access_remote_tags(child, addr, &kiov, gup_flags);
+ if (!ret)
+ ret = put_user(kiov.iov_len, &uiov->iov_len);
+
+ return ret;
+}
diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
index 295d66490584..c07d7a034941 100644
--- a/arch/arm64/kernel/paravirt.c
+++ b/arch/arm64/kernel/paravirt.c
@@ -50,16 +50,19 @@ static u64 pv_steal_clock(int cpu)
struct pv_time_stolen_time_region *reg;
reg = per_cpu_ptr(&stolen_time_region, cpu);
- if (!reg->kaddr) {
- pr_warn_once("stolen time enabled but not configured for cpu %d\n",
- cpu);
+
+ /*
+ * paravirt_steal_clock() may be called before the CPU
+ * online notification callback runs. Until the callback
+ * has run we just return zero.
+ */
+ if (!reg->kaddr)
return 0;
- }
return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time));
}
-static int stolen_time_dying_cpu(unsigned int cpu)
+static int stolen_time_cpu_down_prepare(unsigned int cpu)
{
struct pv_time_stolen_time_region *reg;
@@ -73,7 +76,7 @@ static int stolen_time_dying_cpu(unsigned int cpu)
return 0;
}
-static int init_stolen_time_cpu(unsigned int cpu)
+static int stolen_time_cpu_online(unsigned int cpu)
{
struct pv_time_stolen_time_region *reg;
struct arm_smccc_res res;
@@ -103,19 +106,20 @@ static int init_stolen_time_cpu(unsigned int cpu)
return 0;
}
-static int pv_time_init_stolen_time(void)
+static int __init pv_time_init_stolen_time(void)
{
int ret;
- ret = cpuhp_setup_state(CPUHP_AP_ARM_KVMPV_STARTING,
- "hypervisor/arm/pvtime:starting",
- init_stolen_time_cpu, stolen_time_dying_cpu);
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "hypervisor/arm/pvtime:online",
+ stolen_time_cpu_online,
+ stolen_time_cpu_down_prepare);
if (ret < 0)
return ret;
return 0;
}
-static bool has_pv_steal_clock(void)
+static bool __init has_pv_steal_clock(void)
{
struct arm_smccc_res res;
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index b0e03e052dd1..88ff471b0bce 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -137,11 +137,11 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
* whist unwinding the stackframe and is like a subroutine return so we use
* the PC.
*/
-static int callchain_trace(struct stackframe *frame, void *data)
+static bool callchain_trace(void *data, unsigned long pc)
{
struct perf_callchain_entry_ctx *entry = data;
- perf_callchain_store(entry, frame->pc);
- return 0;
+ perf_callchain_store(entry, pc);
+ return true;
}
void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 462f9a9cc44b..3605f77ad4df 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -69,6 +69,9 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
[C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL,
[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB,
+ [C(LL)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD,
+ [C(LL)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_LL_CACHE_RD,
+
[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED,
[C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
};
@@ -302,13 +305,33 @@ static struct attribute_group armv8_pmuv3_format_attr_group = {
.attrs = armv8_pmuv3_format_attrs,
};
+static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+ u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK;
+
+ return snprintf(page, PAGE_SIZE, "0x%08x\n", slots);
+}
+
+static DEVICE_ATTR_RO(slots);
+
+static struct attribute *armv8_pmuv3_caps_attrs[] = {
+ &dev_attr_slots.attr,
+ NULL,
+};
+
+static struct attribute_group armv8_pmuv3_caps_attr_group = {
+ .name = "caps",
+ .attrs = armv8_pmuv3_caps_attrs,
+};
+
/*
* Perf Events' indices
*/
#define ARMV8_IDX_CYCLE_COUNTER 0
#define ARMV8_IDX_COUNTER0 1
-#define ARMV8_IDX_COUNTER_LAST(cpu_pmu) \
- (ARMV8_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1)
/*
@@ -348,6 +371,73 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event)
#define ARMV8_IDX_TO_COUNTER(x) \
(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
+/*
+ * This code is really good
+ */
+
+#define PMEVN_CASE(n, case_macro) \
+ case n: case_macro(n); break
+
+#define PMEVN_SWITCH(x, case_macro) \
+ do { \
+ switch (x) { \
+ PMEVN_CASE(0, case_macro); \
+ PMEVN_CASE(1, case_macro); \
+ PMEVN_CASE(2, case_macro); \
+ PMEVN_CASE(3, case_macro); \
+ PMEVN_CASE(4, case_macro); \
+ PMEVN_CASE(5, case_macro); \
+ PMEVN_CASE(6, case_macro); \
+ PMEVN_CASE(7, case_macro); \
+ PMEVN_CASE(8, case_macro); \
+ PMEVN_CASE(9, case_macro); \
+ PMEVN_CASE(10, case_macro); \
+ PMEVN_CASE(11, case_macro); \
+ PMEVN_CASE(12, case_macro); \
+ PMEVN_CASE(13, case_macro); \
+ PMEVN_CASE(14, case_macro); \
+ PMEVN_CASE(15, case_macro); \
+ PMEVN_CASE(16, case_macro); \
+ PMEVN_CASE(17, case_macro); \
+ PMEVN_CASE(18, case_macro); \
+ PMEVN_CASE(19, case_macro); \
+ PMEVN_CASE(20, case_macro); \
+ PMEVN_CASE(21, case_macro); \
+ PMEVN_CASE(22, case_macro); \
+ PMEVN_CASE(23, case_macro); \
+ PMEVN_CASE(24, case_macro); \
+ PMEVN_CASE(25, case_macro); \
+ PMEVN_CASE(26, case_macro); \
+ PMEVN_CASE(27, case_macro); \
+ PMEVN_CASE(28, case_macro); \
+ PMEVN_CASE(29, case_macro); \
+ PMEVN_CASE(30, case_macro); \
+ default: WARN(1, "Invalid PMEV* index\n"); \
+ } \
+ } while (0)
+
+#define RETURN_READ_PMEVCNTRN(n) \
+ return read_sysreg(pmevcntr##n##_el0)
+static unsigned long read_pmevcntrn(int n)
+{
+ PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
+ return 0;
+}
+
+#define WRITE_PMEVCNTRN(n) \
+ write_sysreg(val, pmevcntr##n##_el0)
+static void write_pmevcntrn(int n, unsigned long val)
+{
+ PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
+}
+
+#define WRITE_PMEVTYPERN(n) \
+ write_sysreg(val, pmevtyper##n##_el0)
+static void write_pmevtypern(int n, unsigned long val)
+{
+ PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
+}
+
static inline u32 armv8pmu_pmcr_read(void)
{
return read_sysreg(pmcr_el0);
@@ -365,28 +455,16 @@ static inline int armv8pmu_has_overflowed(u32 pmovsr)
return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
}
-static inline int armv8pmu_counter_valid(struct arm_pmu *cpu_pmu, int idx)
-{
- return idx >= ARMV8_IDX_CYCLE_COUNTER &&
- idx <= ARMV8_IDX_COUNTER_LAST(cpu_pmu);
-}
-
static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx)
{
return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx));
}
-static inline void armv8pmu_select_counter(int idx)
+static inline u32 armv8pmu_read_evcntr(int idx)
{
u32 counter = ARMV8_IDX_TO_COUNTER(idx);
- write_sysreg(counter, pmselr_el0);
- isb();
-}
-static inline u64 armv8pmu_read_evcntr(int idx)
-{
- armv8pmu_select_counter(idx);
- return read_sysreg(pmxevcntr_el0);
+ return read_pmevcntrn(counter);
}
static inline u64 armv8pmu_read_hw_counter(struct perf_event *event)
@@ -440,15 +518,11 @@ static u64 armv8pmu_unbias_long_counter(struct perf_event *event, u64 value)
static u64 armv8pmu_read_counter(struct perf_event *event)
{
- struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
u64 value = 0;
- if (!armv8pmu_counter_valid(cpu_pmu, idx))
- pr_err("CPU%u reading wrong counter %d\n",
- smp_processor_id(), idx);
- else if (idx == ARMV8_IDX_CYCLE_COUNTER)
+ if (idx == ARMV8_IDX_CYCLE_COUNTER)
value = read_sysreg(pmccntr_el0);
else
value = armv8pmu_read_hw_counter(event);
@@ -458,8 +532,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event)
static inline void armv8pmu_write_evcntr(int idx, u64 value)
{
- armv8pmu_select_counter(idx);
- write_sysreg(value, pmxevcntr_el0);
+ u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+
+ write_pmevcntrn(counter, value);
}
static inline void armv8pmu_write_hw_counter(struct perf_event *event,
@@ -477,16 +552,12 @@ static inline void armv8pmu_write_hw_counter(struct perf_event *event,
static void armv8pmu_write_counter(struct perf_event *event, u64 value)
{
- struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
value = armv8pmu_bias_long_counter(event, value);
- if (!armv8pmu_counter_valid(cpu_pmu, idx))
- pr_err("CPU%u writing wrong counter %d\n",
- smp_processor_id(), idx);
- else if (idx == ARMV8_IDX_CYCLE_COUNTER)
+ if (idx == ARMV8_IDX_CYCLE_COUNTER)
write_sysreg(value, pmccntr_el0);
else
armv8pmu_write_hw_counter(event, value);
@@ -494,9 +565,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value)
static inline void armv8pmu_write_evtype(int idx, u32 val)
{
- armv8pmu_select_counter(idx);
+ u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+
val &= ARMV8_PMU_EVTYPE_MASK;
- write_sysreg(val, pmxevtyper_el0);
+ write_pmevtypern(counter, val);
}
static inline void armv8pmu_write_event_type(struct perf_event *event)
@@ -516,7 +588,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event)
armv8pmu_write_evtype(idx - 1, hwc->config_base);
armv8pmu_write_evtype(idx, chain_evt);
} else {
- armv8pmu_write_evtype(idx, hwc->config_base);
+ if (idx == ARMV8_IDX_CYCLE_COUNTER)
+ write_sysreg(hwc->config_base, pmccfiltr_el0);
+ else
+ armv8pmu_write_evtype(idx, hwc->config_base);
}
}
@@ -532,6 +607,11 @@ static u32 armv8pmu_event_cnten_mask(struct perf_event *event)
static inline void armv8pmu_enable_counter(u32 mask)
{
+ /*
+ * Make sure event configuration register writes are visible before we
+ * enable the counter.
+ * */
+ isb();
write_sysreg(mask, pmcntenset_el0);
}
@@ -550,6 +630,11 @@ static inline void armv8pmu_enable_event_counter(struct perf_event *event)
static inline void armv8pmu_disable_counter(u32 mask)
{
write_sysreg(mask, pmcntenclr_el0);
+ /*
+ * Make sure the effects of disabling the counter are visible before we
+ * start configuring the event.
+ */
+ isb();
}
static inline void armv8pmu_disable_event_counter(struct perf_event *event)
@@ -606,15 +691,10 @@ static inline u32 armv8pmu_getreset_flags(void)
static void armv8pmu_enable_event(struct perf_event *event)
{
- unsigned long flags;
- struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
- struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events);
-
/*
* Enable counter and interrupt, and set the counter to count
* the event that we're interested in.
*/
- raw_spin_lock_irqsave(&events->pmu_lock, flags);
/*
* Disable counter
@@ -622,7 +702,7 @@ static void armv8pmu_enable_event(struct perf_event *event)
armv8pmu_disable_event_counter(event);
/*
- * Set event (if destined for PMNx counters).
+ * Set event.
*/
armv8pmu_write_event_type(event);
@@ -635,21 +715,10 @@ static void armv8pmu_enable_event(struct perf_event *event)
* Enable counter
*/
armv8pmu_enable_event_counter(event);
-
- raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
static void armv8pmu_disable_event(struct perf_event *event)
{
- unsigned long flags;
- struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
- struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events);
-
- /*
- * Disable counter and interrupt
- */
- raw_spin_lock_irqsave(&events->pmu_lock, flags);
-
/*
* Disable counter
*/
@@ -659,30 +728,18 @@ static void armv8pmu_disable_event(struct perf_event *event)
* Disable interrupt for this counter
*/
armv8pmu_disable_event_irq(event);
-
- raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
static void armv8pmu_start(struct arm_pmu *cpu_pmu)
{
- unsigned long flags;
- struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events);
-
- raw_spin_lock_irqsave(&events->pmu_lock, flags);
/* Enable all counters */
armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
- raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
{
- unsigned long flags;
- struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events);
-
- raw_spin_lock_irqsave(&events->pmu_lock, flags);
/* Disable all counters */
armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
- raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
}
static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
@@ -735,20 +792,16 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
if (!armpmu_event_set_period(event))
continue;
+ /*
+ * Perf event overflow will queue the processing of the event as
+ * an irq_work which will be taken care of in the handling of
+ * IPI_IRQ_WORK.
+ */
if (perf_event_overflow(event, &data, regs))
cpu_pmu->disable(event);
}
armv8pmu_start(cpu_pmu);
- /*
- * Handle the pending perf events.
- *
- * Note: this call *must* be run with interrupts disabled. For
- * platforms that can have the PMU interrupts raised as an NMI, this
- * will not work.
- */
- irq_work_run();
-
return IRQ_HANDLED;
}
@@ -997,6 +1050,12 @@ static void __armv8pmu_probe_pmu(void *info)
bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap,
pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
+
+ /* store PMMIR_EL1 register for sysfs */
+ if (pmuver >= ID_AA64DFR0_PMUVER_8_4 && (pmceid_raw[1] & BIT(31)))
+ cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1);
+ else
+ cpu_pmu->reg_pmmir = 0;
}
static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
@@ -1019,7 +1078,8 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
int (*map_event)(struct perf_event *event),
const struct attribute_group *events,
- const struct attribute_group *format)
+ const struct attribute_group *format,
+ const struct attribute_group *caps)
{
int ret = armv8pmu_probe_pmu(cpu_pmu);
if (ret)
@@ -1044,104 +1104,112 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
events : &armv8_pmuv3_events_attr_group;
cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = format ?
format : &armv8_pmuv3_format_attr_group;
+ cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
+ caps : &armv8_pmuv3_caps_attr_group;
return 0;
}
+static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name,
+ int (*map_event)(struct perf_event *event))
+{
+ return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL);
+}
+
static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_pmuv3",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_pmuv3",
+ armv8_pmuv3_map_event);
}
static int armv8_a34_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a34",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a34",
+ armv8_pmuv3_map_event);
}
static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a35",
- armv8_a53_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a35",
+ armv8_a53_map_event);
}
static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a53",
- armv8_a53_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a53",
+ armv8_a53_map_event);
}
static int armv8_a55_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a55",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a55",
+ armv8_pmuv3_map_event);
}
static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a57",
- armv8_a57_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57",
+ armv8_a57_map_event);
}
static int armv8_a65_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a65",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a65",
+ armv8_pmuv3_map_event);
}
static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a72",
- armv8_a57_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72",
+ armv8_a57_map_event);
}
static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a73",
- armv8_a73_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a73",
+ armv8_a73_map_event);
}
static int armv8_a75_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a75",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a75",
+ armv8_pmuv3_map_event);
}
static int armv8_a76_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a76",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a76",
+ armv8_pmuv3_map_event);
}
static int armv8_a77_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cortex_a77",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a77",
+ armv8_pmuv3_map_event);
}
static int armv8_e1_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_neoverse_e1",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_e1",
+ armv8_pmuv3_map_event);
}
static int armv8_n1_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_neoverse_n1",
- armv8_pmuv3_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_n1",
+ armv8_pmuv3_map_event);
}
static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_cavium_thunder",
- armv8_thunder_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder",
+ armv8_thunder_map_event);
}
static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu)
{
- return armv8_pmu_init(cpu_pmu, "armv8_brcm_vulcan",
- armv8_vulcan_map_event, NULL, NULL);
+ return armv8_pmu_init_nogroups(cpu_pmu, "armv8_brcm_vulcan",
+ armv8_vulcan_map_event);
}
static const struct of_device_id armv8_pmu_of_device_ids[] = {
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 666b225aeb3a..94e8718e7229 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -16,7 +16,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
/*
* Our handling of compat tasks (PERF_SAMPLE_REGS_ABI_32) is weird, but
- * we're stuck with it for ABI compatability reasons.
+ * we're stuck with it for ABI compatibility reasons.
*
* For a 32-bit consumer inspecting a 32-bit task, then it will look at
* the first 16 registers (see arch/arm/include/uapi/asm/perf_regs.h).
diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c
index 1e77736a4f66..adb955fd9bdd 100644
--- a/arch/arm64/kernel/pointer_auth.c
+++ b/arch/arm64/kernel/pointer_auth.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/prctl.h>
#include <linux/random.h>
@@ -17,6 +18,9 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg)
if (!system_supports_address_auth() && !system_supports_generic_auth())
return -EINVAL;
+ if (is_compat_thread(task_thread_info(tsk)))
+ return -EINVAL;
+
if (!arg) {
ptrauth_keys_init_user(keys);
return 0;
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 263d5fba4c8a..104101f633b1 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -29,7 +29,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
aarch64_insn_is_msr_imm(insn) ||
aarch64_insn_is_msr_reg(insn) ||
aarch64_insn_is_exception(insn) ||
- aarch64_insn_is_eret(insn))
+ aarch64_insn_is_eret(insn) ||
+ aarch64_insn_is_eret_auth(insn))
return false;
/*
@@ -42,8 +43,10 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
!= AARCH64_INSN_SPCLREG_DAIF;
/*
- * The HINT instruction is is problematic when single-stepping,
- * except for the NOP case.
+ * The HINT instruction is steppable only if it is in whitelist
+ * and the rest of other such instructions are blocked for
+ * single stepping as they may cause exception or other
+ * unintended behaviour.
*/
if (aarch64_insn_is_hint(insn))
return aarch64_insn_is_steppable_hint(insn);
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 5290f17a4d80..deba738142ed 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -464,87 +464,15 @@ int __init arch_populate_kprobe_blacklist(void)
void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
{
- struct kretprobe_instance *ri = NULL;
- struct hlist_head *head, empty_rp;
- struct hlist_node *tmp;
- unsigned long flags, orig_ret_address = 0;
- unsigned long trampoline_address =
- (unsigned long)&kretprobe_trampoline;
- kprobe_opcode_t *correct_ret_addr = NULL;
-
- INIT_HLIST_HEAD(&empty_rp);
- kretprobe_hash_lock(current, &head, &flags);
-
- /*
- * It is possible to have multiple instances associated with a given
- * task either because multiple functions in the call path have
- * return probes installed on them, and/or more than one
- * return probe was registered for a target function.
- *
- * We can handle this because:
- * - instances are always pushed into the head of the list
- * - when multiple return probes are registered for the same
- * function, the (chronologically) first instance's ret_addr
- * will be the real return address, and all the rest will
- * point to kretprobe_trampoline.
- */
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long)ri->ret_addr;
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
- correct_ret_addr = ri->ret_addr;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long)ri->ret_addr;
- if (ri->rp && ri->rp->handler) {
- __this_cpu_write(current_kprobe, &ri->rp->kp);
- get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
- ri->ret_addr = correct_ret_addr;
- ri->rp->handler(ri, regs);
- __this_cpu_write(current_kprobe, NULL);
- }
-
- recycle_rp_inst(ri, &empty_rp);
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- kretprobe_hash_unlock(current, &flags);
-
- hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
- return (void *)orig_ret_address;
+ return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline,
+ (void *)kernel_stack_pointer(regs));
}
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];
+ ri->fp = (void *)kernel_stack_pointer(regs);
/* replace return addr (x30) with trampoline */
regs->regs[30] = (long)&kretprobe_trampoline;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f1804496b935..4784011cecac 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -21,6 +21,7 @@
#include <linux/lockdep.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/nospec.h>
#include <linux/stddef.h>
#include <linux/sysctl.h>
#include <linux/unistd.h>
@@ -52,6 +53,7 @@
#include <asm/exec.h>
#include <asm/fpsimd.h>
#include <asm/mmu_context.h>
+#include <asm/mte.h>
#include <asm/processor.h>
#include <asm/pointer_auth.h>
#include <asm/stacktrace.h>
@@ -239,7 +241,7 @@ static void print_pstate(struct pt_regs *regs)
const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >>
PSR_BTYPE_SHIFT];
- printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO BTYPE=%s)\n",
+ printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO %cTCO BTYPE=%s)\n",
pstate,
pstate & PSR_N_BIT ? 'N' : 'n',
pstate & PSR_Z_BIT ? 'Z' : 'z',
@@ -251,6 +253,7 @@ static void print_pstate(struct pt_regs *regs)
pstate & PSR_F_BIT ? 'F' : 'f',
pstate & PSR_PAN_BIT ? '+' : '-',
pstate & PSR_UAO_BIT ? '+' : '-',
+ pstate & PSR_TCO_BIT ? '+' : '-',
btype_str);
}
}
@@ -336,6 +339,7 @@ void flush_thread(void)
tls_thread_flush();
flush_ptrace_hw_breakpoint(current);
flush_tagged_addr_state();
+ flush_mte_state();
}
void release_thread(struct task_struct *dead_task)
@@ -368,6 +372,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
dst->thread.sve_state = NULL;
clear_tsk_thread_flag(dst, TIF_SVE);
+ /* clear any pending asynchronous tag fault raised by the parent */
+ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT);
+
return 0;
}
@@ -421,8 +428,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
cpus_have_const_cap(ARM64_HAS_UAO))
childregs->pstate |= PSR_UAO_BIT;
- if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
- set_ssbs_bit(childregs);
+ spectre_v4_enable_task_mitigation(p);
if (system_uses_irq_prio_masking())
childregs->pmr_save = GIC_PRIO_IRQON;
@@ -472,8 +478,6 @@ void uao_thread_switch(struct task_struct *next)
*/
static void ssbs_thread_switch(struct task_struct *next)
{
- struct pt_regs *regs = task_pt_regs(next);
-
/*
* Nothing to do for kernel threads, but 'regs' may be junk
* (e.g. idle task) so check the flags and bail early.
@@ -485,18 +489,10 @@ static void ssbs_thread_switch(struct task_struct *next)
* If all CPUs implement the SSBS extension, then we just need to
* context-switch the PSTATE field.
*/
- if (cpu_have_feature(cpu_feature(SSBS)))
+ if (cpus_have_const_cap(ARM64_SSBS))
return;
- /* If the mitigation is enabled, then we leave SSBS clear. */
- if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) ||
- test_tsk_thread_flag(next, TIF_SSBD))
- return;
-
- if (compat_user_mode(regs))
- set_compat_ssbs_bit(regs);
- else if (user_mode(regs))
- set_ssbs_bit(regs);
+ spectre_v4_enable_task_mitigation(next);
}
/*
@@ -571,6 +567,13 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
*/
dsb(ish);
+ /*
+ * MTE thread switching must happen after the DSB above to ensure that
+ * any asynchronous tag check faults have been logged in the TFSR*_EL1
+ * registers.
+ */
+ mte_thread_switch(next);
+
/* the actual thread switch */
last = cpu_switch_to(prev, next);
@@ -620,6 +623,11 @@ void arch_setup_new_exec(void)
current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
ptrauth_thread_init_user(current);
+
+ if (task_spec_ssb_noexec(current)) {
+ arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS,
+ PR_SPEC_ENABLE);
+ }
}
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
@@ -628,11 +636,18 @@ void arch_setup_new_exec(void)
*/
static unsigned int tagged_addr_disabled;
-long set_tagged_addr_ctrl(unsigned long arg)
+long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg)
{
- if (is_compat_task())
+ unsigned long valid_mask = PR_TAGGED_ADDR_ENABLE;
+ struct thread_info *ti = task_thread_info(task);
+
+ if (is_compat_thread(ti))
return -EINVAL;
- if (arg & ~PR_TAGGED_ADDR_ENABLE)
+
+ if (system_supports_mte())
+ valid_mask |= PR_MTE_TCF_MASK | PR_MTE_TAG_MASK;
+
+ if (arg & ~valid_mask)
return -EINVAL;
/*
@@ -642,20 +657,28 @@ long set_tagged_addr_ctrl(unsigned long arg)
if (arg & PR_TAGGED_ADDR_ENABLE && tagged_addr_disabled)
return -EINVAL;
- update_thread_flag(TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE);
+ if (set_mte_ctrl(task, arg) != 0)
+ return -EINVAL;
+
+ update_ti_thread_flag(ti, TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE);
return 0;
}
-long get_tagged_addr_ctrl(void)
+long get_tagged_addr_ctrl(struct task_struct *task)
{
- if (is_compat_task())
+ long ret = 0;
+ struct thread_info *ti = task_thread_info(task);
+
+ if (is_compat_thread(ti))
return -EINVAL;
- if (test_thread_flag(TIF_TAGGED_ADDR))
- return PR_TAGGED_ADDR_ENABLE;
+ if (test_ti_thread_flag(ti, TIF_TAGGED_ADDR))
+ ret = PR_TAGGED_ADDR_ENABLE;
- return 0;
+ ret |= get_mte_ctrl(task);
+
+ return ret;
}
/*
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
new file mode 100644
index 000000000000..25f3c80b5ffe
--- /dev/null
+++ b/arch/arm64/kernel/proton-pack.c
@@ -0,0 +1,790 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handle detection, reporting and mitigation of Spectre v1, v2 and v4, as
+ * detailed at:
+ *
+ * https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
+ *
+ * This code was originally written hastily under an awful lot of stress and so
+ * aspects of it are somewhat hacky. Unfortunately, changing anything in here
+ * instantly makes me feel ill. Thanks, Jann. Thann.
+ *
+ * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
+ * Copyright (C) 2020 Google LLC
+ *
+ * "If there's something strange in your neighbourhood, who you gonna call?"
+ *
+ * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/spectre.h>
+#include <asm/traps.h>
+
+/*
+ * We try to ensure that the mitigation state can never change as the result of
+ * onlining a late CPU.
+ */
+static void update_mitigation_state(enum mitigation_state *oldp,
+ enum mitigation_state new)
+{
+ enum mitigation_state state;
+
+ do {
+ state = READ_ONCE(*oldp);
+ if (new <= state)
+ break;
+
+ /* Userspace almost certainly can't deal with this. */
+ if (WARN_ON(system_capabilities_finalized()))
+ break;
+ } while (cmpxchg_relaxed(oldp, state, new) != state);
+}
+
+/*
+ * Spectre v1.
+ *
+ * The kernel can't protect userspace for this one: it's each person for
+ * themselves. Advertise what we're doing and be done with it.
+ */
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+}
+
+/*
+ * Spectre v2.
+ *
+ * This one sucks. A CPU is either:
+ *
+ * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2.
+ * - Mitigated in hardware and listed in our "safe list".
+ * - Mitigated in software by firmware.
+ * - Mitigated in software by a CPU-specific dance in the kernel and a
+ * firmware call at EL2.
+ * - Vulnerable.
+ *
+ * It's not unlikely for different CPUs in a big.LITTLE system to fall into
+ * different camps.
+ */
+static enum mitigation_state spectre_v2_state;
+
+static bool __read_mostly __nospectre_v2;
+static int __init parse_spectre_v2_param(char *str)
+{
+ __nospectre_v2 = true;
+ return 0;
+}
+early_param("nospectre_v2", parse_spectre_v2_param);
+
+static bool spectre_v2_mitigations_off(void)
+{
+ bool ret = __nospectre_v2 || cpu_mitigations_off();
+
+ if (ret)
+ pr_info_once("spectre-v2 mitigation disabled by command line option\n");
+
+ return ret;
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ switch (spectre_v2_state) {
+ case SPECTRE_UNAFFECTED:
+ return sprintf(buf, "Not affected\n");
+ case SPECTRE_MITIGATED:
+ return sprintf(buf, "Mitigation: Branch predictor hardening\n");
+ case SPECTRE_VULNERABLE:
+ fallthrough;
+ default:
+ return sprintf(buf, "Vulnerable\n");
+ }
+}
+
+static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
+{
+ u64 pfr0;
+ static const struct midr_range spectre_v2_safe_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
+ MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
+ MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
+ { /* sentinel */ }
+ };
+
+ /* If the CPU has CSV2 set, we're safe */
+ pfr0 = read_cpuid(ID_AA64PFR0_EL1);
+ if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT))
+ return SPECTRE_UNAFFECTED;
+
+ /* Alternatively, we have a list of unaffected CPUs */
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
+ return SPECTRE_UNAFFECTED;
+
+ return SPECTRE_VULNERABLE;
+}
+
+#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED (1)
+
+static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void)
+{
+ int ret;
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+
+ ret = res.a0;
+ switch (ret) {
+ case SMCCC_RET_SUCCESS:
+ return SPECTRE_MITIGATED;
+ case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
+ return SPECTRE_UNAFFECTED;
+ default:
+ fallthrough;
+ case SMCCC_RET_NOT_SUPPORTED:
+ return SPECTRE_VULNERABLE;
+ }
+}
+
+bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+ if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED)
+ return false;
+
+ if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED)
+ return false;
+
+ return true;
+}
+
+DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+
+enum mitigation_state arm64_get_spectre_v2_state(void)
+{
+ return spectre_v2_state;
+}
+
+#ifdef CONFIG_KVM
+#include <asm/cacheflush.h>
+#include <asm/kvm_asm.h>
+
+atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
+
+static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
+ const char *hyp_vecs_end)
+{
+ void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
+ int i;
+
+ for (i = 0; i < SZ_2K; i += 0x80)
+ memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
+
+ __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
+}
+
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
+{
+ static DEFINE_RAW_SPINLOCK(bp_lock);
+ int cpu, slot = -1;
+ const char *hyp_vecs_start = __smccc_workaround_1_smc;
+ const char *hyp_vecs_end = __smccc_workaround_1_smc +
+ __SMCCC_WORKAROUND_1_SMC_SZ;
+
+ /*
+ * Vinz Clortho takes the hyp_vecs start/end "keys" at
+ * the door when we're a guest. Skip the hyp-vectors work.
+ */
+ if (!is_hyp_mode_available()) {
+ __this_cpu_write(bp_hardening_data.fn, fn);
+ return;
+ }
+
+ raw_spin_lock(&bp_lock);
+ for_each_possible_cpu(cpu) {
+ if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
+ slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
+ break;
+ }
+ }
+
+ if (slot == -1) {
+ slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+ BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
+ __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
+ }
+
+ __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
+ __this_cpu_write(bp_hardening_data.fn, fn);
+ raw_spin_unlock(&bp_lock);
+}
+#else
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
+{
+ __this_cpu_write(bp_hardening_data.fn, fn);
+}
+#endif /* CONFIG_KVM */
+
+static void call_smc_arch_workaround_1(void)
+{
+ arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
+}
+
+static void call_hvc_arch_workaround_1(void)
+{
+ arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
+}
+
+static void qcom_link_stack_sanitisation(void)
+{
+ u64 tmp;
+
+ asm volatile("mov %0, x30 \n"
+ ".rept 16 \n"
+ "bl . + 4 \n"
+ ".endr \n"
+ "mov x30, %0 \n"
+ : "=&r" (tmp));
+}
+
+static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void)
+{
+ u32 midr = read_cpuid_id();
+ if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) &&
+ ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1))
+ return NULL;
+
+ return qcom_link_stack_sanitisation;
+}
+
+static enum mitigation_state spectre_v2_enable_fw_mitigation(void)
+{
+ bp_hardening_cb_t cb;
+ enum mitigation_state state;
+
+ state = spectre_v2_get_cpu_fw_mitigation_state();
+ if (state != SPECTRE_MITIGATED)
+ return state;
+
+ if (spectre_v2_mitigations_off())
+ return SPECTRE_VULNERABLE;
+
+ switch (arm_smccc_1_1_get_conduit()) {
+ case SMCCC_CONDUIT_HVC:
+ cb = call_hvc_arch_workaround_1;
+ break;
+
+ case SMCCC_CONDUIT_SMC:
+ cb = call_smc_arch_workaround_1;
+ break;
+
+ default:
+ return SPECTRE_VULNERABLE;
+ }
+
+ /*
+ * Prefer a CPU-specific workaround if it exists. Note that we
+ * still rely on firmware for the mitigation at EL2.
+ */
+ cb = spectre_v2_get_sw_mitigation_cb() ?: cb;
+ install_bp_hardening_cb(cb);
+ return SPECTRE_MITIGATED;
+}
+
+void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
+{
+ enum mitigation_state state;
+
+ WARN_ON(preemptible());
+
+ state = spectre_v2_get_cpu_hw_mitigation_state();
+ if (state == SPECTRE_VULNERABLE)
+ state = spectre_v2_enable_fw_mitigation();
+
+ update_mitigation_state(&spectre_v2_state, state);
+}
+
+/*
+ * Spectre v4.
+ *
+ * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
+ * either:
+ *
+ * - Mitigated in hardware and listed in our "safe list".
+ * - Mitigated in hardware via PSTATE.SSBS.
+ * - Mitigated in software by firmware (sometimes referred to as SSBD).
+ *
+ * Wait, that doesn't sound so bad, does it? Keep reading...
+ *
+ * A major source of headaches is that the software mitigation is enabled both
+ * on a per-task basis, but can also be forced on for the kernel, necessitating
+ * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs
+ * allow EL0 to toggle SSBS directly, which can end up with the prctl() state
+ * being stale when re-entering the kernel. The usual big.LITTLE caveats apply,
+ * so you can have systems that have both firmware and SSBS mitigations. This
+ * means we actually have to reject late onlining of CPUs with mitigations if
+ * all of the currently onlined CPUs are safelisted, as the mitigation tends to
+ * be opt-in for userspace. Yes, really, the cure is worse than the disease.
+ *
+ * The only good part is that if the firmware mitigation is present, then it is
+ * present for all CPUs, meaning we don't have to worry about late onlining of a
+ * vulnerable CPU if one of the boot CPUs is using the firmware mitigation.
+ *
+ * Give me a VAX-11/780 any day of the week...
+ */
+static enum mitigation_state spectre_v4_state;
+
+/* This is the per-cpu state tracking whether we need to talk to firmware */
+DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
+
+enum spectre_v4_policy {
+ SPECTRE_V4_POLICY_MITIGATION_DYNAMIC,
+ SPECTRE_V4_POLICY_MITIGATION_ENABLED,
+ SPECTRE_V4_POLICY_MITIGATION_DISABLED,
+};
+
+static enum spectre_v4_policy __read_mostly __spectre_v4_policy;
+
+static const struct spectre_v4_param {
+ const char *str;
+ enum spectre_v4_policy policy;
+} spectre_v4_params[] = {
+ { "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, },
+ { "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, },
+ { "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, },
+};
+static int __init parse_spectre_v4_param(char *str)
+{
+ int i;
+
+ if (!str || !str[0])
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) {
+ const struct spectre_v4_param *param = &spectre_v4_params[i];
+
+ if (strncmp(str, param->str, strlen(param->str)))
+ continue;
+
+ __spectre_v4_policy = param->policy;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+early_param("ssbd", parse_spectre_v4_param);
+
+/*
+ * Because this was all written in a rush by people working in different silos,
+ * we've ended up with multiple command line options to control the same thing.
+ * Wrap these up in some helpers, which prefer disabling the mitigation if faced
+ * with contradictory parameters. The mitigation is always either "off",
+ * "dynamic" or "on".
+ */
+static bool spectre_v4_mitigations_off(void)
+{
+ bool ret = cpu_mitigations_off() ||
+ __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
+
+ if (ret)
+ pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
+
+ return ret;
+}
+
+/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
+static bool spectre_v4_mitigations_dynamic(void)
+{
+ return !spectre_v4_mitigations_off() &&
+ __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC;
+}
+
+static bool spectre_v4_mitigations_on(void)
+{
+ return !spectre_v4_mitigations_off() &&
+ __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED;
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ switch (spectre_v4_state) {
+ case SPECTRE_UNAFFECTED:
+ return sprintf(buf, "Not affected\n");
+ case SPECTRE_MITIGATED:
+ return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
+ case SPECTRE_VULNERABLE:
+ fallthrough;
+ default:
+ return sprintf(buf, "Vulnerable\n");
+ }
+}
+
+enum mitigation_state arm64_get_spectre_v4_state(void)
+{
+ return spectre_v4_state;
+}
+
+static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
+{
+ static const struct midr_range spectre_v4_safe_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
+ MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
+ MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
+ { /* sentinel */ },
+ };
+
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list))
+ return SPECTRE_UNAFFECTED;
+
+ /* CPU features are detected first */
+ if (this_cpu_has_cap(ARM64_SSBS))
+ return SPECTRE_MITIGATED;
+
+ return SPECTRE_VULNERABLE;
+}
+
+static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void)
+{
+ int ret;
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_2, &res);
+
+ ret = res.a0;
+ switch (ret) {
+ case SMCCC_RET_SUCCESS:
+ return SPECTRE_MITIGATED;
+ case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
+ fallthrough;
+ case SMCCC_RET_NOT_REQUIRED:
+ return SPECTRE_UNAFFECTED;
+ default:
+ fallthrough;
+ case SMCCC_RET_NOT_SUPPORTED:
+ return SPECTRE_VULNERABLE;
+ }
+}
+
+bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope)
+{
+ enum mitigation_state state;
+
+ WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+ state = spectre_v4_get_cpu_hw_mitigation_state();
+ if (state == SPECTRE_VULNERABLE)
+ state = spectre_v4_get_cpu_fw_mitigation_state();
+
+ return state != SPECTRE_UNAFFECTED;
+}
+
+static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
+{
+ if (user_mode(regs))
+ return 1;
+
+ if (instr & BIT(PSTATE_Imm_shift))
+ regs->pstate |= PSR_SSBS_BIT;
+ else
+ regs->pstate &= ~PSR_SSBS_BIT;
+
+ arm64_skip_faulting_instruction(regs, 4);
+ return 0;
+}
+
+static struct undef_hook ssbs_emulation_hook = {
+ .instr_mask = ~(1U << PSTATE_Imm_shift),
+ .instr_val = 0xd500401f | PSTATE_SSBS,
+ .fn = ssbs_emulation_handler,
+};
+
+static enum mitigation_state spectre_v4_enable_hw_mitigation(void)
+{
+ static bool undef_hook_registered = false;
+ static DEFINE_RAW_SPINLOCK(hook_lock);
+ enum mitigation_state state;
+
+ /*
+ * If the system is mitigated but this CPU doesn't have SSBS, then
+ * we must be on the safelist and there's nothing more to do.
+ */
+ state = spectre_v4_get_cpu_hw_mitigation_state();
+ if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS))
+ return state;
+
+ raw_spin_lock(&hook_lock);
+ if (!undef_hook_registered) {
+ register_undef_hook(&ssbs_emulation_hook);
+ undef_hook_registered = true;
+ }
+ raw_spin_unlock(&hook_lock);
+
+ if (spectre_v4_mitigations_off()) {
+ sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
+ asm volatile(SET_PSTATE_SSBS(1));
+ return SPECTRE_VULNERABLE;
+ }
+
+ /* SCTLR_EL1.DSSBS was initialised to 0 during boot */
+ asm volatile(SET_PSTATE_SSBS(0));
+ return SPECTRE_MITIGATED;
+}
+
+/*
+ * Patch a branch over the Spectre-v4 mitigation code with a NOP so that
+ * we fallthrough and check whether firmware needs to be called on this CPU.
+ */
+void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt,
+ __le32 *origptr,
+ __le32 *updptr, int nr_inst)
+{
+ BUG_ON(nr_inst != 1); /* Branch -> NOP */
+
+ if (spectre_v4_mitigations_off())
+ return;
+
+ if (cpus_have_final_cap(ARM64_SSBS))
+ return;
+
+ if (spectre_v4_mitigations_dynamic())
+ *updptr = cpu_to_le32(aarch64_insn_gen_nop());
+}
+
+/*
+ * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction
+ * to call into firmware to adjust the mitigation state.
+ */
+void __init spectre_v4_patch_fw_mitigation_conduit(struct alt_instr *alt,
+ __le32 *origptr,
+ __le32 *updptr, int nr_inst)
+{
+ u32 insn;
+
+ BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */
+
+ switch (arm_smccc_1_1_get_conduit()) {
+ case SMCCC_CONDUIT_HVC:
+ insn = aarch64_insn_get_hvc_value();
+ break;
+ case SMCCC_CONDUIT_SMC:
+ insn = aarch64_insn_get_smc_value();
+ break;
+ default:
+ return;
+ }
+
+ *updptr = cpu_to_le32(insn);
+}
+
+static enum mitigation_state spectre_v4_enable_fw_mitigation(void)
+{
+ enum mitigation_state state;
+
+ state = spectre_v4_get_cpu_fw_mitigation_state();
+ if (state != SPECTRE_MITIGATED)
+ return state;
+
+ if (spectre_v4_mitigations_off()) {
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL);
+ return SPECTRE_VULNERABLE;
+ }
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL);
+
+ if (spectre_v4_mitigations_dynamic())
+ __this_cpu_write(arm64_ssbd_callback_required, 1);
+
+ return SPECTRE_MITIGATED;
+}
+
+void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
+{
+ enum mitigation_state state;
+
+ WARN_ON(preemptible());
+
+ state = spectre_v4_enable_hw_mitigation();
+ if (state == SPECTRE_VULNERABLE)
+ state = spectre_v4_enable_fw_mitigation();
+
+ update_mitigation_state(&spectre_v4_state, state);
+}
+
+static void __update_pstate_ssbs(struct pt_regs *regs, bool state)
+{
+ u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+
+ if (state)
+ regs->pstate |= bit;
+ else
+ regs->pstate &= ~bit;
+}
+
+void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
+{
+ struct pt_regs *regs = task_pt_regs(tsk);
+ bool ssbs = false, kthread = tsk->flags & PF_KTHREAD;
+
+ if (spectre_v4_mitigations_off())
+ ssbs = true;
+ else if (spectre_v4_mitigations_dynamic() && !kthread)
+ ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD);
+
+ __update_pstate_ssbs(regs, ssbs);
+}
+
+/*
+ * The Spectre-v4 mitigation can be controlled via a prctl() from userspace.
+ * This is interesting because the "speculation disabled" behaviour can be
+ * configured so that it is preserved across exec(), which means that the
+ * prctl() may be necessary even when PSTATE.SSBS can be toggled directly
+ * from userspace.
+ */
+static void ssbd_prctl_enable_mitigation(struct task_struct *task)
+{
+ task_clear_spec_ssb_noexec(task);
+ task_set_spec_ssb_disable(task);
+ set_tsk_thread_flag(task, TIF_SSBD);
+}
+
+static void ssbd_prctl_disable_mitigation(struct task_struct *task)
+{
+ task_clear_spec_ssb_noexec(task);
+ task_clear_spec_ssb_disable(task);
+ clear_tsk_thread_flag(task, TIF_SSBD);
+}
+
+static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* Enable speculation: disable mitigation */
+ /*
+ * Force disabled speculation prevents it from being
+ * re-enabled.
+ */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+
+ /*
+ * If the mitigation is forced on, then speculation is forced
+ * off and we again prevent it from being re-enabled.
+ */
+ if (spectre_v4_mitigations_on())
+ return -EPERM;
+
+ ssbd_prctl_disable_mitigation(task);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ /* Force disable speculation: force enable mitigation */
+ /*
+ * If the mitigation is forced off, then speculation is forced
+ * on and we prevent it from being disabled.
+ */
+ if (spectre_v4_mitigations_off())
+ return -EPERM;
+
+ task_set_spec_ssb_force_disable(task);
+ fallthrough;
+ case PR_SPEC_DISABLE:
+ /* Disable speculation: enable mitigation */
+ /* Same as PR_SPEC_FORCE_DISABLE */
+ if (spectre_v4_mitigations_off())
+ return -EPERM;
+
+ ssbd_prctl_enable_mitigation(task);
+ break;
+ case PR_SPEC_DISABLE_NOEXEC:
+ /* Disable speculation until execve(): enable mitigation */
+ /*
+ * If the mitigation state is forced one way or the other, then
+ * we must fail now before we try to toggle it on execve().
+ */
+ if (task_spec_ssb_force_disable(task) ||
+ spectre_v4_mitigations_off() ||
+ spectre_v4_mitigations_on()) {
+ return -EPERM;
+ }
+
+ ssbd_prctl_enable_mitigation(task);
+ task_set_spec_ssb_noexec(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+
+ spectre_v4_enable_task_mitigation(task);
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssbd_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+static int ssbd_prctl_get(struct task_struct *task)
+{
+ switch (spectre_v4_state) {
+ case SPECTRE_UNAFFECTED:
+ return PR_SPEC_NOT_AFFECTED;
+ case SPECTRE_MITIGATED:
+ if (spectre_v4_mitigations_on())
+ return PR_SPEC_NOT_AFFECTED;
+
+ if (spectre_v4_mitigations_dynamic())
+ break;
+
+ /* Mitigations are disabled, so we're vulnerable. */
+ fallthrough;
+ case SPECTRE_VULNERABLE:
+ fallthrough;
+ default:
+ return PR_SPEC_ENABLE;
+ }
+
+ /* Check the mitigation state for this task */
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+
+ if (task_spec_ssb_noexec(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
+
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssbd_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index d8ebfd813e28..f49b349e16a3 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -34,6 +34,7 @@
#include <asm/cpufeature.h>
#include <asm/debug-monitors.h>
#include <asm/fpsimd.h>
+#include <asm/mte.h>
#include <asm/pointer_auth.h>
#include <asm/stacktrace.h>
#include <asm/syscall.h>
@@ -1032,6 +1033,35 @@ static int pac_generic_keys_set(struct task_struct *target,
#endif /* CONFIG_CHECKPOINT_RESTORE */
#endif /* CONFIG_ARM64_PTR_AUTH */
+#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
+static int tagged_addr_ctrl_get(struct task_struct *target,
+ const struct user_regset *regset,
+ struct membuf to)
+{
+ long ctrl = get_tagged_addr_ctrl(target);
+
+ if (IS_ERR_VALUE(ctrl))
+ return ctrl;
+
+ return membuf_write(&to, &ctrl, sizeof(ctrl));
+}
+
+static int tagged_addr_ctrl_set(struct task_struct *target, const struct
+ user_regset *regset, unsigned int pos,
+ unsigned int count, const void *kbuf, const
+ void __user *ubuf)
+{
+ int ret;
+ long ctrl;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1);
+ if (ret)
+ return ret;
+
+ return set_tagged_addr_ctrl(target, ctrl);
+}
+#endif
+
enum aarch64_regset {
REGSET_GPR,
REGSET_FPR,
@@ -1051,6 +1081,9 @@ enum aarch64_regset {
REGSET_PACG_KEYS,
#endif
#endif
+#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
+ REGSET_TAGGED_ADDR_CTRL,
+#endif
};
static const struct user_regset aarch64_regsets[] = {
@@ -1148,6 +1181,16 @@ static const struct user_regset aarch64_regsets[] = {
},
#endif
#endif
+#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
+ [REGSET_TAGGED_ADDR_CTRL] = {
+ .core_note_type = NT_ARM_TAGGED_ADDR_CTRL,
+ .n = 1,
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .regset_get = tagged_addr_ctrl_get,
+ .set = tagged_addr_ctrl_set,
+ },
+#endif
};
static const struct user_regset_view user_aarch64_view = {
@@ -1691,6 +1734,12 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
+ switch (request) {
+ case PTRACE_PEEKMTETAGS:
+ case PTRACE_POKEMTETAGS:
+ return mte_ptrace_copy_tags(child, request, addr, data);
+ }
+
return ptrace_request(child, request, addr, data);
}
@@ -1793,7 +1842,7 @@ void syscall_trace_exit(struct pt_regs *regs)
* We also reserve IL for the kernel; SS is handled dynamically.
*/
#define SPSR_EL1_AARCH64_RES0_BITS \
- (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \
+ (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \
GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5))
#define SPSR_EL1_AARCH32_RES0_BITS \
(GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20))
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index 542d6edc6806..84eec95ec06c 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -36,18 +36,6 @@ SYM_CODE_START(arm64_relocate_new_kernel)
mov x14, xzr /* x14 = entry ptr */
mov x13, xzr /* x13 = copy dest */
- /* Clear the sctlr_el2 flags. */
- mrs x0, CurrentEL
- cmp x0, #CurrentEL_EL2
- b.ne 1f
- mrs x0, sctlr_el2
- mov_q x1, SCTLR_ELx_FLAGS
- bic x0, x0, x1
- pre_disable_mmu_workaround
- msr sctlr_el2, x0
- isb
-1:
-
/* Check if the new image needs relocation. */
tbnz x16, IND_DONE_BIT, .Ldone
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index a5e8b3b9d798..a6d18755652f 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -18,16 +18,16 @@ struct return_address_data {
void *addr;
};
-static int save_return_addr(struct stackframe *frame, void *d)
+static bool save_return_addr(void *d, unsigned long pc)
{
struct return_address_data *data = d;
if (!data->level) {
- data->addr = (void *)frame->pc;
- return 1;
+ data->addr = (void *)pc;
+ return false;
} else {
--data->level;
- return 0;
+ return true;
}
}
NOKPROBE_SYMBOL(save_return_addr);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 53acbeca4f57..133257ffd859 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -217,7 +217,7 @@ static void __init request_standard_resources(void)
if (!standard_resources)
panic("%s: Failed to allocate %zu bytes\n", __func__, res_size);
- for_each_memblock(memory, region) {
+ for_each_mem_region(region) {
res = &standard_resources[i++];
if (memblock_is_nomap(region)) {
res->name = "reserved";
@@ -257,7 +257,7 @@ static int __init reserve_memblock_reserved_regions(void)
if (!memblock_is_region_reserved(mem->start, mem_size))
continue;
- for_each_reserved_mem_region(j, &r_start, &r_end) {
+ for_each_reserved_mem_range(j, &r_start, &r_end) {
resource_size_t start, end;
start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 3b4f31f35e45..a8184cad8890 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -244,7 +244,8 @@ static int preserve_sve_context(struct sve_context __user *ctx)
if (vq) {
/*
* This assumes that the SVE state has already been saved to
- * the task struct by calling preserve_fpsimd_context().
+ * the task struct by calling the function
+ * fpsimd_signal_preserve_current_state().
*/
err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET,
current->thread.sve_state,
@@ -748,6 +749,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
regs->pstate |= PSR_BTYPE_C;
}
+ /* TCO (Tag Check Override) always cleared for signal handlers */
+ regs->pstate &= ~PSR_TCO_BIT;
+
if (ka->sa.sa_flags & SA_RESTORER)
sigtramp = ka->sa.sa_restorer;
else
@@ -932,11 +936,16 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
if (thread_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
+ if (thread_flags & _TIF_MTE_ASYNC_FAULT) {
+ clear_thread_flag(TIF_MTE_ASYNC_FAULT);
+ send_sig_fault(SIGSEGV, SEGV_MTEAERR,
+ (void __user *)NULL, current);
+ }
+
if (thread_flags & _TIF_SIGPENDING)
do_signal(regs);
if (thread_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
rseq_handle_notify_resume(NULL, regs);
}
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index 1f93809528a4..d62447964ed9 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -9,7 +9,6 @@
#include <asm/assembler.h>
.macro SMCCC instr
- .cfi_startproc
\instr #0
ldr x4, [sp]
stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
@@ -21,7 +20,6 @@
b.ne 1f
str x6, [x4, ARM_SMCCC_QUIRK_STATE_OFFS]
1: ret
- .cfi_endproc
.endm
/*
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 355ee9eed4dd..82e75fc2c903 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -30,6 +30,7 @@
#include <linux/completion.h>
#include <linux/of.h>
#include <linux/irq_work.h>
+#include <linux/kernel_stat.h>
#include <linux/kexec.h>
#include <linux/kvm_host.h>
@@ -72,10 +73,18 @@ enum ipi_msg_type {
IPI_CPU_CRASH_STOP,
IPI_TIMER,
IPI_IRQ_WORK,
- IPI_WAKEUP
+ IPI_WAKEUP,
+ NR_IPI
};
+static int ipi_irq_base __read_mostly;
+static int nr_ipi __read_mostly = NR_IPI;
+static struct irq_desc *ipi_desc[NR_IPI] __read_mostly;
+
+static void ipi_setup(int cpu);
+
#ifdef CONFIG_HOTPLUG_CPU
+static void ipi_teardown(int cpu);
static int op_cpu_kill(unsigned int cpu);
#else
static inline int op_cpu_kill(unsigned int cpu)
@@ -237,6 +246,8 @@ asmlinkage notrace void secondary_start_kernel(void)
*/
notify_cpu_starting(cpu);
+ ipi_setup(cpu);
+
store_cpu_topology(cpu);
numa_add_cpu(cpu);
@@ -302,6 +313,7 @@ int __cpu_disable(void)
* and we must not schedule until we're ready to give up the cpu.
*/
set_cpu_online(cpu, false);
+ ipi_teardown(cpu);
/*
* OK - migrate IRQs away from this CPU
@@ -772,13 +784,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
}
}
-void (*__smp_cross_call)(const struct cpumask *, unsigned int);
-
-void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int))
-{
- __smp_cross_call = fn;
-}
-
static const char *ipi_types[NR_IPI] __tracepoint_string = {
#define S(x,s) [x] = s
S(IPI_RESCHEDULE, "Rescheduling interrupts"),
@@ -790,35 +795,25 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
S(IPI_WAKEUP, "CPU wake-up interrupts"),
};
-static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
-{
- trace_ipi_raise(target, ipi_types[ipinr]);
- __smp_cross_call(target, ipinr);
-}
+static void smp_cross_call(const struct cpumask *target, unsigned int ipinr);
-void show_ipi_list(struct seq_file *p, int prec)
+unsigned long irq_err_count;
+
+int arch_show_interrupts(struct seq_file *p, int prec)
{
unsigned int cpu, i;
for (i = 0; i < NR_IPI; i++) {
+ unsigned int irq = irq_desc_get_irq(ipi_desc[i]);
seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i,
prec >= 4 ? " " : "");
for_each_online_cpu(cpu)
- seq_printf(p, "%10u ",
- __get_irq_stat(cpu, ipi_irqs[i]));
+ seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
seq_printf(p, " %s\n", ipi_types[i]);
}
-}
-
-u64 smp_irq_stat_cpu(unsigned int cpu)
-{
- u64 sum = 0;
- int i;
- for (i = 0; i < NR_IPI; i++)
- sum += __get_irq_stat(cpu, ipi_irqs[i]);
-
- return sum;
+ seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count);
+ return 0;
}
void arch_send_call_function_ipi_mask(const struct cpumask *mask)
@@ -841,8 +836,7 @@ void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
#ifdef CONFIG_IRQ_WORK
void arch_irq_work_raise(void)
{
- if (__smp_cross_call)
- smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
+ smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
}
#endif
@@ -890,15 +884,12 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
/*
* Main handler for inter-processor interrupts
*/
-void handle_IPI(int ipinr, struct pt_regs *regs)
+static void do_handle_IPI(int ipinr)
{
unsigned int cpu = smp_processor_id();
- struct pt_regs *old_regs = set_irq_regs(regs);
- if ((unsigned)ipinr < NR_IPI) {
+ if ((unsigned)ipinr < NR_IPI)
trace_ipi_entry_rcuidle(ipi_types[ipinr]);
- __inc_irq_stat(cpu, ipi_irqs[ipinr]);
- }
switch (ipinr) {
case IPI_RESCHEDULE:
@@ -906,21 +897,16 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
break;
case IPI_CALL_FUNC:
- irq_enter();
generic_smp_call_function_interrupt();
- irq_exit();
break;
case IPI_CPU_STOP:
- irq_enter();
local_cpu_stop();
- irq_exit();
break;
case IPI_CPU_CRASH_STOP:
if (IS_ENABLED(CONFIG_KEXEC_CORE)) {
- irq_enter();
- ipi_cpu_crash_stop(cpu, regs);
+ ipi_cpu_crash_stop(cpu, get_irq_regs());
unreachable();
}
@@ -928,17 +914,13 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
case IPI_TIMER:
- irq_enter();
tick_receive_broadcast();
- irq_exit();
break;
#endif
#ifdef CONFIG_IRQ_WORK
case IPI_IRQ_WORK:
- irq_enter();
irq_work_run();
- irq_exit();
break;
#endif
@@ -957,7 +939,66 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
if ((unsigned)ipinr < NR_IPI)
trace_ipi_exit_rcuidle(ipi_types[ipinr]);
- set_irq_regs(old_regs);
+}
+
+static irqreturn_t ipi_handler(int irq, void *data)
+{
+ do_handle_IPI(irq - ipi_irq_base);
+ return IRQ_HANDLED;
+}
+
+static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
+{
+ trace_ipi_raise(target, ipi_types[ipinr]);
+ __ipi_send_mask(ipi_desc[ipinr], target);
+}
+
+static void ipi_setup(int cpu)
+{
+ int i;
+
+ if (WARN_ON_ONCE(!ipi_irq_base))
+ return;
+
+ for (i = 0; i < nr_ipi; i++)
+ enable_percpu_irq(ipi_irq_base + i, 0);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void ipi_teardown(int cpu)
+{
+ int i;
+
+ if (WARN_ON_ONCE(!ipi_irq_base))
+ return;
+
+ for (i = 0; i < nr_ipi; i++)
+ disable_percpu_irq(ipi_irq_base + i);
+}
+#endif
+
+void __init set_smp_ipi_range(int ipi_base, int n)
+{
+ int i;
+
+ WARN_ON(n < NR_IPI);
+ nr_ipi = min(n, NR_IPI);
+
+ for (i = 0; i < nr_ipi; i++) {
+ int err;
+
+ err = request_percpu_irq(ipi_base + i, ipi_handler,
+ "IPI", &cpu_number);
+ WARN_ON(err);
+
+ ipi_desc[i] = irq_to_desc(ipi_base + i);
+ irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
+ }
+
+ ipi_irq_base = ipi_base;
+
+ /* Setup the boot CPU immediately */
+ ipi_setup(smp_processor_id());
}
void smp_send_reschedule(int cpu)
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index c8a3fee00c11..5892e79fa429 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -83,9 +83,9 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
/*
* We write the release address as LE regardless of the native
- * endianess of the kernel. Therefore, any boot-loaders that
+ * endianness of the kernel. Therefore, any boot-loaders that
* read this address need to convert this address to the
- * boot-loader's endianess before jumping. This is mandated by
+ * boot-loader's endianness before jumping. This is mandated by
* the boot protocol.
*/
writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr);
diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c
deleted file mode 100644
index b26955f56750..000000000000
--- a/arch/arm64/kernel/ssbd.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
- */
-
-#include <linux/compat.h>
-#include <linux/errno.h>
-#include <linux/prctl.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
-
-#include <asm/cpufeature.h>
-
-static void ssbd_ssbs_enable(struct task_struct *task)
-{
- u64 val = is_compat_thread(task_thread_info(task)) ?
- PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
-
- task_pt_regs(task)->pstate |= val;
-}
-
-static void ssbd_ssbs_disable(struct task_struct *task)
-{
- u64 val = is_compat_thread(task_thread_info(task)) ?
- PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
-
- task_pt_regs(task)->pstate &= ~val;
-}
-
-/*
- * prctl interface for SSBD
- */
-static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
-{
- int state = arm64_get_ssbd_state();
-
- /* Unsupported */
- if (state == ARM64_SSBD_UNKNOWN)
- return -ENODEV;
-
- /* Treat the unaffected/mitigated state separately */
- if (state == ARM64_SSBD_MITIGATED) {
- switch (ctrl) {
- case PR_SPEC_ENABLE:
- return -EPERM;
- case PR_SPEC_DISABLE:
- case PR_SPEC_FORCE_DISABLE:
- return 0;
- }
- }
-
- /*
- * Things are a bit backward here: the arm64 internal API
- * *enables the mitigation* when the userspace API *disables
- * speculation*. So much fun.
- */
- switch (ctrl) {
- case PR_SPEC_ENABLE:
- /* If speculation is force disabled, enable is not allowed */
- if (state == ARM64_SSBD_FORCE_ENABLE ||
- task_spec_ssb_force_disable(task))
- return -EPERM;
- task_clear_spec_ssb_disable(task);
- clear_tsk_thread_flag(task, TIF_SSBD);
- ssbd_ssbs_enable(task);
- break;
- case PR_SPEC_DISABLE:
- if (state == ARM64_SSBD_FORCE_DISABLE)
- return -EPERM;
- task_set_spec_ssb_disable(task);
- set_tsk_thread_flag(task, TIF_SSBD);
- ssbd_ssbs_disable(task);
- break;
- case PR_SPEC_FORCE_DISABLE:
- if (state == ARM64_SSBD_FORCE_DISABLE)
- return -EPERM;
- task_set_spec_ssb_disable(task);
- task_set_spec_ssb_force_disable(task);
- set_tsk_thread_flag(task, TIF_SSBD);
- ssbd_ssbs_disable(task);
- break;
- default:
- return -ERANGE;
- }
-
- return 0;
-}
-
-int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
- unsigned long ctrl)
-{
- switch (which) {
- case PR_SPEC_STORE_BYPASS:
- return ssbd_prctl_set(task, ctrl);
- default:
- return -ENODEV;
- }
-}
-
-static int ssbd_prctl_get(struct task_struct *task)
-{
- switch (arm64_get_ssbd_state()) {
- case ARM64_SSBD_UNKNOWN:
- return -ENODEV;
- case ARM64_SSBD_FORCE_ENABLE:
- return PR_SPEC_DISABLE;
- case ARM64_SSBD_KERNEL:
- if (task_spec_ssb_force_disable(task))
- return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
- if (task_spec_ssb_disable(task))
- return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
- return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- case ARM64_SSBD_FORCE_DISABLE:
- return PR_SPEC_ENABLE;
- default:
- return PR_SPEC_NOT_AFFECTED;
- }
-}
-
-int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
-{
- switch (which) {
- case PR_SPEC_STORE_BYPASS:
- return ssbd_prctl_get(task);
- default:
- return -ENODEV;
- }
-}
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 2dd8e3b8b94b..fa56af1a59c3 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -118,12 +118,12 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
NOKPROBE_SYMBOL(unwind_frame);
void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
- int (*fn)(struct stackframe *, void *), void *data)
+ bool (*fn)(void *, unsigned long), void *data)
{
while (1) {
int ret;
- if (fn(frame, data))
+ if (!fn(data, frame->pc))
break;
ret = unwind_frame(tsk, frame);
if (ret < 0)
@@ -132,84 +132,89 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
}
NOKPROBE_SYMBOL(walk_stackframe);
-#ifdef CONFIG_STACKTRACE
-struct stack_trace_data {
- struct stack_trace *trace;
- unsigned int no_sched_functions;
- unsigned int skip;
-};
-
-static int save_trace(struct stackframe *frame, void *d)
+static void dump_backtrace_entry(unsigned long where, const char *loglvl)
{
- struct stack_trace_data *data = d;
- struct stack_trace *trace = data->trace;
- unsigned long addr = frame->pc;
-
- if (data->no_sched_functions && in_sched_functions(addr))
- return 0;
- if (data->skip) {
- data->skip--;
- return 0;
- }
-
- trace->entries[trace->nr_entries++] = addr;
-
- return trace->nr_entries >= trace->max_entries;
+ printk("%s %pS\n", loglvl, (void *)where);
}
-void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
+ const char *loglvl)
{
- struct stack_trace_data data;
struct stackframe frame;
+ int skip = 0;
- data.trace = trace;
- data.skip = trace->skip;
- data.no_sched_functions = 0;
+ pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
- start_backtrace(&frame, regs->regs[29], regs->pc);
- walk_stackframe(current, &frame, save_trace, &data);
-}
-EXPORT_SYMBOL_GPL(save_stack_trace_regs);
+ if (regs) {
+ if (user_mode(regs))
+ return;
+ skip = 1;
+ }
-static noinline void __save_stack_trace(struct task_struct *tsk,
- struct stack_trace *trace, unsigned int nosched)
-{
- struct stack_trace_data data;
- struct stackframe frame;
+ if (!tsk)
+ tsk = current;
if (!try_get_task_stack(tsk))
return;
- data.trace = trace;
- data.skip = trace->skip;
- data.no_sched_functions = nosched;
-
- if (tsk != current) {
- start_backtrace(&frame, thread_saved_fp(tsk),
- thread_saved_pc(tsk));
- } else {
- /* We don't want this function nor the caller */
- data.skip += 2;
+ if (tsk == current) {
start_backtrace(&frame,
(unsigned long)__builtin_frame_address(0),
- (unsigned long)__save_stack_trace);
+ (unsigned long)dump_backtrace);
+ } else {
+ /*
+ * task blocked in __switch_to
+ */
+ start_backtrace(&frame,
+ thread_saved_fp(tsk),
+ thread_saved_pc(tsk));
}
- walk_stackframe(tsk, &frame, save_trace, &data);
+ printk("%sCall trace:\n", loglvl);
+ do {
+ /* skip until specified stack frame */
+ if (!skip) {
+ dump_backtrace_entry(frame.pc, loglvl);
+ } else if (frame.fp == regs->regs[29]) {
+ skip = 0;
+ /*
+ * Mostly, this is the case where this function is
+ * called in panic/abort. As exception handler's
+ * stack frame does not contain the corresponding pc
+ * at which an exception has taken place, use regs->pc
+ * instead.
+ */
+ dump_backtrace_entry(regs->pc, loglvl);
+ }
+ } while (!unwind_frame(tsk, &frame));
put_task_stack(tsk);
}
-void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
{
- __save_stack_trace(tsk, trace, 1);
+ dump_backtrace(NULL, tsk, loglvl);
+ barrier();
}
-EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
-void save_stack_trace(struct stack_trace *trace)
+#ifdef CONFIG_STACKTRACE
+
+void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
+ struct task_struct *task, struct pt_regs *regs)
{
- __save_stack_trace(current, trace, 0);
+ struct stackframe frame;
+
+ if (regs)
+ start_backtrace(&frame, regs->regs[29], regs->pc);
+ else if (task == current)
+ start_backtrace(&frame,
+ (unsigned long)__builtin_frame_address(0),
+ (unsigned long)arch_stack_walk);
+ else
+ start_backtrace(&frame, thread_saved_fp(task),
+ thread_saved_pc(task));
+
+ walk_stackframe(task, &frame, consume_entry, cookie);
}
-EXPORT_SYMBOL_GPL(save_stack_trace);
#endif
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index c1dee9066ff9..96cd347c7a46 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -10,6 +10,7 @@
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/exec.h>
+#include <asm/mte.h>
#include <asm/memory.h>
#include <asm/mmu_context.h>
#include <asm/smp_plat.h>
@@ -72,8 +73,10 @@ void notrace __cpu_suspend_exit(void)
* have turned the mitigation on. If the user has forcefully
* disabled it, make sure their wishes are obeyed.
*/
- if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
- arm64_set_ssbd_mitigation(false);
+ spectre_v4_enable_mitigation(NULL);
+
+ /* Restore additional MTE-specific configuration */
+ mte_suspend_exit();
}
/*
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index 5f0c04863d2c..e4c0dadf0d92 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -123,6 +123,16 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
local_daif_restore(DAIF_PROCCTX);
user_exit();
+ if (system_supports_mte() && (flags & _TIF_MTE_ASYNC_FAULT)) {
+ /*
+ * Process the asynchronous tag check fault before the actual
+ * syscall. do_notify_resume() will send a signal to userspace
+ * before the syscall is restarted.
+ */
+ regs->regs[0] = -ERESTARTNOINTR;
+ return;
+ }
+
if (has_syscall_work(flags)) {
/*
* The de-facto standard way to skip a system call using ptrace
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 0801a0f3c156..543c67cae02f 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -36,21 +36,23 @@ void store_cpu_topology(unsigned int cpuid)
if (mpidr & MPIDR_UP_BITMASK)
return;
- /* Create cpu topology mapping based on MPIDR. */
- if (mpidr & MPIDR_MT_BITMASK) {
- /* Multiprocessor system : Multi-threads per core */
- cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
- cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
- cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
- MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
- } else {
- /* Multiprocessor system : Single-thread per core */
- cpuid_topo->thread_id = -1;
- cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
- cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
- MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
- MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
- }
+ /*
+ * This would be the place to create cpu topology based on MPIDR.
+ *
+ * However, it cannot be trusted to depict the actual topology; some
+ * pieces of the architecture enforce an artificial cap on Aff0 values
+ * (e.g. GICv3's ICC_SGI1R_EL1 limits it to 15), leading to an
+ * artificial cycling of Aff1, Aff2 and Aff3 values. IOW, these end up
+ * having absolutely no relationship to the actual underlying system
+ * topology, and cannot be reasonably used as core / package ID.
+ *
+ * If the MT bit is set, Aff0 *could* be used to define a thread ID, but
+ * we still wouldn't be able to obtain a sane core ID. This means we
+ * need to entirely ignore MPIDR for any topology deduction.
+ */
+ cpuid_topo->thread_id = -1;
+ cpuid_topo->core_id = cpuid;
+ cpuid_topo->package_id = cpu_to_node(cpuid);
pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
@@ -246,6 +248,13 @@ static int __init init_amu_fie(void)
static_branch_enable(&amu_fie_key);
}
+ /*
+ * If the system is not fully invariant after AMU init, disable
+ * partial use of counters for frequency invariance.
+ */
+ if (!topology_scale_freq_invariant())
+ static_branch_disable(&amu_fie_key);
+
free_valid_mask:
free_cpumask_var(valid_cpus);
@@ -253,7 +262,7 @@ free_valid_mask:
}
late_initcall_sync(init_amu_fie);
-bool arch_freq_counters_available(struct cpumask *cpus)
+bool arch_freq_counters_available(const struct cpumask *cpus)
{
return amu_freq_invariant() &&
cpumask_subset(cpus, amu_fie_cpus);
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 13ebd5ca2070..8af4e0e85736 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -34,6 +34,7 @@
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
+#include <asm/extable.h>
#include <asm/insn.h>
#include <asm/kprobes.h>
#include <asm/traps.h>
@@ -53,11 +54,6 @@ static const char *handler[]= {
int show_unhandled_signals = 0;
-static void dump_backtrace_entry(unsigned long where, const char *loglvl)
-{
- printk("%s %pS\n", loglvl, (void *)where);
-}
-
static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
{
unsigned long addr = instruction_pointer(regs);
@@ -83,66 +79,6 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
printk("%sCode: %s\n", lvl, str);
}
-void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
- const char *loglvl)
-{
- struct stackframe frame;
- int skip = 0;
-
- pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
-
- if (regs) {
- if (user_mode(regs))
- return;
- skip = 1;
- }
-
- if (!tsk)
- tsk = current;
-
- if (!try_get_task_stack(tsk))
- return;
-
- if (tsk == current) {
- start_backtrace(&frame,
- (unsigned long)__builtin_frame_address(0),
- (unsigned long)dump_backtrace);
- } else {
- /*
- * task blocked in __switch_to
- */
- start_backtrace(&frame,
- thread_saved_fp(tsk),
- thread_saved_pc(tsk));
- }
-
- printk("%sCall trace:\n", loglvl);
- do {
- /* skip until specified stack frame */
- if (!skip) {
- dump_backtrace_entry(frame.pc, loglvl);
- } else if (frame.fp == regs->regs[29]) {
- skip = 0;
- /*
- * Mostly, this is the case where this function is
- * called in panic/abort. As exception handler's
- * stack frame does not contain the corresponding pc
- * at which an exception has taken place, use regs->pc
- * instead.
- */
- dump_backtrace_entry(regs->pc, loglvl);
- }
- } while (!unwind_frame(tsk, &frame));
-
- put_task_stack(tsk);
-}
-
-void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
-{
- dump_backtrace(NULL, tsk, loglvl);
- barrier();
-}
-
#ifdef CONFIG_PREEMPT
#define S_PREEMPT " PREEMPT"
#elif defined(CONFIG_PREEMPT_RT)
@@ -200,9 +136,9 @@ void die(const char *str, struct pt_regs *regs, int err)
oops_exit();
if (in_interrupt())
- panic("Fatal exception in interrupt");
+ panic("%s: Fatal exception in interrupt", str);
if (panic_on_oops)
- panic("Fatal exception");
+ panic("%s: Fatal exception", str);
raw_spin_unlock_irqrestore(&die_lock, flags);
@@ -412,7 +348,7 @@ exit:
return fn ? fn(regs, instr) : 1;
}
-void force_signal_inject(int signal, int code, unsigned long address)
+void force_signal_inject(int signal, int code, unsigned long address, unsigned int err)
{
const char *desc;
struct pt_regs *regs = current_pt_regs();
@@ -438,7 +374,7 @@ void force_signal_inject(int signal, int code, unsigned long address)
signal = SIGKILL;
}
- arm64_notify_die(desc, regs, signal, code, (void __user *)address, 0);
+ arm64_notify_die(desc, regs, signal, code, (void __user *)address, err);
}
/*
@@ -455,7 +391,7 @@ void arm64_notify_segfault(unsigned long addr)
code = SEGV_ACCERR;
mmap_read_unlock(current->mm);
- force_signal_inject(SIGSEGV, code, addr);
+ force_signal_inject(SIGSEGV, code, addr, 0);
}
void do_undefinstr(struct pt_regs *regs)
@@ -468,17 +404,28 @@ void do_undefinstr(struct pt_regs *regs)
return;
BUG_ON(!user_mode(regs));
- force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
+ force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
}
NOKPROBE_SYMBOL(do_undefinstr);
void do_bti(struct pt_regs *regs)
{
BUG_ON(!user_mode(regs));
- force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
+ force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
}
NOKPROBE_SYMBOL(do_bti);
+void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr)
+{
+ /*
+ * Unexpected FPAC exception or pointer authentication failure in
+ * the kernel: kill the task before it does any more harm.
+ */
+ BUG_ON(!user_mode(regs));
+ force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr);
+}
+NOKPROBE_SYMBOL(do_ptrauth_fault);
+
#define __user_cache_maint(insn, address, res) \
if (address >= user_addr_max()) { \
res = -EFAULT; \
@@ -528,7 +475,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs)
__user_cache_maint("ic ivau", address, ret);
break;
default:
- force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
+ force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
return;
}
@@ -581,7 +528,7 @@ static void mrs_handler(unsigned int esr, struct pt_regs *regs)
sysreg = esr_sys64_to_sysreg(esr);
if (do_emulate_mrs(regs, sysreg, rt) != 0)
- force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
+ force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
}
static void wfi_handler(unsigned int esr, struct pt_regs *regs)
@@ -775,6 +722,7 @@ static const char *esr_class_str[] = {
[ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)",
[ESR_ELx_EC_SVE] = "SVE",
[ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB",
+ [ESR_ELx_EC_FPAC] = "FPAC",
[ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF",
[ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)",
[ESR_ELx_EC_IABT_CUR] = "IABT (current EL)",
@@ -935,26 +883,6 @@ asmlinkage void enter_from_user_mode(void)
}
NOKPROBE_SYMBOL(enter_from_user_mode);
-void __pte_error(const char *file, int line, unsigned long val)
-{
- pr_err("%s:%d: bad pte %016lx.\n", file, line, val);
-}
-
-void __pmd_error(const char *file, int line, unsigned long val)
-{
- pr_err("%s:%d: bad pmd %016lx.\n", file, line, val);
-}
-
-void __pud_error(const char *file, int line, unsigned long val)
-{
- pr_err("%s:%d: bad pud %016lx.\n", file, line, val);
-}
-
-void __pgd_error(const char *file, int line, unsigned long val)
-{
- pr_err("%s:%d: bad pgd %016lx.\n", file, line, val);
-}
-
/* GENERIC_BUG traps */
int is_valid_bugaddr(unsigned long addr)
@@ -994,6 +922,21 @@ static struct break_hook bug_break_hook = {
.imm = BUG_BRK_IMM,
};
+static int reserved_fault_handler(struct pt_regs *regs, unsigned int esr)
+{
+ pr_err("%s generated an invalid instruction at %pS!\n",
+ in_bpf_jit(regs) ? "BPF JIT" : "Kernel text patching",
+ (void *)instruction_pointer(regs));
+
+ /* We cannot handle this */
+ return DBG_HOOK_ERROR;
+}
+
+static struct break_hook fault_break_hook = {
+ .fn = reserved_fault_handler,
+ .imm = FAULT_BRK_IMM,
+};
+
#ifdef CONFIG_KASAN_SW_TAGS
#define KASAN_ESR_RECOVER 0x20
@@ -1059,6 +1002,7 @@ int __init early_brk64(unsigned long addr, unsigned int esr,
void __init trap_init(void)
{
register_kernel_break_hook(&bug_break_hook);
+ register_kernel_break_hook(&fault_break_hook);
#ifdef CONFIG_KASAN_SW_TAGS
register_kernel_break_hook(&kasan_break_hook);
#endif
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index d4202a32abc9..debb8995d57f 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -30,15 +30,11 @@
#include <asm/vdso.h>
extern char vdso_start[], vdso_end[];
-#ifdef CONFIG_COMPAT_VDSO
extern char vdso32_start[], vdso32_end[];
-#endif /* CONFIG_COMPAT_VDSO */
enum vdso_abi {
VDSO_ABI_AA64,
-#ifdef CONFIG_COMPAT_VDSO
VDSO_ABI_AA32,
-#endif /* CONFIG_COMPAT_VDSO */
};
enum vvar_pages {
@@ -284,21 +280,17 @@ up_fail:
/*
* Create and map the vectors page for AArch32 tasks.
*/
-#ifdef CONFIG_COMPAT_VDSO
static int aarch32_vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
return __vdso_remap(VDSO_ABI_AA32, sm, new_vma);
}
-#endif /* CONFIG_COMPAT_VDSO */
enum aarch32_map {
AA32_MAP_VECTORS, /* kuser helpers */
-#ifdef CONFIG_COMPAT_VDSO
+ AA32_MAP_SIGPAGE,
AA32_MAP_VVAR,
AA32_MAP_VDSO,
-#endif
- AA32_MAP_SIGPAGE
};
static struct page *aarch32_vectors_page __ro_after_init;
@@ -309,7 +301,10 @@ static struct vm_special_mapping aarch32_vdso_maps[] = {
.name = "[vectors]", /* ABI */
.pages = &aarch32_vectors_page,
},
-#ifdef CONFIG_COMPAT_VDSO
+ [AA32_MAP_SIGPAGE] = {
+ .name = "[sigpage]", /* ABI */
+ .pages = &aarch32_sig_page,
+ },
[AA32_MAP_VVAR] = {
.name = "[vvar]",
.fault = vvar_fault,
@@ -319,11 +314,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = {
.name = "[vdso]",
.mremap = aarch32_vdso_mremap,
},
-#endif /* CONFIG_COMPAT_VDSO */
- [AA32_MAP_SIGPAGE] = {
- .name = "[sigpage]", /* ABI */
- .pages = &aarch32_sig_page,
- },
};
static int aarch32_alloc_kuser_vdso_page(void)
@@ -362,25 +352,25 @@ static int aarch32_alloc_sigpage(void)
return 0;
}
-#ifdef CONFIG_COMPAT_VDSO
static int __aarch32_alloc_vdso_pages(void)
{
+
+ if (!IS_ENABLED(CONFIG_COMPAT_VDSO))
+ return 0;
+
vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR];
vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO];
return __vdso_init(VDSO_ABI_AA32);
}
-#endif /* CONFIG_COMPAT_VDSO */
static int __init aarch32_alloc_vdso_pages(void)
{
int ret;
-#ifdef CONFIG_COMPAT_VDSO
ret = __aarch32_alloc_vdso_pages();
if (ret)
return ret;
-#endif
ret = aarch32_alloc_sigpage();
if (ret)
@@ -449,14 +439,12 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (ret)
goto out;
-#ifdef CONFIG_COMPAT_VDSO
- ret = __setup_additional_pages(VDSO_ABI_AA32,
- mm,
- bprm,
- uses_interp);
- if (ret)
- goto out;
-#endif /* CONFIG_COMPAT_VDSO */
+ if (IS_ENABLED(CONFIG_COMPAT_VDSO)) {
+ ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm,
+ uses_interp);
+ if (ret)
+ goto out;
+ }
ret = aarch32_sigreturn_setup(mm);
out:
@@ -497,8 +485,7 @@ static int __init vdso_init(void)
}
arch_initcall(vdso_init);
-int arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
int ret;
@@ -506,11 +493,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = __setup_additional_pages(VDSO_ABI_AA64,
- mm,
- bprm,
- uses_interp);
-
+ ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp);
mmap_write_unlock(mm);
return ret;
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 45d5cfe46429..d65f52264aba 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -24,14 +24,13 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti
# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so
# preparation in build-time C")).
ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \
- -Bsymbolic $(call ld-option, --no-eh-frame-hdr) --build-id -n \
+ -Bsymbolic $(call ld-option, --no-eh-frame-hdr) --build-id=sha1 -n \
$(btildflags-y) -T
ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
ccflags-y += -DDISABLE_BRANCH_PROFILING
CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS)
-KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD := y
@@ -43,18 +42,11 @@ ifneq ($(c-gettimeofday-y),)
CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
endif
-# Clang versions less than 8 do not support -mcmodel=tiny
-ifeq ($(CONFIG_CC_IS_CLANG), y)
- ifeq ($(shell test $(CONFIG_CLANG_VERSION) -lt 80000; echo $$?),0)
- CFLAGS_REMOVE_vgettimeofday.o += -mcmodel=tiny
- endif
-endif
-
# Disable gcov profiling for VDSO code
GCOV_PROFILE := n
obj-y += vdso.o
-extra-y += vdso.lds
+targets += vdso.lds
CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
# Force dependency (incbin is bad)
diff --git a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh
index 0664acaf61ff..8b806eacd0a6 100755
--- a/arch/arm64/kernel/vdso/gen_vdso_offsets.sh
+++ b/arch/arm64/kernel/vdso/gen_vdso_offsets.sh
@@ -8,7 +8,7 @@
# Doing this inside the Makefile will break the $(filter-out) function,
# causing Kbuild to rebuild the vdso-offsets header file every time.
#
-# Author: Will Deacon <will.deacon@arm.com
+# Author: Will Deacon <will.deacon@arm.com>
#
LC_ALL=C
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index d6adb4677c25..7f96a1a9f68c 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -90,9 +90,9 @@ VDSO_CFLAGS += -O2
# Some useful compiler-dependent flags from top-level Makefile
VDSO_CFLAGS += $(call cc32-option,-Wdeclaration-after-statement,)
VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign)
-VDSO_CFLAGS += $(call cc32-option,-fno-strict-overflow)
+VDSO_CFLAGS += -fno-strict-overflow
VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes)
-VDSO_CFLAGS += $(call cc32-option,-Werror=date-time)
+VDSO_CFLAGS += -Werror=date-time
VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types)
# The 32-bit compiler does not provide 128-bit integers, which are used in
@@ -128,7 +128,7 @@ VDSO_LDFLAGS += -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
VDSO_LDFLAGS += -nostdlib -shared -mfloat-abi=soft
VDSO_LDFLAGS += -Wl,--hash-style=sysv
-VDSO_LDFLAGS += -Wl,--build-id
+VDSO_LDFLAGS += -Wl,--build-id=sha1
VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd)
@@ -155,7 +155,7 @@ asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso))
obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso)
obj-y += vdso.o
-extra-y += vdso.lds
+targets += vdso.lds
CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
# Force dependency (vdso.s includes vdso.so through incbin)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 7cba7623fcec..6d78c041fdf6 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -6,9 +6,11 @@
*/
#define RO_EXCEPTION_TABLE_ALIGN 8
+#define RUNTIME_DISCARD_EXIT
#include <asm-generic/vmlinux.lds.h>
#include <asm/cache.h>
+#include <asm/hyp_image.h>
#include <asm/kernel-pgtable.h>
#include <asm/memory.h>
#include <asm/page.h>
@@ -21,12 +23,23 @@ ENTRY(_text)
jiffies = jiffies_64;
+#ifdef CONFIG_KVM
#define HYPERVISOR_EXTABLE \
. = ALIGN(SZ_8); \
__start___kvm_ex_table = .; \
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;
+#define HYPERVISOR_PERCPU_SECTION \
+ . = ALIGN(PAGE_SIZE); \
+ HYP_SECTION_NAME(.data..percpu) : { \
+ *(HYP_SECTION_NAME(.data..percpu)) \
+ }
+#else /* CONFIG_KVM */
+#define HYPERVISOR_EXTABLE
+#define HYPERVISOR_PERCPU_SECTION
+#endif
+
#define HYPERVISOR_TEXT \
/* \
* Align to 4 KB so that \
@@ -96,16 +109,13 @@ SECTIONS
* matching the same input section name. There is no documented
* order of matching.
*/
+ DISCARDS
/DISCARD/ : {
- EXIT_CALL
- *(.discard)
- *(.discard.*)
*(.interp .dynamic)
*(.dynsym .dynstr .hash .gnu.hash)
- *(.eh_frame)
}
- . = KIMAGE_VADDR + TEXT_OFFSET;
+ . = KIMAGE_VADDR;
.head.text : {
_text = .;
@@ -131,6 +141,14 @@ SECTIONS
*(.got) /* Global offset table */
}
+ /*
+ * Make sure that the .got.plt is either completely empty or it
+ * contains only the lazy dispatch entries.
+ */
+ .got.plt : { *(.got.plt) }
+ ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18,
+ "Unexpected GOT/PLT entries detected!")
+
. = ALIGN(SEGMENT_ALIGN);
_etext = .; /* End of text section */
@@ -190,6 +208,7 @@ SECTIONS
}
PERCPU_SECTION(L1_CACHE_BYTES)
+ HYPERVISOR_PERCPU_SECTION
.rela.dyn : ALIGN(8) {
*(.rela .rela*)
@@ -249,8 +268,22 @@ SECTIONS
_end = .;
STABS_DEBUG
+ DWARF_DEBUG
+ ELF_DETAILS
HEAD_SYMBOLS
+
+ /*
+ * Sections that should stay zero sized, which is safer to
+ * explicitly check instead of blindly discarding.
+ */
+ .plt : {
+ *(.plt) *(.plt.*) *(.iplt) *(.igot)
+ }
+ ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+
+ .data.rel.ro : { *(.data.rel.ro) }
+ ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!")
}
#include "image-vars.h"
@@ -274,4 +307,4 @@ ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
/*
* If padding is applied before .head.text, virt<->phys conversions will fail.
*/
-ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned")
+ASSERT(_text == KIMAGE_VADDR, "HEAD is misaligned")
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 318c8f2df245..043756db8f6e 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -57,9 +57,6 @@ config KVM_ARM_PMU
Adds support for a virtual Performance Monitoring Unit (PMU) in
virtual machines.
-config KVM_INDIRECT_VECTORS
- def_bool HARDEN_BRANCH_PREDICTOR || RANDOMIZE_BASE
-
endif # KVM
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 99977c1972cc..1504c81fbf5d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
$(KVM)/vfio.o $(KVM)/irqchip.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
- inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \
+ inject_fault.o regmap.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o \
aarch32.o arch_timer.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46dc3d75cf13..f56122eedffc 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -46,8 +46,10 @@
__asm__(".arch_extension virt");
#endif
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
+
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
/* The VMID used in the VTTBR */
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
@@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
int i;
+ bitmap_free(kvm->arch.pmu_filter);
+
kvm_vgic_destroy(kvm);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
@@ -206,6 +210,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = 1;
break;
+ case KVM_CAP_STEAL_TIME:
+ r = kvm_arm_pvtime_supported();
+ break;
default:
r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
break;
@@ -283,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
static_branch_dec(&userspace_irqchip_in_use);
- kvm_mmu_free_memory_caches(vcpu);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
@@ -1256,12 +1263,60 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
}
+static unsigned long nvhe_percpu_size(void)
+{
+ return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
+ (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
+}
+
+static unsigned long nvhe_percpu_order(void)
+{
+ unsigned long size = nvhe_percpu_size();
+
+ return size ? get_order(size) : 0;
+}
+
+static int kvm_map_vectors(void)
+{
+ /*
+ * SV2 = ARM64_SPECTRE_V2
+ * HEL2 = ARM64_HARDEN_EL2_VECTORS
+ *
+ * !SV2 + !HEL2 -> use direct vectors
+ * SV2 + !HEL2 -> use hardened vectors in place
+ * !SV2 + HEL2 -> allocate one vector slot and use exec mapping
+ * SV2 + HEL2 -> use hardened vectors and use exec mapping
+ */
+ if (cpus_have_const_cap(ARM64_SPECTRE_V2)) {
+ __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
+ __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
+ }
+
+ if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
+ phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
+ unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
+
+ /*
+ * Always allocate a spare vector slot, as we don't
+ * know yet which CPUs have a BP hardening slot that
+ * we can reuse.
+ */
+ __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+ BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
+ return create_hyp_exec_mappings(vect_pa, size,
+ &__kvm_bp_vect_base);
+ }
+
+ return 0;
+}
+
static void cpu_init_hyp_mode(void)
{
phys_addr_t pgd_ptr;
unsigned long hyp_stack_ptr;
unsigned long vector_ptr;
unsigned long tpidr_el2;
+ struct arm_smccc_res res;
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
@@ -1271,12 +1326,13 @@ static void cpu_init_hyp_mode(void)
* kernel's mapping to the linear mapping, and store it in tpidr_el2
* so that we can use adr_l to access per-cpu variables in EL2.
*/
- tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
- (unsigned long)kvm_ksym_ref(&kvm_host_data));
+ tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
+ (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
pgd_ptr = kvm_mmu_get_httbr();
hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
- vector_ptr = (unsigned long)kvm_get_hyp_vector();
+ hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
+ vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
/*
* Call initialization code, and switch to the full blown HYP code.
@@ -1285,14 +1341,16 @@ static void cpu_init_hyp_mode(void)
* cpus_have_const_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
- __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
+ pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
+ WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
/*
* Disabling SSBD on a non-VHE system requires us to enable SSBS
* at EL2.
*/
if (this_cpu_has_cap(ARM64_SSBS) &&
- arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
+ arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
kvm_call_hyp_nvhe(__kvm_enable_ssbs);
}
}
@@ -1305,10 +1363,12 @@ static void cpu_hyp_reset(void)
static void cpu_hyp_reinit(void)
{
- kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
+ kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
cpu_hyp_reset();
+ *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
+
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
else
@@ -1459,8 +1519,10 @@ static void teardown_hyp_mode(void)
int cpu;
free_hyp_pgds();
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
+ }
}
/**
@@ -1494,6 +1556,24 @@ static int init_hyp_mode(void)
}
/*
+ * Allocate and initialize pages for Hypervisor-mode percpu regions.
+ */
+ for_each_possible_cpu(cpu) {
+ struct page *page;
+ void *page_addr;
+
+ page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
+ if (!page) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ page_addr = page_address(page);
+ memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
+ kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
+ }
+
+ /*
* Map the Hyp-code called directly from the host
*/
err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
@@ -1537,22 +1617,21 @@ static int init_hyp_mode(void)
}
}
+ /*
+ * Map Hyp percpu pages
+ */
for_each_possible_cpu(cpu) {
- kvm_host_data_t *cpu_data;
+ char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+ char *percpu_end = percpu_begin + nvhe_percpu_size();
- cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
- err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
+ err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
if (err) {
- kvm_err("Cannot map host CPU state: %d\n", err);
+ kvm_err("Cannot map hyp percpu region\n");
goto out_err;
}
}
- err = hyp_map_aux_data();
- if (err)
- kvm_err("Cannot map host auxiliary data: %d\n", err);
-
return 0;
out_err:
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
deleted file mode 100644
index 3c79a1124af2..000000000000
--- a/arch/arm64/kvm/hyp.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-
-/*
- * u64 __kvm_call_hyp(void *hypfn, ...);
- *
- * This is not really a variadic function in the classic C-way and care must
- * be taken when calling this to ensure parameters are passed in registers
- * only, since the stack will change between the caller and the callee.
- *
- * Call the function with the first argument containing a pointer to the
- * function you wish to call in Hyp mode, and subsequent arguments will be
- * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
- * function pointer can be passed). The function being called must be mapped
- * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are
- * passed in x0.
- *
- * A function pointer with a value less than 0xfff has a special meaning,
- * and is used to implement hyp stubs in the same way as in
- * arch/arm64/kernel/hyp_stub.S.
- */
-SYM_FUNC_START(__kvm_call_hyp)
- hvc #0
- ret
-SYM_FUNC_END(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index f54f0e89a71c..4a81eddabcd8 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,5 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_KVM) += vhe/ nvhe/
-obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 76e7eaf4675e..b0afad7a99c6 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/alternative.h>
-#include <asm/asm-offsets.h>
#include <asm/assembler.h>
#include <asm/fpsimdmacros.h>
#include <asm/kvm.h>
@@ -16,66 +15,28 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_ptrauth.h>
-#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
-#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
-
.text
/*
- * We treat x18 as callee-saved as the host may use it as a platform
- * register (e.g. for shadow call stack).
- */
-.macro save_callee_saved_regs ctxt
- str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro restore_callee_saved_regs ctxt
- // We require \ctxt is not x18-x28
- ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro save_sp_el0 ctxt, tmp
- mrs \tmp, sp_el0
- str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
-.endm
-
-.macro restore_sp_el0 ctxt, tmp
- ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
- msr sp_el0, \tmp
-.endm
-
-/*
- * u64 __guest_enter(struct kvm_vcpu *vcpu,
- * struct kvm_cpu_context *host_ctxt);
+ * u64 __guest_enter(struct kvm_vcpu *vcpu);
*/
SYM_FUNC_START(__guest_enter)
// x0: vcpu
- // x1: host context
- // x2-x17: clobbered by macros
+ // x1-x17: clobbered by macros
// x29: guest context
- // Store the host regs
+ adr_this_cpu x1, kvm_hyp_ctxt, x2
+
+ // Store the hyp regs
save_callee_saved_regs x1
- // Save the host's sp_el0
+ // Save hyp's sp_el0
save_sp_el0 x1, x2
- // Now the host state is stored if we have a pending RAS SError it must
- // affect the host. If any asynchronous exception is pending we defer
- // the guest entry. The DSB isn't necessary before v8.2 as any SError
- // would be fatal.
+ // Now the hyp state is stored if we have a pending RAS SError it must
+ // affect the host or hyp. If any asynchronous exception is pending we
+ // defer the guest entry. The DSB isn't necessary before v8.2 as any
+ // SError would be fatal.
alternative_if ARM64_HAS_RAS_EXTN
dsb nshst
isb
@@ -86,6 +47,8 @@ alternative_else_nop_endif
ret
1:
+ set_loaded_vcpu x0, x1, x2
+
add x29, x0, #VCPU_CONTEXT
// Macro ptrauth_switch_to_guest format:
@@ -116,6 +79,26 @@ alternative_else_nop_endif
eret
sb
+SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
+
+ // If the hyp context is loaded, go straight to hyp_panic
+ get_loaded_vcpu x0, x1
+ cbz x0, hyp_panic
+
+ // The hyp context is saved so make sure it is restored to allow
+ // hyp_panic to run at hyp and, subsequently, panic to run in the host.
+ // This makes use of __guest_exit to avoid duplication but sets the
+ // return address to tail call into hyp_panic. As a side effect, the
+ // current state is saved to the guest context but it will only be
+ // accurate if the guest had been completely restored.
+ adr_this_cpu x0, kvm_hyp_ctxt, x1
+ adr x1, hyp_panic
+ str x1, [x0, #CPU_XREG_OFFSET(30)]
+
+ get_vcpu_ptr x1, x0
+
SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// x0: return code
// x1: vcpu
@@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Store the guest's sp_el0
save_sp_el0 x1, x2
- get_host_ctxt x2, x3
+ adr_this_cpu x2, kvm_hyp_ctxt, x3
- // Macro ptrauth_switch_to_guest format:
- // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
+ // Macro ptrauth_switch_to_hyp format:
+ // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3)
// The below macro to save/restore keys is not implemented in C code
// as it may cause Pointer Authentication key signing mismatch errors
// when this feature is enabled for kernel code.
- ptrauth_switch_to_host x1, x2, x3, x4, x5
+ ptrauth_switch_to_hyp x1, x2, x3, x4, x5
- // Restore the hosts's sp_el0
+ // Restore hyp's sp_el0
restore_sp_el0 x2, x3
- // Now restore the host regs
+ // Now restore the hyp regs
restore_callee_saved_regs x2
+ set_loaded_vcpu xzr, x1, x2
+
alternative_if ARM64_HAS_RAS_EXTN
// If we have the RAS extensions we can consume a pending error
// without an unmask-SError and isb. The ESB-instruction consumed any
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 46b4dab933d0..0a5b36eb54b3 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -12,7 +12,6 @@
#include <asm/cpufeature.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
#include <asm/mmu.h>
.macro save_caller_saved_regs_vect
@@ -41,20 +40,6 @@
.text
-.macro do_el2_call
- /*
- * Shuffle the parameters before calling the function
- * pointed to in x0. Assumes parameters in x[1,2,3].
- */
- str lr, [sp, #-16]!
- mov lr, x0
- mov x0, x1
- mov x1, x2
- mov x2, x3
- blr lr
- ldr lr, [sp], #16
-.endm
-
el1_sync: // Guest trapped into EL2
mrs x0, esr_el2
@@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
b.ne el1_trap
-#ifdef __KVM_NVHE_HYPERVISOR__
- mrs x1, vttbr_el2 // If vttbr is valid, the guest
- cbnz x1, el1_hvc_guest // called HVC
-
- /* Here, we're pretty sure the host called HVC. */
- ldp x0, x1, [sp], #16
-
- /* Check for a stub HVC call */
- cmp x0, #HVC_STUB_HCALL_NR
- b.hs 1f
-
- /*
- * Compute the idmap address of __kvm_handle_stub_hvc and
- * jump there. Since we use kimage_voffset, do not use the
- * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
- * (by loading it from the constant pool).
- *
- * Preserve x0-x4, which may contain stub parameters.
- */
- ldr x5, =__kvm_handle_stub_hvc
- ldr_l x6, kimage_voffset
-
- /* x5 = __pa(x5) */
- sub x5, x5, x6
- br x5
-
-1:
- /*
- * Perform the EL2 call
- */
- kern_hyp_va x0
- do_el2_call
-
- eret
- sb
-#endif /* __KVM_NVHE_HYPERVISOR__ */
-
-el1_hvc_guest:
/*
* Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
* The workaround has already been applied on the host,
@@ -116,35 +63,6 @@ el1_hvc_guest:
ARM_SMCCC_ARCH_WORKAROUND_2)
cbnz w1, el1_trap
-#ifdef CONFIG_ARM64_SSBD
-alternative_cb arm64_enable_wa2_handling
- b wa2_end
-alternative_cb_end
- get_vcpu_ptr x2, x0
- ldr x0, [x2, #VCPU_WORKAROUND_FLAGS]
-
- // Sanitize the argument and update the guest flags
- ldr x1, [sp, #8] // Guest's x1
- clz w1, w1 // Murphy's device:
- lsr w1, w1, #5 // w1 = !!w1 without using
- eor w1, w1, #1 // the flags...
- bfi x0, x1, #VCPU_WORKAROUND_2_FLAG_SHIFT, #1
- str x0, [x2, #VCPU_WORKAROUND_FLAGS]
-
- /* Check that we actually need to perform the call */
- hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2
- cbz x0, wa2_end
-
- mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
- smc #0
-
- /* Don't leak data from the SMC call */
- mov x3, xzr
-wa2_end:
- mov x2, xzr
- mov x1, xzr
-#endif
-
wa_epilogue:
mov x0, xzr
add sp, sp, #16
@@ -198,24 +116,7 @@ el2_error:
eret
sb
-#ifdef __KVM_NVHE_HYPERVISOR__
-SYM_FUNC_START(__hyp_do_panic)
- mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
- PSR_MODE_EL1h)
- msr spsr_el2, lr
- ldr lr, =panic
- msr elr_el2, lr
- eret
- sb
-SYM_FUNC_END(__hyp_do_panic)
-#endif
-
-SYM_CODE_START(__hyp_panic)
- get_host_ctxt x0, x1
- b hyp_panic
-SYM_CODE_END(__hyp_panic)
-
-.macro invalid_vector label, target = __hyp_panic
+.macro invalid_vector label, target = __guest_exit_panic
.align 2
SYM_CODE_START(\label)
b \target
@@ -227,7 +128,6 @@ SYM_CODE_END(\label)
invalid_vector el2t_irq_invalid
invalid_vector el2t_fiq_invalid
invalid_vector el2t_error_invalid
- invalid_vector el2h_sync_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
invalid_vector el1_fiq_invalid
@@ -257,10 +157,9 @@ check_preamble_length 661b, 662b
.macro invalid_vect target
.align 7
661:
- b \target
nop
+ stp x0, x1, [sp, #-16]!
662:
- ldp x0, x1, [sp], #16
b \target
check_preamble_length 661b, 662b
@@ -288,7 +187,6 @@ SYM_CODE_START(__kvm_hyp_vector)
valid_vect el1_error // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_vector)
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
.macro hyp_ventry
.align 7
1: esb
@@ -338,4 +236,3 @@ SYM_CODE_START(__bp_harden_hyp_vecs)
1: .org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ
.org 1b
SYM_CODE_END(__bp_harden_hyp_vecs)
-#endif
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 5e28ea6aa097..4ebe9f558f3a 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
@@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 5b6b8fa00f0a..313a8fa3c721 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
}
}
-static inline void __activate_vm(struct kvm_s2_mmu *mmu)
-{
- __load_guest_stage2(mmu);
-}
-
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
u64 par, tmp;
@@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr)
ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
} while(0)
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+
static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *ctxt;
@@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
!esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
return false;
- ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
__ptrauth_save_key(ctxt, APIA);
__ptrauth_save_key(ctxt, APIB);
__ptrauth_save_key(ctxt, APDA);
@@ -449,7 +446,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
kvm_vcpu_dabt_isvalid(vcpu) &&
!kvm_vcpu_abt_issea(vcpu) &&
- !kvm_vcpu_dabt_iss1tw(vcpu);
+ !kvm_vcpu_abt_iss1tw(vcpu);
if (valid) {
int ret = __vgic_v2_perform_cpuif_access(vcpu);
@@ -479,49 +476,15 @@ exit:
return false;
}
-static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu)
-{
- if (!cpus_have_final_cap(ARM64_SSBD))
- return false;
-
- return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG);
-}
-
-static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_ARM64_SSBD
- /*
- * The host runs with the workaround always present. If the
- * guest wants it disabled, so be it...
- */
- if (__needs_ssbd_off(vcpu) &&
- __hyp_this_cpu_read(arm64_ssbd_callback_required))
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
-#endif
-}
-
-static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_ARM64_SSBD
- /*
- * If the guest has disabled the workaround, bring it back on.
- */
- if (__needs_ssbd_off(vcpu) &&
- __hyp_this_cpu_read(arm64_ssbd_callback_required))
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
-#endif
-}
-
static inline void __kvm_unexpected_el2_exception(void)
{
+ extern char __guest_exit_panic[];
unsigned long addr, fixup;
- struct kvm_cpu_context *host_ctxt;
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
entry = hyp_symbol_addr(__start___kvm_ex_table);
end = hyp_symbol_addr(__stop___kvm_ex_table);
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
while (entry < end) {
addr = (unsigned long)&entry->insn + entry->insn;
@@ -536,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void)
return;
}
- hyp_panic(host_ctxt);
+ /* Trigger a panic after restoring the hyp context. */
+ write_sysreg(__guest_exit_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore
new file mode 100644
index 000000000000..695d73d0249e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+hyp.lds
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index aef76487edc2..ddde15fe85f2 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,44 +6,50 @@
asflags-y := -D__KVM_NVHE_HYPERVISOR__
ccflags-y := -D__KVM_NVHE_HYPERVISOR__
-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o
-obj-y := $(patsubst %.o,%.hyp.o,$(obj-y))
-extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y))
+##
+## Build rules for compiling nVHE hyp code
+## Output of this folder is `kvm_nvhe.o`, a partially linked object
+## file containing all nVHE hyp code and data.
+##
-$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE
+hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
+obj-y := kvm_nvhe.o
+extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
+
+# 1) Compile all source files to `.nvhe.o` object files. The file extension
+# avoids file name clashes for files shared with VHE.
+$(obj)/%.nvhe.o: $(src)/%.c FORCE
$(call if_changed_rule,cc_o_c)
-$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE
+$(obj)/%.nvhe.o: $(src)/%.S FORCE
$(call if_changed_rule,as_o_S)
-$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE
- $(call if_changed,hypcopy)
-# Disable reordering functions by GCC (enabled at -O2).
-# This pass puts functions into '.text.*' sections to aid the linker
-# in optimizing ELF layout. See HYPCOPY comment below for more info.
-ccflags-y += $(call cc-option,-fno-reorder-functions)
+# 2) Compile linker script.
+$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
+ $(call if_changed_dep,cpp_lds_S)
+
+# 3) Partially link all '.nvhe.o' files and apply the linker script.
+# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
+# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
+# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
+# keep the dependency on the target while avoiding an error from
+# GNU ld if the linker script is passed to it twice.
+LDFLAGS_kvm_nvhe.tmp.o := -r -T
+$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
+ $(call if_changed,ld)
+
+# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+# Prefixes names of ELF symbols with '__kvm_nvhe_'.
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
+ $(call if_changed,hypcopy)
# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
-# and relevant ELF section names to avoid clashes with VHE code/data.
-#
-# Hyp code is assumed to be in the '.text' section of the input object
-# files (with the exception of specialized sections such as
-# '.hyp.idmap.text'). This assumption may be broken by a compiler that
-# divides code into sections like '.text.unlikely' so as to optimize
-# ELF layout. HYPCOPY checks that no such sections exist in the input
-# using `objdump`, otherwise they would be linked together with other
-# kernel code and not memory-mapped correctly at runtime.
+# to avoid clashes with VHE code/data.
quiet_cmd_hypcopy = HYPCOPY $@
- cmd_hypcopy = \
- if $(OBJDUMP) -h $< | grep -F '.text.'; then \
- echo "$@: function reordering not supported in nVHE hyp code" >&2; \
- /bin/false; \
- fi; \
- $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \
- --rename-section=.text=.hyp.text \
- $< $@
+ cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
# Remove ftrace and Shadow Call Stack CFLAGS.
# This is equivalent to the 'notrace' and '__noscs' annotations.
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
new file mode 100644
index 000000000000..ff9a0f547b9f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/assembler.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+ .text
+
+SYM_FUNC_START(__host_exit)
+ stp x0, x1, [sp, #-16]!
+
+ get_host_ctxt x0, x1
+
+ ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+
+ /* Store the host regs x2 and x3 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
+
+ /* Retrieve the host regs x0-x1 from the stack */
+ ldp x2, x3, [sp], #16 // x0, x1
+
+ /* Store the host regs x0-x1 and x4-x17 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(0)]
+ stp x4, x5, [x0, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x0, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x0, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x0, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x0, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x0, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x0, #CPU_XREG_OFFSET(16)]
+
+ /* Store the host regs x18-x29, lr */
+ save_callee_saved_regs x0
+
+ /* Save the host context pointer in x29 across the function call */
+ mov x29, x0
+ bl handle_trap
+
+ /* Restore host regs x0-x17 */
+ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)]
+
+ /* x0-7 are use for panic arguments */
+__host_enter_for_panic:
+ ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]
+
+ /* Restore host regs x18-x29, lr */
+ restore_callee_saved_regs x29
+
+ /* Do not touch any register after this! */
+__host_enter_without_restoring:
+ eret
+ sb
+SYM_FUNC_END(__host_exit)
+
+/*
+ * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
+ */
+SYM_FUNC_START(__hyp_do_panic)
+ /* Load the format arguments into x1-7 */
+ mov x6, x3
+ get_vcpu_ptr x7, x3
+
+ mrs x3, esr_el2
+ mrs x4, far_el2
+ mrs x5, hpfar_el2
+
+ /* Prepare and exit to the host's panic funciton. */
+ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+ PSR_MODE_EL1h)
+ msr spsr_el2, lr
+ ldr lr, =panic
+ msr elr_el2, lr
+
+ /*
+ * Set the panic format string and enter the host, conditionally
+ * restoring the host context.
+ */
+ cmp x0, xzr
+ ldr x0, =__hyp_panic_string
+ b.eq __host_enter_without_restoring
+ b __host_enter_for_panic
+SYM_FUNC_END(__hyp_do_panic)
+
+.macro host_el1_sync_vect
+ .align 7
+.L__vect_start\@:
+ stp x0, x1, [sp, #-16]!
+ mrs x0, esr_el2
+ lsr x0, x0, #ESR_ELx_EC_SHIFT
+ cmp x0, #ESR_ELx_EC_HVC64
+ ldp x0, x1, [sp], #16
+ b.ne __host_exit
+
+ /* Check for a stub HVC call */
+ cmp x0, #HVC_STUB_HCALL_NR
+ b.hs __host_exit
+
+ /*
+ * Compute the idmap address of __kvm_handle_stub_hvc and
+ * jump there. Since we use kimage_voffset, do not use the
+ * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
+ * (by loading it from the constant pool).
+ *
+ * Preserve x0-x4, which may contain stub parameters.
+ */
+ ldr x5, =__kvm_handle_stub_hvc
+ ldr_l x6, kimage_voffset
+
+ /* x5 = __pa(x5) */
+ sub x5, x5, x6
+ br x5
+.L__vect_end\@:
+.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
+ .error "host_el1_sync_vect larger than vector entry"
+.endif
+.endm
+
+.macro invalid_host_el2_vect
+ .align 7
+ /* If a guest is loaded, panic out of it. */
+ stp x0, x1, [sp, #-16]!
+ get_loaded_vcpu x0, x1
+ cbnz x0, __guest_exit_panic
+ add sp, sp, #16
+
+ /*
+ * The panic may not be clean if the exception is taken before the host
+ * context has been saved by __host_exit or after the hyp context has
+ * been partially clobbered by __host_enter.
+ */
+ b hyp_panic
+.endm
+
+.macro invalid_host_el1_vect
+ .align 7
+ mov x0, xzr /* restore_host = false */
+ mrs x1, spsr_el2
+ mrs x2, elr_el2
+ mrs x3, par_el1
+ b __hyp_do_panic
+.endm
+
+/*
+ * The host vector does not use an ESB instruction in order to avoid consuming
+ * SErrors that should only be consumed by the host. Guest entry is deferred by
+ * __guest_enter if there are any pending asynchronous exceptions so hyp will
+ * always return to the host without having consumerd host SErrors.
+ *
+ * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
+ * host knows about the EL2 vectors already, and there is no point in hiding
+ * them.
+ */
+ .align 11
+SYM_CODE_START(__kvm_hyp_host_vector)
+ invalid_host_el2_vect // Synchronous EL2t
+ invalid_host_el2_vect // IRQ EL2t
+ invalid_host_el2_vect // FIQ EL2t
+ invalid_host_el2_vect // Error EL2t
+
+ invalid_host_el2_vect // Synchronous EL2h
+ invalid_host_el2_vect // IRQ EL2h
+ invalid_host_el2_vect // FIQ EL2h
+ invalid_host_el2_vect // Error EL2h
+
+ host_el1_sync_vect // Synchronous 64-bit EL1
+ invalid_host_el1_vect // IRQ 64-bit EL1
+ invalid_host_el1_vect // FIQ 64-bit EL1
+ invalid_host_el1_vect // Error 64-bit EL1
+
+ invalid_host_el1_vect // Synchronous 32-bit EL1
+ invalid_host_el1_vect // IRQ 32-bit EL1
+ invalid_host_el1_vect // FIQ 32-bit EL1
+ invalid_host_el1_vect // Error 32-bit EL1
+SYM_CODE_END(__kvm_hyp_host_vector)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index d9434e90c06d..47224dc62c51 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -4,11 +4,13 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <linux/arm-smccc.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/assembler.h>
#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sysreg.h>
@@ -44,27 +46,37 @@ __invalid:
b .
/*
- * x0: HYP pgd
- * x1: HYP stack
- * x2: HYP vectors
- * x3: per-CPU offset
+ * x0: SMCCC function ID
+ * x1: HYP pgd
+ * x2: per-CPU offset
+ * x3: HYP stack
+ * x4: HYP vectors
*/
__do_hyp_init:
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc
- phys_to_ttbr x4, x0
+ /* Set tpidr_el2 for use by HYP to free a register */
+ msr tpidr_el2, x2
+
+ mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
+ cmp x0, x2
+ b.eq 1f
+ mov x0, #SMCCC_RET_NOT_SUPPORTED
+ eret
+
+1: phys_to_ttbr x0, x1
alternative_if ARM64_HAS_CNP
- orr x4, x4, #TTBR_CNP_BIT
+ orr x0, x0, #TTBR_CNP_BIT
alternative_else_nop_endif
- msr ttbr0_el2, x4
+ msr ttbr0_el2, x0
- mrs x4, tcr_el1
- mov_q x5, TCR_EL2_MASK
- and x4, x4, x5
- mov x5, #TCR_EL2_RES1
- orr x4, x4, x5
+ mrs x0, tcr_el1
+ mov_q x1, TCR_EL2_MASK
+ and x0, x0, x1
+ mov x1, #TCR_EL2_RES1
+ orr x0, x0, x1
/*
* The ID map may be configured to use an extended virtual address
@@ -80,18 +92,18 @@ alternative_else_nop_endif
*
* So use the same T0SZ value we use for the ID map.
*/
- ldr_l x5, idmap_t0sz
- bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+ ldr_l x1, idmap_t0sz
+ bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
/*
* Set the PS bits in TCR_EL2.
*/
- tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
+ tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
- msr tcr_el2, x4
+ msr tcr_el2, x0
- mrs x4, mair_el1
- msr mair_el2, x4
+ mrs x0, mair_el1
+ msr mair_el2, x0
isb
/* Invalidate the stale TLBs from Bootloader */
@@ -103,25 +115,22 @@ alternative_else_nop_endif
* as well as the EE bit on BE. Drop the A flag since the compiler
* is allowed to generate unaligned accesses.
*/
- mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE( orr x4, x4, #SCTLR_ELx_EE)
+ mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
+CPU_BE( orr x0, x0, #SCTLR_ELx_EE)
alternative_if ARM64_HAS_ADDRESS_AUTH
- mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
+ mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
- orr x4, x4, x5
+ orr x0, x0, x1
alternative_else_nop_endif
- msr sctlr_el2, x4
+ msr sctlr_el2, x0
isb
/* Set the stack and new vectors */
- kern_hyp_va x1
- mov sp, x1
- msr vbar_el2, x2
-
- /* Set tpidr_el2 for use by HYP */
- msr tpidr_el2, x3
+ mov sp, x3
+ msr vbar_el2, x4
/* Hello, World! */
+ mov x0, #SMCCC_RET_SUCCESS
eret
SYM_CODE_END(__kvm_hyp_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
new file mode 100644
index 000000000000..e2eafe2c93af
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <hyp/switch.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#include <kvm/arm_hypercalls.h>
+
+static void handle_host_hcall(unsigned long func_id,
+ struct kvm_cpu_context *host_ctxt)
+{
+ unsigned long ret = 0;
+
+ switch (func_id) {
+ case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
+
+ ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
+ __kvm_flush_vm_context();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+ phys_addr_t ipa = host_ctxt->regs.regs[2];
+ int level = host_ctxt->regs.regs[3];
+
+ __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
+ u64 cntvoff = host_ctxt->regs.regs[1];
+
+ __kvm_timer_set_cntvoff(cntvoff);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
+ __kvm_enable_ssbs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
+ ret = __vgic_v3_get_ich_vtr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
+ ret = __vgic_v3_read_vmcr();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
+ u32 vmcr = host_ctxt->regs.regs[1];
+
+ __vgic_v3_write_vmcr(vmcr);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
+ __vgic_v3_init_lrs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
+ ret = __kvm_get_mdcr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ default:
+ /* Invalid host HVC. */
+ host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
+ return;
+ }
+
+ host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
+ host_ctxt->regs.regs[1] = ret;
+}
+
+void handle_trap(struct kvm_cpu_context *host_ctxt)
+{
+ u64 esr = read_sysreg_el2(SYS_ESR);
+ unsigned long func_id;
+
+ if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
+ hyp_panic();
+
+ func_id = host_ctxt->regs.regs[0];
+ handle_host_hcall(func_id, host_ctxt);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
new file mode 100644
index 000000000000..bb2d986ff696
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ *
+ * Linker script used for partial linking of nVHE EL2 object files.
+ */
+
+#include <asm/hyp_image.h>
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/memory.h>
+
+SECTIONS {
+ HYP_SECTION(.text)
+ HYP_SECTION_NAME(.data..percpu) : {
+ PERCPU_INPUT(L1_CACHE_BYTES)
+ }
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 0970442d2dbc..a457a0306e03 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -27,6 +27,11 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
+/* Non-VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -42,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
}
write_sysreg(val, cptr_el2);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
@@ -60,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
+ extern char __kvm_hyp_host_vector[];
u64 mdcr_el2;
___deactivate_traps(vcpu);
@@ -91,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
+ write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
-static void __deactivate_vm(struct kvm_vcpu *vcpu)
+static void __load_host_stage2(void)
{
write_sysreg(0, vttbr_el2);
}
@@ -173,9 +181,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmr_sync();
}
- vcpu = kern_hyp_va(vcpu);
-
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -194,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg32_restore_state(vcpu);
__sysreg_restore_state_nvhe(guest_ctxt);
- __activate_vm(kern_hyp_va(vcpu->arch.hw_mmu));
+ __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
__activate_traps(vcpu);
__hyp_vgic_restore_state(vcpu);
@@ -202,24 +208,20 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__debug_switch_to_guest(vcpu);
- __set_guest_arch_workaround_state(vcpu);
-
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
- __set_host_arch_workaround_state(vcpu);
-
__sysreg_save_state_nvhe(guest_ctxt);
__sysreg32_save_state(vcpu);
__timer_disable_traps(vcpu);
__hyp_vgic_save_state(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
@@ -239,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
if (system_uses_irq_prio_masking())
gic_write_pmr(GIC_PRIO_IRQOFF);
+ host_ctxt->__hyp_running_vcpu = NULL;
+
return exit_code;
}
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu;
- unsigned long str_va;
+ bool restore_host = true;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
- if (read_sysreg(vttbr_el2)) {
+ if (vcpu) {
__timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
}
- /*
- * Force the panic string to be loaded from the literal pool,
- * making sure it is a kernel address and not a PC-relative
- * reference.
- */
- asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string));
-
- __hyp_do_panic(str_va,
- spsr, elr,
- read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
- read_sysreg(hpfar_el2), par, vcpu);
+ __hyp_do_panic(restore_host, spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 69eae608d670..39ca71ab8866 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -31,7 +31,14 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
isb();
}
+ /*
+ * __load_guest_stage2() includes an ISB only when the AT
+ * workaround is applied. Take care of the opposite condition,
+ * ensuring that we always have an ISB, but not two ISBs back
+ * to back.
+ */
__load_guest_stage2(mmu);
+ asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
}
static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
@@ -54,7 +61,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
/*
@@ -108,7 +114,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalls12e1is);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
new file mode 100644
index 000000000000..0cdf6e461cbd
--- /dev/null
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -0,0 +1,892 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS 4U
+
+#define KVM_PTE_VALID BIT(0)
+
+#define KVM_PTE_TYPE BIT(1)
+#define KVM_PTE_TYPE_BLOCK 0
+#define KVM_PTE_TYPE_PAGE 1
+#define KVM_PTE_TYPE_TABLE 1
+
+#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+
+struct kvm_pgtable_walk_data {
+ struct kvm_pgtable *pgt;
+ struct kvm_pgtable_walker *walker;
+
+ u64 addr;
+ u64 end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+ /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+ return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+ u64 granule = kvm_granule_size(level);
+
+ /*
+ * Reject invalid block mappings and don't bother with 4TB mappings for
+ * 52-bit PAs.
+ */
+ if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+ return false;
+
+ if (granule > (end - addr))
+ return false;
+
+ return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+ u64 shift = kvm_granule_shift(level);
+ u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+ return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+ u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+ u64 mask = BIT(pgt->ia_bits) - 1;
+
+ return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+ return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+ struct kvm_pgtable pgt = {
+ .ia_bits = ia_bits,
+ .start_level = start_level,
+ };
+
+ return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+ return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return false;
+
+ if (!kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+ u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+ return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+ kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+ return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+ return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+ kvm_pte_t pte = *ptep;
+ WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+ pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+ pte |= KVM_PTE_VALID;
+
+ WARN_ON(kvm_pte_valid(old));
+ smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+ u32 level)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+ u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+ KVM_PTE_TYPE_BLOCK;
+
+ pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+ pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+ pte |= KVM_PTE_VALID;
+
+ /* Tolerate KVM recreating the exact same mapping. */
+ if (kvm_pte_valid(old))
+ return old == pte;
+
+ smp_store_release(ptep, pte);
+ return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+ u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag)
+{
+ struct kvm_pgtable_walker *walker = data->walker;
+ return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *ptep, u32 level)
+{
+ int ret = 0;
+ u64 addr = data->addr;
+ kvm_pte_t *childp, pte = *ptep;
+ bool table = kvm_pte_table(pte, level);
+ enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+ if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_PRE);
+ }
+
+ if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_LEAF);
+ pte = *ptep;
+ table = kvm_pte_table(pte, level);
+ }
+
+ if (ret)
+ goto out;
+
+ if (!table) {
+ data->addr += kvm_granule_size(level);
+ goto out;
+ }
+
+ childp = kvm_pte_follow(pte);
+ ret = __kvm_pgtable_walk(data, childp, level + 1);
+ if (ret)
+ goto out;
+
+ if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_POST);
+ }
+
+out:
+ return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level)
+{
+ u32 idx;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+ return -EINVAL;
+
+ for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+ kvm_pte_t *ptep = &pgtable[idx];
+
+ if (data->addr >= data->end)
+ break;
+
+ ret = __kvm_pgtable_visit(data, ptep, level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+ u32 idx;
+ int ret = 0;
+ struct kvm_pgtable *pgt = data->pgt;
+ u64 limit = BIT(pgt->ia_bits);
+
+ if (data->addr > limit || data->end > limit)
+ return -ERANGE;
+
+ if (!pgt->pgd)
+ return -EINVAL;
+
+ for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+ kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+ ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker)
+{
+ struct kvm_pgtable_walk_data walk_data = {
+ .pgt = pgt,
+ .addr = ALIGN_DOWN(addr, PAGE_SIZE),
+ .end = PAGE_ALIGN(walk_data.addr + size),
+ .walker = walker,
+ };
+
+ return _kvm_pgtable_walk(&walk_data);
+}
+
+struct hyp_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+};
+
+static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct hyp_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
+ kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
+ u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
+ KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
+
+ if (!(prot & KVM_PGTABLE_PROT_R))
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_X) {
+ if (prot & KVM_PGTABLE_PROT_W)
+ return -EINVAL;
+
+ if (device)
+ return -EINVAL;
+ } else {
+ attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
+ }
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep, struct hyp_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+ data->phys += granule;
+ return true;
+}
+
+static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ kvm_pte_t *childp;
+
+ if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
+ return 0;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!childp)
+ return -ENOMEM;
+
+ kvm_set_table_pte(ptep, childp);
+ return 0;
+}
+
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ struct hyp_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &map_data,
+ };
+
+ ret = hyp_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ isb();
+ return ret;
+}
+
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+{
+ u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
+
+ pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = va_bits;
+ pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
+ pgt->mmu = NULL;
+ return 0;
+}
+
+static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ return 0;
+}
+
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_free_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ free_page((unsigned long)pgt->pgd);
+ pgt->pgd = NULL;
+}
+
+struct stage2_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+
+ kvm_pte_t *anchor;
+
+ struct kvm_s2_mmu *mmu;
+ struct kvm_mmu_memory_cache *memcache;
+};
+
+static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct stage2_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
+ PAGE_S2_MEMATTR(NORMAL);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+
+ if (!(prot & KVM_PGTABLE_PROT_X))
+ attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ else if (device)
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
+ goto out;
+
+ /* There's an existing valid leaf entry, so perform break-before-make */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+out:
+ data->phys += granule;
+ return true;
+}
+
+static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ if (data->anchor)
+ return 0;
+
+ if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+ return 0;
+
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
+ data->anchor = ptep;
+ return 0;
+}
+
+static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ kvm_pte_t *childp, pte = *ptep;
+ struct page *page = virt_to_page(ptep);
+
+ if (data->anchor) {
+ if (kvm_pte_valid(pte))
+ put_page(page);
+
+ return 0;
+ }
+
+ if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
+ goto out_get_page;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ if (!data->memcache)
+ return -ENOMEM;
+
+ childp = kvm_mmu_memory_cache_alloc(data->memcache);
+ if (!childp)
+ return -ENOMEM;
+
+ /*
+ * If we've run into an existing block mapping then replace it with
+ * a table. Accesses beyond 'end' that fall within the new table
+ * will be mapped lazily.
+ */
+ if (kvm_pte_valid(pte)) {
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ put_page(page);
+ }
+
+ kvm_set_table_pte(ptep, childp);
+
+out_get_page:
+ get_page(page);
+ return 0;
+}
+
+static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ int ret = 0;
+
+ if (!data->anchor)
+ return 0;
+
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ put_page(virt_to_page(ptep));
+
+ if (data->anchor == ptep) {
+ data->anchor = NULL;
+ ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+ }
+
+ return ret;
+}
+
+/*
+ * This is a little fiddly, as we use all three of the walk flags. The idea
+ * is that the TABLE_PRE callback runs for table entries on the way down,
+ * looking for table entries which we could conceivably replace with a
+ * block entry for this mapping. If it finds one, then it sets the 'anchor'
+ * field in 'struct stage2_map_data' to point at the table entry, before
+ * clearing the entry to zero and descending into the now detached table.
+ *
+ * The behaviour of the LEAF callback then depends on whether or not the
+ * anchor has been set. If not, then we're not using a block mapping higher
+ * up the table and we perform the mapping at the existing leaves instead.
+ * If, on the other hand, the anchor _is_ set, then we drop references to
+ * all valid leaves so that the pages beneath the anchor can be freed.
+ *
+ * Finally, the TABLE_POST callback does nothing if the anchor has not
+ * been set, but otherwise frees the page-table pages while walking back up
+ * the page-table, installing the block entry when it revisits the anchor
+ * pointer and clearing the anchor to NULL.
+ */
+static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ struct stage2_map_data *data = arg;
+
+ switch (flag) {
+ case KVM_PGTABLE_WALK_TABLE_PRE:
+ return stage2_map_walk_table_pre(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_LEAF:
+ return stage2_map_walk_leaf(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_TABLE_POST:
+ return stage2_map_walk_table_post(addr, end, level, ptep, data);
+ }
+
+ return -EINVAL;
+}
+
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ struct kvm_mmu_memory_cache *mc)
+{
+ int ret;
+ struct stage2_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_PRE |
+ KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = &map_data,
+ };
+
+ ret = stage2_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ return ret;
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return;
+
+ __flush_dcache_area(addr, size);
+}
+
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+ u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
+ return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ struct kvm_s2_mmu *mmu = arg;
+ kvm_pte_t pte = *ptep, *childp = NULL;
+ bool need_flush = false;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ if (kvm_pte_table(pte, level)) {
+ childp = kvm_pte_follow(pte);
+
+ if (page_count(virt_to_page(childp)) != 1)
+ return 0;
+ } else if (stage2_pte_cacheable(pte)) {
+ need_flush = true;
+ }
+
+ /*
+ * This is similar to the map() path in that we unmap the entire
+ * block entry and rely on the remaining portions being faulted
+ * back lazily.
+ */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+ put_page(virt_to_page(ptep));
+
+ if (need_flush) {
+ stage2_flush_dcache(kvm_pte_follow(pte),
+ kvm_granule_size(level));
+ }
+
+ if (childp)
+ free_page((unsigned long)childp);
+
+ return 0;
+}
+
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_unmap_walker,
+ .arg = pgt->mmu,
+ .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+struct stage2_attr_data {
+ kvm_pte_t attr_set;
+ kvm_pte_t attr_clr;
+ kvm_pte_t pte;
+ u32 level;
+};
+
+static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+ struct stage2_attr_data *data = arg;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ data->level = level;
+ data->pte = pte;
+ pte &= ~data->attr_clr;
+ pte |= data->attr_set;
+
+ /*
+ * We may race with the CPU trying to set the access flag here,
+ * but worst-case the access flag update gets lost and will be
+ * set on the next access instead.
+ */
+ if (data->pte != pte)
+ WRITE_ONCE(*ptep, pte);
+
+ return 0;
+}
+
+static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, kvm_pte_t attr_set,
+ kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
+ u32 *level)
+{
+ int ret;
+ kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
+ struct stage2_attr_data data = {
+ .attr_set = attr_set & attr_mask,
+ .attr_clr = attr_clr & attr_mask,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_attr_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ if (ret)
+ return ret;
+
+ if (orig_pte)
+ *orig_pte = data.pte;
+
+ if (level)
+ *level = data.level;
+ return 0;
+}
+
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ return stage2_update_leaf_attrs(pgt, addr, size, 0,
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
+ NULL, NULL);
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+ &pte, NULL);
+ dsb(ishst);
+ return pte;
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
+ &pte, NULL);
+ /*
+ * "But where's the TLBI?!", you scream.
+ * "Over in the core code", I sigh.
+ *
+ * See the '->clear_flush_young()' callback on the KVM mmu notifier.
+ */
+ return pte;
+}
+
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
+ return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+}
+
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ u32 level;
+ kvm_pte_t set = 0, clr = 0;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ if (prot & KVM_PGTABLE_PROT_X)
+ clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
+ if (!ret)
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
+ return ret;
+}
+
+static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+ return 0;
+
+ stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+ return 0;
+}
+
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_flush_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return 0;
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+{
+ size_t pgd_sz;
+ u64 vtcr = kvm->arch.vtcr;
+ u32 ia_bits = VTCR_EL2_IPA(vtcr);
+ u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+ pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+ pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = ia_bits;
+ pgt->start_level = start_level;
+ pgt->mmu = &kvm->arch.mmu;
+
+ /* Ensure zeroed PGD pages are visible to the hardware walker */
+ dsb(ishst);
+ return 0;
+}
+
+static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ put_page(virt_to_page(ptep));
+
+ if (kvm_pte_table(pte, level))
+ free_page((unsigned long)kvm_pte_follow(pte));
+
+ return 0;
+}
+
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ size_t pgd_sz;
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_free_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
+ free_pages_exact(pgt->pgd, pgd_sz);
+ pgt->pgd = NULL;
+}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index c1da4f86ccac..fe69de16dadc 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -28,6 +28,11 @@
const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
+/* VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -59,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
write_sysreg(val, cpacr_el1);
- write_sysreg(kvm_get_hyp_vector(), vbar_el1);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
NOKPROBE_SYMBOL(__activate_traps);
@@ -108,7 +113,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt;
u64 exit_code;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -120,28 +125,24 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
* HCR_EL2.TGE.
*
* We have already configured the guest's stage 1 translation in
- * kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm
- * before __activate_traps, because __activate_vm configures
- * stage 2 translation, and __activate_traps clear HCR_EL2.TGE
- * (among other things).
+ * kvm_vcpu_load_sysregs_vhe above. We must now call
+ * __load_guest_stage2 before __activate_traps, because
+ * __load_guest_stage2 configures stage 2 translation, and
+ * __activate_traps clear HCR_EL2.TGE (among other things).
*/
- __activate_vm(vcpu->arch.hw_mmu);
+ __load_guest_stage2(vcpu->arch.hw_mmu);
__activate_traps(vcpu);
sysreg_restore_guest_state_vhe(guest_ctxt);
__debug_switch_to_guest(vcpu);
- __set_guest_arch_workaround_state(vcpu);
-
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
- __set_host_arch_workaround_state(vcpu);
-
sysreg_save_guest_state_vhe(guest_ctxt);
__deactivate_traps(vcpu);
@@ -192,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return ret;
}
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
- struct kvm_cpu_context *host_ctxt)
+static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
+ struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
@@ -208,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
}
NOKPROBE_SYMBOL(__hyp_call_panic);
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- __hyp_call_panic(spsr, elr, par, host_ctxt);
+ __hyp_call_panic(spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 996471e4c138..2a0b8c88d74f 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
__sysreg_save_user_state(host_ctxt);
/*
@@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
deactivate_traps_vhe_put();
__sysreg_save_el1_state(guest_ctxt);
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 550dfa3e53cd..9824025ccc5c 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -24,27 +24,36 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
feature = smccc_get_arg1(vcpu);
switch (feature) {
case ARM_SMCCC_ARCH_WORKAROUND_1:
- switch (kvm_arm_harden_branch_predictor()) {
- case KVM_BP_HARDEN_UNKNOWN:
+ switch (arm64_get_spectre_v2_state()) {
+ case SPECTRE_VULNERABLE:
break;
- case KVM_BP_HARDEN_WA_NEEDED:
+ case SPECTRE_MITIGATED:
val = SMCCC_RET_SUCCESS;
break;
- case KVM_BP_HARDEN_NOT_REQUIRED:
+ case SPECTRE_UNAFFECTED:
val = SMCCC_RET_NOT_REQUIRED;
break;
}
break;
case ARM_SMCCC_ARCH_WORKAROUND_2:
- switch (kvm_arm_have_ssbd()) {
- case KVM_SSBD_FORCE_DISABLE:
- case KVM_SSBD_UNKNOWN:
+ switch (arm64_get_spectre_v4_state()) {
+ case SPECTRE_VULNERABLE:
break;
- case KVM_SSBD_KERNEL:
- val = SMCCC_RET_SUCCESS;
- break;
- case KVM_SSBD_FORCE_ENABLE:
- case KVM_SSBD_MITIGATED:
+ case SPECTRE_MITIGATED:
+ /*
+ * SSBS everywhere: Indicate no firmware
+ * support, as the SSBS support will be
+ * indicated to the guest and the default is
+ * safe.
+ *
+ * Otherwise, expose a permanent mitigation
+ * to the guest, and hide SSBS so that the
+ * guest stays protected.
+ */
+ if (cpus_have_final_cap(ARM64_SSBS))
+ break;
+ fallthrough;
+ case SPECTRE_UNAFFECTED:
val = SMCCC_RET_NOT_REQUIRED;
break;
}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index ebfdfc27b2bd..34a96ab244fa 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
/**
* kvm_inject_undefined - inject an undefined instruction into the guest
+ * @vcpu: The vCPU in which to inject the exception
*
* It is assumed that this code is called from the VCPU thread and that the
* VCPU therefore is not currently executing guest code.
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index ba00bcc0c884..19aacc7d64de 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -14,6 +14,7 @@
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
@@ -21,9 +22,7 @@
#include "trace.h"
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
+static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
static unsigned long hyp_idmap_start;
@@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector;
static unsigned long io_map_base;
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
-
-static bool is_iomap(unsigned long flags)
+/*
+ * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
+ * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
+ * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
+ * long will also starve other vCPUs. We have to also make sure that the page
+ * tables are not freed while we released the lock.
+ */
+static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end,
+ int (*fn)(struct kvm_pgtable *, u64, u64),
+ bool resched)
{
- return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+ int ret;
+ u64 next;
+
+ do {
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ ret = fn(pgt, addr, next - addr);
+ if (ret)
+ break;
+
+ if (resched && next != end)
+ cond_resched_lock(&kvm->mmu_lock);
+ } while (addr = next, addr != end);
+
+ return ret;
}
+#define stage2_apply_range_resched(kvm, addr, end, fn) \
+ stage2_apply_range(kvm, addr, end, fn, true)
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
}
-static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
- int level)
-{
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
- __kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
- __kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
- __kvm_flush_dcache_pud(pud);
-}
-
static bool kvm_is_device_pfn(unsigned long pfn)
{
return !pfn_valid(pfn);
}
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pmd: pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
-{
- if (!pmd_thp_or_huge(*pmd))
- return;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pud: pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
-{
- struct kvm *kvm = mmu->kvm;
-
- if (!stage2_pud_huge(kvm, *pudp))
- return;
-
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- put_page(virt_to_page(pudp));
-}
-
-static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
- stage2_pgd_clear(kvm, pgd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_p4d_free(kvm, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
- stage2_p4d_clear(kvm, p4d);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pud_free(kvm, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-
- VM_BUG_ON(stage2_pud_huge(kvm, *pud));
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pmd_free(kvm, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- free_page((unsigned long)pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
- WRITE_ONCE(*ptep, new_pte);
- dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
- WRITE_ONCE(*pmdp, new_pmd);
- dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
- kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
- WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
- dsb(ishst);
-}
-
-static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
-{
- WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
- dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
-{
-#ifndef __PAGETABLE_P4D_FOLDED
- WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
- dsb(ishst);
-#endif
-}
-
/*
* Unmapping vs dcache management:
*
@@ -223,120 +105,19 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
* end up writing old data to disk.
*
* This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
+ * the corresponding TLBs, we flush to make sure the IO subsystem will
+ * never hit in the cache.
*
* This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
* we then fully enforce cacheability of RAM, no matter what the guest
* does.
*/
-static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t start_addr = addr;
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- pte_t old_pte = *pte;
-
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-
- /* No need to invalidate the cache for device mappings */
- if (!kvm_is_device_pfn(pte_pfn(old_pte)))
- kvm_flush_dcache_pte(old_pte);
-
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (stage2_pte_table_empty(mmu->kvm, start_pte))
- clear_stage2_pmd_entry(mmu, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- pmd_t old_pmd = *pmd;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-
- kvm_flush_dcache_pmd(old_pmd);
-
- put_page(virt_to_page(pmd));
- } else {
- unmap_stage2_ptes(mmu, pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-
- if (stage2_pmd_table_empty(kvm, start_pmd))
- clear_stage2_pud_entry(mmu, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pud_t *pud, *start_pud;
-
- start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- pud_t old_pud = *pud;
-
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- kvm_flush_dcache_pud(old_pud);
- put_page(virt_to_page(pud));
- } else {
- unmap_stage2_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-
- if (stage2_pud_table_empty(kvm, start_pud))
- clear_stage2_p4d_entry(mmu, p4d, start_addr);
-}
-
-static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- unmap_stage2_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (stage2_p4d_table_empty(kvm, start_p4d))
- clear_stage2_pgd_entry(mmu, pgd, start_addr);
-}
-
/**
* unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm: The VM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
+ * @may_block: Whether or not we are permitted to block
*
* Clear a range of stage-2 mappings, lowering the various ref-counts. Must
* be called while holding mmu_lock (unless for freeing the stage2 pgd before
@@ -347,32 +128,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
bool may_block)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
+ phys_addr_t end = start + size;
assert_spin_locked(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Make sure the page table is still active, as another thread
- * could have possibly freed the page table, while we released
- * the lock.
- */
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- unmap_stage2_p4ds(mmu, pgd, addr, next);
- /*
- * If the range is too large, release the kvm->mmu_lock
- * to prevent starvation and lockup detector warnings.
- */
- if (may_block && next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ may_block));
}
static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
@@ -380,89 +141,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
__unmap_stage2_range(mmu, start, size, true);
}
-static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
- kvm_flush_dcache_pte(*pte);
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd))
- kvm_flush_dcache_pmd(*pmd);
- else
- stage2_flush_ptes(mmu, pmd, addr, next);
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud))
- kvm_flush_dcache_pud(*pud);
- else
- stage2_flush_pmds(mmu, pud, addr, next);
- }
- } while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_flush_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
static void stage2_flush_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- phys_addr_t next;
- pgd_t *pgd;
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- stage2_flush_p4ds(mmu, pgd, addr, next);
-
- if (next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
}
/**
@@ -489,338 +174,28 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
- p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
- pgd_clear(pgd);
- p4d_free(NULL, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_p4d_entry(p4d_t *p4d)
-{
- pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
- VM_BUG_ON(p4d_huge(*p4d));
- p4d_clear(p4d);
- pud_free(NULL, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
- pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
- VM_BUG_ON(pud_huge(*pud));
- pud_clear(pud);
- pmd_free(NULL, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- pte_free_kernel(NULL, pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- kvm_set_pte(pte, __pte(0));
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (hyp_pte_table_empty(start_pte))
- clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- /* Hyp doesn't use huge pmds */
- if (!pmd_none(*pmd))
- unmap_hyp_ptes(pmd, addr, next);
- } while (pmd++, addr = next, addr != end);
-
- if (hyp_pmd_table_empty(start_pmd))
- clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pud_t *pud, *start_pud;
-
- start_pud = pud = pud_offset(p4d, addr);
- do {
- next = pud_addr_end(addr, end);
- /* Hyp doesn't use huge puds */
- if (!pud_none(*pud))
- unmap_hyp_pmds(pud, addr, next);
- } while (pud++, addr = next, addr != end);
-
- if (hyp_pud_table_empty(start_pud))
- clear_hyp_p4d_entry(p4d);
-}
-
-static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = p4d_offset(pgd, addr);
- do {
- next = p4d_addr_end(addr, end);
- /* Hyp doesn't use huge p4ds */
- if (!p4d_none(*p4d))
- unmap_hyp_puds(p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (hyp_p4d_table_empty(start_p4d))
- clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
- return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- phys_addr_t start, u64 size)
-{
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
-
- /*
- * We don't unmap anything from HYP, except at the hyp tear down.
- * Hence, we don't have to invalidate the TLBs here.
- */
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
- do {
- next = pgd_addr_end(addr, end);
- if (!pgd_none(*pgd))
- unmap_hyp_p4ds(pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
/**
* free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
*/
void free_hyp_pgds(void)
{
- pgd_t *id_pgd;
-
mutex_lock(&kvm_hyp_pgd_mutex);
-
- id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
- if (id_pgd) {
- /* In case we never called hyp_mmu_init() */
- if (!io_map_base)
- io_map_base = hyp_idmap_start;
- unmap_hyp_idmap_range(id_pgd, io_map_base,
- hyp_idmap_start + PAGE_SIZE - io_map_base);
- }
-
- if (boot_hyp_pgd) {
- free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
- boot_hyp_pgd = NULL;
- }
-
- if (hyp_pgd) {
- unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
- (uintptr_t)high_memory - PAGE_OFFSET);
-
- free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
- hyp_pgd = NULL;
- }
- if (merged_hyp_pgd) {
- clear_page(merged_hyp_pgd);
- free_page((unsigned long)merged_hyp_pgd);
- merged_hyp_pgd = NULL;
+ if (hyp_pgtable) {
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+ kfree(hyp_pgtable);
}
-
mutex_unlock(&kvm_hyp_pgd_mutex);
}
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pte_t *pte;
- unsigned long addr;
-
- addr = start;
- do {
- pte = pte_offset_kernel(pmd, addr);
- kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
- get_page(virt_to_page(pte));
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pmd_t *pmd;
- pte_t *pte;
- unsigned long addr, next;
-
- addr = start;
- do {
- pmd = pmd_offset(pud, addr);
-
- BUG_ON(pmd_sect(*pmd));
-
- if (pmd_none(*pmd)) {
- pte = pte_alloc_one_kernel(NULL);
- if (!pte) {
- kvm_err("Cannot allocate Hyp pte\n");
- return -ENOMEM;
- }
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- next = pmd_addr_end(addr, end);
-
- create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- pud = pud_offset(p4d, addr);
-
- if (pud_none_or_clear_bad(pud)) {
- pmd = pmd_alloc_one(NULL, addr);
- if (!pmd) {
- kvm_err("Cannot allocate Hyp pmd\n");
- return -ENOMEM;
- }
- kvm_pud_populate(pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- next = pud_addr_end(addr, end);
- ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
+static int __create_hyp_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot)
{
- p4d_t *p4d;
- pud_t *pud;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- p4d = p4d_offset(pgd, addr);
-
- if (p4d_none(*p4d)) {
- pud = pud_alloc_one(NULL, addr);
- if (!pud) {
- kvm_err("Cannot allocate Hyp pud\n");
- return -ENOMEM;
- }
- kvm_p4d_populate(p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- next = p4d_addr_end(addr, end);
- ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- unsigned long start, unsigned long end,
- unsigned long pfn, pgprot_t prot)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- unsigned long addr, next;
- int err = 0;
+ int err;
mutex_lock(&kvm_hyp_pgd_mutex);
- addr = start & PAGE_MASK;
- end = PAGE_ALIGN(end);
- do {
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
- if (pgd_none(*pgd)) {
- p4d = p4d_alloc_one(NULL, addr);
- if (!p4d) {
- kvm_err("Cannot allocate Hyp p4d\n");
- err = -ENOMEM;
- goto out;
- }
- kvm_pgd_populate(pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- next = pgd_addr_end(addr, end);
- err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
- if (err)
- goto out;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-out:
+ err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
mutex_unlock(&kvm_hyp_pgd_mutex);
+
return err;
}
@@ -845,7 +220,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
* physical pages.
*/
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
{
phys_addr_t phys_addr;
unsigned long virt_addr;
@@ -862,9 +237,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
int err;
phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
- err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
- virt_addr, virt_addr + PAGE_SIZE,
- __phys_to_pfn(phys_addr),
+ err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
prot);
if (err)
return err;
@@ -874,9 +247,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
}
static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
- unsigned long *haddr, pgprot_t prot)
+ unsigned long *haddr,
+ enum kvm_pgtable_prot prot)
{
- pgd_t *pgd = hyp_pgd;
unsigned long base;
int ret = 0;
@@ -908,17 +281,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
if (ret)
goto out;
- if (__kvm_cpu_uses_extended_idmap())
- pgd = boot_hyp_pgd;
-
- ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- base, base + size,
- __phys_to_pfn(phys_addr), prot);
+ ret = __create_hyp_mappings(base, size, phys_addr, prot);
if (ret)
goto out;
*haddr = base + offset_in_page(phys_addr);
-
out:
return ret;
}
@@ -989,47 +356,48 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
*
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(mmu->kvm).
- *
+ * Allocates only the stage-2 HW PGD level table(s).
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
- phys_addr_t pgd_phys;
- pgd_t *pgd;
- int cpu;
+ int cpu, err;
+ struct kvm_pgtable *pgt;
- if (mmu->pgd != NULL) {
+ if (mmu->pgt != NULL) {
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
- /* Allocate the HW PGD, making sure that each page gets its own refcount */
- pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
- if (!pgd)
+ pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
+ if (!pgt)
return -ENOMEM;
- pgd_phys = virt_to_phys(pgd);
- if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
- return -EINVAL;
+ err = kvm_pgtable_stage2_init(pgt, kvm);
+ if (err)
+ goto out_free_pgtable;
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
- return -ENOMEM;
+ err = -ENOMEM;
+ goto out_destroy_pgtable;
}
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
mmu->kvm = kvm;
- mmu->pgd = pgd;
- mmu->pgd_phys = pgd_phys;
+ mmu->pgt = pgt;
+ mmu->pgd_phys = __pa(pgt->pgd);
mmu->vmid.vmid_gen = 0;
-
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_stage2_destroy(pgt);
+out_free_pgtable:
+ kfree(pgt);
+ return err;
}
static void stage2_unmap_memslot(struct kvm *kvm,
@@ -1102,363 +470,21 @@ void stage2_unmap_vm(struct kvm *kvm)
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
struct kvm *kvm = mmu->kvm;
- void *pgd = NULL;
+ struct kvm_pgtable *pgt = NULL;
spin_lock(&kvm->mmu_lock);
- if (mmu->pgd) {
- unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
- pgd = READ_ONCE(mmu->pgd);
- mmu->pgd = NULL;
- }
- spin_unlock(&kvm->mmu_lock);
-
- /* Free the HW pgd, one page at a time */
- if (pgd) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
+ pgt = mmu->pgt;
+ if (pgt) {
+ mmu->pgd_phys = 0;
+ mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
-}
-
-static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- p4d_t *p4d;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- if (stage2_pgd_none(kvm, *pgd)) {
- if (!cache)
- return NULL;
- p4d = kvm_mmu_memory_cache_alloc(cache);
- stage2_pgd_populate(kvm, pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- return stage2_p4d_offset(kvm, pgd, addr);
-}
-
-static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- pud_t *pud;
-
- p4d = stage2_get_p4d(mmu, cache, addr);
- if (stage2_p4d_none(kvm, *p4d)) {
- if (!cache)
- return NULL;
- pud = kvm_mmu_memory_cache_alloc(cache);
- stage2_p4d_populate(kvm, p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- return stage2_pud_offset(kvm, p4d, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
-
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud || stage2_pud_huge(kvm, *pud))
- return NULL;
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return NULL;
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pmd_t *new_pmd)
-{
- pmd_t *pmd, old_pmd;
-
-retry:
- pmd = stage2_get_pmd(mmu, cache, addr);
- VM_BUG_ON(!pmd);
-
- old_pmd = *pmd;
- /*
- * Multiple vcpus faulting on the same PMD entry, can
- * lead to them sequentially updating the PMD with the
- * same value. Following the break-before-make
- * (pmd_clear() followed by tlb_flush()) process can
- * hinder forward progress due to refaults generated
- * on missing translations.
- *
- * Skip updating the page table if the entry is
- * unchanged.
- */
- if (pmd_val(old_pmd) == pmd_val(*new_pmd))
- return 0;
-
- if (pmd_present(old_pmd)) {
- /*
- * If we already have PTE level mapping for this block,
- * we must unmap it to avoid inconsistent TLB state and
- * leaking the table page. We could end up in this situation
- * if the memory slot was marked for dirty logging and was
- * reverted, leaving PTE level mappings for the pages accessed
- * during the period. So, unmap the PTE level mapping for this
- * block and retry, as we could have released the upper level
- * table in the process.
- *
- * Normal THP split/merge follows mmu_notifier callbacks and do
- * get handled accordingly.
- */
- if (!pmd_thp_or_huge(old_pmd)) {
- unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
- goto retry;
- }
- /*
- * Mapping in huge pages should only happen through a
- * fault. If a page is merged into a transparent huge
- * page, the individual subpages of that huge page
- * should be unmapped through MMU notifiers before we
- * get here.
- *
- * Merging of CompoundPages is not supported; they
- * should become splitting first, unmapped, merged,
- * and mapped back in on-demand.
- */
- WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- } else {
- get_page(virt_to_page(pmd));
- }
-
- kvm_set_pmd(pmd, *new_pmd);
- return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pud_t *new_pudp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp, old_pud;
-
-retry:
- pudp = stage2_get_pud(mmu, cache, addr);
- VM_BUG_ON(!pudp);
-
- old_pud = *pudp;
-
- /*
- * A large number of vcpus faulting on the same stage 2 entry,
- * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
- * Skip updating the page tables if there is no change.
- */
- if (pud_val(old_pud) == pud_val(*new_pudp))
- return 0;
-
- if (stage2_pud_present(kvm, old_pud)) {
- /*
- * If we already have table level mapping for this block, unmap
- * the range for this block and retry.
- */
- if (!stage2_pud_huge(kvm, old_pud)) {
- unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
- goto retry;
- }
-
- WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- } else {
- get_page(virt_to_page(pudp));
- }
-
- kvm_set_pud(pudp, *new_pudp);
- return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
- pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- *pudpp = NULL;
- *pmdpp = NULL;
- *ptepp = NULL;
-
- pudp = stage2_get_pud(mmu, NULL, addr);
- if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
- return false;
-
- if (stage2_pud_huge(kvm, *pudp)) {
- *pudpp = pudp;
- return true;
- }
-
- pmdp = stage2_pmd_offset(kvm, pudp, addr);
- if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
- return false;
-
- if (pmd_thp_or_huge(*pmdp)) {
- *pmdpp = pmdp;
- return true;
- }
-
- ptep = pte_offset_kernel(pmdp, addr);
- if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
- return false;
-
- *ptepp = ptep;
- return true;
-}
-
-static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
-{
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
- bool found;
-
- found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
- if (!found)
- return false;
-
- if (pudp)
- return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
- else if (pmdp)
- return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
- else
- return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte,
- unsigned long flags)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte, old_pte;
- bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
- bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
- VM_BUG_ON(logging_active && !cache);
-
- /* Create stage-2 page table mapping - Levels 0 and 1 */
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PUD, then continue
- * on to allocate page.
- */
- if (logging_active)
- stage2_dissolve_pud(mmu, addr, pud);
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- if (!pmd) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PMD, then continue on to
- * allocate page.
- */
- if (logging_active)
- stage2_dissolve_pmd(mmu, addr, pmd);
-
- /* Create stage-2 page mappings - Level 2 */
- if (pmd_none(*pmd)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pte = kvm_mmu_memory_cache_alloc(cache);
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- pte = pte_offset_kernel(pmd, addr);
-
- if (iomap && pte_present(*pte))
- return -EFAULT;
-
- /* Create 2nd stage page table mapping - Level 3 */
- old_pte = *pte;
- if (pte_present(old_pte)) {
- /* Skip page table update if there is no change */
- if (pte_val(old_pte) == pte_val(*new_pte))
- return 0;
-
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
- } else {
- get_page(virt_to_page(pte));
- }
-
- kvm_set_pte(pte, *new_pte);
- return 0;
-}
+ spin_unlock(&kvm->mmu_lock);
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- if (pte_young(*pte)) {
- *pte = pte_mkold(*pte);
- return 1;
+ if (pgt) {
+ kvm_pgtable_stage2_destroy(pgt);
+ kfree(pgt);
}
- return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pud);
}
/**
@@ -1468,169 +494,52 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud)
* @guest_ipa: The IPA at which to insert the mapping
* @pa: The physical address of the device
* @size: The size of the mapping
+ * @writable: Whether or not to create a writable mapping
*/
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable)
{
- phys_addr_t addr, end;
+ phys_addr_t addr;
int ret = 0;
- unsigned long pfn;
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_R |
+ (writable ? KVM_PGTABLE_PROT_W : 0);
- end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
- pfn = __phys_to_pfn(pa);
-
- for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
- pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
- if (writable)
- pte = kvm_s2pte_mkwrite(pte);
+ size += offset_in_page(guest_ipa);
+ guest_ipa &= PAGE_MASK;
+ for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
- goto out;
+ break;
+
spin_lock(&kvm->mmu_lock);
- ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
- KVM_S2PTE_FLAG_IS_IOMAP);
+ ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
+ &cache);
spin_unlock(&kvm->mmu_lock);
if (ret)
- goto out;
+ break;
- pfn++;
+ pa += PAGE_SIZE;
}
-out:
kvm_mmu_free_memory_cache(&cache);
return ret;
}
/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd: pointer to pmd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- if (!kvm_s2pte_readonly(pte))
- kvm_set_s2pte_readonly(pte);
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm: kvm instance for the VM
- * @pud: pointer to pud entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
-
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- if (!kvm_s2pmd_readonly(pmd))
- kvm_set_s2pmd_readonly(pmd);
- } else {
- stage2_wp_ptes(pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect P4D range
- * @p4d: pointer to p4d entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- if (!kvm_s2pud_readonly(pud))
- kvm_set_s2pud_readonly(pud);
- } else {
- stage2_wp_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_p4ds - write protect PGD range
- * @pgd: pointer to pgd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_wp_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
-/**
* stage2_wp_range() - write protect stage2 memory region range
- * @kvm: The KVM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t next;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Release kvm_mmu_lock periodically if the memory region is
- * large. Otherwise, we may see kernel panics with
- * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
- * CONFIG_LOCKDEP. Additionally, holding the lock too long
- * will also starve other vCPUs. We have to also make sure
- * that the page tables are not freed while we released
- * the lock.
- */
- cond_resched_lock(&kvm->mmu_lock);
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (stage2_pgd_present(kvm, *pgd))
- stage2_wp_p4ds(mmu, pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
}
/**
@@ -1833,23 +742,24 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
{
- int ret;
+ int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault, needs_exec;
+ bool exec_fault;
+ bool device = false;
unsigned long mmu_seq;
- gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ gfn_t gfn;
kvm_pfn_t pfn;
- pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
- unsigned long vma_pagesize, flags = 0;
- struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+ unsigned long vma_pagesize;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+ struct kvm_pgtable *pgt;
write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
@@ -1871,30 +781,41 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else
vma_shift = PAGE_SHIFT;
- vma_pagesize = 1ULL << vma_shift;
if (logging_active ||
- (vma->vm_flags & VM_PFNMAP) ||
- !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ (vma->vm_flags & VM_PFNMAP)) {
force_pte = true;
- vma_pagesize = PAGE_SIZE;
+ vma_shift = PAGE_SHIFT;
}
- /*
- * The stage2 has a minimum of 2 level table (For arm64 see
- * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
- * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
- * As for PUD huge maps, we must make sure that we have at least
- * 3 levels, i.e, PMD is not folded.
- */
- if (vma_pagesize == PMD_SIZE ||
- (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
- gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+ if (vma_shift == PUD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+ vma_shift = PMD_SHIFT;
+
+ if (vma_shift == PMD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ force_pte = true;
+ vma_shift = PAGE_SHIFT;
+ }
+
+ vma_pagesize = 1UL << vma_shift;
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+ fault_ipa &= ~(vma_pagesize - 1);
+
+ gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
- /* We need minimum second+third level pages */
- ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ if (fault_status != FSC_PERM || (logging_active && write_fault)) {
+ ret = kvm_mmu_topup_memory_cache(memcache,
+ kvm_mmu_cache_min_pages(kvm));
+ if (ret)
+ return ret;
+ }
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
@@ -1917,28 +838,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (kvm_is_device_pfn(pfn)) {
- mem_type = PAGE_S2_DEVICE;
- flags |= KVM_S2PTE_FLAG_IS_IOMAP;
- } else if (logging_active) {
- /*
- * Faults on pages in a memslot with logging enabled
- * should not be mapped with huge pages (it introduces churn
- * and performance degradation), so force a pte mapping.
- */
- flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
+ device = true;
+ } else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
* fault.
*/
- if (!write_fault)
- writable = false;
+ writable = false;
}
- if (exec_fault && is_iomap(flags))
+ if (exec_fault && device)
return -ENOEXEC;
spin_lock(&kvm->mmu_lock);
+ pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
@@ -1949,62 +862,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
- if (writable)
+ if (writable) {
+ prot |= KVM_PGTABLE_PROT_W;
kvm_set_pfn_dirty(pfn);
+ mark_page_dirty(kvm, gfn);
+ }
- if (fault_status != FSC_PERM && !is_iomap(flags))
+ if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
- if (exec_fault)
+ if (exec_fault) {
+ prot |= KVM_PGTABLE_PROT_X;
invalidate_icache_guest_page(pfn, vma_pagesize);
+ }
- /*
- * If we took an execution fault we have made the
- * icache/dcache coherent above and should now let the s2
- * mapping be executable.
- *
- * Write faults (!exec_fault && FSC_PERM) are orthogonal to
- * execute permissions, and we preserve whatever we have.
- */
- needs_exec = exec_fault ||
- (fault_status == FSC_PERM &&
- stage2_is_exec(mmu, fault_ipa, vma_pagesize));
-
- if (vma_pagesize == PUD_SIZE) {
- pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
- new_pud = kvm_pud_mkhuge(new_pud);
- if (writable)
- new_pud = kvm_s2pud_mkwrite(new_pud);
-
- if (needs_exec)
- new_pud = kvm_s2pud_mkexec(new_pud);
-
- ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
- } else if (vma_pagesize == PMD_SIZE) {
- pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
- new_pmd = kvm_pmd_mkhuge(new_pmd);
-
- if (writable)
- new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
- if (needs_exec)
- new_pmd = kvm_s2pmd_mkexec(new_pmd);
+ if (device)
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ prot |= KVM_PGTABLE_PROT_X;
- ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
+ if (fault_status == FSC_PERM && !(logging_active && writable)) {
+ ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
- pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
- if (writable) {
- new_pte = kvm_s2pte_mkwrite(new_pte);
- mark_page_dirty(kvm, gfn);
- }
-
- if (needs_exec)
- new_pte = kvm_s2pte_mkexec(new_pte);
-
- ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
+ ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ __pfn_to_phys(pfn), prot,
+ memcache);
}
out_unlock:
@@ -2014,46 +896,23 @@ out_unlock:
return ret;
}
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
+/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- kvm_pfn_t pfn;
- bool pfn_valid = false;
+ pte_t pte;
+ kvm_pte_t kpte;
+ struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
spin_lock(&vcpu->kvm->mmu_lock);
-
- if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
- goto out;
-
- if (pud) { /* HugeTLB */
- *pud = kvm_s2pud_mkyoung(*pud);
- pfn = kvm_pud_pfn(*pud);
- pfn_valid = true;
- } else if (pmd) { /* THP, HugeTLB */
- *pmd = pmd_mkyoung(*pmd);
- pfn = pmd_pfn(*pmd);
- pfn_valid = true;
- } else {
- *pte = pte_mkyoung(*pte); /* Just a page... */
- pfn = pte_pfn(*pte);
- pfn_valid = true;
- }
-
-out:
+ mmu = vcpu->arch.hw_mmu;
+ kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
spin_unlock(&vcpu->kvm->mmu_lock);
- if (pfn_valid)
- kvm_set_pfn_accessed(pfn);
+
+ pte = __pte(kpte);
+ if (pte_valid(pte))
+ kvm_set_pfn_accessed(pte_pfn(pte));
}
/**
@@ -2125,7 +984,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out;
}
- if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+ if (kvm_vcpu_abt_iss1tw(vcpu)) {
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
ret = 1;
goto out_unlock;
@@ -2224,7 +1083,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat
int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end, unsigned flags)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_unmap_hva_range(start, end);
@@ -2234,28 +1093,27 @@ int kvm_unmap_hva_range(struct kvm *kvm,
static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pte_t *pte = (pte_t *)data;
+ kvm_pfn_t *pfn = (kvm_pfn_t *)data;
WARN_ON(size != PAGE_SIZE);
+
/*
- * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
- * flag clear because MMU notifiers will have unmapped a huge PMD before
- * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
- * therefore stage2_set_pte() never needs to clear out a huge PMD
- * through this calling path.
+ * The MMU notifiers will have unmapped a huge PMD before calling
+ * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+ * therefore we never need to clear out a huge PMD through this
+ * calling path and a memcache is not required.
*/
- stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
+ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
+ __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
-
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
- pte_t stage2_pte;
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_set_spte_hva(hva);
@@ -2265,51 +1123,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
- handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
+ handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ pte_t pte;
+ kvm_pte_t kpte;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return stage2_pudp_test_and_clear_young(pud);
- else if (pmd)
- return stage2_pmdp_test_and_clear_young(pmd);
- else
- return stage2_ptep_test_and_clear_young(pte);
+ kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+ pte = __pte(kpte);
+ return pte_valid(pte) && pte_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return kvm_s2pud_young(*pud);
- else if (pmd)
- return pmd_young(*pmd);
- else
- return pte_young(*pte);
+ return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_age_hva(start, end);
return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
@@ -2317,24 +1154,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_test_age_hva(hva);
return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
kvm_test_age_hva_handler, NULL);
}
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
phys_addr_t kvm_mmu_get_httbr(void)
{
- if (__kvm_cpu_uses_extended_idmap())
- return virt_to_phys(merged_hyp_pgd);
- else
- return virt_to_phys(hyp_pgd);
+ return __pa(hyp_pgtable->pgd);
}
phys_addr_t kvm_get_idmap_vector(void)
@@ -2342,15 +1171,11 @@ phys_addr_t kvm_get_idmap_vector(void)
return hyp_idmap_vector;
}
-static int kvm_map_idmap_text(pgd_t *pgd)
+static int kvm_map_idmap_text(void)
{
- int err;
-
- /* Create the idmap in the boot page tables */
- err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- hyp_idmap_start, hyp_idmap_end,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP_EXEC);
+ unsigned long size = hyp_idmap_end - hyp_idmap_start;
+ int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
+ PAGE_HYP_EXEC);
if (err)
kvm_err("Failed to idmap %lx-%lx\n",
hyp_idmap_start, hyp_idmap_end);
@@ -2361,6 +1186,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
int kvm_mmu_init(void)
{
int err;
+ u32 hyp_va_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -2374,6 +1200,8 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
+ hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
kern_hyp_va(PAGE_OFFSET),
@@ -2391,43 +1219,30 @@ int kvm_mmu_init(void)
goto out;
}
- hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
- if (!hyp_pgd) {
- kvm_err("Hyp mode PGD not allocated\n");
+ hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
+ if (!hyp_pgtable) {
+ kvm_err("Hyp mode page-table not allocated\n");
err = -ENOMEM;
goto out;
}
- if (__kvm_cpu_uses_extended_idmap()) {
- boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- hyp_pgd_order);
- if (!boot_hyp_pgd) {
- kvm_err("Hyp boot PGD not allocated\n");
- err = -ENOMEM;
- goto out;
- }
-
- err = kvm_map_idmap_text(boot_hyp_pgd);
- if (err)
- goto out;
+ err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+ if (err)
+ goto out_free_pgtable;
- merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- if (!merged_hyp_pgd) {
- kvm_err("Failed to allocate extra HYP pgd\n");
- goto out;
- }
- __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
- hyp_idmap_start);
- } else {
- err = kvm_map_idmap_text(hyp_pgd);
- if (err)
- goto out;
- }
+ err = kvm_map_idmap_text();
+ if (err)
+ goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+out_free_pgtable:
+ kfree(hyp_pgtable);
+ hyp_pgtable = NULL;
out:
- free_hyp_pgds();
return err;
}
@@ -2531,7 +1346,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
if (ret)
unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
- else
+ else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
stage2_flush_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
out:
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index f0d0312c0a55..2ed5ef8f274b 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+static u32 kvm_pmu_event_mask(struct kvm *kvm)
+{
+ switch (kvm->arch.pmuver) {
+ case 1: /* ARMv8.0 */
+ return GENMASK(9, 0);
+ case 4: /* ARMv8.1 */
+ case 5: /* ARMv8.4 */
+ case 6: /* ARMv8.5 */
+ return GENMASK(15, 0);
+ default: /* Shouldn't be here, just for sanity */
+ WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
+ return 0;
+ }
+}
+
/**
* kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
* @vcpu: The vcpu pointer
@@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
return false;
reg = PMEVTYPER0_EL0 + select_idx;
- eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+ eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
}
@@ -269,6 +284,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
kvm_pmu_release_perf_event(&pmu->pmc[i]);
+ irq_work_sync(&vcpu->arch.pmu.overflow_work);
}
u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
@@ -434,6 +450,22 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
}
/**
+ * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding
+ * to the event.
+ * This is why we need a callback to do it once outside of the NMI context.
+ */
+static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_pmu *pmu;
+
+ pmu = container_of(work, struct kvm_pmu, overflow_work);
+ vcpu = kvm_pmc_to_vcpu(pmu->pmc);
+
+ kvm_vcpu_kick(vcpu);
+}
+
+/**
* When the perf event overflows, set the overflow status and inform the vcpu.
*/
static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
@@ -465,7 +497,11 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
if (kvm_pmu_overflow_status(vcpu)) {
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
- kvm_vcpu_kick(vcpu);
+
+ if (!in_nmi())
+ kvm_vcpu_kick(vcpu);
+ else
+ irq_work_queue(&vcpu->arch.pmu.overflow_work);
}
cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
@@ -495,7 +531,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
/* PMSWINC only applies to ... SW_INC! */
type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
- type &= ARMV8_PMU_EVTYPE_EVENT;
+ type &= kvm_pmu_event_mask(vcpu->kvm);
if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
continue;
@@ -578,11 +614,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
data = __vcpu_sys_reg(vcpu, reg);
kvm_pmu_stop_counter(vcpu, pmc);
- eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+ if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+ eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ else
+ eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
- /* Software increment event does't need to be backed by a perf event */
- if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
- pmc->idx != ARMV8_PMU_CYCLE_IDX)
+ /* Software increment event doesn't need to be backed by a perf event */
+ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
+ return;
+
+ /*
+ * If we have a filter in place and that the event isn't allowed, do
+ * not install a perf event either.
+ */
+ if (vcpu->kvm->arch.pmu_filter &&
+ !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
memset(&attr, 0, sizeof(struct perf_event_attr));
@@ -594,8 +640,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
- attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
- ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+ attr.config = eventsel;
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
@@ -679,17 +724,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx)
{
- u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+ u64 reg, mask;
+
+ mask = ARMV8_PMU_EVTYPE_MASK;
+ mask &= ~ARMV8_PMU_EVTYPE_EVENT;
+ mask |= kvm_pmu_event_mask(vcpu->kvm);
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
- __vcpu_sys_reg(vcpu, reg) = event_type;
+ __vcpu_sys_reg(vcpu, reg) = data & mask;
kvm_pmu_update_pmc_chained(vcpu, select_idx);
kvm_pmu_create_perf_event(vcpu, select_idx);
}
+static int kvm_pmu_probe_pmuver(void)
+{
+ struct perf_event_attr attr = { };
+ struct perf_event *event;
+ struct arm_pmu *pmu;
+ int pmuver = 0xf;
+
+ /*
+ * Create a dummy event that only counts user cycles. As we'll never
+ * leave this function with the event being live, it will never
+ * count anything. But it allows us to probe some of the PMU
+ * details. Yes, this is terrible.
+ */
+ attr.type = PERF_TYPE_RAW;
+ attr.size = sizeof(attr);
+ attr.pinned = 1;
+ attr.disabled = 0;
+ attr.exclude_user = 0;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.exclude_host = 1;
+ attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ attr.sample_period = GENMASK(63, 0);
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_pmu_perf_overflow, &attr);
+
+ if (IS_ERR(event)) {
+ pr_err_once("kvm: pmu event creation failed %ld\n",
+ PTR_ERR(event));
+ return 0xf;
+ }
+
+ if (event->pmu) {
+ pmu = to_arm_pmu(event->pmu);
+ if (pmu->pmuver)
+ pmuver = pmu->pmuver;
+ }
+
+ perf_event_disable(event);
+ perf_event_release_kernel(event);
+
+ return pmuver;
+}
+
+u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
+{
+ unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
+ u64 val, mask = 0;
+ int base, i;
+
+ if (!pmceid1) {
+ val = read_sysreg(pmceid0_el0);
+ base = 0;
+ } else {
+ val = read_sysreg(pmceid1_el0);
+ base = 32;
+ }
+
+ if (!bmap)
+ return val;
+
+ for (i = 0; i < 32; i += 8) {
+ u64 byte;
+
+ byte = bitmap_get_value8(bmap, base + i);
+ mask |= byte << i;
+ byte = bitmap_get_value8(bmap, 0x4000 + base + i);
+ mask |= byte << (32 + i);
+ }
+
+ return val & mask;
+}
+
bool kvm_arm_support_pmu_v3(void)
{
/*
@@ -735,15 +858,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
{
- if (!kvm_arm_support_pmu_v3())
- return -ENODEV;
-
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENXIO;
-
- if (vcpu->arch.pmu.created)
- return -EBUSY;
-
if (irqchip_in_kernel(vcpu->kvm)) {
int ret;
@@ -764,6 +878,9 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
return ret;
}
+ init_irq_work(&vcpu->arch.pmu.overflow_work,
+ kvm_pmu_perf_overflow_notify_vcpu);
+
vcpu->arch.pmu.created = true;
return 0;
}
@@ -796,6 +913,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
+ if (!kvm_arm_support_pmu_v3() ||
+ !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return -ENODEV;
+
+ if (vcpu->arch.pmu.created)
+ return -EBUSY;
+
+ if (!vcpu->kvm->arch.pmuver)
+ vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
+
+ if (vcpu->kvm->arch.pmuver == 0xf)
+ return -ENODEV;
+
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ: {
int __user *uaddr = (int __user *)(long)attr->addr;
@@ -804,9 +934,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENODEV;
-
if (get_user(irq, uaddr))
return -EFAULT;
@@ -824,6 +951,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
vcpu->arch.pmu.irq_num = irq;
return 0;
}
+ case KVM_ARM_VCPU_PMU_V3_FILTER: {
+ struct kvm_pmu_event_filter __user *uaddr;
+ struct kvm_pmu_event_filter filter;
+ int nr_events;
+
+ nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;
+
+ uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
+
+ if (copy_from_user(&filter, uaddr, sizeof(filter)))
+ return -EFAULT;
+
+ if (((u32)filter.base_event + filter.nevents) > nr_events ||
+ (filter.action != KVM_PMU_EVENT_ALLOW &&
+ filter.action != KVM_PMU_EVENT_DENY))
+ return -EINVAL;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ if (!vcpu->kvm->arch.pmu_filter) {
+ vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL);
+ if (!vcpu->kvm->arch.pmu_filter) {
+ mutex_unlock(&vcpu->kvm->lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * The default depends on the first applied filter.
+ * If it allows events, the default is to deny.
+ * Conversely, if the first filter denies a set of
+ * events, the default is to allow.
+ */
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events);
+ else
+ bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events);
+ }
+
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+ else
+ bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return 0;
+ }
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
@@ -860,6 +1034,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ:
case KVM_ARM_VCPU_PMU_V3_INIT:
+ case KVM_ARM_VCPU_PMU_V3_FILTER:
if (kvm_arm_support_pmu_v3() &&
test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return 0;
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 3c224162b3dd..faf32a44ba04 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -31,9 +31,9 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
*/
void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
{
- struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+ struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
- if (!kvm_pmu_switch_needed(attr))
+ if (!ctx || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
@@ -47,7 +47,10 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
*/
void kvm_clr_pmu_events(u32 clr)
{
- struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+ struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
+
+ if (!ctx)
+ return;
ctx->pmu_events.events_host &= ~clr;
ctx->pmu_events.events_guest &= ~clr;
@@ -173,7 +176,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
return;
preempt_disable();
- host = this_cpu_ptr(&kvm_host_data);
+ host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
@@ -193,7 +196,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
if (!has_vhe())
return;
- host = this_cpu_ptr(&kvm_host_data);
+ host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 83415e96b589..db4056ecccfd 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -425,27 +425,30 @@ static int get_kernel_wa_level(u64 regid)
{
switch (regid) {
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
- switch (kvm_arm_harden_branch_predictor()) {
- case KVM_BP_HARDEN_UNKNOWN:
+ switch (arm64_get_spectre_v2_state()) {
+ case SPECTRE_VULNERABLE:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
- case KVM_BP_HARDEN_WA_NEEDED:
+ case SPECTRE_MITIGATED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
- case KVM_BP_HARDEN_NOT_REQUIRED:
+ case SPECTRE_UNAFFECTED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
}
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
- switch (kvm_arm_have_ssbd()) {
- case KVM_SSBD_FORCE_DISABLE:
- return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
- case KVM_SSBD_KERNEL:
- return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
- case KVM_SSBD_FORCE_ENABLE:
- case KVM_SSBD_MITIGATED:
+ switch (arm64_get_spectre_v4_state()) {
+ case SPECTRE_MITIGATED:
+ /*
+ * As for the hypercall discovery, we pretend we
+ * don't have any FW mitigation if SSBS is there at
+ * all times.
+ */
+ if (cpus_have_final_cap(ARM64_SSBS))
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+ fallthrough;
+ case SPECTRE_UNAFFECTED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
- case KVM_SSBD_UNKNOWN:
- default:
- return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
+ case SPECTRE_VULNERABLE:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
}
}
@@ -462,14 +465,8 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
val = kvm_psci_version(vcpu, vcpu->kvm);
break;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
- val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
- break;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-
- if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
- kvm_arm_get_vcpu_workaround_2_flag(vcpu))
- val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
break;
default:
return -ENOENT;
@@ -527,34 +524,35 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
return -EINVAL;
- wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
-
- if (get_kernel_wa_level(reg->id) < wa_level)
- return -EINVAL;
-
/* The enabled bit must not be set unless the level is AVAIL. */
- if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
- wa_level != val)
+ if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) &&
+ (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL)
return -EINVAL;
- /* Are we finished or do we need to check the enable bit ? */
- if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
- return 0;
-
/*
- * If this kernel supports the workaround to be switched on
- * or off, make sure it matches the requested setting.
+ * Map all the possible incoming states to the only two we
+ * really want to deal with.
*/
- switch (wa_level) {
- case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
- kvm_arm_set_vcpu_workaround_2_flag(vcpu,
- val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
+ switch (val & KVM_REG_FEATURE_LEVEL_MASK) {
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
+ wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
break;
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
- kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
+ wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
break;
+ default:
+ return -EINVAL;
}
+ /*
+ * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
+ * other way around.
+ */
+ if (get_kernel_wa_level(reg->id) < wa_level)
+ return -EINVAL;
+
return 0;
default:
return -ENOENT;
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
index f7b52ce1557e..920ac43077ad 100644
--- a/arch/arm64/kvm/pvtime.c
+++ b/arch/arm64/kvm/pvtime.c
@@ -13,25 +13,22 @@
void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- u64 steal;
- __le64 steal_le;
- u64 offset;
- int idx;
u64 base = vcpu->arch.steal.base;
+ u64 last_steal = vcpu->arch.steal.last_steal;
+ u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
+ u64 steal = 0;
+ int idx;
if (base == GPA_INVALID)
return;
- /* Let's do the local bookkeeping */
- steal = vcpu->arch.steal.steal;
- steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
- vcpu->arch.steal.last_steal = current->sched_info.run_delay;
- vcpu->arch.steal.steal = steal;
-
- steal_le = cpu_to_le64(steal);
idx = srcu_read_lock(&kvm->srcu);
- offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
- kvm_put_guest(kvm, base + offset, steal_le, u64);
+ if (!kvm_get_guest(kvm, base + offset, steal)) {
+ steal = le64_to_cpu(steal);
+ vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay);
+ steal += vcpu->arch.steal.last_steal - last_steal;
+ kvm_put_guest(kvm, base + offset, cpu_to_le64(steal));
+ }
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -43,7 +40,8 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
switch (feature) {
case ARM_SMCCC_HV_PV_TIME_FEATURES:
case ARM_SMCCC_HV_PV_TIME_ST:
- val = SMCCC_RET_SUCCESS;
+ if (vcpu->arch.steal.base != GPA_INVALID)
+ val = SMCCC_RET_SUCCESS;
break;
}
@@ -64,7 +62,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
* Start counting stolen time from the time the guest requests
* the feature enabled.
*/
- vcpu->arch.steal.steal = 0;
vcpu->arch.steal.last_steal = current->sched_info.run_delay;
idx = srcu_read_lock(&kvm->srcu);
@@ -74,7 +71,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
return base;
}
-static bool kvm_arm_pvtime_supported(void)
+bool kvm_arm_pvtime_supported(void)
{
return !!sched_info_on();
}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index ee33875c5c2a..f32490229a4c 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -319,10 +319,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.reset_state.reset = false;
}
- /* Default workaround setup is enabled (if supported) */
- if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
- vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
-
/* Reset timer */
ret = kvm_timer_vcpu_reset(vcpu);
out:
@@ -339,7 +335,7 @@ u32 get_kvm_ipa_limit(void)
int kvm_set_ipa_limit(void)
{
- unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
+ unsigned int parange, tgran_2;
u64 mmfr0;
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
@@ -376,39 +372,11 @@ int kvm_set_ipa_limit(void)
break;
}
- pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
-
- /* Clamp the IPA limit to the PA size supported by the kernel */
- ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
- /*
- * Since our stage2 table is dependent on the stage1 page table code,
- * we must always honor the following condition:
- *
- * Number of levels in Stage1 >= Number of levels in Stage2.
- *
- * So clamp the ipa limit further down to limit the number of levels.
- * Since we can concatenate upto 16 tables at entry level, we could
- * go upto 4bits above the maximum VA addressable with the current
- * number of levels.
- */
- va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
- va_max += 4;
-
- if (va_max < ipa_max)
- ipa_max = va_max;
-
- /*
- * If the final limit is lower than the real physical address
- * limit of the CPUs, report the reason.
- */
- if (ipa_max < pa_max)
- pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
- (va_max < pa_max) ? "Virtual" : "Physical");
-
- WARN(ipa_max < KVM_PHYS_SHIFT,
- "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
- kvm_ipa_limit = ipa_max;
- kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+ kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
+ WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
+ "KVM IPA Size Limit (%d bits) is smaller than default size\n",
+ kvm_ipa_limit);
+ kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 077293b5115f..d9117bc56237 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_access_el0_disabled(vcpu))
return false;
- if (!(p->Op2 & 1))
- pmceid = read_sysreg(pmceid0_el0);
- else
- pmceid = read_sysreg(pmceid1_el0);
+ pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
p->regval = pmceid;
@@ -1131,6 +1128,11 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
if (!vcpu_has_sve(vcpu))
val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT);
+ if (!(val & (0xfUL << ID_AA64PFR0_CSV2_SHIFT)) &&
+ arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
+ val |= (1UL << ID_AA64PFR0_CSV2_SHIFT);
+ } else if (id == SYS_ID_AA64PFR1_EL1) {
+ val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT);
} else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) |
(0xfUL << ID_AA64ISAR1_API_SHIFT) |
@@ -1382,6 +1384,13 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return true;
}
+static bool access_mte_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ kvm_inject_undefined(vcpu);
+ return false;
+}
+
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
SYS_DESC(SYS_##name), \
@@ -1547,6 +1556,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
+
+ { SYS_DESC(SYS_RGSR_EL1), access_mte_regs },
+ { SYS_DESC(SYS_GCR_EL1), access_mte_regs },
+
{ SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
{ SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
{ SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
@@ -1571,6 +1584,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
{ SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
+ { SYS_DESC(SYS_TFSR_EL1), access_mte_regs },
+ { SYS_DESC(SYS_TFSRE0_EL1), access_mte_regs },
+
{ SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index 4691053c5ee4..ff0444352bba 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -23,7 +23,7 @@ TRACE_EVENT(kvm_entry,
__entry->vcpu_pc = vcpu_pc;
),
- TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
+ TP_printk("PC: 0x%016lx", __entry->vcpu_pc)
);
TRACE_EVENT(kvm_exit,
@@ -42,7 +42,7 @@ TRACE_EVENT(kvm_exit,
__entry->vcpu_pc = vcpu_pc;
),
- TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
+ TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx",
__print_symbolic(__entry->ret, kvm_arm_exception_type),
__entry->esr_ec,
__print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
@@ -69,7 +69,7 @@ TRACE_EVENT(kvm_guest_fault,
__entry->ipa = ipa;
),
- TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+ TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#016lx",
__entry->ipa, __entry->hsr,
__entry->hxfar, __entry->vcpu_pc)
);
@@ -131,7 +131,7 @@ TRACE_EVENT(kvm_mmio_emulate,
__entry->cpsr = cpsr;
),
- TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
+ TP_printk("Emulate MMIO at: 0x%016lx (instr: %08lx, cpsr: %08lx)",
__entry->vcpu_pc, __entry->instr, __entry->cpsr)
);
@@ -149,7 +149,7 @@ TRACE_EVENT(kvm_unmap_hva_range,
__entry->end = end;
),
- TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
+ TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
__entry->start, __entry->end)
);
@@ -165,7 +165,7 @@ TRACE_EVENT(kvm_set_spte_hva,
__entry->hva = hva;
),
- TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
+ TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
);
TRACE_EVENT(kvm_age_hva,
@@ -182,7 +182,7 @@ TRACE_EVENT(kvm_age_hva,
__entry->end = end;
),
- TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
+ TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
__entry->start, __entry->end)
);
@@ -198,7 +198,7 @@ TRACE_EVENT(kvm_test_age_hva,
__entry->hva = hva;
),
- TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
+ TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
);
TRACE_EVENT(kvm_set_way_flush,
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
index 2c56d1e0f5bd..8d78acc4fba7 100644
--- a/arch/arm64/kvm/trace_handle_exit.h
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -22,7 +22,7 @@ TRACE_EVENT(kvm_wfx_arm64,
__entry->is_wfe = is_wfe;
),
- TP_printk("guest executed wf%c at: 0x%08lx",
+ TP_printk("guest executed wf%c at: 0x%016lx",
__entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
);
@@ -42,7 +42,7 @@ TRACE_EVENT(kvm_hvc_arm64,
__entry->imm = imm;
),
- TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
+ TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)",
__entry->vcpu_pc, __entry->r0, __entry->imm)
);
@@ -135,7 +135,7 @@ TRACE_EVENT(trap_reg,
__entry->write_value = write_value;
),
- TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+ TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
);
TRACE_EVENT(kvm_handle_sys_reg,
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index b13a9e3f99dd..f38c40a76251 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v)
return 0;
}
-static const struct seq_operations vgic_debug_seq_ops = {
+static const struct seq_operations vgic_debug_sops = {
.start = vgic_debug_start,
.next = vgic_debug_next,
.stop = vgic_debug_stop,
.show = vgic_debug_show
};
-static int debug_open(struct inode *inode, struct file *file)
-{
- int ret;
- ret = seq_open(file, &vgic_debug_seq_ops);
- if (!ret) {
- struct seq_file *seq;
- /* seq_open will have modified file->private_data */
- seq = file->private_data;
- seq->private = inode->i_private;
- }
-
- return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
- .owner = THIS_MODULE,
- .open = debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
+DEFINE_SEQ_ATTRIBUTE(vgic_debug);
void vgic_debug_init(struct kvm *kvm)
{
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 5c786b915cd3..52d6f24f65dc 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -1001,8 +1001,8 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
raw_spin_lock_irqsave(&irq->irq_lock, flags);
/*
- * An access targetting Group0 SGIs can only generate
- * those, while an access targetting Group1 SGIs can
+ * An access targeting Group0 SGIs can only generate
+ * those, while an access targeting Group1 SGIs can
* generate interrupts of either group.
*/
if (!irq->group || allow_group1) {
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 76e2d85789ed..9cdf39a94a63 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
if (likely(cpu_if->vgic_sre))
kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
- kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
if (has_vhe())
__vgic_v3_activate_traps(cpu_if);
@@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
vgic_v3_vmcr_sync(vcpu);
- kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
if (has_vhe())
__vgic_v3_deactivate_traps(cpu_if);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 2fc253466dbf..d31e1169d9b8 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -16,3 +16,5 @@ lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+
+obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
new file mode 100644
index 000000000000..03ca6d8b8670
--- /dev/null
+++ b/arch/arm64/lib/mte.S
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+#include <linux/linkage.h>
+
+#include <asm/alternative.h>
+#include <asm/assembler.h>
+#include <asm/mte.h>
+#include <asm/page.h>
+#include <asm/sysreg.h>
+
+ .arch armv8.5-a+memtag
+
+/*
+ * multitag_transfer_size - set \reg to the block size that is accessed by the
+ * LDGM/STGM instructions.
+ */
+ .macro multitag_transfer_size, reg, tmp
+ mrs_s \reg, SYS_GMID_EL1
+ ubfx \reg, \reg, #SYS_GMID_EL1_BS_SHIFT, #SYS_GMID_EL1_BS_SIZE
+ mov \tmp, #4
+ lsl \reg, \tmp, \reg
+ .endm
+
+/*
+ * Clear the tags in a page
+ * x0 - address of the page to be cleared
+ */
+SYM_FUNC_START(mte_clear_page_tags)
+ multitag_transfer_size x1, x2
+1: stgm xzr, [x0]
+ add x0, x0, x1
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+ ret
+SYM_FUNC_END(mte_clear_page_tags)
+
+/*
+ * Copy the tags from the source page to the destination one
+ * x0 - address of the destination page
+ * x1 - address of the source page
+ */
+SYM_FUNC_START(mte_copy_page_tags)
+ mov x2, x0
+ mov x3, x1
+ multitag_transfer_size x5, x6
+1: ldgm x4, [x3]
+ stgm x4, [x2]
+ add x2, x2, x5
+ add x3, x3, x5
+ tst x2, #(PAGE_SIZE - 1)
+ b.ne 1b
+ ret
+SYM_FUNC_END(mte_copy_page_tags)
+
+/*
+ * Read tags from a user buffer (one tag per byte) and set the corresponding
+ * tags at the given kernel address. Used by PTRACE_POKEMTETAGS.
+ * x0 - kernel address (to)
+ * x1 - user buffer (from)
+ * x2 - number of tags/bytes (n)
+ * Returns:
+ * x0 - number of tags read/set
+ */
+SYM_FUNC_START(mte_copy_tags_from_user)
+ mov x3, x1
+ cbz x2, 2f
+1:
+ uao_user_alternative 2f, ldrb, ldtrb, w4, x1, 0
+ lsl x4, x4, #MTE_TAG_SHIFT
+ stg x4, [x0], #MTE_GRANULE_SIZE
+ add x1, x1, #1
+ subs x2, x2, #1
+ b.ne 1b
+
+ // exception handling and function return
+2: sub x0, x1, x3 // update the number of tags set
+ ret
+SYM_FUNC_END(mte_copy_tags_from_user)
+
+/*
+ * Get the tags from a kernel address range and write the tag values to the
+ * given user buffer (one tag per byte). Used by PTRACE_PEEKMTETAGS.
+ * x0 - user buffer (to)
+ * x1 - kernel address (from)
+ * x2 - number of tags/bytes (n)
+ * Returns:
+ * x0 - number of tags read/set
+ */
+SYM_FUNC_START(mte_copy_tags_to_user)
+ mov x3, x0
+ cbz x2, 2f
+1:
+ ldg x4, [x1]
+ ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE
+ uao_user_alternative 2f, strb, sttrb, w4, x0, 0
+ add x0, x0, #1
+ add x1, x1, #MTE_GRANULE_SIZE
+ subs x2, x2, #1
+ b.ne 1b
+
+ // exception handling and function return
+2: sub x0, x0, x3 // update the number of tags copied
+ ret
+SYM_FUNC_END(mte_copy_tags_to_user)
+
+/*
+ * Save the tags in a page
+ * x0 - page address
+ * x1 - tag storage
+ */
+SYM_FUNC_START(mte_save_page_tags)
+ multitag_transfer_size x7, x5
+1:
+ mov x2, #0
+2:
+ ldgm x5, [x0]
+ orr x2, x2, x5
+ add x0, x0, x7
+ tst x0, #0xFF // 16 tag values fit in a register,
+ b.ne 2b // which is 16*16=256 bytes
+
+ str x2, [x1], #8
+
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+
+ ret
+SYM_FUNC_END(mte_save_page_tags)
+
+/*
+ * Restore the tags in a page
+ * x0 - page address
+ * x1 - tag storage
+ */
+SYM_FUNC_START(mte_restore_page_tags)
+ multitag_transfer_size x7, x5
+1:
+ ldr x2, [x1], #8
+2:
+ stgm x2, [x0]
+ add x0, x0, x7
+ tst x0, #0xFF
+ b.ne 2b
+
+ tst x0, #(PAGE_SIZE - 1)
+ b.ne 1b
+
+ ret
+SYM_FUNC_END(mte_restore_page_tags)
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index d91030f0ffee..5ead3c3de3b6 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -4,10 +4,11 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
ioremap.o mmap.o pgd.o mmu.o \
context.o proc.o pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_PTDUMP_CORE) += dump.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
+obj-$(CONFIG_ARM64_MTE) += mteswap.o
KASAN_SANITIZE_physaddr.o += n
obj-$(CONFIG_KASAN) += kasan_init.o
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 9b11c096a042..001737a8f309 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -27,6 +27,10 @@ static DEFINE_PER_CPU(atomic64_t, active_asids);
static DEFINE_PER_CPU(u64, reserved_asids);
static cpumask_t tlb_flush_pending;
+static unsigned long max_pinned_asids;
+static unsigned long nr_pinned_asids;
+static unsigned long *pinned_asid_map;
+
#define ASID_MASK (~GENMASK(asid_bits - 1, 0))
#define ASID_FIRST_VERSION (1UL << asid_bits)
@@ -72,7 +76,7 @@ void verify_cpu_asid_bits(void)
}
}
-static void set_kpti_asid_bits(void)
+static void set_kpti_asid_bits(unsigned long *map)
{
unsigned int len = BITS_TO_LONGS(NUM_USER_ASIDS) * sizeof(unsigned long);
/*
@@ -81,13 +85,15 @@ static void set_kpti_asid_bits(void)
* is set, then the ASID will map only userspace. Thus
* mark even as reserved for kernel.
*/
- memset(asid_map, 0xaa, len);
+ memset(map, 0xaa, len);
}
static void set_reserved_asid_bits(void)
{
- if (arm64_kernel_unmapped_at_el0())
- set_kpti_asid_bits();
+ if (pinned_asid_map)
+ bitmap_copy(asid_map, pinned_asid_map, NUM_USER_ASIDS);
+ else if (arm64_kernel_unmapped_at_el0())
+ set_kpti_asid_bits(asid_map);
else
bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
}
@@ -166,6 +172,14 @@ static u64 new_context(struct mm_struct *mm)
return newasid;
/*
+ * If it is pinned, we can keep using it. Note that reserved
+ * takes priority, because even if it is also pinned, we need to
+ * update the generation into the reserved_asids.
+ */
+ if (refcount_read(&mm->context.pinned))
+ return newasid;
+
+ /*
* We had a valid ASID in a previous life, so try to re-use
* it if possible.
*/
@@ -256,6 +270,71 @@ switch_mm_fastpath:
cpu_switch_mm(mm->pgd, mm);
}
+unsigned long arm64_mm_context_get(struct mm_struct *mm)
+{
+ unsigned long flags;
+ u64 asid;
+
+ if (!pinned_asid_map)
+ return 0;
+
+ raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+
+ asid = atomic64_read(&mm->context.id);
+
+ if (refcount_inc_not_zero(&mm->context.pinned))
+ goto out_unlock;
+
+ if (nr_pinned_asids >= max_pinned_asids) {
+ asid = 0;
+ goto out_unlock;
+ }
+
+ if (!asid_gen_match(asid)) {
+ /*
+ * We went through one or more rollover since that ASID was
+ * used. Ensure that it is still valid, or generate a new one.
+ */
+ asid = new_context(mm);
+ atomic64_set(&mm->context.id, asid);
+ }
+
+ nr_pinned_asids++;
+ __set_bit(asid2idx(asid), pinned_asid_map);
+ refcount_set(&mm->context.pinned, 1);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+
+ asid &= ~ASID_MASK;
+
+ /* Set the equivalent of USER_ASID_BIT */
+ if (asid && arm64_kernel_unmapped_at_el0())
+ asid |= 1;
+
+ return asid;
+}
+EXPORT_SYMBOL_GPL(arm64_mm_context_get);
+
+void arm64_mm_context_put(struct mm_struct *mm)
+{
+ unsigned long flags;
+ u64 asid = atomic64_read(&mm->context.id);
+
+ if (!pinned_asid_map)
+ return;
+
+ raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+
+ if (refcount_dec_and_test(&mm->context.pinned)) {
+ __clear_bit(asid2idx(asid), pinned_asid_map);
+ nr_pinned_asids--;
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+}
+EXPORT_SYMBOL_GPL(arm64_mm_context_put);
+
/* Errata workaround post TTBRx_EL1 update. */
asmlinkage void post_ttbr_update_workaround(void)
{
@@ -296,8 +375,11 @@ static int asids_update_limit(void)
{
unsigned long num_available_asids = NUM_USER_ASIDS;
- if (arm64_kernel_unmapped_at_el0())
+ if (arm64_kernel_unmapped_at_el0()) {
num_available_asids /= 2;
+ if (pinned_asid_map)
+ set_kpti_asid_bits(pinned_asid_map);
+ }
/*
* Expect allocation after rollover to fail if we don't have at least
* one more ASID than CPUs. ASID #0 is reserved for init_mm.
@@ -305,6 +387,13 @@ static int asids_update_limit(void)
WARN_ON(num_available_asids - 1 <= num_possible_cpus());
pr_info("ASID allocator initialised with %lu entries\n",
num_available_asids);
+
+ /*
+ * There must always be an ASID available after rollover. Ensure that,
+ * even if all CPUs have a reserved ASID and the maximum number of ASIDs
+ * are pinned, there still is at least one empty slot in the ASID map.
+ */
+ max_pinned_asids = num_available_asids - num_possible_cpus() - 2;
return 0;
}
arch_initcall(asids_update_limit);
@@ -319,13 +408,17 @@ static int asids_init(void)
panic("Failed to allocate bitmap for %lu ASIDs\n",
NUM_USER_ASIDS);
+ pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS),
+ sizeof(*pinned_asid_map), GFP_KERNEL);
+ nr_pinned_asids = 0;
+
/*
* We cannot call set_reserved_asid_bits() here because CPU
* caps are not finalized yet, so it is safer to assume KPTI
* and reserve kernel ASID's from beginning.
*/
if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0))
- set_kpti_asid_bits();
+ set_kpti_asid_bits(asid_map);
return 0;
}
early_initcall(asids_init);
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 2ee7b73433a5..70a71f38b6a9 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -6,21 +6,32 @@
* Copyright (C) 2012 ARM Ltd.
*/
+#include <linux/bitops.h>
#include <linux/mm.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/mte.h>
-void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr)
+void copy_highpage(struct page *to, struct page *from)
{
- struct page *page = virt_to_page(kto);
+ struct page *kto = page_address(to);
+ struct page *kfrom = page_address(from);
+
copy_page(kto, kfrom);
- flush_dcache_page(page);
+
+ if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
+ set_bit(PG_mte_tagged, &to->flags);
+ mte_copy_page_tags(kto, kfrom);
+ }
}
-EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
+EXPORT_SYMBOL(copy_highpage);
-void __cpu_clear_user_page(void *kaddr, unsigned long vaddr)
+void copy_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
{
- clear_page(kaddr);
+ copy_highpage(to, from);
+ flush_dcache_page(to);
}
-EXPORT_SYMBOL_GPL(__cpu_clear_user_page);
+EXPORT_SYMBOL_GPL(copy_user_highpage);
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6c45350e33aa..93e87b287556 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -6,7 +6,7 @@
#include <linux/gfp.h>
#include <linux/cache.h>
-#include <linux/dma-noncoherent.h>
+#include <linux/dma-map-ops.h>
#include <linux/dma-iommu.h>
#include <xen/xen.h>
#include <xen/swiotlb-xen.h>
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index eee1732ab6cd..aa0060178343 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -14,9 +14,7 @@ int fixup_exception(struct pt_regs *regs)
if (!fixup)
return 0;
- if (IS_ENABLED(CONFIG_BPF_JIT) &&
- regs->pc >= BPF_JIT_REGION_START &&
- regs->pc < BPF_JIT_REGION_END)
+ if (in_bpf_jit(regs))
return arm64_bpf_fixup_exception(fixup, regs);
regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index f07333e86c2f..94c99c1c19e3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -218,7 +218,9 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
} while (pteval != old_pteval);
- flush_tlb_fix_spurious_fault(vma, address);
+ /* Invalidate a stale read-only entry */
+ if (dirty)
+ flush_tlb_page(vma, address);
return 1;
}
@@ -641,6 +643,13 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
return 0;
}
+static int do_tag_check_fault(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs)
+{
+ do_bad_area(addr, esr, regs);
+ return 0;
+}
+
static const struct fault_info fault_info[] = {
{ do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
@@ -659,7 +668,7 @@ static const struct fault_info fault_info[] = {
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
{ do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 17" },
+ { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
{ do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 481d22c32a2e..095540667f0f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -21,8 +21,7 @@
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/dma-direct.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
#include <linux/efi.h>
#include <linux/swiotlb.h>
#include <linux/vmalloc.h>
@@ -54,12 +53,6 @@
s64 memstart_addr __ro_after_init = -1;
EXPORT_SYMBOL(memstart_addr);
-s64 physvirt_offset __ro_after_init;
-EXPORT_SYMBOL(physvirt_offset);
-
-struct page *vmemmap __ro_after_init;
-EXPORT_SYMBOL(vmemmap);
-
/*
* We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of
* memory as some devices, namely the Raspberry Pi 4, have peripherals with
@@ -290,20 +283,6 @@ void __init arm64_memblock_init(void)
memstart_addr = round_down(memblock_start_of_DRAM(),
ARM64_MEMSTART_ALIGN);
- physvirt_offset = PHYS_OFFSET - PAGE_OFFSET;
-
- vmemmap = ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT));
-
- /*
- * If we are running with a 52-bit kernel VA config on a system that
- * does not support it, we have to offset our vmemmap and physvirt_offset
- * s.t. we avoid the 52-bit portion of the direct linear map
- */
- if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52)) {
- vmemmap += (_PAGE_OFFSET(48) - _PAGE_OFFSET(52)) >> PAGE_SHIFT;
- physvirt_offset = PHYS_OFFSET - _PAGE_OFFSET(48);
- }
-
/*
* Remove the memory that we will not be able to cover with the
* linear mapping. Take care not to clip the kernel which may be
@@ -319,6 +298,16 @@ void __init arm64_memblock_init(void)
}
/*
+ * If we are running with a 52-bit kernel VA config on a system that
+ * does not support it, we have to place the available physical
+ * memory in the 48-bit addressable part of the linear region, i.e.,
+ * we have to move it upward. Since memstart_addr represents the
+ * physical address of PAGE_OFFSET, we have to *subtract* from it.
+ */
+ if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
+ memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52);
+
+ /*
* Apply the memory limit if it was set. Since the kernel may be loaded
* high up in memory, add back the kernel region that must be accessible
* via the linear mapping.
@@ -429,6 +418,8 @@ void __init bootmem_init(void)
arm64_hugetlb_cma_reserve();
#endif
+ dma_pernuma_cma_reserve();
+
/*
* sparse_init() tries to allocate memory from memblock, so must be
* done after the fixed reservations
@@ -471,12 +462,10 @@ static inline void free_memmap(unsigned long start_pfn, unsigned long end_pfn)
*/
static void __init free_unused_memmap(void)
{
- unsigned long start, prev_end = 0;
- struct memblock_region *reg;
-
- for_each_memblock(memory, reg) {
- start = __phys_to_pfn(reg->base);
+ unsigned long start, end, prev_end = 0;
+ int i;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
#ifdef CONFIG_SPARSEMEM
/*
* Take care not to free memmap entries that don't exist due
@@ -496,8 +485,7 @@ static void __init free_unused_memmap(void)
* memmap entries are valid from the bank end aligned to
* MAX_ORDER_NR_PAGES.
*/
- prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
- MAX_ORDER_NR_PAGES);
+ prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
}
#ifdef CONFIG_SPARSEMEM
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 7291b26ce788..b24e43d20667 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -212,8 +212,8 @@ void __init kasan_init(void)
{
u64 kimg_shadow_start, kimg_shadow_end;
u64 mod_shadow_start, mod_shadow_end;
- struct memblock_region *reg;
- int i;
+ phys_addr_t pa_start, pa_end;
+ u64 i;
kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end));
@@ -246,9 +246,9 @@ void __init kasan_init(void)
kasan_populate_early_shadow((void *)mod_shadow_end,
(void *)kimg_shadow_start);
- for_each_memblock(memory, reg) {
- void *start = (void *)__phys_to_virt(reg->base);
- void *end = (void *)__phys_to_virt(reg->base + reg->size);
+ for_each_mem_range(i, &pa_start, &pa_end) {
+ void *start = (void *)__phys_to_virt(pa_start);
+ void *end = (void *)__phys_to_virt(pa_end);
if (start >= end)
break;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 75df62fea1b6..beff3ad8c7f8 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -43,7 +43,7 @@
u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
-u64 __section(".mmuoff.data.write") vabits_actual;
+u64 __section(.mmuoff.data.write) vabits_actual;
EXPORT_SYMBOL(vabits_actual);
u64 kimage_voffset __ro_after_init;
@@ -122,7 +122,7 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
* The following mapping attributes may be updated in live
* kernel mappings without the need for break-before-make.
*/
- static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
+ pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
/* creating or taking down mappings is always safe */
if (old == 0 || new == 0)
@@ -136,6 +136,17 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
if (old & ~new & PTE_NG)
return false;
+ /*
+ * Changing the memory type between Normal and Normal-Tagged is safe
+ * since Tagged is considered a permission attribute from the
+ * mismatched attribute aliases perspective.
+ */
+ if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
+ (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
+ ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
+ (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
+ mask |= PTE_ATTRINDX_MASK;
+
return ((old ^ new) & ~mask) == 0;
}
@@ -462,8 +473,9 @@ static void __init map_mem(pgd_t *pgdp)
{
phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
- struct memblock_region *reg;
+ phys_addr_t start, end;
int flags = 0;
+ u64 i;
if (rodata_full || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -482,16 +494,15 @@ static void __init map_mem(pgd_t *pgdp)
#endif
/* map all the memory banks */
- for_each_memblock(memory, reg) {
- phys_addr_t start = reg->base;
- phys_addr_t end = start + reg->size;
-
+ for_each_mem_range(i, &start, &end) {
if (start >= end)
break;
- if (memblock_is_nomap(reg))
- continue;
-
- __map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
+ /*
+ * The linear map must allow allocation tags reading/writing
+ * if MTE is present. Otherwise, it has the same attributes as
+ * PAGE_KERNEL.
+ */
+ __map_memblock(pgdp, start, end, PAGE_KERNEL_TAGGED, flags);
}
/*
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
new file mode 100644
index 000000000000..c52c1847079c
--- /dev/null
+++ b/arch/arm64/mm/mteswap.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/pagemap.h>
+#include <linux/xarray.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <asm/mte.h>
+
+static DEFINE_XARRAY(mte_pages);
+
+void *mte_allocate_tag_storage(void)
+{
+ /* tags granule is 16 bytes, 2 tags stored per byte */
+ return kmalloc(PAGE_SIZE / 16 / 2, GFP_KERNEL);
+}
+
+void mte_free_tag_storage(char *storage)
+{
+ kfree(storage);
+}
+
+int mte_save_tags(struct page *page)
+{
+ void *tag_storage, *ret;
+
+ if (!test_bit(PG_mte_tagged, &page->flags))
+ return 0;
+
+ tag_storage = mte_allocate_tag_storage();
+ if (!tag_storage)
+ return -ENOMEM;
+
+ mte_save_page_tags(page_address(page), tag_storage);
+
+ /* page_private contains the swap entry.val set in do_swap_page */
+ ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL);
+ if (WARN(xa_is_err(ret), "Failed to store MTE tags")) {
+ mte_free_tag_storage(tag_storage);
+ return xa_err(ret);
+ } else if (ret) {
+ /* Entry is being replaced, free the old entry */
+ mte_free_tag_storage(ret);
+ }
+
+ return 0;
+}
+
+bool mte_restore_tags(swp_entry_t entry, struct page *page)
+{
+ void *tags = xa_load(&mte_pages, entry.val);
+
+ if (!tags)
+ return false;
+
+ mte_restore_page_tags(page_address(page), tags);
+
+ return true;
+}
+
+void mte_invalidate_tags(int type, pgoff_t offset)
+{
+ swp_entry_t entry = swp_entry(type, offset);
+ void *tags = xa_erase(&mte_pages, entry.val);
+
+ mte_free_tag_storage(tags);
+}
+
+void mte_invalidate_tags_area(int type)
+{
+ swp_entry_t entry = swp_entry(type, 0);
+ swp_entry_t last_entry = swp_entry(type + 1, 0);
+ void *tags;
+
+ XA_STATE(xa_state, &mte_pages, entry.val);
+
+ xa_lock(&mte_pages);
+ xas_for_each(&xa_state, tags, last_entry.val - 1) {
+ __xa_erase(&mte_pages, xa_state.xa_index);
+ mte_free_tag_storage(tags);
+ }
+ xa_unlock(&mte_pages);
+}
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 73f8b49d485c..a8303bc6b62a 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -46,7 +46,11 @@ EXPORT_SYMBOL(node_to_cpumask_map);
*/
const struct cpumask *cpumask_of_node(int node)
{
- if (WARN_ON(node >= nr_node_ids))
+
+ if (node == NUMA_NO_NODE)
+ return cpu_all_mask;
+
+ if (WARN_ON(node < 0 || node >= nr_node_ids))
return cpu_none_mask;
if (WARN_ON(node_to_cpumask_map[node] == NULL))
@@ -350,7 +354,7 @@ static int __init numa_register_nodes(void)
struct memblock_region *mblk;
/* Check that valid nid is set to memblks */
- for_each_memblock(memory, mblk) {
+ for_each_mem_region(mblk) {
int mblk_nid = memblock_get_region_node(mblk);
if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
@@ -423,19 +427,16 @@ out_free_distance:
*/
static int __init dummy_numa_init(void)
{
+ phys_addr_t start = memblock_start_of_DRAM();
+ phys_addr_t end = memblock_end_of_DRAM();
int ret;
- struct memblock_region *mblk;
if (numa_off)
pr_info("NUMA disabled\n"); /* Forced off on command line. */
- pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n",
- memblock_start_of_DRAM(), memblock_end_of_DRAM() - 1);
-
- for_each_memblock(memory, mblk) {
- ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size);
- if (!ret)
- continue;
+ pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1);
+ ret = numa_add_memblk(0, start, end);
+ if (ret) {
pr_err("NUMA init failed\n");
return ret;
}
@@ -448,7 +449,7 @@ static int __init dummy_numa_init(void)
* arm64_numa_init() - Initialize NUMA
*
* Try each configured NUMA initialization method until one succeeds. The
- * last fallback is dummy single node config encomapssing whole memory.
+ * last fallback is dummy single node config encompassing whole memory.
*/
void __init arm64_numa_init(void)
{
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 23f648c2a199..1b94f5b82654 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
#include <asm/set_memory.h>
#include <asm/tlbflush.h>
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 796e47a571e6..23c326a06b2d 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -18,6 +18,7 @@
#include <asm/cpufeature.h>
#include <asm/alternative.h>
#include <asm/smp.h>
+#include <asm/sysreg.h>
#ifdef CONFIG_ARM64_64K_PAGES
#define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K
@@ -44,14 +45,18 @@
#define TCR_KASAN_FLAGS 0
#endif
-/* Default MAIR_EL1 */
+/*
+ * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and
+ * changed during __cpu_setup to Normal Tagged if the system supports MTE.
+ */
#define MAIR_EL1_SET \
(MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \
MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \
MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) | \
MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \
MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \
- MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT))
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
#ifdef CONFIG_CPU_PM
/**
@@ -421,6 +426,29 @@ SYM_FUNC_START(__cpu_setup)
* Memory region attributes
*/
mov_q x5, MAIR_EL1_SET
+#ifdef CONFIG_ARM64_MTE
+ /*
+ * Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported
+ * (ID_AA64PFR1_EL1[11:8] > 1).
+ */
+ mrs x10, ID_AA64PFR1_EL1
+ ubfx x10, x10, #ID_AA64PFR1_MTE_SHIFT, #4
+ cmp x10, #ID_AA64PFR1_MTE
+ b.lt 1f
+
+ /* Normal Tagged memory type at the corresponding MAIR index */
+ mov x10, #MAIR_ATTR_NORMAL_TAGGED
+ bfi x5, x10, #(8 * MT_NORMAL_TAGGED), #8
+
+ /* initialize GCR_EL1: all non-zero tags excluded by default */
+ mov x10, #(SYS_GCR_EL1_RRND | SYS_GCR_EL1_EXCL_MASK)
+ msr_s SYS_GCR_EL1, x10
+
+ /* clear any pending tag check faults in TFSR*_EL1 */
+ msr_s SYS_TFSR_EL1, xzr
+ msr_s SYS_TFSRE0_EL1, xzr
+1:
+#endif
msr mair_el1, x5
/*
* Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/ptdump.c
index 0b8da1cc1c07..807dc634bbd2 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -41,6 +41,8 @@ static struct addr_marker address_markers[] = {
{ 0 /* KASAN_SHADOW_START */, "Kasan shadow start" },
{ KASAN_SHADOW_END, "Kasan shadow end" },
#endif
+ { BPF_JIT_REGION_START, "BPF start" },
+ { BPF_JIT_REGION_END, "BPF end" },
{ MODULES_VADDR, "Modules start" },
{ MODULES_END, "Modules end" },
{ VMALLOC_START, "vmalloc() area" },
@@ -169,6 +171,10 @@ static const struct prot_bits pte_bits[] = {
.mask = PTE_ATTRINDX_MASK,
.val = PTE_ATTRINDX(MT_NORMAL),
.set = "MEM/NORMAL",
+ }, {
+ .mask = PTE_ATTRINDX_MASK,
+ .val = PTE_ATTRINDX(MT_NORMAL_TAGGED),
+ .set = "MEM/NORMAL-TAGGED",
}
};
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index f8912e45be7a..ef9f1d5e989d 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -143,14 +143,17 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val,
}
}
-static inline int bpf2a64_offset(int bpf_to, int bpf_from,
+static inline int bpf2a64_offset(int bpf_insn, int off,
const struct jit_ctx *ctx)
{
- int to = ctx->offset[bpf_to];
- /* -1 to account for the Branch instruction */
- int from = ctx->offset[bpf_from] - 1;
-
- return to - from;
+ /* BPF JMP offset is relative to the next instruction */
+ bpf_insn++;
+ /*
+ * Whereas arm64 branch instructions encode the offset
+ * from the branch itself, so we must subtract 1 from the
+ * instruction offset.
+ */
+ return ctx->offset[bpf_insn + off] - (ctx->offset[bpf_insn] - 1);
}
static void jit_fill_hole(void *area, unsigned int size)
@@ -642,7 +645,7 @@ emit_bswap_uxt:
/* JUMP off */
case BPF_JMP | BPF_JA:
- jmp_offset = bpf2a64_offset(i + off, i, ctx);
+ jmp_offset = bpf2a64_offset(i, off, ctx);
check_imm26(jmp_offset);
emit(A64_B(jmp_offset), ctx);
break;
@@ -669,7 +672,7 @@ emit_bswap_uxt:
case BPF_JMP32 | BPF_JSLE | BPF_X:
emit(A64_CMP(is64, dst, src), ctx);
emit_cond_jmp:
- jmp_offset = bpf2a64_offset(i + off, i, ctx);
+ jmp_offset = bpf2a64_offset(i, off, ctx);
check_imm19(jmp_offset);
switch (BPF_OP(code)) {
case BPF_JEQ:
@@ -908,10 +911,21 @@ static int build_body(struct jit_ctx *ctx, bool extra_pass)
const struct bpf_prog *prog = ctx->prog;
int i;
+ /*
+ * - offset[0] offset of the end of prologue,
+ * start of the 1st instruction.
+ * - offset[1] - offset of the end of 1st instruction,
+ * start of the 2nd instruction
+ * [....]
+ * - offset[3] - offset of the end of 3rd instruction,
+ * start of 4th instruction
+ */
for (i = 0; i < prog->len; i++) {
const struct bpf_insn *insn = &prog->insnsi[i];
int ret;
+ if (ctx->image == NULL)
+ ctx->offset[i] = ctx->idx;
ret = build_insn(insn, ctx, extra_pass);
if (ret > 0) {
i++;
@@ -919,11 +933,16 @@ static int build_body(struct jit_ctx *ctx, bool extra_pass)
ctx->offset[i] = ctx->idx;
continue;
}
- if (ctx->image == NULL)
- ctx->offset[i] = ctx->idx;
if (ret)
return ret;
}
+ /*
+ * offset is allocated with prog->len + 1 so fill in
+ * the last element with the offset after the last
+ * instruction (end of program)
+ */
+ if (ctx->image == NULL)
+ ctx->offset[i] = ctx->idx;
return 0;
}
@@ -1002,7 +1021,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
memset(&ctx, 0, sizeof(ctx));
ctx.prog = prog;
- ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
+ ctx.offset = kcalloc(prog->len + 1, sizeof(int), GFP_KERNEL);
if (ctx.offset == NULL) {
prog = orig_prog;
goto out_off;
@@ -1089,7 +1108,7 @@ skip_init_ctx:
prog->jited_len = prog_size;
if (!prog->is_func || extra_pass) {
- bpf_prog_fill_jited_linfo(prog, ctx.offset);
+ bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
out_off:
kfree(ctx.offset);
kfree(jit_data);