diff options
author | Catalin Marinas <catalin.marinas@arm.com> | 2024-03-07 19:05:29 +0000 |
---|---|---|
committer | Catalin Marinas <catalin.marinas@arm.com> | 2024-03-07 19:05:29 +0000 |
commit | 88f0912253ea47a2bde36e0820f0b9c025d389ad (patch) | |
tree | fb477daef419d79cc960de26157e5de2a6fc9b69 | |
parent | 0c5ade742e91d7bf3a508bf6223deb7410009b6d (diff) | |
parent | 27f2b9fcddc76d542ac339febf2af55b67f610ca (diff) |
Merge branch 'for-next/stage1-lpa2' into for-next/core
* for-next/stage1-lpa2: (48 commits)
: Add support for LPA2 and WXN and stage 1
arm64/mm: Avoid ID mapping of kpti flag if it is no longer needed
arm64/mm: Use generic __pud_free() helper in pud_free() implementation
arm64: gitignore: ignore relacheck
arm64: Use Signed/Unsigned enums for TGRAN{4,16,64} and VARange
arm64: mm: Make PUD folding check in set_pud() a runtime check
arm64: mm: add support for WXN memory translation attribute
mm: add arch hook to validate mmap() prot flags
arm64: defconfig: Enable LPA2 support
arm64: Enable 52-bit virtual addressing for 4k and 16k granule configs
arm64: kvm: avoid CONFIG_PGTABLE_LEVELS for runtime levels
arm64: ptdump: Deal with translation levels folded at runtime
arm64: ptdump: Disregard unaddressable VA space
arm64: mm: Add support for folding PUDs at runtime
arm64: kasan: Reduce minimum shadow alignment and enable 5 level paging
arm64: mm: Add 5 level paging support to fixmap and swapper handling
arm64: Enable LPA2 at boot if supported by the system
arm64: mm: add LPA2 and 5 level paging support to G-to-nG conversion
arm64: mm: Add definitions to support 5 levels of paging
arm64: mm: Add LPA2 support to phys<->pte conversion routines
arm64: mm: Wire up TCR.DS bit to PTE shareability fields
...
55 files changed, 1949 insertions, 1124 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e14e92eb5ba5..6daceefcedfc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -165,7 +165,7 @@ config ARM64 select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE - select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) + select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE) @@ -370,7 +370,9 @@ config PGTABLE_LEVELS default 3 if ARM64_64K_PAGES && (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39 default 3 if ARM64_16K_PAGES && ARM64_VA_BITS_47 + default 4 if ARM64_16K_PAGES && (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) default 4 if !ARM64_64K_PAGES && ARM64_VA_BITS_48 + default 5 if ARM64_4K_PAGES && ARM64_VA_BITS_52 config ARCH_SUPPORTS_UPROBES def_bool y @@ -398,13 +400,13 @@ config BUILTIN_RETURN_ADDRESS_STRIPS_PAC config KASAN_SHADOW_OFFSET hex depends on KASAN_GENERIC || KASAN_SW_TAGS - default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS - default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS + default 0xdfff800000000000 if (ARM64_VA_BITS_48 || (ARM64_VA_BITS_52 && !ARM64_16K_PAGES)) && !KASAN_SW_TAGS + default 0xdfffc00000000000 if (ARM64_VA_BITS_47 || ARM64_VA_BITS_52) && ARM64_16K_PAGES && !KASAN_SW_TAGS default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS default 0xdfffffc000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS default 0xdffffff800000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS - default 0xefff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS - default 0xefffc00000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS + default 0xefff800000000000 if (ARM64_VA_BITS_48 || (ARM64_VA_BITS_52 && !ARM64_16K_PAGES)) && KASAN_SW_TAGS + default 0xefffc00000000000 if (ARM64_VA_BITS_47 || ARM64_VA_BITS_52) && ARM64_16K_PAGES && KASAN_SW_TAGS default 0xeffffe0000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS default 0xefffffc000000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS default 0xeffffff800000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS @@ -1280,9 +1282,7 @@ endchoice choice prompt "Virtual address space size" - default ARM64_VA_BITS_39 if ARM64_4K_PAGES - default ARM64_VA_BITS_47 if ARM64_16K_PAGES - default ARM64_VA_BITS_42 if ARM64_64K_PAGES + default ARM64_VA_BITS_52 help Allows choosing one of multiple possible virtual address space sizes. The level of translation table is determined by @@ -1309,7 +1309,7 @@ config ARM64_VA_BITS_48 config ARM64_VA_BITS_52 bool "52-bit" - depends on ARM64_64K_PAGES && (ARM64_PAN || !ARM64_SW_TTBR0_PAN) + depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN help Enable 52-bit virtual addressing for userspace when explicitly requested via a hint to mmap(). The kernel will also use 52-bit @@ -1356,10 +1356,11 @@ choice config ARM64_PA_BITS_48 bool "48-bit" + depends on ARM64_64K_PAGES || !ARM64_VA_BITS_52 config ARM64_PA_BITS_52 - bool "52-bit (ARMv8.2)" - depends on ARM64_64K_PAGES + bool "52-bit" + depends on ARM64_64K_PAGES || ARM64_VA_BITS_52 depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN help Enable support for a 52-bit physical address space, introduced as @@ -1376,6 +1377,10 @@ config ARM64_PA_BITS default 48 if ARM64_PA_BITS_48 default 52 if ARM64_PA_BITS_52 +config ARM64_LPA2 + def_bool y + depends on ARM64_PA_BITS_52 && !ARM64_64K_PAGES + choice prompt "Endianness" default CPU_LITTLE_ENDIAN @@ -1602,6 +1607,17 @@ config RODATA_FULL_DEFAULT_ENABLED This requires the linear region to be mapped down to pages, which may adversely affect performance in some cases. +config ARM64_WXN + bool "Enable WXN attribute so all writable mappings are non-exec" + help + Set the WXN bit in the SCTLR system register so that all writable + mappings are treated as if the PXN/UXN bit is set as well. + If this is set to Y, it can still be disabled at runtime by + passing 'arm64.nowxn' on the kernel command line. + + This should only be set if no software needs to be supported that + relies on being able to execute from writable mappings. + config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" help diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index e6cf3e5d63c3..f086b0624ec8 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -76,7 +76,6 @@ CONFIG_ARCH_VEXPRESS=y CONFIG_ARCH_VISCONTI=y CONFIG_ARCH_XGENE=y CONFIG_ARCH_ZYNQMP=y -CONFIG_ARM64_VA_BITS_48=y CONFIG_SCHED_MC=y CONFIG_SCHED_SMT=y CONFIG_NUMA=y diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index ecdb3cfcd0f8..8babfbe31f95 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -129,6 +129,4 @@ static inline bool __init __early_cpu_has_rndr(void) return (ftr >> ID_AA64ISAR0_EL1_RNDR_SHIFT) & 0xf; } -u64 kaslr_early_init(void *fdt); - #endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 96b18a707507..ab8b396428da 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -342,20 +342,6 @@ alternative_cb_end .endm /* - * idmap_get_t0sz - get the T0SZ value needed to cover the ID map - * - * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the - * entire ID map region can be mapped. As T0SZ == (64 - #bits used), - * this number conveniently equals the number of leading zeroes in - * the physical address of _end. - */ - .macro idmap_get_t0sz, reg - adrp \reg, _end - orr \reg, \reg, #(1 << VA_BITS_MIN) - 1 - clz \reg, \reg - .endm - -/* * tcr_compute_pa_size - set TCR.(I)PS to the highest supported * ID_AA64MMFR0_EL1.PARange value * @@ -586,18 +572,27 @@ alternative_endif .endm /* - * Offset ttbr1 to allow for 48-bit kernel VAs set with 52-bit PTRS_PER_PGD. + * If the kernel is built for 52-bit virtual addressing but the hardware only + * supports 48 bits, we cannot program the pgdir address into TTBR1 directly, + * but we have to add an offset so that the TTBR1 address corresponds with the + * pgdir entry that covers the lowest 48-bit addressable VA. + * + * Note that this trick is only used for LVA/64k pages - LPA2/4k pages uses an + * additional paging level, and on LPA2/16k pages, we would end up with a root + * level table with only 2 entries, which is suboptimal in terms of TLB + * utilization, so there we fall back to 47 bits of translation if LPA2 is not + * supported. + * * orr is used as it can cover the immediate value (and is idempotent). - * In future this may be nop'ed out when dealing with 52-bit kernel VAs. * ttbr: Value of ttbr to set, modified. */ .macro offset_ttbr1, ttbr, tmp -#ifdef CONFIG_ARM64_VA_BITS_52 - mrs_s \tmp, SYS_ID_AA64MMFR2_EL1 - and \tmp, \tmp, #(0xf << ID_AA64MMFR2_EL1_VARange_SHIFT) - cbnz \tmp, .Lskipoffs_\@ - orr \ttbr, \ttbr, #TTBR1_BADDR_4852_OFFSET -.Lskipoffs_\@ : +#if defined(CONFIG_ARM64_VA_BITS_52) && !defined(CONFIG_ARM64_LPA2) + mrs \tmp, tcr_el1 + and \tmp, \tmp, #TCR_T1SZ_MASK + cmp \tmp, #TCR_T1SZ(VA_BITS_MIN) + orr \tmp, \ttbr, #TTBR1_BADDR_4852_OFFSET + csel \ttbr, \tmp, \ttbr, eq #endif .endm @@ -619,25 +614,13 @@ alternative_endif .macro phys_to_pte, pte, phys #ifdef CONFIG_ARM64_PA_BITS_52 - /* - * We assume \phys is 64K aligned and this is guaranteed by only - * supporting this configuration with 64K pages. - */ - orr \pte, \phys, \phys, lsr #36 - and \pte, \pte, #PTE_ADDR_MASK + orr \pte, \phys, \phys, lsr #PTE_ADDR_HIGH_SHIFT + and \pte, \pte, #PHYS_TO_PTE_ADDR_MASK #else mov \pte, \phys #endif .endm - .macro pte_to_phys, phys, pte - and \phys, \pte, #PTE_ADDR_MASK -#ifdef CONFIG_ARM64_PA_BITS_52 - orr \phys, \phys, \phys, lsl #PTE_ADDR_HIGH_SHIFT - and \phys, \phys, GENMASK_ULL(PHYS_MASK_SHIFT - 1, PAGE_SHIFT) -#endif - .endm - /* * tcr_clear_errata_bits - Clear TCR bits that trigger an errata on this CPU. */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 34fcdbc65d7d..66ba0801f7b7 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -17,6 +17,8 @@ #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 #define ARM64_SW_FEATURE_OVERRIDE_HVHE 4 +#define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF 8 +#define ARM64_SW_FEATURE_OVERRIDE_NOWXN 12 #ifndef __ASSEMBLY__ @@ -910,7 +912,9 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur); struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id); +extern struct arm64_ftr_override id_aa64mmfr0_override; extern struct arm64_ftr_override id_aa64mmfr1_override; +extern struct arm64_ftr_override id_aa64mmfr2_override; extern struct arm64_ftr_override id_aa64pfr0_override; extern struct arm64_ftr_override id_aa64pfr1_override; extern struct arm64_ftr_override id_aa64zfr0_override; @@ -920,9 +924,121 @@ extern struct arm64_ftr_override id_aa64isar2_override; extern struct arm64_ftr_override arm64_sw_feature_override; +static inline +u64 arm64_apply_feature_override(u64 val, int feat, int width, + const struct arm64_ftr_override *override) +{ + u64 oval = override->val; + + /* + * When it encounters an invalid override (e.g., an override that + * cannot be honoured due to a missing CPU feature), the early idreg + * override code will set the mask to 0x0 and the value to non-zero for + * the field in question. In order to determine whether the override is + * valid or not for the field we are interested in, we first need to + * disregard bits belonging to other fields. + */ + oval &= GENMASK_ULL(feat + width - 1, feat); + + /* + * The override is valid if all value bits are accounted for in the + * mask. If so, replace the masked bits with the override value. + */ + if (oval == (oval & override->mask)) { + val &= ~override->mask; + val |= oval; + } + + /* Extract the field from the updated value */ + return cpuid_feature_extract_unsigned_field(val, feat); +} + +static inline bool arm64_test_sw_feature_override(int feat) +{ + /* + * Software features are pseudo CPU features that have no underlying + * CPUID system register value to apply the override to. + */ + return arm64_apply_feature_override(0, feat, 4, + &arm64_sw_feature_override); +} + +static inline bool kaslr_disabled_cmdline(void) +{ + return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOKASLR); +} + +static inline bool arm64_wxn_enabled(void) +{ + if (!IS_ENABLED(CONFIG_ARM64_WXN)) + return false; + return !arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOWXN); +} + u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); +static inline bool cpu_has_bti(void) +{ + if (!IS_ENABLED(CONFIG_ARM64_BTI)) + return false; + + return arm64_apply_feature_override(read_cpuid(ID_AA64PFR1_EL1), + ID_AA64PFR1_EL1_BT_SHIFT, 4, + &id_aa64pfr1_override); +} + +static inline bool cpu_has_pac(void) +{ + u64 isar1, isar2; + + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; + + isar1 = read_cpuid(ID_AA64ISAR1_EL1); + isar2 = read_cpuid(ID_AA64ISAR2_EL1); + + if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_APA_SHIFT, 4, + &id_aa64isar1_override)) + return true; + + if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_API_SHIFT, 4, + &id_aa64isar1_override)) + return true; + + return arm64_apply_feature_override(isar2, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, + &id_aa64isar2_override); +} + +static inline bool cpu_has_lva(void) +{ + u64 mmfr2; + + mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); + mmfr2 &= ~id_aa64mmfr2_override.mask; + mmfr2 |= id_aa64mmfr2_override.val; + return cpuid_feature_extract_unsigned_field(mmfr2, + ID_AA64MMFR2_EL1_VARange_SHIFT); +} + +static inline bool cpu_has_lpa2(void) +{ +#ifdef CONFIG_ARM64_LPA2 + u64 mmfr0; + int feat; + + mmfr0 = read_sysreg(id_aa64mmfr0_el1); + mmfr0 &= ~id_aa64mmfr0_override.mask; + mmfr0 |= id_aa64mmfr0_override.val; + feat = cpuid_feature_extract_signed_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + + return feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2; +#else + return false; +#endif +} + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 353fe08546cf..81606bf7d5ac 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -117,15 +117,9 @@ #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) -#define ESR_ELx_FSC_SEA_TTW0 (0x14) -#define ESR_ELx_FSC_SEA_TTW1 (0x15) -#define ESR_ELx_FSC_SEA_TTW2 (0x16) -#define ESR_ELx_FSC_SEA_TTW3 (0x17) +#define ESR_ELx_FSC_SEA_TTW(n) (0x14 + (n)) #define ESR_ELx_FSC_SECC (0x18) -#define ESR_ELx_FSC_SECC_TTW0 (0x1c) -#define ESR_ELx_FSC_SECC_TTW1 (0x1d) -#define ESR_ELx_FSC_SECC_TTW2 (0x1e) -#define ESR_ELx_FSC_SECC_TTW3 (0x1f) +#define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) /* ISS field definitions for Data Aborts */ #define ESR_ELx_ISV_SHIFT (24) @@ -394,6 +388,9 @@ static inline bool esr_is_data_abort(unsigned long esr) static inline bool esr_fsc_is_translation_fault(unsigned long esr) { + /* Translation fault, level -1 */ + if ((esr & ESR_ELx_FSC) == 0b101011) + return true; return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; } diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 58c294a96676..87e307804b99 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -87,6 +87,7 @@ enum fixed_addresses { FIX_PTE, FIX_PMD, FIX_PUD, + FIX_P4D, FIX_PGD, __end_of_fixed_addresses @@ -100,7 +101,6 @@ enum fixed_addresses { #define FIXMAP_PAGE_IO __pgprot(PROT_DEVICE_nGnRE) void __init early_fixmap_init(void); -void __init fixmap_copy(pgd_t *pgdir); #define __early_set_fixmap __set_fixmap diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index 7eefc525a9df..e1b57c13f8a4 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -17,11 +17,9 @@ asmlinkage void kasan_early_init(void); void kasan_init(void); -void kasan_copy_shadow(pgd_t *pgdir); #else static inline void kasan_init(void) { } -static inline void kasan_copy_shadow(pgd_t *pgdir) { } #endif #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 83ddb14b95a5..bf05a77873a4 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -13,28 +13,27 @@ #include <asm/sparsemem.h> /* - * The linear mapping and the start of memory are both 2M aligned (per - * the arm64 booting.txt requirements). Hence we can use section mapping - * with 4K (section size = 2M) but not with 16K (section size = 32M) or - * 64K (section size = 512M). + * The physical and virtual addresses of the start of the kernel image are + * equal modulo 2 MiB (per the arm64 booting.txt requirements). Hence we can + * use section mapping with 4K (section size = 2M) but not with 16K (section + * size = 32M) or 64K (section size = 512M). */ - -/* - * The idmap and swapper page tables need some space reserved in the kernel - * image. Both require pgd, pud (4 levels only) and pmd tables to (section) - * map the kernel. With the 64K page configuration, swapper and idmap need to - * map to pte level. The swapper also maps the FDT (see __create_page_tables - * for more information). Note that the number of ID map translation levels - * could be increased on the fly if system RAM is out of reach for the default - * VA range, so pages required to map highest possible PA are reserved in all - * cases. - */ -#ifdef CONFIG_ARM64_4K_PAGES -#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1) +#if defined(PMD_SIZE) && PMD_SIZE <= MIN_KIMG_ALIGN +#define SWAPPER_BLOCK_SHIFT PMD_SHIFT +#define SWAPPER_SKIP_LEVEL 1 #else -#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS) +#define SWAPPER_BLOCK_SHIFT PAGE_SHIFT +#define SWAPPER_SKIP_LEVEL 0 #endif +#define SWAPPER_BLOCK_SIZE (UL(1) << SWAPPER_BLOCK_SHIFT) +#define SWAPPER_TABLE_SHIFT (SWAPPER_BLOCK_SHIFT + PAGE_SHIFT - 3) +#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - SWAPPER_SKIP_LEVEL) +#define INIT_IDMAP_PGTABLE_LEVELS (IDMAP_LEVELS - SWAPPER_SKIP_LEVEL) + +#define IDMAP_VA_BITS 48 +#define IDMAP_LEVELS ARM64_HW_PGTABLE_LEVELS(IDMAP_VA_BITS) +#define IDMAP_ROOT_LEVEL (4 - IDMAP_LEVELS) /* * A relocatable kernel may execute from an address that differs from the one at @@ -50,57 +49,39 @@ #define EARLY_ENTRIES(vstart, vend, shift, add) \ (SPAN_NR_ENTRIES(vstart, vend, shift) + (add)) -#define EARLY_PGDS(vstart, vend, add) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT, add)) - -#if SWAPPER_PGTABLE_LEVELS > 3 -#define EARLY_PUDS(vstart, vend, add) (EARLY_ENTRIES(vstart, vend, PUD_SHIFT, add)) -#else -#define EARLY_PUDS(vstart, vend, add) (0) -#endif +#define EARLY_LEVEL(lvl, lvls, vstart, vend, add) \ + (lvls > lvl ? EARLY_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * (PAGE_SHIFT - 3), add) : 0) -#if SWAPPER_PGTABLE_LEVELS > 2 -#define EARLY_PMDS(vstart, vend, add) (EARLY_ENTRIES(vstart, vend, SWAPPER_TABLE_SHIFT, add)) -#else -#define EARLY_PMDS(vstart, vend, add) (0) -#endif +#define EARLY_PAGES(lvls, vstart, vend, add) (1 /* PGDIR page */ \ + + EARLY_LEVEL(3, (lvls), (vstart), (vend), add) /* each entry needs a next level page table */ \ + + EARLY_LEVEL(2, (lvls), (vstart), (vend), add) /* each entry needs a next level page table */ \ + + EARLY_LEVEL(1, (lvls), (vstart), (vend), add))/* each entry needs a next level page table */ +#define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(SWAPPER_PGTABLE_LEVELS, KIMAGE_VADDR, _end, EXTRA_PAGE) \ + + EARLY_SEGMENT_EXTRA_PAGES)) -#define EARLY_PAGES(vstart, vend, add) ( 1 /* PGDIR page */ \ - + EARLY_PGDS((vstart), (vend), add) /* each PGDIR needs a next level page table */ \ - + EARLY_PUDS((vstart), (vend), add) /* each PUD needs a next level page table */ \ - + EARLY_PMDS((vstart), (vend), add)) /* each PMD needs a next level page table */ -#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end, EXTRA_PAGE)) +#define INIT_IDMAP_DIR_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, _end, 1)) +#define INIT_IDMAP_DIR_SIZE ((INIT_IDMAP_DIR_PAGES + EARLY_IDMAP_EXTRA_PAGES) * PAGE_SIZE) -/* the initial ID map may need two extra pages if it needs to be extended */ -#if VA_BITS < 48 -#define INIT_IDMAP_DIR_SIZE ((INIT_IDMAP_DIR_PAGES + 2) * PAGE_SIZE) -#else -#define INIT_IDMAP_DIR_SIZE (INIT_IDMAP_DIR_PAGES * PAGE_SIZE) -#endif -#define INIT_IDMAP_DIR_PAGES EARLY_PAGES(KIMAGE_VADDR, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE, 1) +#define INIT_IDMAP_FDT_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, 0UL, UL(MAX_FDT_SIZE), 1) - 1) +#define INIT_IDMAP_FDT_SIZE ((INIT_IDMAP_FDT_PAGES + EARLY_IDMAP_EXTRA_FDT_PAGES) * PAGE_SIZE) -/* Initial memory map size */ -#ifdef CONFIG_ARM64_4K_PAGES -#define SWAPPER_BLOCK_SHIFT PMD_SHIFT -#define SWAPPER_BLOCK_SIZE PMD_SIZE -#define SWAPPER_TABLE_SHIFT PUD_SHIFT -#else -#define SWAPPER_BLOCK_SHIFT PAGE_SHIFT -#define SWAPPER_BLOCK_SIZE PAGE_SIZE -#define SWAPPER_TABLE_SHIFT PMD_SHIFT -#endif +/* The number of segments in the kernel image (text, rodata, inittext, initdata, data+bss) */ +#define KERNEL_SEGMENT_COUNT 5 +#if SWAPPER_BLOCK_SIZE > SEGMENT_ALIGN +#define EARLY_SEGMENT_EXTRA_PAGES (KERNEL_SEGMENT_COUNT + 1) /* - * Initial memory map attributes. + * The initial ID map consists of the kernel image, mapped as two separate + * segments, and may appear misaligned wrt the swapper block size. This means + * we need 3 additional pages. The DT could straddle a swapper block boundary, + * so it may need 2. */ -#define SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED | PTE_UXN) -#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S | PTE_UXN) - -#ifdef CONFIG_ARM64_4K_PAGES -#define SWAPPER_RW_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS | PTE_WRITE) -#define SWAPPER_RX_MMUFLAGS (SWAPPER_RW_MMUFLAGS | PMD_SECT_RDONLY) +#define EARLY_IDMAP_EXTRA_PAGES 3 +#define EARLY_IDMAP_EXTRA_FDT_PAGES 2 #else -#define SWAPPER_RW_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS | PTE_WRITE) -#define SWAPPER_RX_MMUFLAGS (SWAPPER_RW_MMUFLAGS | PTE_RDONLY) +#define EARLY_SEGMENT_EXTRA_PAGES 0 +#define EARLY_IDMAP_EXTRA_PAGES 0 +#define EARLY_IDMAP_EXTRA_FDT_PAGES 0 #endif #endif /* __ASM_KERNEL_PGTABLE_H */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index b804fe832184..6f5b41c70103 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -425,15 +425,9 @@ static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) { switch (kvm_vcpu_trap_get_fault(vcpu)) { case ESR_ELx_FSC_EXTABT: - case ESR_ELx_FSC_SEA_TTW0: - case ESR_ELx_FSC_SEA_TTW1: - case ESR_ELx_FSC_SEA_TTW2: - case ESR_ELx_FSC_SEA_TTW3: + case ESR_ELx_FSC_SEA_TTW(-1) ... ESR_ELx_FSC_SEA_TTW(3): case ESR_ELx_FSC_SECC: - case ESR_ELx_FSC_SECC_TTW0: - case ESR_ELx_FSC_SECC_TTW1: - case ESR_ELx_FSC_SECC_TTW2: - case ESR_ELx_FSC_SECC_TTW3: + case ESR_ELx_FSC_SECC_TTW(-1) ... ESR_ELx_FSC_SECC_TTW(3): return true; default: return false; diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 60904a6c4b42..b850b1b91471 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -54,7 +54,11 @@ #define FIXADDR_TOP (-UL(SZ_8M)) #if VA_BITS > 48 +#ifdef CONFIG_ARM64_16K_PAGES +#define VA_BITS_MIN (47) +#else #define VA_BITS_MIN (48) +#endif #else #define VA_BITS_MIN (VA_BITS) #endif @@ -209,9 +213,20 @@ #include <asm/boot.h> #include <asm/bug.h> #include <asm/sections.h> +#include <asm/sysreg.h> + +static inline u64 __pure read_tcr(void) +{ + u64 tcr; + + // read_sysreg() uses asm volatile, so avoid it here + asm("mrs %0, tcr_el1" : "=r"(tcr)); + return tcr; +} #if VA_BITS > 48 -extern u64 vabits_actual; +// For reasons of #include hell, we can't use TCR_T1SZ_OFFSET/TCR_T1SZ_MASK here +#define vabits_actual (64 - ((read_tcr() >> 16) & 63)) #else #define vabits_actual ((u64)VA_BITS) #endif diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 5966ee4a6154..6d4940342ba7 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -35,11 +35,40 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags) } #define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags) +static inline bool arm64_check_wx_prot(unsigned long prot, + struct task_struct *tsk) +{ + /* + * When we are running with SCTLR_ELx.WXN==1, writable mappings are + * implicitly non-executable. This means we should reject such mappings + * when user space attempts to create them using mmap() or mprotect(). + */ + if (arm64_wxn_enabled() && + ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))) { + /* + * User space libraries such as libffi carry elaborate + * heuristics to decide whether it is worth it to even attempt + * to create writable executable mappings, as PaX or selinux + * enabled systems will outright reject it. They will usually + * fall back to something else (e.g., two separate shared + * mmap()s of a temporary file) on failure. + */ + pr_info_ratelimited( + "process %s (%d) attempted to create PROT_WRITE+PROT_EXEC mapping\n", + tsk->comm, tsk->pid); + return false; + } + return true; +} + static inline bool arch_validate_prot(unsigned long prot, unsigned long addr __always_unused) { unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM; + if (!arm64_check_wx_prot(prot, current)) + return false; + if (system_supports_bti()) supported |= PROT_BTI; @@ -50,6 +79,13 @@ static inline bool arch_validate_prot(unsigned long prot, } #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) +static inline bool arch_validate_mmap_prot(unsigned long prot, + unsigned long addr) +{ + return arm64_check_wx_prot(prot, current); +} +#define arch_validate_mmap_prot arch_validate_mmap_prot + static inline bool arch_validate_flags(unsigned long vm_flags) { if (!system_supports_mte()) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 2fcf51231d6e..65977c7783c5 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -71,10 +71,46 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); -extern bool kaslr_requires_kpti(void); + +/* + * This check is triggered during the early boot before the cpufeature + * is initialised. Checking the status on the local CPU allows the boot + * CPU to detect the need for non-global mappings and thus avoiding a + * pagetable re-write after all the CPUs are booted. This check will be + * anyway run on individual CPUs, allowing us to get the consistent + * state once the SMP CPUs are up and thus make the switch to non-global + * mappings if required. + */ +static inline bool kaslr_requires_kpti(void) +{ + /* + * E0PD does a similar job to KPTI so can be used instead + * where available. + */ + if (IS_ENABLED(CONFIG_ARM64_E0PD)) { + u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); + if (cpuid_feature_extract_unsigned_field(mmfr2, + ID_AA64MMFR2_EL1_E0PD_SHIFT)) + return false; + } + + /* + * Systems affected by Cavium erratum 24756 are incompatible + * with KPTI. + */ + if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { + extern const struct midr_range cavium_erratum_27456_cpus[]; + + if (is_midr_in_range_list(read_cpuid_id(), + cavium_erratum_27456_cpus)) + return false; + } + + return true; +} #define INIT_MM_CONTEXT(name) \ - .pgd = init_pg_dir, + .pgd = swapper_pg_dir, #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 9ce4200508b1..f0fe2d09d139 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -20,13 +20,41 @@ #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/proc-fns.h> -#include <asm-generic/mm_hooks.h> #include <asm/cputype.h> #include <asm/sysreg.h> #include <asm/tlbflush.h> extern bool rodata_full; +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + return 0; +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool execute, bool foreign) +{ + if (IS_ENABLED(CONFIG_ARM64_WXN) && execute && + (vma->vm_flags & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC)) { + pr_warn_ratelimited( + "process %s (%d) attempted to execute from writable memory\n", + current->comm, current->pid); + /* disallow unless the nowxn override is set */ + return !arm64_wxn_enabled(); + } + return true; +} + static inline void contextidr_thread_switch(struct task_struct *next) { if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR)) @@ -61,11 +89,9 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) } /* - * TCR.T0SZ value to use when the ID map is active. Usually equals - * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in - * physical memory, in which case it will be smaller. + * TCR.T0SZ value to use when the ID map is active. */ -extern int idmap_t0sz; +#define idmap_t0sz TCR_T0SZ(IDMAP_VA_BITS) /* * Ensure TCR.T0SZ is set to the provided value. @@ -110,18 +136,13 @@ static inline void cpu_uninstall_idmap(void) cpu_switch_mm(mm->pgd, mm); } -static inline void __cpu_install_idmap(pgd_t *idmap) +static inline void cpu_install_idmap(void) { cpu_set_reserved_ttbr0(); local_flush_tlb_all(); cpu_set_idmap_tcr_t0sz(); - cpu_switch_mm(lm_alias(idmap), &init_mm); -} - -static inline void cpu_install_idmap(void) -{ - __cpu_install_idmap(idmap_pg_dir); + cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm); } /* @@ -148,51 +169,21 @@ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz) isb(); } -/* - * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, - * avoiding the possibility of conflicting TLB entries being allocated. - */ -static inline void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp) -{ - typedef void (ttbr_replace_func)(phys_addr_t); - extern ttbr_replace_func idmap_cpu_replace_ttbr1; - ttbr_replace_func *replace_phys; - unsigned long daif; - - /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ - phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); - - if (cnp) - ttbr1 |= TTBR_CNP_BIT; - - replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); - - __cpu_install_idmap(idmap); - - /* - * We really don't want to take *any* exceptions while TTBR1 is - * in the process of being replaced so mask everything. - */ - daif = local_daif_save(); - replace_phys(ttbr1); - local_daif_restore(daif); - - cpu_uninstall_idmap(); -} +void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp); static inline void cpu_enable_swapper_cnp(void) { - __cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir, true); + __cpu_replace_ttbr1(lm_alias(swapper_pg_dir), true); } -static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap) +static inline void cpu_replace_ttbr1(pgd_t *pgdp) { /* * Only for early TTBR1 replacement before cpucaps are finalized and * before we've decided whether to use CNP. */ WARN_ON(system_capabilities_finalized()); - __cpu_replace_ttbr1(pgdp, idmap, false); + __cpu_replace_ttbr1(pgdp, false); } /* diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 237224484d0f..8ff5f2a2579e 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -14,6 +14,7 @@ #include <asm/tlbflush.h> #define __HAVE_ARCH_PGD_FREE +#define __HAVE_ARCH_PUD_FREE #include <asm-generic/pgalloc.h> #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) @@ -43,7 +44,8 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { - set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot)); + if (pgtable_l4_enabled()) + set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot)); } static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) @@ -53,6 +55,13 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN; __p4d_populate(p4dp, __pa(pudp), p4dval); } + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + if (!pgtable_l4_enabled()) + return; + __pud_free(mm, pud); +} #else static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { @@ -60,6 +69,47 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#if CONFIG_PGTABLE_LEVELS > 4 + +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) +{ + if (pgtable_l5_enabled()) + set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot)); +} + +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) +{ + pgdval_t pgdval = PGD_TYPE_TABLE; + + pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN; + __pgd_populate(pgdp, __pa(p4dp), pgdval); +} + +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + return (p4d_t *)get_zeroed_page(gfp); +} + +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + if (!pgtable_l5_enabled()) + return; + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + free_page((unsigned long)p4d); +} + +#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d) +#else +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) +{ + BUILD_BUG(); +} +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ + extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp); diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index e4944d517c99..ef207a0d4f0d 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -26,10 +26,10 @@ #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3)) /* - * Size mapped by an entry at level n ( 0 <= n <= 3) + * Size mapped by an entry at level n ( -1 <= n <= 3) * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits * in the final page. The maximum number of translation levels supported by - * the architecture is 4. Hence, starting at level n, we have further + * the architecture is 5. Hence, starting at level n, we have further * ((4 - n) - 1) levels of translation excluding the offset within the page. * So, the total number of bits mapped by an entry at level n is : * @@ -62,9 +62,16 @@ #define PTRS_PER_PUD (1 << (PAGE_SHIFT - 3)) #endif +#if CONFIG_PGTABLE_LEVELS > 4 +#define P4D_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(0) +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE-1)) +#define PTRS_PER_P4D (1 << (PAGE_SHIFT - 3)) +#endif + /* * PGDIR_SHIFT determines the size a top-level page table entry can map - * (depending on the configuration, this level can be 0, 1 or 2). + * (depending on the configuration, this level can be -1, 0, 1 or 2). */ #define PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - CONFIG_PGTABLE_LEVELS) #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) @@ -87,6 +94,15 @@ /* * Hardware page table definitions. * + * Level -1 descriptor (PGD). + */ +#define PGD_TYPE_TABLE (_AT(pgdval_t, 3) << 0) +#define PGD_TABLE_BIT (_AT(pgdval_t, 1) << 1) +#define PGD_TYPE_MASK (_AT(pgdval_t, 3) << 0) +#define PGD_TABLE_PXN (_AT(pgdval_t, 1) << 59) +#define PGD_TABLE_UXN (_AT(pgdval_t, 1) << 60) + +/* * Level 0 descriptor (P4D). */ #define P4D_TYPE_TABLE (_AT(p4dval_t, 3) << 0) @@ -155,13 +171,17 @@ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ -#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) +#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (50 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) #ifdef CONFIG_ARM64_PA_BITS_52 +#ifdef CONFIG_ARM64_64K_PAGES #define PTE_ADDR_HIGH (_AT(pteval_t, 0xf) << 12) -#define PTE_ADDR_MASK (PTE_ADDR_LOW | PTE_ADDR_HIGH) #define PTE_ADDR_HIGH_SHIFT 36 +#define PHYS_TO_PTE_ADDR_MASK (PTE_ADDR_LOW | PTE_ADDR_HIGH) #else -#define PTE_ADDR_MASK PTE_ADDR_LOW +#define PTE_ADDR_HIGH (_AT(pteval_t, 0x3) << 8) +#define PTE_ADDR_HIGH_SHIFT 42 +#define PHYS_TO_PTE_ADDR_MASK GENMASK_ULL(49, 8) +#endif #endif /* @@ -284,6 +304,7 @@ #define TCR_E0PD1 (UL(1) << 56) #define TCR_TCMA0 (UL(1) << 57) #define TCR_TCMA1 (UL(1) << 58) +#define TCR_DS (UL(1) << 59) /* * TTBR. diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 483dbfa39c4c..dd9ee67d1d87 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -30,8 +30,8 @@ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) -#define PROT_DEFAULT (_PROT_DEFAULT | PTE_MAYBE_NG) -#define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_MAYBE_NG) +#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) +#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) @@ -57,10 +57,6 @@ #define _PAGE_READONLY_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) #define _PAGE_EXECONLY (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) -#ifdef __ASSEMBLY__ -#define PTE_MAYBE_NG 0 -#endif - #ifndef __ASSEMBLY__ #include <asm/cpufeature.h> @@ -71,7 +67,19 @@ extern bool arm64_use_ng_mappings; #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) +#ifndef CONFIG_ARM64_LPA2 #define lpa2_is_enabled() false +#define PTE_MAYBE_SHARED PTE_SHARED +#define PMD_MAYBE_SHARED PMD_SECT_S +#else +static inline bool __pure lpa2_is_enabled(void) +{ + return read_tcr() & TCR_DS; +} + +#define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED) +#define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S) +#endif /* * If we have userspace only BTI we don't want to mark kernel pages diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h index b8f158ae2527..6d6d4065b0cb 100644 --- a/arch/arm64/include/asm/pgtable-types.h +++ b/arch/arm64/include/asm/pgtable-types.h @@ -36,6 +36,12 @@ typedef struct { pudval_t pud; } pud_t; #define __pud(x) ((pud_t) { (x) } ) #endif +#if CONFIG_PGTABLE_LEVELS > 4 +typedef struct { p4dval_t p4d; } p4d_t; +#define p4d_val(x) ((x).p4d) +#define __p4d(x) ((p4d_t) { (x) } ) +#endif + typedef struct { pgdval_t pgd; } pgd_t; #define pgd_val(x) ((x).pgd) #define __pgd(x) ((pgd_t) { (x) } ) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 522c21348ae8..8bec85350865 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -80,15 +80,16 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_ARM64_PA_BITS_52 static inline phys_addr_t __pte_to_phys(pte_t pte) { + pte_val(pte) &= ~PTE_MAYBE_SHARED; return (pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT); } static inline pteval_t __phys_to_pte_val(phys_addr_t phys) { - return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PTE_ADDR_MASK; + return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK; } #else -#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_MASK) +#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_LOW) #define __phys_to_pte_val(phys) (phys) #endif @@ -620,12 +621,12 @@ static inline bool pud_table(pud_t pud) { return true; } PUD_TYPE_TABLE) #endif -extern pgd_t init_pg_dir[PTRS_PER_PGD]; +extern pgd_t init_pg_dir[]; extern pgd_t init_pg_end[]; -extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; -extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; -extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; -extern pgd_t reserved_pg_dir[PTRS_PER_PGD]; +extern pgd_t swapper_pg_dir[]; +extern pgd_t idmap_pg_dir[]; +extern pgd_t tramp_pg_dir[]; +extern pgd_t reserved_pg_dir[]; extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd); @@ -698,14 +699,14 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pud_user(pud) pte_user(pud_pte(pud)) #define pud_user_exec(pud) pte_user_exec(pud_pte(pud)) +static inline bool pgtable_l4_enabled(void); + static inline void set_pud(pud_t *pudp, pud_t pud) { -#ifdef __PAGETABLE_PUD_FOLDED - if (in_swapper_pgdir(pudp)) { + if (!pgtable_l4_enabled() && in_swapper_pgdir(pudp)) { set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud))); return; } -#endif /* __PAGETABLE_PUD_FOLDED */ WRITE_ONCE(*pudp, pud); @@ -758,12 +759,27 @@ static inline pmd_t *pud_pgtable(pud_t pud) #if CONFIG_PGTABLE_LEVELS > 3 +static __always_inline bool pgtable_l4_enabled(void) +{ + if (CONFIG_PGTABLE_LEVELS > 4 || !IS_ENABLED(CONFIG_ARM64_LPA2)) + return true; + if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT)) + return vabits_actual == VA_BITS; + return alternative_has_cap_unlikely(ARM64_HAS_VA52); +} + +static inline bool mm_pud_folded(const struct mm_struct *mm) +{ + return !pgtable_l4_enabled(); +} +#define mm_pud_folded mm_pud_folded + #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e)) -#define p4d_none(p4d) (!p4d_val(p4d)) -#define p4d_bad(p4d) (!(p4d_val(p4d) & 2)) -#define p4d_present(p4d) (p4d_val(p4d)) +#define p4d_none(p4d) (pgtable_l4_enabled() && !p4d_val(p4d)) +#define p4d_bad(p4d) (pgtable_l4_enabled() && !(p4d_val(p4d) & 2)) +#define p4d_present(p4d) (!p4d_none(p4d)) static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { @@ -779,7 +795,8 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline void p4d_clear(p4d_t *p4dp) { - set_p4d(p4dp, __p4d(0)); + if (pgtable_l4_enabled()) + set_p4d(p4dp, __p4d(0)); } static inline phys_addr_t p4d_page_paddr(p4d_t p4d) @@ -787,27 +804,75 @@ static inline phys_addr_t p4d_page_paddr(p4d_t p4d) return __p4d_to_phys(p4d); } +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) + +static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr) +{ + return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr); +} + static inline pud_t *p4d_pgtable(p4d_t p4d) { return (pud_t *)__va(p4d_page_paddr(p4d)); } -/* Find an entry in the first-level page table. */ -#define pud_offset_phys(dir, addr) (p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) +static inline phys_addr_t pud_offset_phys(p4d_t *p4dp, unsigned long addr) +{ + BUG_ON(!pgtable_l4_enabled()); -#define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) -#define pud_set_fixmap_offset(p4d, addr) pud_set_fixmap(pud_offset_phys(p4d, addr)) -#define pud_clear_fixmap() clear_fixmap(FIX_PUD) + return p4d_page_paddr(READ_ONCE(*p4dp)) + pud_index(addr) * sizeof(pud_t); +} -#define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d))) +static inline +pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long addr) +{ + if (!pgtable_l4_enabled()) + return p4d_to_folded_pud(p4dp, addr); + return (pud_t *)__va(p4d_page_paddr(p4d)) + pud_index(addr); +} +#define pud_offset_lockless pud_offset_lockless + +static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long addr) +{ + return pud_offset_lockless(p4dp, READ_ONCE(*p4dp), addr); +} +#define pud_offset pud_offset + +static inline pud_t *pud_set_fixmap(unsigned long addr) +{ + if (!pgtable_l4_enabled()) + return NULL; + return (pud_t *)set_fixmap_offset(FIX_PUD, addr); +} + +static inline pud_t *pud_set_fixmap_offset(p4d_t *p4dp, unsigned long addr) +{ + if (!pgtable_l4_enabled()) + return p4d_to_folded_pud(p4dp, addr); + return pud_set_fixmap(pud_offset_phys(p4dp, addr)); +} + +static inline void pud_clear_fixmap(void) +{ + if (pgtable_l4_enabled()) + clear_fixmap(FIX_PUD); +} /* use ONLY for statically allocated translation tables */ -#define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) +static inline pud_t *pud_offset_kimg(p4d_t *p4dp, u64 addr) +{ + if (!pgtable_l4_enabled()) + return p4d_to_folded_pud(p4dp, addr); + return (pud_t *)__phys_to_kimg(pud_offset_phys(p4dp, addr)); +} + +#define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d))) #else +static inline bool pgtable_l4_enabled(void) { return false; } + #define p4d_page_paddr(p4d) ({ BUILD_BUG(); 0;}) -#define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */ #define pud_set_fixmap(addr) NULL @@ -818,6 +883,122 @@ static inline pud_t *p4d_pgtable(p4d_t p4d) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#if CONFIG_PGTABLE_LEVELS > 4 + +static __always_inline bool pgtable_l5_enabled(void) +{ + if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT)) + return vabits_actual == VA_BITS; + return alternative_has_cap_unlikely(ARM64_HAS_VA52); +} + +static inline bool mm_p4d_folded(const struct mm_struct *mm) +{ + return !pgtable_l5_enabled(); +} +#define mm_p4d_folded mm_p4d_folded + +#define p4d_ERROR(e) \ + pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e)) + +#define pgd_none(pgd) (pgtable_l5_enabled() && !pgd_val(pgd)) +#define pgd_bad(pgd) (pgtable_l5_enabled() && !(pgd_val(pgd) & 2)) +#define pgd_present(pgd) (!pgd_none(pgd)) + +static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (in_swapper_pgdir(pgdp)) { + set_swapper_pgd(pgdp, __pgd(pgd_val(pgd))); + return; + } + + WRITE_ONCE(*pgdp, pgd); + dsb(ishst); + isb(); +} + +static inline void pgd_clear(pgd_t *pgdp) +{ + if (pgtable_l5_enabled()) + set_pgd(pgdp, __pgd(0)); +} + +static inline phys_addr_t pgd_page_paddr(pgd_t pgd) +{ + return __pgd_to_phys(pgd); +} + +#define p4d_index(addr) (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1)) + +static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr) +{ + return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr); +} + +static inline phys_addr_t p4d_offset_phys(pgd_t *pgdp, unsigned long addr) +{ + BUG_ON(!pgtable_l5_enabled()); + + return pgd_page_paddr(READ_ONCE(*pgdp)) + p4d_index(addr) * sizeof(p4d_t); +} + +static inline +p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long addr) +{ + if (!pgtable_l5_enabled()) + return pgd_to_folded_p4d(pgdp, addr); + return (p4d_t *)__va(pgd_page_paddr(pgd)) + p4d_index(addr); +} +#define p4d_offset_lockless p4d_offset_lockless + +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long addr) +{ + return p4d_offset_lockless(pgdp, READ_ONCE(*pgdp), addr); +} + +static inline p4d_t *p4d_set_fixmap(unsigned long addr) +{ + if (!pgtable_l5_enabled()) + return NULL; + return (p4d_t *)set_fixmap_offset(FIX_P4D, addr); +} + +static inline p4d_t *p4d_set_fixmap_offset(pgd_t *pgdp, unsigned long addr) +{ + if (!pgtable_l5_enabled()) + return pgd_to_folded_p4d(pgdp, addr); + return p4d_set_fixmap(p4d_offset_phys(pgdp, addr)); +} + +static inline void p4d_clear_fixmap(void) +{ + if (pgtable_l5_enabled()) + clear_fixmap(FIX_P4D); +} + +/* use ONLY for statically allocated translation tables */ +static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr) +{ + if (!pgtable_l5_enabled()) + return pgd_to_folded_p4d(pgdp, addr); + return (p4d_t *)__phys_to_kimg(p4d_offset_phys(pgdp, addr)); +} + +#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd))) + +#else + +static inline bool pgtable_l5_enabled(void) { return false; } + +/* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */ +#define p4d_set_fixmap(addr) NULL +#define p4d_set_fixmap_offset(p4dp, addr) ((p4d_t *)p4dp) +#define p4d_clear_fixmap() + +#define p4d_offset_kimg(dir,addr) ((p4d_t *)dir) + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ + #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e)) diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index 3fdae5fe3142..2e010ea76be2 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -33,37 +33,11 @@ #include <asm/cpufeature.h> #ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS -static inline bool should_patch_pac_into_scs(void) -{ - u64 reg; - - /* - * We only enable the shadow call stack dynamically if we are running - * on a system that does not implement PAC or BTI. PAC and SCS provide - * roughly the same level of protection, and BTI relies on the PACIASP - * instructions serving as landing pads, preventing us from patching - * those instructions into something else. - */ - reg = read_sysreg_s(SYS_ID_AA64ISAR1_EL1); - if (SYS_FIELD_GET(ID_AA64ISAR1_EL1, APA, reg) | - SYS_FIELD_GET(ID_AA64ISAR1_EL1, API, reg)) - return false; - - reg = read_sysreg_s(SYS_ID_AA64ISAR2_EL1); - if (SYS_FIELD_GET(ID_AA64ISAR2_EL1, APA3, reg)) - return false; - - if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) { - reg = read_sysreg_s(SYS_ID_AA64PFR1_EL1); - if (reg & (0xf << ID_AA64PFR1_EL1_BT_SHIFT)) - return false; - } - return true; -} - static inline void dynamic_scs_init(void) { - if (should_patch_pac_into_scs()) { + extern bool __pi_dynamic_scs_is_enabled; + + if (__pi_dynamic_scs_is_enabled) { pr_info("Enabling dynamic shadow call stack\n"); static_branch_enable(&dynamic_scs_enabled); } @@ -72,8 +46,8 @@ static inline void dynamic_scs_init(void) static inline void dynamic_scs_init(void) {} #endif -int scs_patch(const u8 eh_frame[], int size); -asmlinkage void scs_patch_vmlinux(void); +int __pi_scs_patch(const u8 eh_frame[], int size); +asmlinkage void __pi_scs_patch_vmlinux(void); #endif /* __ASSEMBLY __ */ diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index 2e4d7da74fb8..ba269a7a3201 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -7,9 +7,6 @@ #include <uapi/asm/setup.h> -void *get_early_fdt_ptr(void); -void early_fdt_map(u64 dt_phys); - /* * These two variables are used in the head.S file. */ diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 0150deb332af..a947c6e784ed 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -103,6 +103,9 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, { struct ptdesc *ptdesc = virt_to_ptdesc(pudp); + if (!pgtable_l4_enabled()) + return; + pagetable_pud_dtor(ptdesc); tlb_remove_ptdesc(tlb, ptdesc); } diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 467cb7117273..14b4a179bad3 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -33,8 +33,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o idreg-override.o idle.o \ - patching.o + syscall.o proton-pack.o idle.o patching.o pi/ obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o @@ -57,7 +56,7 @@ obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-$(CONFIG_PARAVIRT) += paravirt.o -obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/ +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_ELF_CORE) += elfcore.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ @@ -72,14 +71,6 @@ obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o -obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o - -# We need to prevent the SCS patching code from patching itself. Using -# -mbranch-protection=none here to avoid the patchable PAC opcodes from being -# generated triggers an issue with full LTO on Clang, which stops emitting PAC -# instructions altogether. So disable LTO as well for the compilation unit. -CFLAGS_patch-scs.o += -mbranch-protection=none -CFLAGS_REMOVE_patch-scs.o += $(CC_FLAGS_LTO) # Force dependency (vdso*-wrap.S includes vdso.so through incbin) $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6e1cca7b2098..d6679d8b737e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -688,13 +688,15 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) -struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override; -struct arm64_ftr_override __ro_after_init id_aa64pfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64pfr1_override; -struct arm64_ftr_override __ro_after_init id_aa64zfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; -struct arm64_ftr_override __ro_after_init id_aa64isar1_override; -struct arm64_ftr_override __ro_after_init id_aa64isar2_override; +struct arm64_ftr_override id_aa64mmfr0_override; +struct arm64_ftr_override id_aa64mmfr1_override; +struct arm64_ftr_override id_aa64mmfr2_override; +struct arm64_ftr_override id_aa64pfr0_override; +struct arm64_ftr_override id_aa64pfr1_override; +struct arm64_ftr_override id_aa64zfr0_override; +struct arm64_ftr_override id_aa64smfr0_override; +struct arm64_ftr_override id_aa64isar1_override; +struct arm64_ftr_override id_aa64isar2_override; struct arm64_ftr_override arm64_sw_feature_override; @@ -755,10 +757,12 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_AA64ISAR3_EL1, ftr_id_aa64isar3), /* Op1 = 0, CRn = 0, CRm = 7 */ - ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0, + &id_aa64mmfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, &id_aa64mmfr1_override), - ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), + ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2, + &id_aa64mmfr2_override), ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), /* Op1 = 1, CRn = 0, CRm = 0 */ @@ -1669,46 +1673,6 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) return has_cpuid_feature(entry, scope); } -/* - * This check is triggered during the early boot before the cpufeature - * is initialised. Checking the status on the local CPU allows the boot - * CPU to detect the need for non-global mappings and thus avoiding a - * pagetable re-write after all the CPUs are booted. This check will be - * anyway run on individual CPUs, allowing us to get the consistent - * state once the SMP CPUs are up and thus make the switch to non-global - * mappings if required. - */ -bool kaslr_requires_kpti(void) -{ - if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return false; - - /* - * E0PD does a similar job to KPTI so can be used instead - * where available. - */ - if (IS_ENABLED(CONFIG_ARM64_E0PD)) { - u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); - if (cpuid_feature_extract_unsigned_field(mmfr2, - ID_AA64MMFR2_EL1_E0PD_SHIFT)) - return false; - } - - /* - * Systems affected by Cavium erratum 24756 are incompatible - * with KPTI. - */ - if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { - extern const struct midr_range cavium_erratum_27456_cpus[]; - - if (is_midr_in_range_list(read_cpuid_id(), - cavium_erratum_27456_cpus)) - return false; - } - - return kaslr_enabled(); -} - static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ @@ -1761,7 +1725,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, } /* Useful for KASLR robustness */ - if (kaslr_requires_kpti()) { + if (kaslr_enabled() && kaslr_requires_kpti()) { if (!__kpti_forced) { str = "KASLR"; __kpti_forced = 1; @@ -1850,6 +1814,11 @@ static int __init __kpti_install_ng_mappings(void *__unused) pgd_t *kpti_ng_temp_pgd; u64 alloc = 0; + if (levels == 5 && !pgtable_l5_enabled()) + levels = 4; + else if (levels == 4 && !pgtable_l4_enabled()) + levels = 3; + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); if (!cpu) { @@ -1863,9 +1832,9 @@ static int __init __kpti_install_ng_mappings(void *__unused) // // The physical pages are laid out as follows: // - // +--------+-/-------+-/------ +-\\--------+ - // : PTE[] : | PMD[] : | PUD[] : || PGD[] : - // +--------+-\-------+-\------ +-//--------+ + // +--------+-/-------+-/------ +-/------ +-\\\--------+ + // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : + // +--------+-\-------+-\------ +-\------ +-///--------+ // ^ // The first page is mapped into this hierarchy at a PMD_SHIFT // aligned virtual address, so that we can manipulate the PTE @@ -2091,14 +2060,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, int __unused) { - u64 val; - - val = read_sysreg(id_aa64mmfr1_el1); - if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT)) - return false; - - val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask; - return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE); + return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } #ifdef CONFIG_ARM64_PAN @@ -2796,6 +2758,24 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_fpmr, ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP) }, +#ifdef CONFIG_ARM64_VA_BITS_52 + { + .capability = ARM64_HAS_VA52, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, +#ifdef CONFIG_ARM64_64K_PAGES + .desc = "52-bit Virtual Addressing (LVA)", + ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, VARange, 52) +#else + .desc = "52-bit Virtual Addressing (LPA2)", +#ifdef CONFIG_ARM64_4K_PAGES + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN4, 52_BIT) +#else + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN16, 52_BIT) +#endif +#endif + }, +#endif {}, }; diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index cab7f91949d8..405e9bce8c73 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -80,28 +80,42 @@ * x19 primary_entry() .. start_kernel() whether we entered with the MMU on * x20 primary_entry() .. __primary_switch() CPU boot mode * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 - * x22 create_idmap() .. start_kernel() ID map VA of the DT blob - * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset - * x24 __primary_switch() linear map KASLR seed - * x25 primary_entry() .. start_kernel() supported VA size - * x28 create_idmap() callee preserved temp register */ SYM_CODE_START(primary_entry) bl record_mmu_state bl preserve_boot_args - bl create_idmap + + adrp x1, early_init_stack + mov sp, x1 + mov x29, xzr + adrp x0, init_idmap_pg_dir + mov x1, xzr + bl __pi_create_init_idmap + + /* + * If the page tables have been populated with non-cacheable + * accesses (MMU disabled), invalidate those tables again to + * remove any speculatively loaded cache lines. + */ + cbnz x19, 0f + dmb sy + mov x1, x0 // end of used region + adrp x0, init_idmap_pg_dir + adr_l x2, dcache_inval_poc + blr x2 + b 1f /* * If we entered with the MMU and caches on, clean the ID mapped part * of the primary boot code to the PoC so we can safely execute it with * the MMU off. */ - cbz x19, 0f - adrp x0, __idmap_text_start +0: adrp x0, __idmap_text_start adr_l x1, __idmap_text_end adr_l x2, dcache_clean_poc blr x2 -0: mov x0, x19 + +1: mov x0, x19 bl init_kernel_el // w0=cpu_boot_mode mov x20, x0 @@ -111,14 +125,6 @@ SYM_CODE_START(primary_entry) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ -#if VA_BITS > 48 - mrs_s x0, SYS_ID_AA64MMFR2_EL1 - tst x0, ID_AA64MMFR2_EL1_VARange_MASK - mov x0, #VA_BITS - mov x25, #VA_BITS_MIN - csel x25, x25, x0, eq - mov x0, x25 -#endif bl __cpu_setup // initialise processor b __primary_switch SYM_CODE_END(primary_entry) @@ -177,267 +183,6 @@ SYM_CODE_START_LOCAL(preserve_boot_args) ret SYM_CODE_END(preserve_boot_args) -SYM_FUNC_START_LOCAL(clear_page_tables) - /* - * Clear the init page tables. - */ - adrp x0, init_pg_dir - adrp x1, init_pg_end - sub x2, x1, x0 - mov x1, xzr - b __pi_memset // tail call -SYM_FUNC_END(clear_page_tables) - -/* - * Macro to populate page table entries, these entries can be pointers to the next level - * or last level entries pointing to physical memory. - * - * tbl: page table address - * rtbl: pointer to page table or physical memory - * index: start index to write - * eindex: end index to write - [index, eindex] written to - * flags: flags for pagetable entry to or in - * inc: increment to rtbl between each entry - * tmp1: temporary variable - * - * Preserves: tbl, eindex, flags, inc - * Corrupts: index, tmp1 - * Returns: rtbl - */ - .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 -.Lpe\@: phys_to_pte \tmp1, \rtbl - orr \tmp1, \tmp1, \flags // tmp1 = table entry - str \tmp1, [\tbl, \index, lsl #3] - add \rtbl, \rtbl, \inc // rtbl = pa next level - add \index, \index, #1 - cmp \index, \eindex - b.ls .Lpe\@ - .endm - -/* - * Compute indices of table entries from virtual address range. If multiple entries - * were needed in the previous page table level then the next page table level is assumed - * to be composed of multiple pages. (This effectively scales the end index). - * - * vstart: virtual address of start of range - * vend: virtual address of end of range - we map [vstart, vend] - * shift: shift used to transform virtual address into index - * order: #imm 2log(number of entries in page table) - * istart: index in table corresponding to vstart - * iend: index in table corresponding to vend - * count: On entry: how many extra entries were required in previous level, scales - * our end index. - * On exit: returns how many extra entries required for next page table level - * - * Preserves: vstart, vend - * Returns: istart, iend, count - */ - .macro compute_indices, vstart, vend, shift, order, istart, iend, count - ubfx \istart, \vstart, \shift, \order - ubfx \iend, \vend, \shift, \order - add \iend, \iend, \count, lsl \order - sub \count, \iend, \istart - .endm - -/* - * Map memory for specified virtual address range. Each level of page table needed supports - * multiple entries. If a level requires n entries the next page table level is assumed to be - * formed from n pages. - * - * tbl: location of page table - * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) - * vstart: virtual address of start of range - * vend: virtual address of end of range - we map [vstart, vend - 1] - * flags: flags to use to map last level entries - * phys: physical address corresponding to vstart - physical memory is contiguous - * order: #imm 2log(number of entries in PGD table) - * - * If extra_shift is set, an extra level will be populated if the end address does - * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range. - * - * Temporaries: istart, iend, tmp, count, sv - these need to be different registers - * Preserves: vstart, flags - * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv - */ - .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift - sub \vend, \vend, #1 - add \rtbl, \tbl, #PAGE_SIZE - mov \count, #0 - - .ifnb \extra_shift - tst \vend, #~((1 << (\extra_shift)) - 1) - b.eq .L_\@ - compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - .endif -.L_\@: - compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv - -#if SWAPPER_PGTABLE_LEVELS > 3 - compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv -#endif - -#if SWAPPER_PGTABLE_LEVELS > 2 - compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - mov \sv, \rtbl - populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp - mov \tbl, \sv -#endif - - compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count - bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1 - populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp - .endm - -/* - * Remap a subregion created with the map_memory macro with modified attributes - * or output address. The entire remapped region must have been covered in the - * invocation of map_memory. - * - * x0: last level table address (returned in first argument to map_memory) - * x1: start VA of the existing mapping - * x2: start VA of the region to update - * x3: end VA of the region to update (exclusive) - * x4: start PA associated with the region to update - * x5: attributes to set on the updated region - * x6: order of the last level mappings - */ -SYM_FUNC_START_LOCAL(remap_region) - sub x3, x3, #1 // make end inclusive - - // Get the index offset for the start of the last level table - lsr x1, x1, x6 - bfi x1, xzr, #0, #PAGE_SHIFT - 3 - - // Derive the start and end indexes into the last level table - // associated with the provided region - lsr x2, x2, x6 - lsr x3, x3, x6 - sub x2, x2, x1 - sub x3, x3, x1 - - mov x1, #1 - lsl x6, x1, x6 // block size at this level - - populate_entries x0, x4, x2, x3, x5, x6, x7 - ret -SYM_FUNC_END(remap_region) - -SYM_FUNC_START_LOCAL(create_idmap) - mov x28, lr - /* - * The ID map carries a 1:1 mapping of the physical address range - * covered by the loaded image, which could be anywhere in DRAM. This - * means that the required size of the VA (== PA) space is decided at - * boot time, and could be more than the configured size of the VA - * space for ordinary kernel and user space mappings. - * - * There are three cases to consider here: - * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover - * the placement of the image. In this case, we configure one extra - * level of translation on the fly for the ID map only. (This case - * also covers 42-bit VA/52-bit PA on 64k pages). - * - * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can - * only happen when using 64k pages, in which case we need to extend - * the root level table rather than add a level. Note that we can - * treat this case as 'always extended' as long as we take care not - * to program an unsupported T0SZ value into the TCR register. - * - * - Combinations that would require two additional levels of - * translation are not supported, e.g., VA_BITS==36 on 16k pages, or - * VA_BITS==39/4k pages with 5-level paging, where the input address - * requires more than 47 or 48 bits, respectively. - */ -#if (VA_BITS < 48) -#define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT) -#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) - - /* - * If VA_BITS < 48, we have to configure an additional table level. - * First, we have to verify our assumption that the current value of - * VA_BITS was chosen such that all translation levels are fully - * utilised, and that lowering T0SZ will always result in an additional - * translation level to be configured. - */ -#if VA_BITS != EXTRA_SHIFT -#error "Mismatch between VA_BITS and page size/number of translation levels" -#endif -#else -#define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT) -#define EXTRA_SHIFT - /* - * If VA_BITS == 48, we don't have to configure an additional - * translation level, but the top-level table has more entries. - */ -#endif - adrp x0, init_idmap_pg_dir - adrp x3, _text - adrp x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE - mov_q x7, SWAPPER_RX_MMUFLAGS - - map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT - - /* Remap the kernel page tables r/w in the ID map */ - adrp x1, _text - adrp x2, init_pg_dir - adrp x3, init_pg_end - bic x4, x2, #SWAPPER_BLOCK_SIZE - 1 - mov_q x5, SWAPPER_RW_MMUFLAGS - mov x6, #SWAPPER_BLOCK_SHIFT - bl remap_region - - /* Remap the FDT after the kernel image */ - adrp x1, _text - adrp x22, _end + SWAPPER_BLOCK_SIZE - bic x2, x22, #SWAPPER_BLOCK_SIZE - 1 - bfi x22, x21, #0, #SWAPPER_BLOCK_SHIFT // remapped FDT address - add x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE - bic x4, x21, #SWAPPER_BLOCK_SIZE - 1 - mov_q x5, SWAPPER_RW_MMUFLAGS - mov x6, #SWAPPER_BLOCK_SHIFT - bl remap_region - - /* - * Since the page tables have been populated with non-cacheable - * accesses (MMU disabled), invalidate those tables again to - * remove any speculatively loaded cache lines. - */ - cbnz x19, 0f // skip cache invalidation if MMU is on - dmb sy - - adrp x0, init_idmap_pg_dir - adrp x1, init_idmap_pg_end - bl dcache_inval_poc -0: ret x28 -SYM_FUNC_END(create_idmap) - -SYM_FUNC_START_LOCAL(create_kernel_mapping) - adrp x0, init_pg_dir - mov_q x5, KIMAGE_VADDR // compile time __va(_text) -#ifdef CONFIG_RELOCATABLE - add x5, x5, x23 // add KASLR displacement -#endif - adrp x6, _end // runtime __pa(_end) - adrp x3, _text // runtime __pa(_text) - sub x6, x6, x3 // _end - _text - add x6, x6, x5 // runtime __va(_end) - mov_q x7, SWAPPER_RW_MMUFLAGS - - map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14 - - dsb ishst // sync with page table walker - ret -SYM_FUNC_END(create_kernel_mapping) - /* * Initialize CPU registers with task-specific and cpu-specific context. * @@ -489,34 +234,9 @@ SYM_FUNC_START_LOCAL(__primary_switched) mov x0, x20 bl set_cpu_boot_mode_flag - // Clear BSS - adr_l x0, __bss_start - mov x1, xzr - adr_l x2, __bss_stop - sub x2, x2, x0 - bl __pi_memset - dsb ishst // Make zero page visible to PTW - -#if VA_BITS > 48 - adr_l x8, vabits_actual // Set this early so KASAN early init - str x25, [x8] // ... observes the correct value - dc civac, x8 // Make visible to booting secondaries -#endif - -#ifdef CONFIG_RANDOMIZE_BASE - adrp x5, memstart_offset_seed // Save KASLR linear map seed - strh w24, [x5, :lo12:memstart_offset_seed] -#endif #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) bl kasan_early_init #endif - mov x0, x21 // pass FDT address in x0 - bl early_fdt_map // Try mapping the FDT early - mov x0, x20 // pass the full boot status - bl init_feature_override // Parse cpu feature overrides -#ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS - bl scs_patch_vmlinux -#endif mov x0, x20 bl finalise_el2 // Prefer VHE if possible ldp x29, x30, [sp], #16 @@ -643,10 +363,13 @@ SYM_FUNC_START_LOCAL(secondary_startup) * Common entry point for secondary CPUs. */ mov x20, x0 // preserve boot mode + +#ifdef CONFIG_ARM64_VA_BITS_52 +alternative_if ARM64_HAS_VA52 bl __cpu_secondary_check52bitva -#if VA_BITS > 48 - ldr_l x0, vabits_actual +alternative_else_nop_endif #endif + bl __cpu_setup // initialise processor adrp x1, swapper_pg_dir adrp x2, idmap_pg_dir @@ -749,15 +472,18 @@ SYM_FUNC_START(__enable_mmu) ret SYM_FUNC_END(__enable_mmu) +#ifdef CONFIG_ARM64_VA_BITS_52 SYM_FUNC_START(__cpu_secondary_check52bitva) -#if VA_BITS > 48 - ldr_l x0, vabits_actual - cmp x0, #52 - b.ne 2f - +#ifndef CONFIG_ARM64_LPA2 mrs_s x0, SYS_ID_AA64MMFR2_EL1 and x0, x0, ID_AA64MMFR2_EL1_VARange_MASK cbnz x0, 2f +#else + mrs x0, id_aa64mmfr0_el1 + sbfx x0, x0, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4 + cmp x0, #ID_AA64MMFR0_EL1_TGRAN_LPA2 + b.ge 2f +#endif update_early_cpu_boot_status \ CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_52_BIT_VA, x0, x1 @@ -765,9 +491,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva) wfi b 1b -#endif 2: ret SYM_FUNC_END(__cpu_secondary_check52bitva) +#endif SYM_FUNC_START_LOCAL(__no_granule_support) /* Indicate that this CPU can't boot and is stuck in the kernel */ @@ -779,123 +505,18 @@ SYM_FUNC_START_LOCAL(__no_granule_support) b 1b SYM_FUNC_END(__no_granule_support) -#ifdef CONFIG_RELOCATABLE -SYM_FUNC_START_LOCAL(__relocate_kernel) - /* - * Iterate over each entry in the relocation table, and apply the - * relocations in place. - */ - adr_l x9, __rela_start - adr_l x10, __rela_end - mov_q x11, KIMAGE_VADDR // default virtual offset - add x11, x11, x23 // actual virtual offset - -0: cmp x9, x10 - b.hs 1f - ldp x12, x13, [x9], #24 - ldr x14, [x9, #-8] - cmp w13, #R_AARCH64_RELATIVE - b.ne 0b - add x14, x14, x23 // relocate - str x14, [x12, x23] - b 0b - -1: -#ifdef CONFIG_RELR - /* - * Apply RELR relocations. - * - * RELR is a compressed format for storing relative relocations. The - * encoded sequence of entries looks like: - * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] - * - * i.e. start with an address, followed by any number of bitmaps. The - * address entry encodes 1 relocation. The subsequent bitmap entries - * encode up to 63 relocations each, at subsequent offsets following - * the last address entry. - * - * The bitmap entries must have 1 in the least significant bit. The - * assumption here is that an address cannot have 1 in lsb. Odd - * addresses are not supported. Any odd addresses are stored in the RELA - * section, which is handled above. - * - * Excluding the least significant bit in the bitmap, each non-zero - * bit in the bitmap represents a relocation to be applied to - * a corresponding machine word that follows the base address - * word. The second least significant bit represents the machine - * word immediately following the initial address, and each bit - * that follows represents the next word, in linear order. As such, - * a single bitmap can encode up to 63 relocations in a 64-bit object. - * - * In this implementation we store the address of the next RELR table - * entry in x9, the address being relocated by the current address or - * bitmap entry in x13 and the address being relocated by the current - * bit in x14. - */ - adr_l x9, __relr_start - adr_l x10, __relr_end - -2: cmp x9, x10 - b.hs 7f - ldr x11, [x9], #8 - tbnz x11, #0, 3f // branch to handle bitmaps - add x13, x11, x23 - ldr x12, [x13] // relocate address entry - add x12, x12, x23 - str x12, [x13], #8 // adjust to start of bitmap - b 2b - -3: mov x14, x13 -4: lsr x11, x11, #1 - cbz x11, 6f - tbz x11, #0, 5f // skip bit if not set - ldr x12, [x14] // relocate bit - add x12, x12, x23 - str x12, [x14] - -5: add x14, x14, #8 // move to next bit's address - b 4b - -6: /* - * Move to the next bitmap's address. 8 is the word size, and 63 is the - * number of significant bits in a bitmap entry. - */ - add x13, x13, #(8 * 63) - b 2b - -7: -#endif - ret - -SYM_FUNC_END(__relocate_kernel) -#endif - SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir adrp x2, init_idmap_pg_dir bl __enable_mmu -#ifdef CONFIG_RELOCATABLE - adrp x23, KERNEL_START - and x23, x23, MIN_KIMG_ALIGN - 1 -#ifdef CONFIG_RANDOMIZE_BASE - mov x0, x22 - adrp x1, init_pg_end + + adrp x1, early_init_stack mov sp, x1 mov x29, xzr - bl __pi_kaslr_early_init - and x24, x0, #SZ_2M - 1 // capture memstart offset seed - bic x0, x0, #SZ_2M - 1 - orr x23, x23, x0 // record kernel offset -#endif -#endif - bl clear_page_tables - bl create_kernel_mapping + mov x0, x20 // pass the full boot status + mov x1, x21 // pass the FDT + bl __pi_early_map_kernel // Map and relocate the kernel - adrp x1, init_pg_dir - load_ttbr1 x1, x1, x2 -#ifdef CONFIG_RELOCATABLE - bl __relocate_kernel -#endif ldr x8, =__primary_switched adrp x0, KERNEL_START // __pa(KERNEL_START) br x8 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index e931ce078a00..ba4f8f7d6a91 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -36,7 +36,40 @@ PROVIDE(__pi___memcpy = __pi_memcpy); PROVIDE(__pi___memmove = __pi_memmove); PROVIDE(__pi___memset = __pi_memset); -PROVIDE(__pi_vabits_actual = vabits_actual); +PROVIDE(__pi_id_aa64isar1_override = id_aa64isar1_override); +PROVIDE(__pi_id_aa64isar2_override = id_aa64isar2_override); +PROVIDE(__pi_id_aa64mmfr0_override = id_aa64mmfr0_override); +PROVIDE(__pi_id_aa64mmfr1_override = id_aa64mmfr1_override); +PROVIDE(__pi_id_aa64mmfr2_override = id_aa64mmfr2_override); +PROVIDE(__pi_id_aa64pfr0_override = id_aa64pfr0_override); +PROVIDE(__pi_id_aa64pfr1_override = id_aa64pfr1_override); +PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); +PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); +PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); +PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus); +#endif +PROVIDE(__pi__ctype = _ctype); +PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); + +PROVIDE(__pi_init_idmap_pg_dir = init_idmap_pg_dir); +PROVIDE(__pi_init_idmap_pg_end = init_idmap_pg_end); +PROVIDE(__pi_init_pg_dir = init_pg_dir); +PROVIDE(__pi_init_pg_end = init_pg_end); +PROVIDE(__pi_swapper_pg_dir = swapper_pg_dir); + +PROVIDE(__pi__text = _text); +PROVIDE(__pi__stext = _stext); +PROVIDE(__pi__etext = _etext); +PROVIDE(__pi___start_rodata = __start_rodata); +PROVIDE(__pi___inittext_begin = __inittext_begin); +PROVIDE(__pi___inittext_end = __inittext_end); +PROVIDE(__pi___initdata_begin = __initdata_begin); +PROVIDE(__pi___initdata_end = __initdata_end); +PROVIDE(__pi__data = _data); +PROVIDE(__pi___bss_start = __bss_start); +PROVIDE(__pi__end = _end); #ifdef CONFIG_KVM diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 12c7f3c8ba76..1da3e25f9d9e 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -16,9 +16,7 @@ bool __ro_after_init __kaslr_is_enabled = false; void __init kaslr_init(void) { - if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val & - arm64_sw_feature_override.mask, - ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) { + if (kaslr_disabled_cmdline()) { pr_info("KASLR disabled on command line\n"); return; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index dd851297596e..47e0be610bb6 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -595,7 +595,7 @@ int module_finalize(const Elf_Ehdr *hdr, if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); if (s) - scs_patch((void *)s->sh_addr, s->sh_size); + __pi_scs_patch((void *)s->sh_addr, s->sh_size); } return module_init_ftrace_plt(hdr, sechdrs, me); diff --git a/arch/arm64/kernel/pi/.gitignore b/arch/arm64/kernel/pi/.gitignore new file mode 100644 index 000000000000..efb29b663e85 --- /dev/null +++ b/arch/arm64/kernel/pi/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +relacheck diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile index c844a0546d7f..4393b41f0b71 100644 --- a/arch/arm64/kernel/pi/Makefile +++ b/arch/arm64/kernel/pi/Makefile @@ -11,6 +11,9 @@ KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ -fno-asynchronous-unwind-tables -fno-unwind-tables \ $(call cc-option,-fno-addrsig) +# this code may run with the MMU off so disable unaligned accesses +CFLAGS_map_range.o += -mstrict-align + # remove SCS flags from all objects in this directory KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) # disable LTO @@ -22,14 +25,26 @@ KCSAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +hostprogs := relacheck + +quiet_cmd_piobjcopy = $(quiet_cmd_objcopy) + cmd_piobjcopy = $(cmd_objcopy) && $(obj)/relacheck $(@) $(<) + $(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ - --remove-section=.note.gnu.property \ - --prefix-alloc-sections=.init -$(obj)/%.pi.o: $(obj)/%.o FORCE - $(call if_changed,objcopy) + --remove-section=.note.gnu.property +$(obj)/%.pi.o: $(obj)/%.o $(obj)/relacheck FORCE + $(call if_changed,piobjcopy) + +# ensure that all the lib- code ends up as __init code and data +$(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE $(call if_changed_rule,cc_o_c) -obj-y := kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o -extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) +obj-y := idreg-override.pi.o \ + map_kernel.pi.o map_range.pi.o \ + lib-fdt.pi.o lib-fdt_ro.pi.o +obj-$(CONFIG_RELOCATABLE) += relocate.pi.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_early.pi.o +obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.pi.o +extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index e30fd9e32ef3..bccfee34f62f 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -14,6 +14,8 @@ #include <asm/cpufeature.h> #include <asm/setup.h> +#include "pi.h" + #define FTR_DESC_NAME_LEN 20 #define FTR_DESC_FIELD_LEN 10 #define FTR_ALIAS_NAME_LEN 30 @@ -21,15 +23,6 @@ static u64 __boot_status __initdata; -// temporary __prel64 related definitions -// to be removed when this code is moved under pi/ - -#define __prel64_initconst __initconst - -#define PREL64(type, name) union { type *name; } - -#define prel64_pointer(__d) (__d) - typedef bool filter_t(u64 val); struct ftr_set_desc { @@ -66,6 +59,35 @@ static const struct ftr_set_desc mmfr1 __prel64_initconst = { }, }; + +static bool __init mmfr2_varange_filter(u64 val) +{ + int __maybe_unused feat; + + if (val) + return false; + +#ifdef CONFIG_ARM64_LPA2 + feat = cpuid_feature_extract_signed_field(read_sysreg(id_aa64mmfr0_el1), + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + if (feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2) { + id_aa64mmfr0_override.val |= + (ID_AA64MMFR0_EL1_TGRAN_LPA2 - 1) << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_TGRAN_SHIFT; + } +#endif + return true; +} + +static const struct ftr_set_desc mmfr2 __prel64_initconst = { + .name = "id_aa64mmfr2", + .override = &id_aa64mmfr2_override, + .fields = { + FIELD("varange", ID_AA64MMFR2_EL1_VARange_SHIFT, mmfr2_varange_filter), + {} + }, +}; + static bool __init pfr0_sve_filter(u64 val) { /* @@ -166,6 +188,8 @@ static const struct ftr_set_desc sw_features __prel64_initconst = { .fields = { FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), + FIELD("rodataoff", ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF, NULL), + FIELD("nowxn", ARM64_SW_FEATURE_OVERRIDE_NOWXN, NULL), {} }, }; @@ -173,6 +197,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = { static const PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { { &mmfr1 }, + { &mmfr2 }, { &pfr0 }, { &pfr1 }, { &isar1 }, @@ -197,6 +222,9 @@ static const struct { { "arm64.nomops", "id_aa64isar2.mops=0" }, { "arm64.nomte", "id_aa64pfr1.mte=0" }, { "nokaslr", "arm64_sw.nokaslr=1" }, + { "rodata=off", "arm64_sw.rodataoff=1 arm64_sw.nowxn=1" }, + { "arm64.nolva", "id_aa64mmfr2.varange=0" }, + { "arm64.nowxn", "arm64_sw.nowxn=1" }, }; static int __init parse_hexdigit(const char *p, u64 *v) @@ -313,42 +341,35 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) } while (1); } -static __init const u8 *get_bootargs_cmdline(void) +static __init const u8 *get_bootargs_cmdline(const void *fdt, int node) { + static char const bootargs[] __initconst = "bootargs"; const u8 *prop; - void *fdt; - int node; - fdt = get_early_fdt_ptr(); - if (!fdt) - return NULL; - - node = fdt_path_offset(fdt, "/chosen"); if (node < 0) return NULL; - prop = fdt_getprop(fdt, node, "bootargs", NULL); + prop = fdt_getprop(fdt, node, bootargs, NULL); if (!prop) return NULL; return strlen(prop) ? prop : NULL; } -static __init void parse_cmdline(void) +static __init void parse_cmdline(const void *fdt, int chosen) { - const u8 *prop = get_bootargs_cmdline(); + static char const cmdline[] __initconst = CONFIG_CMDLINE; + const u8 *prop = get_bootargs_cmdline(fdt, chosen); if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) - __parse_cmdline(CONFIG_CMDLINE, true); + __parse_cmdline(cmdline, true); if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) __parse_cmdline(prop, true); } -/* Keep checkers quiet */ -void init_feature_override(u64 boot_status); - -asmlinkage void __init init_feature_override(u64 boot_status) +void __init init_feature_override(u64 boot_status, const void *fdt, + int chosen) { struct arm64_ftr_override *override; const struct ftr_set_desc *reg; @@ -364,7 +385,7 @@ asmlinkage void __init init_feature_override(u64 boot_status) __boot_status = boot_status; - parse_cmdline(); + parse_cmdline(fdt, chosen); for (i = 0; i < ARRAY_SIZE(regs); i++) { reg = prel64_pointer(regs[i].reg); @@ -373,3 +394,10 @@ asmlinkage void __init init_feature_override(u64 boot_status) (unsigned long)(override + 1)); } } + +char * __init skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c index b9e0bb4bc6a9..0257b43819db 100644 --- a/arch/arm64/kernel/pi/kaslr_early.c +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -16,68 +16,21 @@ #include <asm/memory.h> #include <asm/pgtable.h> -/* taken from lib/string.c */ -static char *__strstr(const char *s1, const char *s2) -{ - size_t l1, l2; - - l2 = strlen(s2); - if (!l2) - return (char *)s1; - l1 = strlen(s1); - while (l1 >= l2) { - l1--; - if (!memcmp(s1, s2, l2)) - return (char *)s1; - s1++; - } - return NULL; -} -static bool cmdline_contains_nokaslr(const u8 *cmdline) -{ - const u8 *str; - - str = __strstr(cmdline, "nokaslr"); - return str == cmdline || (str > cmdline && *(str - 1) == ' '); -} - -static bool is_kaslr_disabled_cmdline(void *fdt) -{ - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - int node; - const u8 *prop; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; - - if (cmdline_contains_nokaslr(prop)) - return true; +#include "pi.h" - if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) - goto out; +extern u16 memstart_offset_seed; - return false; - } -out: - return cmdline_contains_nokaslr(CONFIG_CMDLINE); -} - -static u64 get_kaslr_seed(void *fdt) +static u64 __init get_kaslr_seed(void *fdt, int node) { - int node, len; + static char const seed_str[] __initconst = "kaslr-seed"; fdt64_t *prop; u64 ret; + int len; - node = fdt_path_offset(fdt, "/chosen"); if (node < 0) return 0; - prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + prop = fdt_getprop_w(fdt, node, seed_str, &len); if (!prop || len != sizeof(u64)) return 0; @@ -86,20 +39,22 @@ static u64 get_kaslr_seed(void *fdt) return ret; } -asmlinkage u64 kaslr_early_init(void *fdt) +u64 __init kaslr_early_init(void *fdt, int chosen) { u64 seed, range; - if (is_kaslr_disabled_cmdline(fdt)) + if (kaslr_disabled_cmdline()) return 0; - seed = get_kaslr_seed(fdt); + seed = get_kaslr_seed(fdt, chosen); if (!seed) { if (!__early_cpu_has_rndr() || !__arm64_rndr((unsigned long *)&seed)) return 0; } + memstart_offset_seed = seed & U16_MAX; + /* * OK, so we are proceeding with KASLR enabled. Calculate a suitable * kernel image offset from the seed. Let's place the kernel in the diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c new file mode 100644 index 000000000000..cac1e1f63c44 --- /dev/null +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/init.h> +#include <linux/libfdt.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/string.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +#include "pi.h" + +extern const u8 __eh_frame_start[], __eh_frame_end[]; + +extern void idmap_cpu_replace_ttbr1(void *pgdir); + +static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset, + void *start, void *end, pgprot_t prot, + bool may_use_cont, int root_level) +{ + map_range(pgd, ((u64)start + va_offset) & ~PAGE_OFFSET, + ((u64)end + va_offset) & ~PAGE_OFFSET, (u64)start, + prot, root_level, (pte_t *)pg_dir, may_use_cont, 0); +} + +static void __init unmap_segment(pgd_t *pg_dir, u64 va_offset, void *start, + void *end, int root_level) +{ + map_segment(pg_dir, NULL, va_offset, start, end, __pgprot(0), + false, root_level); +} + +static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) +{ + bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); + bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); + u64 pgdp = (u64)init_pg_dir + PAGE_SIZE; + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + pgprot_t prot; + + /* + * External debuggers may need to write directly to the text mapping to + * install SW breakpoints. Allow this (only) when explicitly requested + * with rodata=off. + */ + if (arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF)) + text_prot = PAGE_KERNEL_EXEC; + + /* + * We only enable the shadow call stack dynamically if we are running + * on a system that does not implement PAC or BTI. PAC and SCS provide + * roughly the same level of protection, and BTI relies on the PACIASP + * instructions serving as landing pads, preventing us from patching + * those instructions into something else. + */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && cpu_has_pac()) + enable_scs = false; + + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && cpu_has_bti()) { + enable_scs = false; + + /* + * If we have a CPU that supports BTI and a kernel built for + * BTI then mark the kernel executable text as guarded pages + * now so we don't have to rewrite the page tables later. + */ + text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); + } + + /* Map all code read-write on the first pass if needed */ + twopass |= enable_scs; + prot = twopass ? data_prot : text_prot; + + map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, + !twopass, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, + __inittext_begin, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __inittext_begin, + __inittext_end, prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __initdata_begin, + __initdata_end, data_prot, false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, _data, _end, data_prot, + true, root_level); + dsb(ishst); + + idmap_cpu_replace_ttbr1(init_pg_dir); + + if (twopass) { + if (IS_ENABLED(CONFIG_RELOCATABLE)) + relocate_kernel(kaslr_offset); + + if (enable_scs) { + scs_patch(__eh_frame_start + va_offset, + __eh_frame_end - __eh_frame_start); + asm("ic ialluis"); + + dynamic_scs_is_enabled = true; + } + + /* + * Unmap the text region before remapping it, to avoid + * potential TLB conflicts when creating the contiguous + * descriptors. + */ + unmap_segment(init_pg_dir, va_offset, _stext, _etext, + root_level); + dsb(ishst); + isb(); + __tlbi(vmalle1); + isb(); + + /* + * Remap these segments with different permissions + * No new page table allocations should be needed + */ + map_segment(init_pg_dir, NULL, va_offset, _stext, _etext, + text_prot, true, root_level); + map_segment(init_pg_dir, NULL, va_offset, __inittext_begin, + __inittext_end, text_prot, false, root_level); + } + + /* Copy the root page table to its final location */ + memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); + dsb(ishst); + idmap_cpu_replace_ttbr1(swapper_pg_dir); +} + +static void noinline __section(".idmap.text") disable_wxn(void) +{ + u64 sctlr = read_sysreg(sctlr_el1) & ~SCTLR_ELx_WXN; + + /* + * We cannot safely clear the WXN bit while the MMU and caches are on, + * so turn the MMU off, flush the TLBs and turn it on again but with + * the WXN bit cleared this time. + */ + asm(" msr sctlr_el1, %0 ;" + " isb ;" + " tlbi vmalle1 ;" + " dsb nsh ;" + " isb ;" + " msr sctlr_el1, %1 ;" + " isb ;" + :: "r"(sctlr & ~SCTLR_ELx_M), "r"(sctlr)); +} + +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) +{ + u64 sctlr = read_sysreg(sctlr_el1); + u64 tcr = read_sysreg(tcr_el1) | TCR_DS; + + asm(" msr sctlr_el1, %0 ;" + " isb ;" + " msr ttbr0_el1, %1 ;" + " msr tcr_el1, %2 ;" + " isb ;" + " tlbi vmalle1 ;" + " dsb nsh ;" + " isb ;" + " msr sctlr_el1, %3 ;" + " isb ;" + :: "r"(sctlr & ~SCTLR_ELx_M), "r"(ttbr), "r"(tcr), "r"(sctlr)); +} + +static void __init remap_idmap_for_lpa2(void) +{ + /* clear the bits that change meaning once LPA2 is turned on */ + pteval_t mask = PTE_SHARED; + + /* + * We have to clear bits [9:8] in all block or page descriptors in the + * initial ID map, as otherwise they will be (mis)interpreted as + * physical address bits once we flick the LPA2 switch (TCR.DS). Since + * we cannot manipulate live descriptors in that way without creating + * potential TLB conflicts, let's create another temporary ID map in a + * LPA2 compatible fashion, and update the initial ID map while running + * from that. + */ + create_init_idmap(init_pg_dir, mask); + dsb(ishst); + set_ttbr0_for_lpa2((u64)init_pg_dir); + + /* + * Recreate the initial ID map with the same granularity as before. + * Don't bother with the FDT, we no longer need it after this. + */ + memset(init_idmap_pg_dir, 0, + (u64)init_idmap_pg_dir - (u64)init_idmap_pg_end); + + create_init_idmap(init_idmap_pg_dir, mask); + dsb(ishst); + + /* switch back to the updated initial ID map */ + set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); + + /* wipe the temporary ID map from memory */ + memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir); +} + +static void __init map_fdt(u64 fdt) +{ + static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); + u64 efdt = fdt + MAX_FDT_SIZE; + u64 ptep = (u64)ptes; + + /* + * Map up to MAX_FDT_SIZE bytes, but avoid overlap with + * the kernel image. + */ + map_range(&ptep, fdt, (u64)_text > fdt ? min((u64)_text, efdt) : efdt, + fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, + (pte_t *)init_idmap_pg_dir, false, 0); + dsb(ishst); +} + +asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) +{ + static char const chosen_str[] __initconst = "/chosen"; + u64 va_base, pa_base = (u64)&_text; + u64 kaslr_offset = pa_base % MIN_KIMG_ALIGN; + int root_level = 4 - CONFIG_PGTABLE_LEVELS; + int va_bits = VA_BITS; + int chosen; + + map_fdt((u64)fdt); + + /* Clear BSS and the initial page tables */ + memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start); + + /* Parse the command line for CPU feature overrides */ + chosen = fdt_path_offset(fdt, chosen_str); + init_feature_override(boot_status, fdt, chosen); + + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { + va_bits = VA_BITS_MIN; + } else if (IS_ENABLED(CONFIG_ARM64_LPA2) && !cpu_has_lpa2()) { + va_bits = VA_BITS_MIN; + root_level++; + } + + if (va_bits > VA_BITS_MIN) + sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(va_bits)); + + if (IS_ENABLED(CONFIG_ARM64_WXN) && + arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOWXN)) + disable_wxn(); + + /* + * The virtual KASLR displacement modulo 2MiB is decided by the + * physical placement of the image, as otherwise, we might not be able + * to create the early kernel mapping using 2 MiB block descriptors. So + * take the low bits of the KASLR offset from the physical address, and + * fill in the high bits from the seed. + */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + u64 kaslr_seed = kaslr_early_init(fdt, chosen); + + if (kaslr_seed && kaslr_requires_kpti()) + arm64_use_ng_mappings = true; + + kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); + } + + if (IS_ENABLED(CONFIG_ARM64_LPA2) && va_bits > VA_BITS_MIN) + remap_idmap_for_lpa2(); + + va_base = KIMAGE_VADDR + kaslr_offset; + map_kernel(kaslr_offset, va_base - pa_base, root_level); +} diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c new file mode 100644 index 000000000000..5410b2cac590 --- /dev/null +++ b/arch/arm64/kernel/pi/map_range.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> +#include <linux/sizes.h> + +#include <asm/memory.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +#include "pi.h" + +/** + * map_range - Map a contiguous range of physical pages into virtual memory + * + * @pte: Address of physical pointer to array of pages to + * allocate page tables from + * @start: Virtual address of the start of the range + * @end: Virtual address of the end of the range (exclusive) + * @pa: Physical address of the start of the range + * @prot: Access permissions of the range + * @level: Translation level for the mapping + * @tbl: The level @level page table to create the mappings in + * @may_use_cont: Whether the use of the contiguous attribute is allowed + * @va_offset: Offset between a physical page and its current mapping + * in the VA space + */ +void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, + int level, pte_t *tbl, bool may_use_cont, u64 va_offset) +{ + u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; + u64 protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + int lshift = (3 - level) * (PAGE_SHIFT - 3); + u64 lmask = (PAGE_SIZE << lshift) - 1; + + start &= PAGE_MASK; + pa &= PAGE_MASK; + + /* Advance tbl to the entry that covers start */ + tbl += (start >> (lshift + PAGE_SHIFT)) % PTRS_PER_PTE; + + /* + * Set the right block/page bits for this level unless we are + * clearing the mapping + */ + if (protval) + protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + + while (start < end) { + u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); + + if (level < 3 && (start | next | pa) & lmask) { + /* + * This chunk needs a finer grained mapping. Create a + * table mapping if necessary and recurse. + */ + if (pte_none(*tbl)) { + *tbl = __pte(__phys_to_pte_val(*pte) | + PMD_TYPE_TABLE | PMD_TABLE_UXN); + *pte += PTRS_PER_PTE * sizeof(pte_t); + } + map_range(pte, start, next, pa, prot, level + 1, + (pte_t *)(__pte_to_phys(*tbl) + va_offset), + may_use_cont, va_offset); + } else { + /* + * Start a contiguous range if start and pa are + * suitably aligned + */ + if (((start | pa) & cmask) == 0 && may_use_cont) + protval |= PTE_CONT; + + /* + * Clear the contiguous attribute if the remaining + * range does not cover a contiguous block + */ + if ((end & ~cmask) <= start) + protval &= ~PTE_CONT; + + /* Put down a block or page mapping */ + *tbl = __pte(__phys_to_pte_val(pa) | protval); + } + pa += next - start; + start = next; + tbl++; + } +} + +asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, pteval_t clrmask) +{ + u64 ptep = (u64)pg_dir + PAGE_SIZE; + pgprot_t text_prot = PAGE_KERNEL_ROX; + pgprot_t data_prot = PAGE_KERNEL; + + pgprot_val(text_prot) &= ~clrmask; + pgprot_val(data_prot) &= ~clrmask; + + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext, + text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin, + data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + + return ptep; +} diff --git a/arch/arm64/kernel/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index a1fe4b4ff591..49d8b40e61bc 100644 --- a/arch/arm64/kernel/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -4,16 +4,17 @@ * Author: Ard Biesheuvel <ardb@google.com> */ -#include <linux/bug.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/linkage.h> -#include <linux/printk.h> #include <linux/types.h> -#include <asm/cacheflush.h> #include <asm/scs.h> +#include "pi.h" + +bool dynamic_scs_is_enabled; + // // This minimal DWARF CFI parser is partially based on the code in // arch/arc/kernel/unwind.c, and on the document below: @@ -49,8 +50,6 @@ #define DW_CFA_GNU_negative_offset_extended 0x2f #define DW_CFA_hi_user 0x3f -extern const u8 __eh_frame_start[], __eh_frame_end[]; - enum { PACIASP = 0xd503233f, AUTIASP = 0xd50323bf, @@ -81,7 +80,11 @@ static void __always_inline scs_patch_loc(u64 loc) */ return; } - dcache_clean_pou(loc, loc + sizeof(u32)); + if (IS_ENABLED(CONFIG_ARM64_WORKAROUND_CLEAN_CACHE)) + asm("dc civac, %0" :: "r"(loc)); + else + asm(ALTERNATIVE("dc cvau, %0", "nop", ARM64_HAS_CACHE_IDC) + :: "r"(loc)); } /* @@ -128,10 +131,10 @@ struct eh_frame { }; }; -static int noinstr scs_handle_fde_frame(const struct eh_frame *frame, - bool fde_has_augmentation_data, - int code_alignment_factor, - bool dry_run) +static int scs_handle_fde_frame(const struct eh_frame *frame, + bool fde_has_augmentation_data, + int code_alignment_factor, + bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; u64 loc = (u64)offset_to_ptr(&frame->initial_loc); @@ -198,14 +201,13 @@ static int noinstr scs_handle_fde_frame(const struct eh_frame *frame, break; default: - pr_err("unhandled opcode: %02x in FDE frame %lx\n", opcode[-1], (uintptr_t)frame); return -ENOEXEC; } } return 0; } -int noinstr scs_patch(const u8 eh_frame[], int size) +int scs_patch(const u8 eh_frame[], int size) { const u8 *p = eh_frame; @@ -250,13 +252,3 @@ int noinstr scs_patch(const u8 eh_frame[], int size) } return 0; } - -asmlinkage void __init scs_patch_vmlinux(void) -{ - if (!should_patch_pac_into_scs()) - return; - - WARN_ON(scs_patch(__eh_frame_start, __eh_frame_end - __eh_frame_start)); - icache_inval_all_pou(); - isb(); -} diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h new file mode 100644 index 000000000000..c91e5e965cd3 --- /dev/null +++ b/arch/arm64/kernel/pi/pi.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Author: Ard Biesheuvel <ardb@google.com> + +#include <linux/types.h> + +#define __prel64_initconst __section(".init.rodata.prel64") + +#define PREL64(type, name) union { type *name; prel64_t name ## _prel; } + +#define prel64_pointer(__d) (typeof(__d))prel64_to_pointer(&__d##_prel) + +typedef volatile signed long prel64_t; + +static inline void *prel64_to_pointer(const prel64_t *offset) +{ + if (!*offset) + return NULL; + return (void *)offset + *offset; +} + +extern bool dynamic_scs_is_enabled; + +extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; + +void init_feature_override(u64 boot_status, const void *fdt, int chosen); +u64 kaslr_early_init(void *fdt, int chosen); +void relocate_kernel(u64 offset); +int scs_patch(const u8 eh_frame[], int size); + +void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, + int level, pte_t *tbl, bool may_use_cont, u64 va_offset); + +asmlinkage void early_map_kernel(u64 boot_status, void *fdt); + +asmlinkage u64 create_init_idmap(pgd_t *pgd, pteval_t clrmask); diff --git a/arch/arm64/kernel/pi/relacheck.c b/arch/arm64/kernel/pi/relacheck.c new file mode 100644 index 000000000000..b0cd4d0d275b --- /dev/null +++ b/arch/arm64/kernel/pi/relacheck.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 - Google LLC + * Author: Ard Biesheuvel <ardb@google.com> + */ + +#include <elf.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define HOST_ORDER ELFDATA2LSB +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define HOST_ORDER ELFDATA2MSB +#endif + +static Elf64_Ehdr *ehdr; +static Elf64_Shdr *shdr; +static const char *strtab; +static bool swap; + +static uint64_t swab_elfxword(uint64_t val) +{ + return swap ? __builtin_bswap64(val) : val; +} + +static uint32_t swab_elfword(uint32_t val) +{ + return swap ? __builtin_bswap32(val) : val; +} + +static uint16_t swab_elfhword(uint16_t val) +{ + return swap ? __builtin_bswap16(val) : val; +} + +int main(int argc, char *argv[]) +{ + struct stat stat; + int fd, ret; + + if (argc < 3) { + fprintf(stderr, "file arguments missing\n"); + exit(EXIT_FAILURE); + } + + fd = open(argv[1], O_RDWR); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ret = fstat(fd, &stat); + if (ret < 0) { + fprintf(stderr, "failed to stat() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + ehdr = mmap(0, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ehdr == MAP_FAILED) { + fprintf(stderr, "failed to mmap() %s\n", argv[1]); + exit(EXIT_FAILURE); + } + + swap = ehdr->e_ident[EI_DATA] != HOST_ORDER; + shdr = (void *)ehdr + swab_elfxword(ehdr->e_shoff); + strtab = (void *)ehdr + + swab_elfxword(shdr[swab_elfhword(ehdr->e_shstrndx)].sh_offset); + + for (int i = 0; i < swab_elfhword(ehdr->e_shnum); i++) { + unsigned long info, flags; + bool prel64 = false; + Elf64_Rela *rela; + int numrela; + + if (swab_elfword(shdr[i].sh_type) != SHT_RELA) + continue; + + /* only consider RELA sections operating on data */ + info = swab_elfword(shdr[i].sh_info); + flags = swab_elfxword(shdr[info].sh_flags); + if ((flags & (SHF_ALLOC | SHF_EXECINSTR)) != SHF_ALLOC) + continue; + + /* + * We generally don't permit ABS64 relocations in the code that + * runs before relocation processing occurs. If statically + * initialized absolute symbol references are unavoidable, they + * may be emitted into a *.rodata.prel64 section and they will + * be converted to place-relative 64-bit references. This + * requires special handling in the referring code. + */ + if (strstr(strtab + swab_elfword(shdr[info].sh_name), + ".rodata.prel64")) { + prel64 = true; + } + + rela = (void *)ehdr + swab_elfxword(shdr[i].sh_offset); + numrela = swab_elfxword(shdr[i].sh_size) / sizeof(*rela); + + for (int j = 0; j < numrela; j++) { + uint64_t info = swab_elfxword(rela[j].r_info); + + if (ELF64_R_TYPE(info) != R_AARCH64_ABS64) + continue; + + if (prel64) { + /* convert ABS64 into PREL64 */ + info ^= R_AARCH64_ABS64 ^ R_AARCH64_PREL64; + rela[j].r_info = swab_elfxword(info); + } else { + fprintf(stderr, + "Unexpected absolute relocations detected in %s\n", + argv[2]); + close(fd); + unlink(argv[1]); + exit(EXIT_FAILURE); + } + } + } + close(fd); + return 0; +} diff --git a/arch/arm64/kernel/pi/relocate.c b/arch/arm64/kernel/pi/relocate.c new file mode 100644 index 000000000000..2407d2696398 --- /dev/null +++ b/arch/arm64/kernel/pi/relocate.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright 2023 Google LLC +// Authors: Ard Biesheuvel <ardb@google.com> +// Peter Collingbourne <pcc@google.com> + +#include <linux/elf.h> +#include <linux/init.h> +#include <linux/types.h> + +#include "pi.h" + +extern const Elf64_Rela rela_start[], rela_end[]; +extern const u64 relr_start[], relr_end[]; + +void __init relocate_kernel(u64 offset) +{ + u64 *place = NULL; + + for (const Elf64_Rela *rela = rela_start; rela < rela_end; rela++) { + if (ELF64_R_TYPE(rela->r_info) != R_AARCH64_RELATIVE) + continue; + *(u64 *)(rela->r_offset + offset) = rela->r_addend + offset; + } + + if (!IS_ENABLED(CONFIG_RELR) || !offset) + return; + + /* + * Apply RELR relocations. + * + * RELR is a compressed format for storing relative relocations. The + * encoded sequence of entries looks like: + * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ] + * + * i.e. start with an address, followed by any number of bitmaps. The + * address entry encodes 1 relocation. The subsequent bitmap entries + * encode up to 63 relocations each, at subsequent offsets following + * the last address entry. + * + * The bitmap entries must have 1 in the least significant bit. The + * assumption here is that an address cannot have 1 in lsb. Odd + * addresses are not supported. Any odd addresses are stored in the + * RELA section, which is handled above. + * + * With the exception of the least significant bit, each bit in the + * bitmap corresponds with a machine word that follows the base address + * word, and the bit value indicates whether or not a relocation needs + * to be applied to it. The second least significant bit represents the + * machine word immediately following the initial address, and each bit + * that follows represents the next word, in linear order. As such, a + * single bitmap can encode up to 63 relocations in a 64-bit object. + */ + for (const u64 *relr = relr_start; relr < relr_end; relr++) { + if ((*relr & 1) == 0) { + place = (u64 *)(*relr + offset); + *place++ += offset; + } else { + for (u64 *p = place, r = *relr >> 1; r; p++, r >>= 1) + if (r & 1) + *p += offset; + place += 63; + } + } +} diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index ab43bfa85368..65a052bf741f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -166,21 +166,6 @@ static void __init smp_build_mpidr_hash(void) pr_warn("Large number of MPIDR hash buckets detected\n"); } -static void *early_fdt_ptr __initdata; - -void __init *get_early_fdt_ptr(void) -{ - return early_fdt_ptr; -} - -asmlinkage void __init early_fdt_map(u64 dt_phys) -{ - int fdt_size; - - early_fixmap_init(); - early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL); -} - static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size; @@ -298,13 +283,6 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) kaslr_init(); - /* - * If know now we are going to need KPTI then use non-global - * mappings from the start, avoiding the cost of rewriting - * everything later. - */ - arm64_use_ng_mappings = kaslr_requires_kpti(); - early_fixmap_init(); early_ioremap_init(); diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 2aa5129d8253..f093cdf71be1 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -102,9 +102,6 @@ SYM_CODE_START(cpu_resume) mov x0, xzr bl init_kernel_el mov x19, x0 // preserve boot mode -#if VA_BITS > 48 - ldr_l x0, vabits_actual -#endif bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 3cd7e76cc562..755a22d4f840 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -126,9 +126,9 @@ jiffies = jiffies_64; #ifdef CONFIG_UNWIND_TABLES #define UNWIND_DATA_SECTIONS \ .eh_frame : { \ - __eh_frame_start = .; \ + __pi___eh_frame_start = .; \ *(.eh_frame) \ - __eh_frame_end = .; \ + __pi___eh_frame_end = .; \ } #else #define UNWIND_DATA_SECTIONS @@ -270,15 +270,15 @@ SECTIONS HYPERVISOR_RELOC_SECTION .rela.dyn : ALIGN(8) { - __rela_start = .; + __pi_rela_start = .; *(.rela .rela*) - __rela_end = .; + __pi_rela_end = .; } .relr.dyn : ALIGN(8) { - __relr_start = .; + __pi_relr_start = .; *(.relr.dyn) - __relr_end = .; + __pi_relr_end = .; } . = ALIGN(SEGMENT_ALIGN); @@ -311,12 +311,17 @@ SECTIONS __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); _edata = .; + /* start of zero-init region */ BSS_SECTION(SBSS_ALIGN, 0, 0) . = ALIGN(PAGE_SIZE); init_pg_dir = .; . += INIT_DIR_SIZE; init_pg_end = .; + /* end of zero-init region */ + + . += SZ_4K; /* stack for the early C runtime */ + early_init_stack = .; . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d14504821b79..cd9456a03e38 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -805,7 +805,7 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) .pgd = (kvm_pteref_t)kvm->mm->pgd, .ia_bits = vabits_actual, .start_level = (KVM_PGTABLE_LAST_LEVEL - - CONFIG_PGTABLE_LEVELS + 1), + ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), .mm_ops = &kvm_user_mm_ops, }; unsigned long flags; @@ -1874,16 +1874,9 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); /* - * The ID map may be configured to use an extended virtual address - * range. This is only the case if system RAM is out of range for the - * currently configured page size and VA_BITS_MIN, in which case we will - * also need the extended virtual range for the HYP ID map, or we won't - * be able to enable the EL2 MMU. - * - * However, in some cases the ID map may be configured for fewer than - * the number of VA bits used by the regular kernel stage 1. This - * happens when VA_BITS=52 and the kernel image is placed in PA space - * below 48 bits. + * The ID map is always configured for 48 bits of translation, which + * may be fewer than the number of VA bits used by the regular kernel + * stage 1, when VA_BITS=52. * * At EL2, there is only one TTBR register, and we can't switch between * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom @@ -1894,7 +1887,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) * 1 VA bits to assure that the hypervisor can both ID map its code page * and map any kernel memory. */ - idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + idmap_bits = IDMAP_VA_BITS; kernel_bits = vabits_actual; *hyp_va_bits = max(idmap_bits, kernel_bits); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 55f6455a8284..60265ede48fe 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -257,16 +257,14 @@ static bool is_el1_data_abort(unsigned long esr) static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { - unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE; - if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr)) return false; - if (fsc_type == ESR_ELx_FSC_PERM) + if (esr_fsc_is_permission_fault(esr)) return true; if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan()) - return fsc_type == ESR_ELx_FSC_FAULT && + return esr_fsc_is_translation_fault(esr) && (regs->pstate & PSR_PAN_BIT); return false; @@ -279,8 +277,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, unsigned long flags; u64 par, dfsc; - if (!is_el1_data_abort(esr) || - (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT) + if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr)) return false; local_irq_save(flags); @@ -301,7 +298,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, * treat the translation fault as spurious. */ dfsc = FIELD_GET(SYS_PAR_EL1_FST, par); - return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT; + return !esr_fsc_is_translation_fault(dfsc); } static void die_kernel_fault(const char *msg, unsigned long addr, @@ -368,11 +365,6 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) return false; } -static bool is_translation_fault(unsigned long esr) -{ - return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; -} - static void __do_kernel_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { @@ -405,7 +397,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (is_translation_fault(esr) && + if (esr_fsc_is_translation_fault(esr) && kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) return; @@ -782,18 +774,18 @@ static const struct fault_info fault_info[] = { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 8" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 12" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 19" }, + { do_sea, SIGKILL, SI_KERNEL, "level -1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" }, @@ -801,7 +793,7 @@ static const struct fault_info fault_info[] = { { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented { do_bad, SIGKILL, SI_KERNEL, "unknown 25" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 26" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 27" }, + { do_sea, SIGKILL, SI_KERNEL, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented @@ -815,9 +807,9 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 41" }, + { do_bad, SIGKILL, SI_KERNEL, "level -1 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 43" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level -1 translation fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 44" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 45" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 46" }, diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index 6fc17b2e1714..d22506e9c7fd 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -104,7 +104,7 @@ void __init early_fixmap_init(void) unsigned long end = FIXADDR_TOP; pgd_t *pgdp = pgd_offset_k(addr); - p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t *p4dp = p4d_offset_kimg(pgdp, addr); early_fixmap_init_pud(p4dp, addr, end); } @@ -170,37 +170,3 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) return dt_virt; } - -/* - * Copy the fixmap region into a new pgdir. - */ -void __init fixmap_copy(pgd_t *pgdir) -{ - if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdir, FIXADDR_TOT_START)))) { - /* - * The fixmap falls in a separate pgd to the kernel, and doesn't - * live in the carveout for the swapper_pg_dir. We can simply - * re-use the existing dir for the fixmap. - */ - set_pgd(pgd_offset_pgd(pgdir, FIXADDR_TOT_START), - READ_ONCE(*pgd_offset_k(FIXADDR_TOT_START))); - } else if (CONFIG_PGTABLE_LEVELS > 3) { - pgd_t *bm_pgdp; - p4d_t *bm_p4dp; - pud_t *bm_pudp; - /* - * The fixmap shares its top level pgd entry with the kernel - * mapping. This can really only occur when we are running - * with 16k/4 levels, so we can simply reuse the pud level - * entry instead. - */ - BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - bm_pgdp = pgd_offset_pgd(pgdir, FIXADDR_TOT_START); - bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_TOT_START); - bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_TOT_START); - pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd)); - pud_clear_fixmap(); - } else { - BUG(); - } -} diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 74c1db8ce271..0f427b50fdc3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -238,7 +238,7 @@ void __init arm64_memblock_init(void) * physical address of PAGE_OFFSET, we have to *subtract* from it. */ if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52)) - memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52); + memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52); /* * Apply the memory limit if it was set. Since the kernel may be loaded diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 4c7ad574b946..fbddbf9faf19 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -23,7 +23,7 @@ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); +static pgd_t tmp_pg_dir[PTRS_PER_PTE] __initdata __aligned(PAGE_SIZE); /* * The p*d_populate functions call virt_to_phys implicitly so they can't be used @@ -99,6 +99,19 @@ static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr); } +static p4d_t *__init kasan_p4d_offset(pgd_t *pgdp, unsigned long addr, int node, + bool early) +{ + if (pgd_none(READ_ONCE(*pgdp))) { + phys_addr_t p4d_phys = early ? + __pa_symbol(kasan_early_shadow_p4d) + : kasan_alloc_zeroed_page(node); + __pgd_populate(pgdp, p4d_phys, PGD_TYPE_TABLE); + } + + return early ? p4d_offset_kimg(pgdp, addr) : p4d_offset(pgdp, addr); +} + static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, unsigned long end, int node, bool early) { @@ -144,12 +157,12 @@ static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t *p4dp = kasan_p4d_offset(pgdp, addr, node, early); do { next = p4d_addr_end(addr, end); kasan_pud_populate(p4dp, addr, next, node, early); - } while (p4dp++, addr = next, addr != end); + } while (p4dp++, addr = next, addr != end && p4d_none(READ_ONCE(*p4dp))); } static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, @@ -165,19 +178,48 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, } while (pgdp++, addr = next, addr != end); } +#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS > 4 +#define SHADOW_ALIGN P4D_SIZE +#else +#define SHADOW_ALIGN PUD_SIZE +#endif + +/* + * Return whether 'addr' is aligned to the size covered by a root level + * descriptor. + */ +static bool __init root_level_aligned(u64 addr) +{ + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3); + + return (addr % (PAGE_SIZE << shift)) == 0; +} + /* The early shadow maps everything to a single page of zeroes */ asmlinkage void __init kasan_early_init(void) { BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); - /* - * We cannot check the actual value of KASAN_SHADOW_START during build, - * as it depends on vabits_actual. As a best-effort approach, check - * potential values calculated based on VA_BITS and VA_BITS_MIN. - */ - BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS), PGDIR_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS_MIN), PGDIR_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS), SHADOW_ALIGN)); + BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS_MIN), SHADOW_ALIGN)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, SHADOW_ALIGN)); + + if (!root_level_aligned(KASAN_SHADOW_START)) { + /* + * The start address is misaligned, and so the next level table + * will be shared with the linear region. This can happen with + * 4 or 5 level paging, so install a generic pte_t[] as the + * next level. This prevents the kasan_pgd_populate call below + * from inserting an entry that refers to the shared KASAN zero + * shadow pud_t[]/p4d_t[], which could end up getting corrupted + * when the linear region is mapped. + */ + static pte_t tbl[PTRS_PER_PTE] __page_aligned_bss; + pgd_t *pgdp = pgd_offset_k(KASAN_SHADOW_START); + + set_pgd(pgdp, __pgd(__pa_symbol(tbl) | PGD_TYPE_TABLE)); + } + kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, true); } @@ -190,34 +232,74 @@ static void __init kasan_map_populate(unsigned long start, unsigned long end, } /* - * Copy the current shadow region into a new pgdir. + * Return the descriptor index of 'addr' in the root level table */ -void __init kasan_copy_shadow(pgd_t *pgdir) +static int __init root_level_idx(u64 addr) { - pgd_t *pgdp, *pgdp_new, *pgdp_end; + /* + * On 64k pages, the TTBR1 range root tables are extended for 52-bit + * virtual addressing, and TTBR1 will simply point to the pgd_t entry + * that covers the start of the 48-bit addressable VA space if LVA is + * not implemented. This means we need to index the table as usual, + * instead of masking off bits based on vabits_actual. + */ + u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS + : vabits_actual; + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3); - pgdp = pgd_offset_k(KASAN_SHADOW_START); - pgdp_end = pgd_offset_k(KASAN_SHADOW_END); - pgdp_new = pgd_offset_pgd(pgdir, KASAN_SHADOW_START); - do { - set_pgd(pgdp_new, READ_ONCE(*pgdp)); - } while (pgdp++, pgdp_new++, pgdp != pgdp_end); + return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT); +} + +/* + * Clone a next level table from swapper_pg_dir into tmp_pg_dir + */ +static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud) +{ + int idx = root_level_idx(addr); + pgd_t pgd = READ_ONCE(swapper_pg_dir[idx]); + pud_t *pudp = (pud_t *)__phys_to_kimg(__pgd_to_phys(pgd)); + + memcpy(pud, pudp, PAGE_SIZE); + tmp_pg_dir[idx] = __pgd(__phys_to_pgd_val(__pa_symbol(pud)) | + PUD_TYPE_TABLE); } -static void __init clear_pgds(unsigned long start, - unsigned long end) +/* + * Return the descriptor index of 'addr' in the next level table + */ +static int __init next_level_idx(u64 addr) { - /* - * Remove references to kasan page tables from - * swapper_pg_dir. pgd_clear() can't be used - * here because it's nop on 2,3-level pagetable setups - */ - for (; start < end; start += PGDIR_SIZE) - set_pgd(pgd_offset_k(start), __pgd(0)); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3); + + return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE; +} + +/* + * Dereference the table descriptor at 'pgd_idx' and clear the entries from + * 'start' to 'end' (exclusive) from the table. + */ +static void __init clear_next_level(int pgd_idx, int start, int end) +{ + pgd_t pgd = READ_ONCE(swapper_pg_dir[pgd_idx]); + pud_t *pudp = (pud_t *)__phys_to_kimg(__pgd_to_phys(pgd)); + + memset(&pudp[start], 0, (end - start) * sizeof(pud_t)); +} + +static void __init clear_shadow(u64 start, u64 end) +{ + int l = root_level_idx(start), m = root_level_idx(end); + + if (!root_level_aligned(start)) + clear_next_level(l++, next_level_idx(start), PTRS_PER_PTE); + if (!root_level_aligned(end)) + clear_next_level(m, 0, next_level_idx(end)); + memset(&swapper_pg_dir[l], 0, (m - l) * sizeof(pgd_t)); } static void __init kasan_init_shadow(void) { + static pud_t pud[2][PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); u64 kimg_shadow_start, kimg_shadow_end; u64 mod_shadow_start; u64 vmalloc_shadow_end; @@ -239,10 +321,23 @@ static void __init kasan_init_shadow(void) * setup will be finished. */ memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir)); + + /* + * If the start or end address of the shadow region is not aligned to + * the root level size, we have to allocate a temporary next-level table + * in each case, clone the next level of descriptors, and install the + * table into tmp_pg_dir. Note that with 5 levels of paging, the next + * level will in fact be p4d_t, but that makes no difference in this + * case. + */ + if (!root_level_aligned(KASAN_SHADOW_START)) + clone_next_level(KASAN_SHADOW_START, tmp_pg_dir, pud[0]); + if (!root_level_aligned(KASAN_SHADOW_END)) + clone_next_level(KASAN_SHADOW_END, tmp_pg_dir, pud[1]); dsb(ishst); - cpu_replace_ttbr1(lm_alias(tmp_pg_dir), idmap_pg_dir); + cpu_replace_ttbr1(lm_alias(tmp_pg_dir)); - clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + clear_shadow(KASAN_SHADOW_START, KASAN_SHADOW_END); kasan_map_populate(kimg_shadow_start, kimg_shadow_end, early_pfn_to_nid(virt_to_pfn(lm_alias(KERNEL_START)))); @@ -276,7 +371,7 @@ static void __init kasan_init_shadow(void) PAGE_KERNEL_RO)); memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); - cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir); + cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); } static void __init kasan_init_depth(void) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 645fe60d000f..642bdf908b22 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -73,6 +73,10 @@ static int __init adjust_protection_map(void) protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } + if (lpa2_is_enabled()) + for (int i = 0; i < ARRAY_SIZE(protection_map); i++) + pgprot_val(protection_map[i]) &= ~PTE_SHARED; + return 0; } arch_initcall(adjust_protection_map); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1ac7467d34c9..bf5b1c426ad0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -45,18 +45,13 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ -int idmap_t0sz __ro_after_init; - -#if VA_BITS > 48 -u64 vabits_actual __ro_after_init = VA_BITS_MIN; -EXPORT_SYMBOL(vabits_actual); -#endif - u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 }; +static bool rodata_is_rw __ro_after_init = true; + /* * The booting CPU updates the failed status @__early_cpu_boot_status, * with MMU turned off. @@ -73,10 +68,21 @@ EXPORT_SYMBOL(empty_zero_page); static DEFINE_SPINLOCK(swapper_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); -void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) +void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) { pgd_t *fixmap_pgdp; + /* + * Don't bother with the fixmap if swapper_pg_dir is still mapped + * writable in the kernel mapping. + */ + if (rodata_is_rw) { + WRITE_ONCE(*pgdp, pgd); + dsb(ishst); + isb(); + return; + } + spin_lock(&swapper_pgdir_lock); fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp)); WRITE_ONCE(*fixmap_pgdp, pgd); @@ -307,15 +313,14 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, } while (addr = next, addr != end); } -static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, +static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; - pud_t *pudp; - p4d_t *p4dp = p4d_offset(pgdp, addr); p4d_t p4d = READ_ONCE(*p4dp); + pud_t *pudp; if (p4d_none(p4d)) { p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN; @@ -363,6 +368,46 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, pud_clear_fixmap(); } +static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(int), + int flags) +{ + unsigned long next; + pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp; + + if (pgd_none(pgd)) { + pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN; + phys_addr_t p4d_phys; + + if (flags & NO_EXEC_MAPPINGS) + pgdval |= PGD_TABLE_PXN; + BUG_ON(!pgtable_alloc); + p4d_phys = pgtable_alloc(P4D_SHIFT); + __pgd_populate(pgdp, p4d_phys, pgdval); + pgd = READ_ONCE(*pgdp); + } + BUG_ON(pgd_bad(pgd)); + + p4dp = p4d_set_fixmap_offset(pgdp, addr); + do { + p4d_t old_p4d = READ_ONCE(*p4dp); + + next = p4d_addr_end(addr, end); + + alloc_init_pud(p4dp, addr, next, phys, prot, + pgtable_alloc, flags); + + BUG_ON(p4d_val(old_p4d) != 0 && + p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp))); + + phys += next - addr; + } while (p4dp++, addr = next, addr != end); + + p4d_clear_fixmap(); +} + static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, @@ -385,7 +430,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); - alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc, + alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, flags); phys += next - addr; } while (pgdp++, addr = next, addr != end); @@ -576,8 +621,12 @@ static void __init map_mem(pgd_t *pgdp) * entries at any level are being shared between the linear region and * the vmalloc region. Check whether this is true for the PGD level, in * which case it is guaranteed to be true for all other levels as well. + * (Unless we are running with support for LPA2, in which case the + * entire reduced VA space is covered by a single pgd_t which will have + * been populated without the PXNTable attribute by the time we get here.) */ - BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end)); + BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) && + pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1); early_kfence_pool = arm64_kfence_alloc_pool(); @@ -630,15 +679,16 @@ void mark_rodata_ro(void) * to cover NOTES and EXCEPTION_TABLE. */ section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; + WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); debug_checkwx(); } -static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, - pgprot_t prot, struct vm_struct *vma, - int flags, unsigned long vm_flags) +static void __init declare_vma(struct vm_struct *vma, + void *va_start, void *va_end, + unsigned long vm_flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; @@ -646,9 +696,6 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(pa_start)); BUG_ON(!PAGE_ALIGNED(size)); - __create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, flags); - if (!(vm_flags & VM_NO_GUARD)) size += PAGE_SIZE; @@ -661,12 +708,12 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, vm_area_add_early(vma); } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static pgprot_t kernel_exec_prot(void) { return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { int i; @@ -701,80 +748,36 @@ core_initcall(map_entry_trampoline); #endif /* - * Open coded check for BTI, only for use to determine configuration - * for early mappings for before the cpufeature code has run. + * Declare the VMA areas for the kernel */ -static bool arm64_early_this_cpu_has_bti(void) +static void __init declare_kernel_vmas(void) { - u64 pfr1; + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; - if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) - return false; - - pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1); - return cpuid_feature_extract_unsigned_field(pfr1, - ID_AA64PFR1_EL1_BT_SHIFT); + declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); + declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); + declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); + declare_vma(&vmlinux_seg[4], _data, _end, 0); } -/* - * Create fine-grained mappings for the kernel. - */ -static void __init map_kernel(pgd_t *pgdp) -{ - static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext, - vmlinux_initdata, vmlinux_data; +void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, + int level, pte_t *tbl, bool may_use_cont, u64 va_offset); - /* - * External debuggers may need to write directly to the text - * mapping to install SW breakpoints. Allow this (only) when - * explicitly requested with rodata=off. - */ - pgprot_t text_prot = kernel_exec_prot(); - - /* - * If we have a CPU that supports BTI and a kernel built for - * BTI then mark the kernel executable text as guarded pages - * now so we don't have to rewrite the page tables later. - */ - if (arm64_early_this_cpu_has_bti()) - text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP); - - /* - * Only rodata will be remapped with different permissions later on, - * all other segments are allowed to use contiguous mappings. - */ - map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0, - VM_NO_GUARD); - map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL, - &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD); - map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot, - &vmlinux_inittext, 0, VM_NO_GUARD); - map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL, - &vmlinux_initdata, 0, VM_NO_GUARD); - map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0); - - fixmap_copy(pgdp); - kasan_copy_shadow(pgdp); -} +static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, + kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { u64 start = __pa_symbol(__idmap_text_start); - u64 size = __pa_symbol(__idmap_text_end) - start; - pgd_t *pgd = idmap_pg_dir; - u64 pgd_phys; - - /* check if we need an additional level of translation */ - if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) { - pgd_phys = early_pgtable_alloc(PAGE_SHIFT); - set_pgd(&idmap_pg_dir[start >> VA_BITS], - __pgd(pgd_phys | P4D_TYPE_TABLE)); - pgd = __va(pgd_phys); - } - __create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX, - early_pgtable_alloc, 0); + u64 end = __pa_symbol(__idmap_text_end); + u64 ptep = __pa_symbol(idmap_ptes); + + __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, + IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, + __phys_to_virt(ptep) - ptep); - if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { + if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { extern u32 __idmap_kpti_flag; u64 pa = __pa_symbol(&__idmap_kpti_flag); @@ -782,32 +785,21 @@ static void __init create_idmap(void) * The KPTI G-to-nG conversion code needs a read-write mapping * of its synchronization flag in the ID map. */ - __create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL, - early_pgtable_alloc, 0); + ptep = __pa_symbol(kpti_ptes); + __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, + IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, + __phys_to_virt(ptep) - ptep); } } void __init paging_init(void) { - pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir)); - extern pgd_t init_idmap_pg_dir[]; - - idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0)); - - map_kernel(pgdp); - map_mem(pgdp); - - pgd_clear_fixmap(); - - cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir); - init_mm.pgd = swapper_pg_dir; - - memblock_phys_free(__pa_symbol(init_pg_dir), - __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir)); + map_mem(swapper_pg_dir); memblock_allow_resize(); create_idmap(); + declare_kernel_vmas(); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -1073,10 +1065,10 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, free_empty_pmd_table(pudp, addr, next, floor, ceiling); } while (addr = next, addr < end); - if (CONFIG_PGTABLE_LEVELS <= 3) + if (!pgtable_l4_enabled()) return; - if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) + if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK)) return; /* @@ -1099,8 +1091,8 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { - unsigned long next; p4d_t *p4dp, p4d; + unsigned long i, next, start = addr; do { next = p4d_addr_end(addr, end); @@ -1112,6 +1104,27 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, WARN_ON(!p4d_present(p4d)); free_empty_pud_table(p4dp, addr, next, floor, ceiling); } while (addr = next, addr < end); + + if (!pgtable_l5_enabled()) + return; + + if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) + return; + + /* + * Check whether we can free the p4d page if the rest of the + * entries are empty. Overlap with other regions have been + * handled by the floor/ceiling check. + */ + p4dp = p4d_offset(pgdp, 0UL); + for (i = 0; i < PTRS_PER_P4D; i++) { + if (!p4d_none(READ_ONCE(p4dp[i]))) + return; + } + + pgd_clear(pgdp); + __flush_tlb_kernel_pgtable(start); + free_hotplug_pgtable_page(virt_to_page(p4dp)); } static void free_empty_tables(unsigned long addr, unsigned long end, @@ -1196,6 +1209,12 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) return 1; } +#ifndef __PAGETABLE_P4D_FOLDED +void p4d_clear_huge(p4d_t *p4dp) +{ +} +#endif + int pud_clear_huge(pud_t *pudp) { if (!pud_sect(READ_ONCE(*pudp))) @@ -1486,3 +1505,35 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte { set_pte_at(vma->vm_mm, addr, ptep, pte); } + +/* + * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, + * avoiding the possibility of conflicting TLB entries being allocated. + */ +void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) +{ + typedef void (ttbr_replace_func)(phys_addr_t); + extern ttbr_replace_func idmap_cpu_replace_ttbr1; + ttbr_replace_func *replace_phys; + unsigned long daif; + + /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ + phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); + + if (cnp) + ttbr1 |= TTBR_CNP_BIT; + + replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); + + cpu_install_idmap(); + + /* + * We really don't want to take *any* exceptions while TTBR1 is + * in the process of being replaced so mask everything. + */ + daif = local_daif_save(); + replace_phys(ttbr1); + local_daif_restore(daif); + + cpu_uninstall_idmap(); +} diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 4a64089e5771..0c501cabc238 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -17,11 +17,22 @@ static struct kmem_cache *pgd_cache __ro_after_init; +static bool pgdir_is_page_size(void) +{ + if (PGD_SIZE == PAGE_SIZE) + return true; + if (CONFIG_PGTABLE_LEVELS == 4) + return !pgtable_l4_enabled(); + if (CONFIG_PGTABLE_LEVELS == 5) + return !pgtable_l5_enabled(); + return false; +} + pgd_t *pgd_alloc(struct mm_struct *mm) { gfp_t gfp = GFP_PGTABLE_USER; - if (PGD_SIZE == PAGE_SIZE) + if (pgdir_is_page_size()) return (pgd_t *)__get_free_page(gfp); else return kmem_cache_alloc(pgd_cache, gfp); @@ -29,7 +40,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (PGD_SIZE == PAGE_SIZE) + if (pgdir_is_page_size()) free_page((unsigned long)pgd); else kmem_cache_free(pgd_cache, pgd); @@ -37,7 +48,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) void __init pgtable_cache_init(void) { - if (PGD_SIZE == PAGE_SIZE) + if (pgdir_is_page_size()) return; #ifdef CONFIG_ARM64_PA_BITS_52 diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index f66c37a1610e..bfd2ad896108 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -195,27 +195,36 @@ SYM_TYPED_FUNC_START(idmap_cpu_replace_ttbr1) ret SYM_FUNC_END(idmap_cpu_replace_ttbr1) +SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1) .popsection #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -#define KPTI_NG_PTE_FLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS | PTE_WRITE) +#define KPTI_NG_PTE_FLAGS (PTE_ATTRINDX(MT_NORMAL) | PTE_TYPE_PAGE | \ + PTE_AF | PTE_SHARED | PTE_UXN | PTE_WRITE) .pushsection ".idmap.text", "a" + .macro pte_to_phys, phys, pte + and \phys, \pte, #PTE_ADDR_LOW +#ifdef CONFIG_ARM64_PA_BITS_52 + and \pte, \pte, #PTE_ADDR_HIGH + orr \phys, \phys, \pte, lsl #PTE_ADDR_HIGH_SHIFT +#endif + .endm + .macro kpti_mk_tbl_ng, type, num_entries add end_\type\()p, cur_\type\()p, #\num_entries * 8 .Ldo_\type: - ldr \type, [cur_\type\()p] // Load the entry + ldr \type, [cur_\type\()p], #8 // Load the entry and advance tbz \type, #0, .Lnext_\type // Skip invalid and tbnz \type, #11, .Lnext_\type // non-global entries orr \type, \type, #PTE_NG // Same bit for blocks and pages - str \type, [cur_\type\()p] // Update the entry + str \type, [cur_\type\()p, #-8] // Update the entry .ifnc \type, pte tbnz \type, #1, .Lderef_\type .endif .Lnext_\type: - add cur_\type\()p, cur_\type\()p, #8 cmp cur_\type\()p, end_\type\()p b.ne .Ldo_\type .endm @@ -225,18 +234,18 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1) * fixmap slot associated with the current level. */ .macro kpti_map_pgtbl, type, level - str xzr, [temp_pte, #8 * (\level + 1)] // break before make + str xzr, [temp_pte, #8 * (\level + 2)] // break before make dsb nshst - add pte, temp_pte, #PAGE_SIZE * (\level + 1) + add pte, temp_pte, #PAGE_SIZE * (\level + 2) lsr pte, pte, #12 tlbi vaae1, pte dsb nsh isb phys_to_pte pte, cur_\type\()p - add cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 1) + add cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 2) orr pte, pte, pte_flags - str pte, [temp_pte, #8 * (\level + 1)] + str pte, [temp_pte, #8 * (\level + 2)] dsb nshst .endm @@ -269,6 +278,8 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) end_ptep .req x15 pte .req x16 valid .req x17 + cur_p4dp .req x19 + end_p4dp .req x20 mov x5, x3 // preserve temp_pte arg mrs swapper_ttb, ttbr1_el1 @@ -276,6 +287,12 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) cbnz cpu, __idmap_kpti_secondary +#if CONFIG_PGTABLE_LEVELS > 4 + stp x29, x30, [sp, #-32]! + mov x29, sp + stp x19, x20, [sp, #16] +#endif + /* We're the boot CPU. Wait for the others to catch up */ sevl 1: wfe @@ -293,9 +310,32 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) mov_q pte_flags, KPTI_NG_PTE_FLAGS /* Everybody is enjoying the idmap, so we can rewrite swapper. */ + +#ifdef CONFIG_ARM64_LPA2 + /* + * If LPA2 support is configured, but 52-bit virtual addressing is not + * enabled at runtime, we will fall back to one level of paging less, + * and so we have to walk swapper_pg_dir as if we dereferenced its + * address from a PGD level entry, and terminate the PGD level loop + * right after. + */ + adrp pgd, swapper_pg_dir // walk &swapper_pg_dir at the next level + mov cur_pgdp, end_pgdp // must be equal to terminate the PGD loop +alternative_if_not ARM64_HAS_VA52 + b .Lderef_pgd // skip to the next level +alternative_else_nop_endif + /* + * LPA2 based 52-bit virtual addressing requires 52-bit physical + * addressing to be enabled as well. In this case, the shareability + * bits are repurposed as physical address bits, and should not be + * set in pte_flags. + */ + bic pte_flags, pte_flags, #PTE_SHARED +#endif + /* PGD */ adrp cur_pgdp, swapper_pg_dir - kpti_map_pgtbl pgd, 0 + kpti_map_pgtbl pgd, -1 kpti_mk_tbl_ng pgd, PTRS_PER_PGD /* Ensure all the updated entries are visible to secondary CPUs */ @@ -308,16 +348,33 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) /* Set the flag to zero to indicate that we're all done */ str wzr, [flag_ptr] +#if CONFIG_PGTABLE_LEVELS > 4 + ldp x19, x20, [sp, #16] + ldp x29, x30, [sp], #32 +#endif ret .Lderef_pgd: + /* P4D */ + .if CONFIG_PGTABLE_LEVELS > 4 + p4d .req x30 + pte_to_phys cur_p4dp, pgd + kpti_map_pgtbl p4d, 0 + kpti_mk_tbl_ng p4d, PTRS_PER_P4D + b .Lnext_pgd + .else /* CONFIG_PGTABLE_LEVELS <= 4 */ + p4d .req pgd + .set .Lnext_p4d, .Lnext_pgd + .endif + +.Lderef_p4d: /* PUD */ .if CONFIG_PGTABLE_LEVELS > 3 pud .req x10 - pte_to_phys cur_pudp, pgd + pte_to_phys cur_pudp, p4d kpti_map_pgtbl pud, 1 kpti_mk_tbl_ng pud, PTRS_PER_PUD - b .Lnext_pgd + b .Lnext_p4d .else /* CONFIG_PGTABLE_LEVELS <= 3 */ pud .req pgd .set .Lnext_pud, .Lnext_pgd @@ -361,6 +418,9 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) .unreq end_ptep .unreq pte .unreq valid + .unreq cur_p4dp + .unreq end_p4dp + .unreq p4d /* Secondary CPUs end up here */ __idmap_kpti_secondary: @@ -395,8 +455,6 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings) * * Initialise the processor for turning the MMU on. * - * Input: - * x0 - actual number of VA bits (ignored unless VA_BITS > 48) * Output: * Return in x0 the value of the SCTLR_EL1 register. */ @@ -420,20 +478,21 @@ SYM_FUNC_START(__cpu_setup) mair .req x17 tcr .req x16 mov_q mair, MAIR_EL1_SET - mov_q tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ - TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ - TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS + mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ + TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ + TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS tcr_clear_errata_bits tcr, x9, x5 #ifdef CONFIG_ARM64_VA_BITS_52 - sub x9, xzr, x0 - add x9, x9, #64 + mov x9, #64 - VA_BITS +alternative_if ARM64_HAS_VA52 tcr_set_t1sz tcr, x9 -#else - idmap_get_t0sz x9 +#ifdef CONFIG_ARM64_LPA2 + orr tcr, tcr, #TCR_DS +#endif +alternative_else_nop_endif #endif - tcr_set_t0sz tcr, x9 /* * Set the IPS bits in TCR_EL1. @@ -458,11 +517,26 @@ SYM_FUNC_START(__cpu_setup) ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_indirection + /* + * The PROT_* macros describing the various memory types may resolve to + * C expressions if they include the PTE_MAYBE_* macros, and so they + * can only be used from C code. The PIE_E* constants below are also + * defined in terms of those macros, but will mask out those + * PTE_MAYBE_* constants, whether they are set or not. So #define them + * as 0x0 here so we can evaluate the PIE_E* constants in asm context. + */ + +#define PTE_MAYBE_NG 0 +#define PTE_MAYBE_SHARED 0 + mov_q x0, PIE_E0 msr REG_PIRE0_EL1, x0 mov_q x0, PIE_E1 msr REG_PIR_EL1, x0 +#undef PTE_MAYBE_NG +#undef PTE_MAYBE_SHARED + mov x0, TCR2_EL1x_PIE msr REG_TCR2_EL1, x0 @@ -472,6 +546,12 @@ SYM_FUNC_START(__cpu_setup) * Prepare SCTLR */ mov_q x0, INIT_SCTLR_EL1_MMU_ON +#ifdef CONFIG_ARM64_WXN + ldr_l x1, arm64_sw_feature_override + FTR_OVR_VAL_OFFSET + tst x1, #0xf << ARM64_SW_FEATURE_OVERRIDE_NOWXN + orr x1, x0, #SCTLR_ELx_WXN + csel x0, x0, x1, ne +#endif ret // return to head.S .unreq mair diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 5f0849528ccf..5b87f8d623f7 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -48,6 +48,7 @@ struct pg_state { struct ptdump_state ptdump; struct seq_file *seq; const struct addr_marker *marker; + const struct mm_struct *mm; unsigned long start_address; int level; u64 current_prot; @@ -144,12 +145,12 @@ static const struct prot_bits pte_bits[] = { struct pg_level { const struct prot_bits *bits; - const char *name; - size_t num; + char name[4]; + int num; u64 mask; }; -static struct pg_level pg_level[] = { +static struct pg_level pg_level[] __ro_after_init = { { /* pgd */ .name = "PGD", .bits = pte_bits, @@ -159,11 +160,11 @@ static struct pg_level pg_level[] = { .bits = pte_bits, .num = ARRAY_SIZE(pte_bits), }, { /* pud */ - .name = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD", + .name = "PUD", .bits = pte_bits, .num = ARRAY_SIZE(pte_bits), }, { /* pmd */ - .name = (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD", + .name = "PMD", .bits = pte_bits, .num = ARRAY_SIZE(pte_bits), }, { /* pte */ @@ -227,6 +228,11 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, static const char units[] = "KMGTPE"; u64 prot = 0; + /* check if the current level has been folded dynamically */ + if ((level == 1 && mm_p4d_folded(st->mm)) || + (level == 2 && mm_pud_folded(st->mm))) + level = 0; + if (level >= 0) prot = val & pg_level[level].mask; @@ -288,6 +294,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) st = (struct pg_state){ .seq = s, .marker = info->markers, + .mm = info->mm, .level = -1, .ptdump = { .note_page = note_page, @@ -313,7 +320,6 @@ static void __init ptdump_initialize(void) static struct ptdump_info kernel_ptdump_info __ro_after_init = { .mm = &init_mm, - .base_addr = PAGE_OFFSET, }; void ptdump_check_wx(void) @@ -329,7 +335,7 @@ void ptdump_check_wx(void) .ptdump = { .note_page = note_page, .range = (struct ptdump_range[]) { - {PAGE_OFFSET, ~0UL}, + {_PAGE_OFFSET(vabits_actual), ~0UL}, {0, 0} } } @@ -370,6 +376,7 @@ static int __init ptdump_init(void) static struct addr_marker address_markers[ARRAY_SIZE(m)] __ro_after_init; kernel_ptdump_info.markers = memcpy(address_markers, m, sizeof(m)); + kernel_ptdump_info.base_addr = page_offset; ptdump_initialize(); ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables"); diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 63283550c8e8..46ae9252bc3f 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -51,6 +51,7 @@ HAS_STAGE2_FWB HAS_TCR2 HAS_TIDCP1 HAS_TLB_RANGE +HAS_VA52 HAS_VIRT_HOST_EXTN HAS_WFXT HW_DBM diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 508224a0e078..3fc1650a329e 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1573,16 +1573,16 @@ Enum 35:32 TGRAN16_2 0b0010 IMP 0b0011 52_BIT EndEnum -Enum 31:28 TGRAN4 +SignedEnum 31:28 TGRAN4 0b0000 IMP 0b0001 52_BIT 0b1111 NI EndEnum -Enum 27:24 TGRAN64 +SignedEnum 27:24 TGRAN64 0b0000 IMP 0b1111 NI EndEnum -Enum 23:20 TGRAN16 +UnsignedEnum 23:20 TGRAN16 0b0000 NI 0b0001 IMP 0b0010 52_BIT @@ -1730,7 +1730,7 @@ Enum 23:20 CCIDX 0b0000 32 0b0001 64 EndEnum -Enum 19:16 VARange +UnsignedEnum 19:16 VARange 0b0000 48 0b0001 52 EndEnum diff --git a/include/linux/mman.h b/include/linux/mman.h index dc7048824be8..ec5e7f606e43 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -124,6 +124,21 @@ static inline bool arch_validate_flags(unsigned long flags) #define arch_validate_flags arch_validate_flags #endif +#ifndef arch_validate_mmap_prot +/* + * This is called from mmap(), which ignores unknown prot bits so the default + * is to accept anything. + * + * Returns true if the prot flags are valid + */ +static inline bool arch_validate_mmap_prot(unsigned long prot, + unsigned long addr) +{ + return true; +} +#define arch_validate_mmap_prot arch_validate_mmap_prot +#endif + /* * Optimisation macro. It is equivalent to: * (x & bit1) ? bit2 : 0 diff --git a/mm/mmap.c b/mm/mmap.c index d89770eaab6b..977a8c3fd9f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1229,6 +1229,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; + if (!arch_validate_mmap_prot(prot, addr)) + return -EACCES; + /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; |