diff options
Diffstat (limited to 'arch/s390')
85 files changed, 2631 insertions, 1434 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 7fd08755a1f9..933771b0b07a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -187,6 +187,7 @@ config S390 select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES + select HAVE_RETHOOK select HAVE_KVM select HAVE_LIVEPATCH select HAVE_MEMBLOCK_PHYS_MAP diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index d52c3e2e16bc..47a397da0498 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -35,7 +35,7 @@ endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 70418389414d..58ce701d6110 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -8,10 +8,36 @@ #ifndef __ASSEMBLY__ +struct machine_info { + unsigned char has_edat1 : 1; + unsigned char has_edat2 : 1; + unsigned char has_nx : 1; +}; + +struct vmlinux_info { + unsigned long default_lma; + unsigned long entry; + unsigned long image_size; /* does not include .bss */ + unsigned long bss_size; /* uncompressed image .bss size */ + unsigned long bootdata_off; + unsigned long bootdata_size; + unsigned long bootdata_preserved_off; + unsigned long bootdata_preserved_size; + unsigned long dynsym_start; + unsigned long rela_dyn_start; + unsigned long rela_dyn_end; + unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; +}; + void startup_kernel(void); -unsigned long detect_memory(void); +unsigned long detect_memory(unsigned long *safe_addr); +void mem_detect_set_usable_limit(unsigned long limit); bool is_ipl_block_dump(void); void store_ipl_parmblock(void); +unsigned long read_ipl_report(unsigned long safe_addr); void setup_boot_command_line(void); void parse_boot_command_line(void); void verify_facilities(void); @@ -19,7 +45,12 @@ void print_missing_facilities(void); void sclp_early_setup_buffer(void); void print_pgm_check_info(void); unsigned long get_random_base(unsigned long safe_addr); +void setup_vmem(unsigned long asce_limit); +unsigned long vmem_estimate_memory_needs(unsigned long online_mem_total); void __printf(1, 2) decompressor_printk(const char *fmt, ...); +void error(char *m); + +extern struct machine_info machine; /* Symbols defined by linker scripts */ extern const char kernel_version[]; @@ -31,8 +62,13 @@ extern char __boot_data_start[], __boot_data_end[]; extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; extern char _decompressor_syms_start[], _decompressor_syms_end[]; extern char _stack_start[], _stack_end[]; +extern char _end[]; +extern unsigned char _compressed_start[]; +extern unsigned char _compressed_end[]; +extern struct vmlinux_info _vmlinux_info; +#define vmlinux _vmlinux_info -unsigned long read_ipl_report(unsigned long safe_offset); +#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore)) #endif /* __ASSEMBLY__ */ #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index b519a1f045d8..d762733a0753 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -11,6 +11,7 @@ #include <linux/string.h> #include <asm/page.h> #include "decompressor.h" +#include "boot.h" /* * gzip declarations diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h index f75cc31a77dd..92b81d2ea35d 100644 --- a/arch/s390/boot/decompressor.h +++ b/arch/s390/boot/decompressor.h @@ -2,37 +2,11 @@ #ifndef BOOT_COMPRESSED_DECOMPRESSOR_H #define BOOT_COMPRESSED_DECOMPRESSOR_H -#include <linux/stddef.h> - #ifdef CONFIG_KERNEL_UNCOMPRESSED static inline void *decompress_kernel(void) { return NULL; } #else void *decompress_kernel(void); #endif unsigned long mem_safe_offset(void); -void error(char *m); - -struct vmlinux_info { - unsigned long default_lma; - void (*entry)(void); - unsigned long image_size; /* does not include .bss */ - unsigned long bss_size; /* uncompressed image .bss size */ - unsigned long bootdata_off; - unsigned long bootdata_size; - unsigned long bootdata_preserved_off; - unsigned long bootdata_preserved_size; - unsigned long dynsym_start; - unsigned long rela_dyn_start; - unsigned long rela_dyn_end; - unsigned long amode31_size; -}; - -/* Symbols defined by linker scripts */ -extern char _end[]; -extern unsigned char _compressed_start[]; -extern unsigned char _compressed_end[]; -extern char _vmlinux_info[]; - -#define vmlinux (*(struct vmlinux_info *)_vmlinux_info) #endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */ diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index e8d74d4f62aa..3e3d846400b4 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -132,7 +132,7 @@ static unsigned long count_valid_kernel_positions(unsigned long kernel_size, unsigned long start, end, pos = 0; int i; - for_each_mem_detect_block(i, &start, &end) { + for_each_mem_detect_usable_block(i, &start, &end) { if (_min >= end) continue; if (start >= _max) @@ -153,7 +153,7 @@ static unsigned long position_to_address(unsigned long pos, unsigned long kernel unsigned long start, end; int i; - for_each_mem_detect_block(i, &start, &end) { + for_each_mem_detect_usable_block(i, &start, &end) { if (_min >= end) continue; if (start >= _max) @@ -172,26 +172,20 @@ static unsigned long position_to_address(unsigned long pos, unsigned long kernel unsigned long get_random_base(unsigned long safe_addr) { + unsigned long usable_total = get_mem_detect_usable_total(); unsigned long memory_limit = get_mem_detect_end(); unsigned long base_pos, max_pos, kernel_size; - unsigned long kasan_needs; int i; - memory_limit = min(memory_limit, ident_map_size); - /* * Avoid putting kernel in the end of physical memory - * which kasan will use for shadow memory and early pgtable - * mapping allocations. + * which vmem and kasan code will use for shadow memory and + * pgtable mapping allocations. */ - memory_limit -= kasan_estimate_memory_needs(memory_limit); + memory_limit -= kasan_estimate_memory_needs(usable_total); + memory_limit -= vmem_estimate_memory_needs(usable_total); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size) { - if (safe_addr < initrd_data.start + initrd_data.size) - safe_addr = initrd_data.start + initrd_data.size; - } safe_addr = ALIGN(safe_addr, THREAD_SIZE); - kernel_size = vmlinux.image_size + vmlinux.bss_size; if (safe_addr + kernel_size > memory_limit) return 0; diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c index 7fa1a32ea0f3..35f4ba11f7fd 100644 --- a/arch/s390/boot/mem_detect.c +++ b/arch/s390/boot/mem_detect.c @@ -16,29 +16,10 @@ struct mem_detect_info __bootdata(mem_detect); #define ENTRIES_EXTENDED_MAX \ (256 * (1020 / 2) * sizeof(struct mem_detect_block)) -/* - * To avoid corrupting old kernel memory during dump, find lowest memory - * chunk possible either right after the kernel end (decompressed kernel) or - * after initrd (if it is present and there is no hole between the kernel end - * and initrd) - */ -static void *mem_detect_alloc_extended(void) -{ - unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64)); - - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size && - initrd_data.start < offset + ENTRIES_EXTENDED_MAX) - offset = ALIGN(initrd_data.start + initrd_data.size, sizeof(u64)); - - return (void *)offset; -} - static struct mem_detect_block *__get_mem_detect_block_ptr(u32 n) { if (n < MEM_INLINED_ENTRIES) return &mem_detect.entries[n]; - if (unlikely(!mem_detect.entries_extended)) - mem_detect.entries_extended = mem_detect_alloc_extended(); return &mem_detect.entries_extended[n - MEM_INLINED_ENTRIES]; } @@ -147,7 +128,7 @@ static int tprot(unsigned long addr) return rc; } -static void search_mem_end(void) +static unsigned long search_mem_end(void) { unsigned long range = 1 << (MAX_PHYSMEM_BITS - 20); /* in 1MB blocks */ unsigned long offset = 0; @@ -159,33 +140,52 @@ static void search_mem_end(void) if (!tprot(pivot << 20)) offset = pivot; } - - add_mem_detect_block(0, (offset + 1) << 20); + return (offset + 1) << 20; } -unsigned long detect_memory(void) +unsigned long detect_memory(unsigned long *safe_addr) { - unsigned long max_physmem_end; + unsigned long max_physmem_end = 0; sclp_early_get_memsize(&max_physmem_end); + mem_detect.entries_extended = (struct mem_detect_block *)ALIGN(*safe_addr, sizeof(u64)); if (!sclp_early_read_storage_info()) { mem_detect.info_source = MEM_DETECT_SCLP_STOR_INFO; - return max_physmem_end; - } - - if (!diag260()) { + } else if (!diag260()) { mem_detect.info_source = MEM_DETECT_DIAG260; - return max_physmem_end; - } - - if (max_physmem_end) { + max_physmem_end = max_physmem_end ?: get_mem_detect_end(); + } else if (max_physmem_end) { add_mem_detect_block(0, max_physmem_end); mem_detect.info_source = MEM_DETECT_SCLP_READ_INFO; - return max_physmem_end; + } else { + max_physmem_end = search_mem_end(); + add_mem_detect_block(0, max_physmem_end); + mem_detect.info_source = MEM_DETECT_BIN_SEARCH; } - search_mem_end(); - mem_detect.info_source = MEM_DETECT_BIN_SEARCH; - return get_mem_detect_end(); + if (mem_detect.count > MEM_INLINED_ENTRIES) { + *safe_addr += (mem_detect.count - MEM_INLINED_ENTRIES) * + sizeof(struct mem_detect_block); + } + + return max_physmem_end; +} + +void mem_detect_set_usable_limit(unsigned long limit) +{ + struct mem_detect_block *block; + int i; + + /* make sure mem_detect.usable ends up within online memory block */ + for (i = 0; i < mem_detect.count; i++) { + block = __get_mem_detect_block_ptr(i); + if (block->start >= limit) + break; + if (block->end >= limit) { + mem_detect.usable = limit; + break; + } + mem_detect.usable = block->end; + } } diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 47ca3264c023..11413f0baabc 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -3,6 +3,7 @@ #include <linux/elf.h> #include <asm/boot_data.h> #include <asm/sections.h> +#include <asm/maccess.h> #include <asm/cpu_mf.h> #include <asm/setup.h> #include <asm/kasan.h> @@ -11,6 +12,7 @@ #include <asm/diag.h> #include <asm/uv.h> #include <asm/abs_lowcore.h> +#include <asm/mem_detect.h> #include "decompressor.h" #include "boot.h" #include "uv.h" @@ -18,6 +20,7 @@ unsigned long __bootdata_preserved(__kaslr_offset); unsigned long __bootdata_preserved(__abs_lowcore); unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); unsigned long __bootdata(__amode31_base); unsigned long __bootdata_preserved(VMALLOC_START); unsigned long __bootdata_preserved(VMALLOC_END); @@ -33,6 +36,8 @@ u64 __bootdata_preserved(stfle_fac_list[16]); u64 __bootdata_preserved(alt_stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); +struct machine_info machine; + void error(char *x) { sclp_early_printk("\n\n"); @@ -42,6 +47,20 @@ void error(char *x) disabled_wait(); } +static void detect_facilities(void) +{ + if (test_facility(8)) { + machine.has_edat1 = 1; + __ctl_set_bit(0, 23); + } + if (test_facility(78)) + machine.has_edat2 = 1; + if (!noexec_disabled && test_facility(130)) { + machine.has_nx = 1; + __ctl_set_bit(0, 20); + } +} + static void setup_lpp(void) { S390_lowcore.current_pid = 0; @@ -57,16 +76,17 @@ unsigned long mem_safe_offset(void) } #endif -static void rescue_initrd(unsigned long addr) +static unsigned long rescue_initrd(unsigned long safe_addr) { if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) - return; + return safe_addr; if (!initrd_data.start || !initrd_data.size) - return; - if (addr <= initrd_data.start) - return; - memmove((void *)addr, (void *)initrd_data.start, initrd_data.size); - initrd_data.start = addr; + return safe_addr; + if (initrd_data.start < safe_addr) { + memmove((void *)safe_addr, (void *)initrd_data.start, initrd_data.size); + initrd_data.start = safe_addr; + } + return initrd_data.start + initrd_data.size; } static void copy_bootdata(void) @@ -150,9 +170,10 @@ static void setup_ident_map_size(unsigned long max_physmem_end) #endif } -static void setup_kernel_memory_layout(void) +static unsigned long setup_kernel_memory_layout(void) { unsigned long vmemmap_start; + unsigned long asce_limit; unsigned long rte_size; unsigned long pages; unsigned long vmax; @@ -167,10 +188,10 @@ static void setup_kernel_memory_layout(void) vmalloc_size > _REGION2_SIZE || vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > _REGION2_SIZE) { - vmax = _REGION1_SIZE; + asce_limit = _REGION1_SIZE; rte_size = _REGION2_SIZE; } else { - vmax = _REGION2_SIZE; + asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; } /* @@ -178,7 +199,7 @@ static void setup_kernel_memory_layout(void) * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. */ - vmax = adjust_to_uv_max(vmax); + vmax = adjust_to_uv_max(asce_limit); #ifdef CONFIG_KASAN /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); @@ -207,6 +228,8 @@ static void setup_kernel_memory_layout(void) /* make sure vmemmap doesn't overlay with vmalloc area */ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); vmemmap = (struct page *)vmemmap_start; + + return asce_limit; } /* @@ -240,19 +263,25 @@ static void offset_vmlinux_info(unsigned long offset) vmlinux.rela_dyn_start += offset; vmlinux.rela_dyn_end += offset; vmlinux.dynsym_start += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; } static unsigned long reserve_amode31(unsigned long safe_addr) { __amode31_base = PAGE_ALIGN(safe_addr); - return safe_addr + vmlinux.amode31_size; + return __amode31_base + vmlinux.amode31_size; } void startup_kernel(void) { + unsigned long max_physmem_end; unsigned long random_lma; unsigned long safe_addr; + unsigned long asce_limit; void *img; + psw_t psw; initrd_data.start = parmarea.initrd_start; initrd_data.size = parmarea.initrd_size; @@ -265,14 +294,17 @@ void startup_kernel(void) safe_addr = reserve_amode31(safe_addr); safe_addr = read_ipl_report(safe_addr); uv_query_info(); - rescue_initrd(safe_addr); + safe_addr = rescue_initrd(safe_addr); sclp_early_read_info(); setup_boot_command_line(); parse_boot_command_line(); + detect_facilities(); sanitize_prot_virt_host(); - setup_ident_map_size(detect_memory()); + max_physmem_end = detect_memory(&safe_addr); + setup_ident_map_size(max_physmem_end); setup_vmalloc_size(); - setup_kernel_memory_layout(); + asce_limit = setup_kernel_memory_layout(); + mem_detect_set_usable_limit(ident_map_size); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { random_lma = get_random_base(safe_addr); @@ -289,9 +321,23 @@ void startup_kernel(void) } else if (__kaslr_offset) memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + /* + * The order of the following operations is important: + * + * - handle_relocs() must follow clear_bss_section() to establish static + * memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow handle_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes to + * bootdata made by setup_vmem() + */ clear_bss_section(); - copy_bootdata(); handle_relocs(__kaslr_offset); + setup_vmem(asce_limit); + copy_bootdata(); if (__kaslr_offset) { /* @@ -303,5 +349,11 @@ void startup_kernel(void) if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) memset(img, 0, vmlinux.image_size); } - vmlinux.entry(); + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + __load_psw(psw); } diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 000000000000..4d1d0d8e99cb --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched/task.h> +#include <linux/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/mem_detect.h> +#include <asm/maccess.h> +#include <asm/abs_lowcore.h> +#include "decompressor.h" +#include "boot.h" + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +/* + * Mimic virt_to_kpte() in lack of init_mm symbol. Skip pmd NULL check though. + */ +static inline pte_t *__virt_to_kpte(unsigned long va) +{ + return pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va), va); +} + +unsigned long __bootdata_preserved(s390_invalid_asce); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); + +enum populate_mode { + POPULATE_NONE, + POPULATE_ONE2ONE, + POPULATE_ABS_LOWCORE, +}; + +static void boot_check_oom(void) +{ + if (pgalloc_pos < pgalloc_low) + error("out of memory on boot\n"); +} + +static void pgtable_populate_init(void) +{ + unsigned long initrd_end; + unsigned long kernel_end; + + kernel_end = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + pgalloc_low = round_up(kernel_end, PAGE_SIZE); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { + initrd_end = round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); + pgalloc_low = max(pgalloc_low, initrd_end); + } + + pgalloc_end = round_down(get_mem_detect_end(), PAGE_SIZE); + pgalloc_pos = pgalloc_end; + + boot_check_oom(); +} + +static void *boot_alloc_pages(unsigned int order) +{ + unsigned long size = PAGE_SIZE << order; + + pgalloc_pos -= size; + pgalloc_pos = round_down(pgalloc_pos, size); + + boot_check_oom(); + + return (void *)pgalloc_pos; +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = boot_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); + + if (!pte_leftover) { + pte_leftover = boot_alloc_pages(0); + pte = pte_leftover + _PAGE_TABLE_SIZE; + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static unsigned long _pa(unsigned long addr, enum populate_mode mode) +{ + switch (mode) { + case POPULATE_NONE: + return -1; + case POPULATE_ONE2ONE: + return addr; + case POPULATE_ABS_LOWCORE: + return __abs_lowcore_pa(addr); + default: + return -1; + } +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat2 && + IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; +} + +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat1 && + IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + entry = __pte(_pa(addr, mode)); + entry = set_pte_bit(entry, PAGE_KERNEL_EXEC); + set_pte(pte, entry); + } + } +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (can_large_pmd(pmd, addr, next)) { + entry = __pmd(_pa(addr, mode)); + entry = set_pmd_bit(entry, SEGMENT_KERNEL_EXEC); + set_pmd(pmd, entry); + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next, mode); + } +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (can_large_pud(pud, addr, next)) { + entry = __pud(_pa(addr, mode)); + entry = set_pud_bit(entry, REGION3_KERNEL_EXEC); + set_pud(pud, entry); + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next, mode); + } +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next, mode); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end, enum populate_mode mode) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } + pgtable_p4d_populate(pgd, addr, next, mode); + } +} + +void setup_vmem(unsigned long asce_limit) +{ + unsigned long start, end; + unsigned long asce_type; + unsigned long asce_bits; + int i; + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + */ + pgtable_populate_init(); + pgtable_populate(0, sizeof(struct lowcore), POPULATE_ONE2ONE); + for_each_mem_detect_usable_block(i, &start, &end) + pgtable_populate(start, end, POPULATE_ONE2ONE); + pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), + POPULATE_ABS_LOWCORE); + pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE, + POPULATE_NONE); + memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area); + + S390_lowcore.kernel_asce = swapper_pg_dir | asce_bits; + S390_lowcore.user_asce = s390_invalid_asce; + + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.user_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); + + init_mm.context.asce = S390_lowcore.kernel_asce; +} + +unsigned long vmem_estimate_memory_needs(unsigned long online_mem_total) +{ + unsigned long pages = DIV_ROUND_UP(online_mem_total, PAGE_SIZE); + + return DIV_ROUND_UP(pages, _PAGE_ENTRIES) * _PAGE_TABLE_SIZE * 2; +} diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 526c3f40f6a2..c773820e4af9 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -398,10 +398,6 @@ static int xts_aes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, if (err) return err; - /* In fips mode only 128 bit or 256 bit keys are valid */ - if (fips_enabled && key_len != 32 && key_len != 64) - return -EINVAL; - /* Pick the correct function code based on the key length */ fc = (key_len == 32) ? CPACF_KM_XTS_128 : (key_len == 64) ? CPACF_KM_XTS_256 : 0; diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index 1f2d40993c4d..a8a2407381af 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/random.h> #include <linux/static_key.h> +#include <asm/archrandom.h> #include <asm/cpacf.h> DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index a279b7d23a5e..29dc827e0fe8 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -474,7 +474,7 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key, return rc; /* - * xts_check_key verifies the key length is not odd and makes + * xts_verify_key verifies the key length is not odd and makes * sure that the two keys are not the same. This can be done * on the two protected keys as well */ diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h index 4c61b14ee928..6f264b79e377 100644 --- a/arch/s390/include/asm/abs_lowcore.h +++ b/arch/s390/include/asm/abs_lowcore.h @@ -7,11 +7,21 @@ #define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore)) extern unsigned long __abs_lowcore; -extern bool abs_lowcore_mapped; -struct lowcore *get_abs_lowcore(unsigned long *flags); -void put_abs_lowcore(struct lowcore *lc, unsigned long flags); int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc); void abs_lowcore_unmap(int cpu); +static inline struct lowcore *get_abs_lowcore(void) +{ + int cpu; + + cpu = get_cpu(); + return ((struct lowcore *)__abs_lowcore) + cpu; +} + +static inline void put_abs_lowcore(struct lowcore *lc) +{ + put_cpu(); +} + #endif /* _ASM_S390_ABS_LOWCORE_H */ diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index f508f5025e38..57a2d6518d27 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -239,7 +239,10 @@ static inline struct ap_queue_status ap_aqic(ap_qid_t qid, union { unsigned long value; struct ap_qirq_ctrl qirqctrl; - struct ap_queue_status status; + struct { + u32 _pad; + struct ap_queue_status status; + }; } reg1; unsigned long reg2 = pa_ind; @@ -253,7 +256,7 @@ static inline struct ap_queue_status ap_aqic(ap_qid_t qid, " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ : [reg1] "+&d" (reg1) : [reg0] "d" (reg0), [reg2] "d" (reg2) - : "cc", "0", "1", "2"); + : "cc", "memory", "0", "1", "2"); return reg1.status; } @@ -290,7 +293,10 @@ static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22); union { unsigned long value; - struct ap_queue_status status; + struct { + u32 _pad; + struct ap_queue_status status; + }; } reg1; unsigned long reg2; diff --git a/arch/s390/include/asm/asm-extable.h b/arch/s390/include/asm/asm-extable.h index b74f1070ddb2..55a02a153dfc 100644 --- a/arch/s390/include/asm/asm-extable.h +++ b/arch/s390/include/asm/asm-extable.h @@ -12,6 +12,7 @@ #define EX_TYPE_UA_STORE 3 #define EX_TYPE_UA_LOAD_MEM 4 #define EX_TYPE_UA_LOAD_REG 5 +#define EX_TYPE_UA_LOAD_REGPAIR 6 #define EX_DATA_REG_ERR_SHIFT 0 #define EX_DATA_REG_ERR GENMASK(3, 0) @@ -85,4 +86,7 @@ #define EX_TABLE_UA_LOAD_REG(_fault, _target, _regerr, _regzero) \ __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REG, _regerr, _regzero, 0) +#define EX_TABLE_UA_LOAD_REGPAIR(_fault, _target, _regerr, _regzero) \ + __EX_TABLE_UA(__ex_table, _fault, _target, EX_TYPE_UA_LOAD_REGPAIR, _regerr, _regzero, 0) + #endif /* __ASM_EXTABLE_H */ diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index bd1596810cc1..91d261751d25 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -15,6 +15,7 @@ #include <asm/fcx.h> #include <asm/irq.h> #include <asm/schid.h> +#include <linux/mutex.h> /* structs from asm/cio.h */ struct irb; @@ -87,6 +88,7 @@ struct ccw_device { spinlock_t *ccwlock; /* private: */ struct ccw_device_private *private; /* cio private information */ + struct mutex reg_mutex; /* public: */ struct ccw_device_id id; struct ccw_driver *drv; diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h index 84c3f0d576c5..3f26416c2ad8 100644 --- a/arch/s390/include/asm/cmpxchg.h +++ b/arch/s390/include/asm/cmpxchg.h @@ -88,67 +88,90 @@ static __always_inline unsigned long __cmpxchg(unsigned long address, unsigned long old, unsigned long new, int size) { - unsigned long prev, tmp; - int shift; - switch (size) { - case 1: + case 1: { + unsigned int prev, shift, mask; + shift = (3 ^ (address & 3)) << 3; address ^= address & 3; + old = (old & 0xff) << shift; + new = (new & 0xff) << shift; + mask = ~(0xff << shift); asm volatile( - " l %0,%2\n" - "0: nr %0,%5\n" - " lr %1,%0\n" - " or %0,%3\n" - " or %1,%4\n" - " cs %0,%1,%2\n" - " jnl 1f\n" - " xr %1,%0\n" - " nr %1,%5\n" - " jnz 0b\n" + " l %[prev],%[address]\n" + " nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "0: lr %[tmp],%[prev]\n" + " cs %[prev],%[new],%[address]\n" + " jnl 1f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jz 0b\n" "1:" - : "=&d" (prev), "=&d" (tmp), "+Q" (*(int *) address) - : "d" ((old & 0xff) << shift), - "d" ((new & 0xff) << shift), - "d" (~(0xff << shift)) - : "memory", "cc"); + : [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (old), + [new] "+&d" (new), + [mask] "+&d" (mask) + :: "memory", "cc"); return prev >> shift; - case 2: + } + case 2: { + unsigned int prev, shift, mask; + shift = (2 ^ (address & 2)) << 3; address ^= address & 2; + old = (old & 0xffff) << shift; + new = (new & 0xffff) << shift; + mask = ~(0xffff << shift); asm volatile( - " l %0,%2\n" - "0: nr %0,%5\n" - " lr %1,%0\n" - " or %0,%3\n" - " or %1,%4\n" - " cs %0,%1,%2\n" - " jnl 1f\n" - " xr %1,%0\n" - " nr %1,%5\n" - " jnz 0b\n" + " l %[prev],%[address]\n" + " nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "0: lr %[tmp],%[prev]\n" + " cs %[prev],%[new],%[address]\n" + " jnl 1f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jz 0b\n" "1:" - : "=&d" (prev), "=&d" (tmp), "+Q" (*(int *) address) - : "d" ((old & 0xffff) << shift), - "d" ((new & 0xffff) << shift), - "d" (~(0xffff << shift)) - : "memory", "cc"); + : [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (old), + [new] "+&d" (new), + [mask] "+&d" (mask) + :: "memory", "cc"); return prev >> shift; - case 4: + } + case 4: { + unsigned int prev = old; + asm volatile( - " cs %0,%3,%1\n" - : "=&d" (prev), "+Q" (*(int *) address) - : "0" (old), "d" (new) + " cs %[prev],%[new],%[address]\n" + : [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" (new) : "memory", "cc"); return prev; - case 8: + } + case 8: { + unsigned long prev = old; + asm volatile( - " csg %0,%3,%1\n" - : "=&d" (prev), "+QS" (*(long *) address) - : "0" (old), "d" (new) + " csg %[prev],%[new],%[address]\n" + : [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" (new) : "memory", "cc"); return prev; } + } __cmpxchg_called_with_bad_pointer(); return old; } diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h deleted file mode 100644 index f87a4788c19c..000000000000 --- a/arch/s390/include/asm/cpu_mcf.h +++ /dev/null @@ -1,112 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Counter facility support definitions for the Linux perf - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - */ -#ifndef _ASM_S390_CPU_MCF_H -#define _ASM_S390_CPU_MCF_H - -#include <linux/perf_event.h> -#include <asm/cpu_mf.h> - -enum cpumf_ctr_set { - CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ - CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ - CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ - CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ - CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ - - /* Maximum number of counter sets */ - CPUMF_CTR_SET_MAX, -}; - -#define CPUMF_LCCTL_ENABLE_SHIFT 16 -#define CPUMF_LCCTL_ACTCTL_SHIFT 0 - -static inline void ctr_set_enable(u64 *state, u64 ctrsets) -{ - *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; -} - -static inline void ctr_set_disable(u64 *state, u64 ctrsets) -{ - *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); -} - -static inline void ctr_set_start(u64 *state, u64 ctrsets) -{ - *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; -} - -static inline void ctr_set_stop(u64 *state, u64 ctrsets) -{ - *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); -} - -static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) -{ - switch (set) { - case CPUMF_CTR_SET_BASIC: - return stcctm(BASIC, range, dest); - case CPUMF_CTR_SET_USER: - return stcctm(PROBLEM_STATE, range, dest); - case CPUMF_CTR_SET_CRYPTO: - return stcctm(CRYPTO_ACTIVITY, range, dest); - case CPUMF_CTR_SET_EXT: - return stcctm(EXTENDED, range, dest); - case CPUMF_CTR_SET_MT_DIAG: - return stcctm(MT_DIAG_CLEARING, range, dest); - case CPUMF_CTR_SET_MAX: - return 3; - } - return 3; -} - -struct cpu_cf_events { - struct cpumf_ctr_info info; - atomic_t ctr_set[CPUMF_CTR_SET_MAX]; - atomic64_t alert; - u64 state; /* For perf_event_open SVC */ - u64 dev_state; /* For /dev/hwctr */ - unsigned int flags; - size_t used; /* Bytes used in data */ - size_t usedss; /* Bytes used in start/stop */ - unsigned char start[PAGE_SIZE]; /* Counter set at event add */ - unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ - unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ - unsigned int sets; /* # Counter set saved in memory */ -}; -DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events); - -bool kernel_cpumcf_avail(void); -int __kernel_cpumcf_begin(void); -unsigned long kernel_cpumcf_alert(int clear); -void __kernel_cpumcf_end(void); - -static inline int kernel_cpumcf_begin(void) -{ - if (!cpum_cf_avail()) - return -ENODEV; - - preempt_disable(); - return __kernel_cpumcf_begin(); -} -static inline void kernel_cpumcf_end(void) -{ - __kernel_cpumcf_end(); - preempt_enable(); -} - -/* Return true if store counter set multiple instruction is available */ -static inline int stccm_avail(void) -{ - return test_facility(142); -} - -size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info); -int cfset_online_cpu(unsigned int cpu); -int cfset_offline_cpu(unsigned int cpu); -#endif /* _ASM_S390_CPU_MCF_H */ diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index efa103b52a1a..7e417d7de568 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -42,7 +42,6 @@ static inline int cpum_sf_avail(void) return test_facility(40) && test_facility(68); } - struct cpumf_ctr_info { u16 cfvn; u16 auth_ctl; @@ -275,56 +274,4 @@ static inline int lsctl(struct hws_lsctl_request_block *req) return cc ? -EINVAL : 0; } - -/* Sampling control helper functions */ - -#include <linux/time.h> - -static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, - unsigned long freq) -{ - return (USEC_PER_SEC / freq) * qsi->cpu_speed; -} - -static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, - unsigned long rate) -{ - return USEC_PER_SEC * qsi->cpu_speed / rate; -} - -/* Return TOD timestamp contained in an trailer entry */ -static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) -{ - /* TOD in STCKE format */ - if (te->header.t) - return *((unsigned long long *) &te->timestamp[1]); - - /* TOD in STCK format */ - return *((unsigned long long *) &te->timestamp[0]); -} - -/* Return pointer to trailer entry of an sample data block */ -static inline unsigned long *trailer_entry_ptr(unsigned long v) -{ - void *ret; - - ret = (void *) v; - ret += PAGE_SIZE; - ret -= sizeof(struct hws_trailer_entry); - - return (unsigned long *) ret; -} - -/* Return true if the entry in the sample data block table (sdbt) - * is a link to the next sdbt */ -static inline int is_link_entry(unsigned long *s) -{ - return *s & 0x1ul ? 1 : 0; -} - -/* Return pointer to the linked sdbt */ -static inline unsigned long *get_next_sdbt(unsigned long *s) -{ - return (unsigned long *) (*s & ~0x1ul); -} #endif /* _ASM_S390_CPU_MF_H */ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 1d389847b588..30bb3ec4e5fc 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -11,30 +11,11 @@ #include <linux/types.h> #include <asm/timex.h> -#define CPUTIME_PER_USEC 4096ULL -#define CPUTIME_PER_SEC (CPUTIME_PER_USEC * USEC_PER_SEC) - -/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ - -#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) - -/* - * Convert cputime to microseconds. - */ -static inline u64 cputime_to_usecs(const u64 cputime) -{ - return cputime >> 12; -} - /* * Convert cputime to nanoseconds. */ #define cputime_to_nsecs(cputime) tod_to_ns(cputime) -u64 arch_cpu_idle_time(int cpu); - -#define arch_idle_time(cpu) arch_cpu_idle_time(cpu) - void account_idle_time_irq(void); #endif /* _S390_CPUTIME_H */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 56e99c286d12..674a939f16ee 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -12,6 +12,7 @@ #include <linux/if_ether.h> #include <linux/percpu.h> #include <asm/asm-extable.h> +#include <asm/cio.h> enum diag_stat_enum { DIAG_STAT_X008, @@ -20,6 +21,7 @@ enum diag_stat_enum { DIAG_STAT_X014, DIAG_STAT_X044, DIAG_STAT_X064, + DIAG_STAT_X08C, DIAG_STAT_X09C, DIAG_STAT_X0DC, DIAG_STAT_X204, @@ -79,10 +81,20 @@ struct diag210 { u8 vrdccrty; /* real device type (output) */ u8 vrdccrmd; /* real device model (output) */ u8 vrdccrft; /* real device feature (output) */ -} __attribute__((packed, aligned(4))); +} __packed __aligned(4); extern int diag210(struct diag210 *addr); +struct diag8c { + u8 flags; + u8 num_partitions; + u16 width; + u16 height; + u8 data[0]; +} __packed __aligned(4); + +extern int diag8c(struct diag8c *out, struct ccw_dev_id *devno); + /* bit is set in flags, when physical cpu info is included in diag 204 data */ #define DIAG204_LPAR_PHYS_FLG 0x80 #define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ @@ -318,6 +330,7 @@ struct diag_ops { int (*diag210)(struct diag210 *addr); int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode); + int (*diag8c)(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); void (*diag0c)(struct hypfs_diag0c_entry *entry); void (*diag308_reset)(void); }; @@ -330,5 +343,6 @@ int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode); int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode); void _diag0c_amode31(struct hypfs_diag0c_entry *entry); void _diag308_reset_amode31(void); +int _diag8c_amode31(struct diag8c *addr, struct ccw_dev_id *devno, size_t len); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/fpu/internal.h b/arch/s390/include/asm/fpu/internal.h index 4a71dbbf76fb..bbdadb1c9efc 100644 --- a/arch/s390/include/asm/fpu/internal.h +++ b/arch/s390/include/asm/fpu/internal.h @@ -27,7 +27,7 @@ static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs) int i; for (i = 0; i < __NUM_FPRS; i++) - fprs[i] = *(freg_t *)(vxrs + i); + fprs[i].ui = vxrs[i].high; } static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) @@ -35,7 +35,7 @@ static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) int i; for (i = 0; i < __NUM_FPRS; i++) - *(freg_t *)(vxrs + i) = fprs[i]; + vxrs[i].high = fprs[i].ui; } static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu) diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h index 40eae2c08d61..59fcc3c72edf 100644 --- a/arch/s390/include/asm/idals.h +++ b/arch/s390/include/asm/idals.h @@ -23,6 +23,9 @@ #define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */ #define IDA_BLOCK_SIZE (1L<<IDA_SIZE_LOG) +#define IDA_2K_SIZE_LOG 11 +#define IDA_2K_BLOCK_SIZE (1L << IDA_2K_SIZE_LOG) + /* * Test if an address/length pair needs an idal list. */ @@ -43,6 +46,15 @@ static inline unsigned int idal_nr_words(void *vaddr, unsigned int length) } /* + * Return the number of 2K IDA words needed for an address/length pair. + */ +static inline unsigned int idal_2k_nr_words(void *vaddr, unsigned int length) +{ + return ((__pa(vaddr) & (IDA_2K_BLOCK_SIZE - 1)) + length + + (IDA_2K_BLOCK_SIZE - 1)) >> IDA_2K_SIZE_LOG; +} + +/* * Create the list of idal words for an address/length pair. */ static inline unsigned long *idal_create_words(unsigned long *idaws, diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 5cea629c548e..09f763b9eb40 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -10,16 +10,12 @@ #include <linux/types.h> #include <linux/device.h> -#include <linux/seqlock.h> struct s390_idle_data { - seqcount_t seqcount; unsigned long idle_count; unsigned long idle_time; unsigned long clock_idle_enter; - unsigned long clock_idle_exit; unsigned long timer_idle_enter; - unsigned long timer_idle_exit; unsigned long mt_cycles_enter[8]; }; @@ -27,6 +23,5 @@ extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); -void psw_idle_exit(void); #endif /* _S390_IDLE_H */ diff --git a/arch/s390/include/asm/kasan.h b/arch/s390/include/asm/kasan.h index 2768d5db181f..e5cfc81d5b61 100644 --- a/arch/s390/include/asm/kasan.h +++ b/arch/s390/include/asm/kasan.h @@ -14,17 +14,15 @@ #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) extern void kasan_early_init(void); -extern void kasan_copy_shadow_mapping(void); -extern void kasan_free_early_identity(void); /* * Estimate kasan memory requirements, which it will reserve * at the very end of available physical memory. To estimate * that, we take into account that kasan would require * 1/8 of available physical memory (for shadow memory) + - * creating page tables for the whole memory + shadow memory - * region (1 + 1/8). To keep page tables estimates simple take - * the double of combined ptes size. + * creating page tables for the shadow memory region. + * To keep page tables estimates simple take the double of + * combined ptes size. * * physmem parameter has to be already adjusted if not entire physical memory * would be used (e.g. due to effect of "mem=" option). @@ -36,15 +34,13 @@ static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) /* for shadow memory */ kasan_needs = round_up(physmem / 8, PAGE_SIZE); /* for paging structures */ - pages = DIV_ROUND_UP(physmem + kasan_needs, PAGE_SIZE); + pages = DIV_ROUND_UP(kasan_needs, PAGE_SIZE); kasan_needs += DIV_ROUND_UP(pages, _PAGE_ENTRIES) * _PAGE_TABLE_SIZE * 2; return kasan_needs; } #else static inline void kasan_early_init(void) { } -static inline void kasan_copy_shadow_mapping(void) { } -static inline void kasan_free_early_identity(void) { } static inline unsigned long kasan_estimate_memory_needs(unsigned long physmem) { return 0; } #endif diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 598095f4b924..83f732ca3af4 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -70,8 +70,6 @@ struct kprobe_ctlblk { }; void arch_remove_kprobe(struct kprobe *p); -void __kretprobe_trampoline(void); -void trampoline_probe_handler(struct pt_regs *regs); int kprobe_fault_handler(struct pt_regs *regs, int trapnr); int kprobe_exceptions_notify(struct notifier_block *self, diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h index c7fa838cf6b9..cfec3141fdba 100644 --- a/arch/s390/include/asm/maccess.h +++ b/arch/s390/include/asm/maccess.h @@ -7,7 +7,7 @@ struct iov_iter; extern unsigned long __memcpy_real_area; -void memcpy_real_init(void); +extern pte_t *memcpy_real_ptep; size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count); int memcpy_real(void *dest, unsigned long src, size_t count); #ifdef CONFIG_CRASH_DUMP diff --git a/arch/s390/include/asm/mem_detect.h b/arch/s390/include/asm/mem_detect.h index a7c922a69050..f9e7354036d2 100644 --- a/arch/s390/include/asm/mem_detect.h +++ b/arch/s390/include/asm/mem_detect.h @@ -30,6 +30,7 @@ struct mem_detect_block { struct mem_detect_info { u32 count; u8 info_source; + unsigned long usable; struct mem_detect_block entries[MEM_INLINED_ENTRIES]; struct mem_detect_block *entries_extended; }; @@ -38,7 +39,7 @@ extern struct mem_detect_info mem_detect; void add_mem_detect_block(u64 start, u64 end); static inline int __get_mem_detect_block(u32 n, unsigned long *start, - unsigned long *end) + unsigned long *end, bool respect_usable_limit) { if (n >= mem_detect.count) { *start = 0; @@ -53,21 +54,41 @@ static inline int __get_mem_detect_block(u32 n, unsigned long *start, *start = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].start; *end = (unsigned long)mem_detect.entries_extended[n - MEM_INLINED_ENTRIES].end; } + + if (respect_usable_limit && mem_detect.usable) { + if (*start >= mem_detect.usable) + return -1; + if (*end > mem_detect.usable) + *end = mem_detect.usable; + } return 0; } /** - * for_each_mem_detect_block - early online memory range iterator + * for_each_mem_detect_usable_block - early online memory range iterator * @i: an integer used as loop variable * @p_start: ptr to unsigned long for start address of the range * @p_end: ptr to unsigned long for end address of the range * - * Walks over detected online memory ranges. + * Walks over detected online memory ranges below usable limit. */ -#define for_each_mem_detect_block(i, p_start, p_end) \ - for (i = 0, __get_mem_detect_block(i, p_start, p_end); \ - i < mem_detect.count; \ - i++, __get_mem_detect_block(i, p_start, p_end)) +#define for_each_mem_detect_usable_block(i, p_start, p_end) \ + for (i = 0; !__get_mem_detect_block(i, p_start, p_end, true); i++) + +/* Walks over all detected online memory ranges disregarding usable limit. */ +#define for_each_mem_detect_block(i, p_start, p_end) \ + for (i = 0; !__get_mem_detect_block(i, p_start, p_end, false); i++) + +static inline unsigned long get_mem_detect_usable_total(void) +{ + unsigned long start, end, total = 0; + int i; + + for_each_mem_detect_usable_block(i, &start, &end) + total += end - start; + + return total; +} static inline void get_mem_detect_reserved(unsigned long *start, unsigned long *size) @@ -84,8 +105,10 @@ static inline unsigned long get_mem_detect_end(void) unsigned long start; unsigned long end; + if (mem_detect.usable) + return mem_detect.usable; if (mem_detect.count) { - __get_mem_detect_block(mem_detect.count - 1, &start, &end); + __get_mem_detect_block(mem_detect.count - 1, &start, &end, false); return end; } return 0; diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 61dea67bb9c7..8a2a3b5d1e29 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -73,9 +73,8 @@ static inline void copy_page(void *to, void *from) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) /* * These are used to make use of C type-checking.. diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b26cbf1c533c..2c70b4d1263d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -23,6 +23,7 @@ #include <asm/uv.h> extern pgd_t swapper_pg_dir[]; +extern pgd_t invalid_pg_dir[]; extern void paging_init(void); extern unsigned long s390_invalid_asce; @@ -181,6 +182,8 @@ static inline int is_module_addr(void *addr) #define _PAGE_SOFT_DIRTY 0x000 #endif +#define _PAGE_SW_BITS 0xffUL /* All SW bits */ + #define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */ /* Set of bits not changed in pte_modify */ @@ -188,6 +191,12 @@ static inline int is_module_addr(void *addr) _PAGE_YOUNG | _PAGE_SOFT_DIRTY) /* + * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT + * HW bit and all SW bits. + */ +#define _PAGE_RDP_MASK ~(_PAGE_PROTECT | _PAGE_SW_BITS) + +/* * handle_pte_fault uses pte_present and pte_none to find out the pte type * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to * distinguish present from not-present ptes. It is changed only with the page @@ -477,6 +486,12 @@ static inline int is_module_addr(void *addr) _REGION3_ENTRY_YOUNG | \ _REGION_ENTRY_PROTECT | \ _REGION_ENTRY_NOEXEC) +#define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_WRITE | \ + _REGION3_ENTRY_YOUNG | \ + _REGION3_ENTRY_DIRTY) static inline bool mm_p4d_folded(struct mm_struct *mm) { @@ -812,7 +827,6 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; @@ -1045,6 +1059,19 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_NODAT 0x400 #define IPTE_GUEST_ASCE 0x800 +static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) +{ + unsigned long pto; + + pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]" + : "+m" (*ptep) + : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt), + [asce] "a" (asce), [m4] "i" (local)); +} + static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, unsigned long opt, unsigned long asce, int local) @@ -1195,6 +1222,42 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte)); } +/* + * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE + * bits in the comparison. Those might change e.g. because of dirty and young + * tracking. + */ +static inline int pte_allow_rdp(pte_t old, pte_t new) +{ + /* + * Only allow changes from RO to RW + */ + if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT) + return 0; + + return (pte_val(old) & _PAGE_RDP_MASK) == (pte_val(new) & _PAGE_RDP_MASK); +} + +static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, + unsigned long address) +{ + /* + * RDP might not have propagated the PTE protection reset to all CPUs, + * so there could be spurious TLB protection faults. + * NOTE: This will also be called when a racing pagetable update on + * another thread already installed the correct PTE. Both cases cannot + * really be distinguished. + * Therefore, only do the local TLB flush when RDP can be used, to avoid + * unnecessary overhead. + */ + if (MACHINE_HAS_RDP) + asm volatile("ptlb" : : : "memory"); +} +#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault + +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new); + #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS static inline int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -1202,7 +1265,10 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, { if (pte_same(*ptep, entry)) return 0; - ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); + if (MACHINE_HAS_RDP && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry)) + ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry); + else + ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); return 1; } diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index c907f747d2a0..e98d9650764b 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -44,29 +44,46 @@ typedef long (*sys_call_ptr_t)(struct pt_regs *regs); -static inline void set_cpu_flag(int flag) +static __always_inline void set_cpu_flag(int flag) { S390_lowcore.cpu_flags |= (1UL << flag); } -static inline void clear_cpu_flag(int flag) +static __always_inline void clear_cpu_flag(int flag) { S390_lowcore.cpu_flags &= ~(1UL << flag); } -static inline int test_cpu_flag(int flag) +static __always_inline bool test_cpu_flag(int flag) { - return !!(S390_lowcore.cpu_flags & (1UL << flag)); + return S390_lowcore.cpu_flags & (1UL << flag); +} + +static __always_inline bool test_and_set_cpu_flag(int flag) +{ + if (test_cpu_flag(flag)) + return true; + set_cpu_flag(flag); + return false; +} + +static __always_inline bool test_and_clear_cpu_flag(int flag) +{ + if (!test_cpu_flag(flag)) + return false; + clear_cpu_flag(flag); + return true; } /* * Test CIF flag of another CPU. The caller needs to ensure that * CPU hotplug can not happen, e.g. by disabling preemption. */ -static inline int test_cpu_flag_of(int flag, int cpu) +static __always_inline bool test_cpu_flag_of(int flag, int cpu) { struct lowcore *lc = lowcore_ptr[cpu]; - return !!(lc->cpu_flags & (1UL << flag)); + + return lc->cpu_flags & (1UL << flag); } #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 8bae33ab320a..bfb8c3cb8aee 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -26,7 +26,7 @@ #ifndef __ASSEMBLY__ #define PSW_KERNEL_BITS (PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_HOME | \ - PSW_MASK_EA | PSW_MASK_BA) + PSW_MASK_EA | PSW_MASK_BA | PSW_MASK_DAT) #define PSW_USER_BITS (PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | \ PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ PSW_MASK_PSTATE | PSW_ASC_PRIMARY) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 77e6506898f5..3a1f8825bc7d 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -34,6 +34,7 @@ #define MACHINE_FLAG_GS BIT(16) #define MACHINE_FLAG_SCC BIT(17) #define MACHINE_FLAG_PCI_MIO BIT(18) +#define MACHINE_FLAG_RDP BIT(19) #define LPP_MAGIC BIT(31) #define LPP_PID_MASK _AC(0xffffffff, UL) @@ -73,6 +74,10 @@ extern unsigned int zlib_dfltcc_support; extern int noexec_disabled; extern unsigned long ident_map_size; +extern unsigned long pgalloc_pos; +extern unsigned long pgalloc_end; +extern unsigned long pgalloc_low; +extern unsigned long __amode31_base; /* The Write Back bit position in the physaddr is given by the SLPC PCI */ extern unsigned long mio_wb_bit_mask; @@ -95,6 +100,7 @@ extern unsigned long mio_wb_bit_mask; #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) #define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) #define MACHINE_HAS_PCI_MIO (S390_lowcore.machine_flags & MACHINE_FLAG_PCI_MIO) +#define MACHINE_HAS_RDP (S390_lowcore.machine_flags & MACHINE_FLAG_RDP) /* * Console mode. Override with conmode= diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index fde7e6b1df48..9286430fe729 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -7,36 +7,13 @@ #ifndef _ASM_S390_SYSCALL_WRAPPER_H #define _ASM_S390_SYSCALL_WRAPPER_H -#define __SC_TYPE(t, a) t - -#define SYSCALL_PT_ARG6(regs, m, t1, t2, t3, t4, t5, t6)\ - SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5), \ - m(t6, (regs->gprs[7])) - -#define SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5) \ - SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4), \ - m(t5, (regs->gprs[6])) - -#define SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4) \ - SYSCALL_PT_ARG3(regs, m, t1, t2, t3), \ - m(t4, (regs->gprs[5])) - -#define SYSCALL_PT_ARG3(regs, m, t1, t2, t3) \ - SYSCALL_PT_ARG2(regs, m, t1, t2), \ - m(t3, (regs->gprs[4])) - -#define SYSCALL_PT_ARG2(regs, m, t1, t2) \ - SYSCALL_PT_ARG1(regs, m, t1), \ - m(t2, (regs->gprs[3])) - -#define SYSCALL_PT_ARG1(regs, m, t1) \ - m(t1, (regs->orig_gpr2)) - -#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) +/* Mapping of registers to parameters for syscalls */ +#define SC_S390_REGS_TO_ARGS(x, ...) \ + __MAP(x, __SC_ARGS \ + ,, regs->orig_gpr2,, regs->gprs[3],, regs->gprs[4] \ + ,, regs->gprs[5],, regs->gprs[6],, regs->gprs[7]) #ifdef CONFIG_COMPAT -#define __SC_COMPAT_TYPE(t, a) \ - __typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a #define __SC_COMPAT_CAST(t, a) \ ({ \ @@ -56,34 +33,31 @@ (t)__ReS; \ }) -#define __S390_SYS_STUBx(x, name, ...) \ - long __s390_sys##name(struct pt_regs *regs); \ - ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ - long __s390_sys##name(struct pt_regs *regs) \ - { \ - long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \ - __SC_COMPAT_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } - /* * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias * named __s390x_sys_*() */ #define COMPAT_SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ long __s390_compat_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \ long __s390_compat_sys_##sname(void) #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ + long __s390_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__s390_sys_##sname, ERRNO); \ long __s390x_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ + static inline long __do_sys_##sname(void); \ long __s390_sys_##sname(void) \ - __attribute__((alias(__stringify(__s390x_sys_##sname)))); \ - long __s390x_sys_##sname(void) + { \ + return __do_sys_##sname(); \ + } \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) #define COND_SYSCALL(name) \ cond_syscall(__s390x_sys_##name); \ @@ -94,24 +68,20 @@ SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers) #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments"); \ long __s390_compat_sys##name(struct pt_regs *regs); \ - long __s390_compat_sys##name(struct pt_regs *regs) \ - __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ - static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - long __se_compat_sys##name(struct pt_regs *regs); \ - long __se_compat_sys##name(struct pt_regs *regs) \ + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + long __s390_compat_sys##name(struct pt_regs *regs) \ { \ - long ret = __do_compat_sys##name(SYSCALL_PT_ARGS(x, regs, __SC_DELOUSE, \ - __MAP(x, __SC_TYPE, __VA_ARGS__))); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ + return __se_compat_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ } \ - __diag_pop(); \ - static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + static inline long __se_compat_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_compat_sys##name(__MAP(x, __SC_DELOUSE, __VA_ARGS__)); \ + } \ + static inline long __do_compat_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) /* * As some compat syscalls may not be implemented, we need to expand @@ -124,42 +94,58 @@ #define COMPAT_SYS_NI(name) \ SYSCALL_ALIAS(__s390_compat_sys_##name, sys_ni_posix_timers) -#else /* CONFIG_COMPAT */ +#define __S390_SYS_STUBx(x, name, ...) \ + long __s390_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + long __s390_sys##name(struct pt_regs *regs) \ + { \ + return ___se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long ___se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_COMPAT_CAST, __VA_ARGS__)); \ + } -#define __S390_SYS_STUBx(x, fullname, name, ...) +#else /* CONFIG_COMPAT */ #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ long __s390x_sys_##sname(void); \ ALLOW_ERROR_INJECTION(__s390x_sys_##sname, ERRNO); \ - long __s390x_sys_##sname(void) + static inline long __do_sys_##sname(void); \ + long __s390x_sys_##sname(void) \ + { \ + return __do_sys_##sname(); \ + } \ + static inline long __do_sys_##sname(void) #define COND_SYSCALL(name) \ cond_syscall(__s390x_sys_##name) #define SYS_NI(name) \ - SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); + SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers) + +#define __S390_SYS_STUBx(x, fullname, name, ...) #endif /* CONFIG_COMPAT */ -#define __SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments"); \ - long __s390x_sys##name(struct pt_regs *regs) \ - __attribute__((alias(__stringify(__se_sys##name)))); \ - ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ - static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - long __se_sys##name(struct pt_regs *regs); \ - __S390_SYS_STUBx(x, name, __VA_ARGS__) \ - long __se_sys##name(struct pt_regs *regs) \ - { \ - long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \ - __SC_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } \ - __diag_pop(); \ - static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#define __SYSCALL_DEFINEx(x, name, ...) \ + long __s390x_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \ + __S390_SYS_STUBx(x, name, __VA_ARGS__); \ + long __s390x_sys##name(struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_S390_REGS_TO_ARGS(x, __VA_ARGS__)); \ + } \ + static inline long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \ + { \ + __MAP(x, __SC_TEST, __VA_ARGS__); \ + return __do_sys##name(__MAP(x, __SC_CAST, __VA_ARGS__)); \ + } \ + static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)) #endif /* _ASM_S390_SYSCALL_WRAPPER_H */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index f7038b800cc3..8a8c64a678c4 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -390,4 +390,212 @@ do { \ goto err_label; \ } while (0) +void __cmpxchg_user_key_called_with_bad_pointer(void); + +#define CMPXCHG_USER_KEY_MAX_LOOPS 128 + +static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval, + __uint128_t old, __uint128_t new, + unsigned long key, int size) +{ + int rc = 0; + + switch (size) { + case 1: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (3 ^ (address & 3)) << 3; + address ^= address & 3; + _old = ((unsigned int)old & 0xff) << shift; + _new = ((unsigned int)new & 0xff) << shift; + mask = ~(0xff << shift); + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + *(unsigned char *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 2: { + unsigned int prev, shift, mask, _old, _new; + unsigned long count; + + shift = (2 ^ (address & 2)) << 3; + address ^= address & 2; + _old = ((unsigned int)old & 0xffff) << shift; + _new = ((unsigned int)new & 0xffff) << shift; + mask = ~(0xffff << shift); + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + " llill %[count],%[max_loops]\n" + "0: l %[prev],%[address]\n" + "1: nr %[prev],%[mask]\n" + " xilf %[mask],0xffffffff\n" + " or %[new],%[prev]\n" + " or %[prev],%[tmp]\n" + "2: lr %[tmp],%[prev]\n" + "3: cs %[prev],%[new],%[address]\n" + "4: jnl 5f\n" + " xr %[tmp],%[prev]\n" + " xr %[new],%[tmp]\n" + " nr %[tmp],%[mask]\n" + " jnz 5f\n" + " brct %[count],2b\n" + "5: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(3b, 5b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(4b, 5b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "=&d" (prev), + [address] "+Q" (*(int *)address), + [tmp] "+&d" (_old), + [new] "+&d" (_new), + [mask] "+&d" (mask), + [count] "=a" (count) + : [key] "%[count]" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY), + [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) + : "memory", "cc"); + *(unsigned short *)uval = prev >> shift; + if (!count) + rc = -EAGAIN; + return rc; + } + case 4: { + unsigned int prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cs %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+Q" (*(int *)address) + : [new] "d" ((unsigned int)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(unsigned int *)uval = prev; + return rc; + } + case 8: { + unsigned long prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: csg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(long *)address) + : [new] "d" ((unsigned long)new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(unsigned long *)uval = prev; + return rc; + } + case 16: { + __uint128_t prev = old; + + asm volatile( + " spka 0(%[key])\n" + " sacf 256\n" + "0: cdsg %[prev],%[new],%[address]\n" + "1: sacf 768\n" + " spka %[default_key]\n" + EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) + EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) + : [rc] "+&d" (rc), + [prev] "+&d" (prev), + [address] "+QS" (*(__int128_t *)address) + : [new] "d" (new), + [key] "a" (key << 4), + [default_key] "J" (PAGE_DEFAULT_KEY) + : "memory", "cc"); + *(__uint128_t *)uval = prev; + return rc; + } + } + __cmpxchg_user_key_called_with_bad_pointer(); + return rc; +} + +/** + * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys + * @ptr: User space address of value to compare to @old and exchange with + * @new. Must be aligned to sizeof(*@ptr). + * @uval: Address where the old value of *@ptr is written to. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written to *@uval. + * @new: New value to place at *@ptr. + * @key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on a user space target, honoring storage key protection. + * @key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * The caller must compare *@uval and @old to determine if values have been + * exchanged. In case of an exception *@uval is set to zero. + * + * Return: 0: cmpxchg executed + * -EFAULT: an exception happened when trying to access *@ptr + * -EAGAIN: maxed out number of retries (byte and short only) + */ +#define cmpxchg_user_key(ptr, uval, old, new, key) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + __typeof__(uval) __uval = (uval); \ + \ + BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ + might_fault(); \ + __chk_user_ptr(__ptr); \ + __cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ + (old), (new), (key), sizeof(*(__ptr))); \ +}) + #endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h index 02462e7100c1..b8ecf04e3468 100644 --- a/arch/s390/include/asm/unwind.h +++ b/arch/s390/include/asm/unwind.h @@ -4,7 +4,7 @@ #include <linux/sched.h> #include <linux/ftrace.h> -#include <linux/kprobes.h> +#include <linux/rethook.h> #include <linux/llist.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> @@ -43,13 +43,15 @@ struct unwind_state { bool error; }; -/* Recover the return address modified by kretprobe and ftrace_graph. */ +/* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long ip) { ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp); - if (is_kretprobe_trampoline(ip)) - ip = kretprobe_find_ret_addr(state->task, (void *)state->sp, &state->kr_cur); +#ifdef CONFIG_RETHOOK + if (is_rethook_trampoline(ip)) + ip = rethook_find_ret_addr(state->task, state->sp, &state->kr_cur); +#endif return ip; } diff --git a/arch/s390/include/uapi/asm/fs3270.h b/arch/s390/include/uapi/asm/fs3270.h new file mode 100644 index 000000000000..c4bc1108af6a --- /dev/null +++ b/arch/s390/include/uapi/asm/fs3270.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_FS3270_H +#define __ASM_S390_UAPI_FS3270_H + +#include <linux/types.h> +#include <asm/ioctl.h> + +/* ioctls for fullscreen 3270 */ +#define TUBICMD _IO('3', 3) /* set ccw command for fs reads. */ +#define TUBOCMD _IO('3', 4) /* set ccw command for fs writes. */ +#define TUBGETI _IO('3', 7) /* get ccw command for fs reads. */ +#define TUBGETO _IO('3', 8) /* get ccw command for fs writes. */ +#define TUBGETMOD _IO('3', 13) /* get characteristics like model, cols, rows */ + +/* For TUBGETMOD */ +struct raw3270_iocb { + __u16 model; + __u16 line_cnt; + __u16 col_cnt; + __u16 pf_cnt; + __u16 re_cnt; + __u16 map; +}; + +#endif /* __ASM_S390_UAPI_FS3270_H */ diff --git a/arch/s390/include/uapi/asm/raw3270.h b/arch/s390/include/uapi/asm/raw3270.h new file mode 100644 index 000000000000..6676f102bd50 --- /dev/null +++ b/arch/s390/include/uapi/asm/raw3270.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_S390_UAPI_RAW3270_H +#define __ASM_S390_UAPI_RAW3270_H + +/* Local Channel Commands */ +#define TC_WRITE 0x01 /* Write */ +#define TC_RDBUF 0x02 /* Read Buffer */ +#define TC_EWRITE 0x05 /* Erase write */ +#define TC_READMOD 0x06 /* Read modified */ +#define TC_EWRITEA 0x0d /* Erase write alternate */ +#define TC_WRITESF 0x11 /* Write structured field */ + +/* Buffer Control Orders */ +#define TO_GE 0x08 /* Graphics Escape */ +#define TO_SF 0x1d /* Start field */ +#define TO_SBA 0x11 /* Set buffer address */ +#define TO_IC 0x13 /* Insert cursor */ +#define TO_PT 0x05 /* Program tab */ +#define TO_RA 0x3c /* Repeat to address */ +#define TO_SFE 0x29 /* Start field extended */ +#define TO_EUA 0x12 /* Erase unprotected to address */ +#define TO_MF 0x2c /* Modify field */ +#define TO_SA 0x28 /* Set attribute */ + +/* Field Attribute Bytes */ +#define TF_INPUT 0x40 /* Visible input */ +#define TF_INPUTN 0x4c /* Invisible input */ +#define TF_INMDT 0xc1 /* Visible, Set-MDT */ +#define TF_LOG 0x60 + +/* Character Attribute Bytes */ +#define TAT_RESET 0x00 +#define TAT_FIELD 0xc0 +#define TAT_EXTHI 0x41 +#define TAT_FGCOLOR 0x42 +#define TAT_CHARS 0x43 +#define TAT_BGCOLOR 0x45 +#define TAT_TRANS 0x46 + +/* Extended-Highlighting Bytes */ +#define TAX_RESET 0x00 +#define TAX_BLINK 0xf1 +#define TAX_REVER 0xf2 +#define TAX_UNDER 0xf4 + +/* Reset value */ +#define TAR_RESET 0x00 + +/* Color values */ +#define TAC_RESET 0x00 +#define TAC_BLUE 0xf1 +#define TAC_RED 0xf2 +#define TAC_PINK 0xf3 +#define TAC_GREEN 0xf4 +#define TAC_TURQ 0xf5 +#define TAC_YELLOW 0xf6 +#define TAC_WHITE 0xf7 +#define TAC_DEFAULT 0x00 + +/* Write Control Characters */ +#define TW_NONE 0x40 /* No particular action */ +#define TW_KR 0xc2 /* Keyboard restore */ +#define TW_PLUSALARM 0x04 /* Add this bit for alarm */ + +#define RAW3270_FIRSTMINOR 1 /* First minor number */ +#define RAW3270_MAXDEVS 255 /* Max number of 3270 devices */ + +#define AID_CLEAR 0x6d +#define AID_ENTER 0x7d +#define AID_PF3 0xf3 +#define AID_PF7 0xf7 +#define AID_PF8 0xf8 +#define AID_READ_PARTITION 0x88 + +#endif /* __ASM_S390_UAPI_RAW3270_H */ diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h index da034c606314..84457dbb26b4 100644 --- a/arch/s390/include/uapi/asm/types.h +++ b/arch/s390/include/uapi/asm/types.h @@ -12,15 +12,18 @@ #ifndef __ASSEMBLY__ -/* A address type so that arithmetic can be done on it & it can be upgraded to - 64 bit when necessary -*/ -typedef unsigned long addr_t; +typedef unsigned long addr_t; typedef __signed__ long saddr_t; typedef struct { - __u32 u[4]; -} __vector128; + union { + struct { + __u64 high; + __u64 low; + }; + __u32 u[4]; + }; +} __attribute__((packed, aligned(4))) __vector128; #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index d83713f67530..f4785abe1b9f 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -85,7 +85,8 @@ struct ica_rsa_modexpo_crt { struct CPRBX { __u16 cprb_len; /* CPRB length 220 */ __u8 cprb_ver_id; /* CPRB version id. 0x02 */ - __u8 _pad_000[3]; /* Alignment pad bytes */ + __u8 ctfm; /* Command Type Filtering Mask */ + __u8 pad_000[2]; /* Alignment pad bytes */ __u8 func_id[2]; /* function id 0x5432 */ __u8 cprb_flags[4]; /* Flags */ __u32 req_parml; /* request parameter buffer len */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 5e6a23299790..8983837b3565 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KPROBES) += kprobes_insn_page.o obj-$(CONFIG_KPROBES) += mcount.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o @@ -69,7 +70,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c index fb92e8ed0525..f9efc54ec4b7 100644 --- a/arch/s390/kernel/abs_lowcore.c +++ b/arch/s390/kernel/abs_lowcore.c @@ -3,12 +3,7 @@ #include <linux/pgtable.h> #include <asm/abs_lowcore.h> -#define ABS_LOWCORE_UNMAPPED 1 -#define ABS_LOWCORE_LAP_ON 2 -#define ABS_LOWCORE_IRQS_ON 4 - unsigned long __bootdata_preserved(__abs_lowcore); -bool __ro_after_init abs_lowcore_mapped; int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) { @@ -49,47 +44,3 @@ void abs_lowcore_unmap(int cpu) addr += PAGE_SIZE; } } - -struct lowcore *get_abs_lowcore(unsigned long *flags) -{ - unsigned long irq_flags; - union ctlreg0 cr0; - int cpu; - - *flags = 0; - cpu = get_cpu(); - if (abs_lowcore_mapped) { - return ((struct lowcore *)__abs_lowcore) + cpu; - } else { - if (cpu != 0) - panic("Invalid unmapped absolute lowcore access\n"); - local_irq_save(irq_flags); - if (!irqs_disabled_flags(irq_flags)) - *flags |= ABS_LOWCORE_IRQS_ON; - __ctl_store(cr0.val, 0, 0); - if (cr0.lap) { - *flags |= ABS_LOWCORE_LAP_ON; - __ctl_clear_bit(0, 28); - } - *flags |= ABS_LOWCORE_UNMAPPED; - return lowcore_ptr[0]; - } -} - -void put_abs_lowcore(struct lowcore *lc, unsigned long flags) -{ - if (abs_lowcore_mapped) { - if (flags) - panic("Invalid mapped absolute lowcore release\n"); - } else { - if (smp_processor_id() != 0) - panic("Invalid mapped absolute lowcore access\n"); - if (!(flags & ABS_LOWCORE_UNMAPPED)) - panic("Invalid unmapped absolute lowcore release\n"); - if (flags & ABS_LOWCORE_LAP_ON) - __ctl_set_bit(0, 28); - if (flags & ABS_LOWCORE_IRQS_ON) - local_irq_enable(); - } - put_cpu(); -} diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c index 7ee3651d00ab..56254fa06f99 100644 --- a/arch/s390/kernel/cache.c +++ b/arch/s390/kernel/cache.c @@ -46,7 +46,7 @@ struct cache_info { #define CACHE_MAX_LEVEL 8 union cache_topology { struct cache_info ci[CACHE_MAX_LEVEL]; - unsigned long long raw; + unsigned long raw; }; static const char * const cache_type_string[] = { diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index eee1ad3e1b29..cecedd01d4ec 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -139,7 +139,7 @@ static int save_sigregs_ext32(struct pt_regs *regs, /* Save vector registers to signal stack */ if (MACHINE_HAS_VX) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + vxrs[i] = current->thread.fpu.vxrs[i].low; if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, @@ -173,7 +173,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + current->thread.fpu.vxrs[i].low = vxrs[i]; } return 0; } diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index c13b1455ec8c..8a617be28bb4 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -110,7 +110,7 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) /* Copy lower halves of vector registers 0-15 */ for (i = 0; i < 16; i++) - memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8); + sa->vxrs_low[i] = vxrs[i].low; /* Copy vector registers 16-31 */ memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); } diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index a778714e4d8b..82079f2d8583 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -35,6 +35,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" }, [DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" }, [DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" }, + [DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" }, [DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" }, [DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" }, [DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" }, @@ -57,12 +58,16 @@ struct diag_ops __amode31_ref diag_amode31_ops = { .diag26c = _diag26c_amode31, .diag14 = _diag14_amode31, .diag0c = _diag0c_amode31, + .diag8c = _diag8c_amode31, .diag308_reset = _diag308_reset_amode31 }; static struct diag210 _diag210_tmp_amode31 __section(".amode31.data"); struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31; +static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data"); +static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31; + static int show_diag_stat(struct seq_file *m, void *v) { struct diag_stat *stat; @@ -194,6 +199,27 @@ int diag210(struct diag210 *addr) } EXPORT_SYMBOL(diag210); +/* + * Diagnose 210: Get information about a virtual device + */ +int diag8c(struct diag8c *addr, struct ccw_dev_id *devno) +{ + static DEFINE_SPINLOCK(diag8c_lock); + unsigned long flags; + int ccode; + + spin_lock_irqsave(&diag8c_lock, flags); + + diag_stat_inc(DIAG_STAT_X08C); + ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr)); + + *addr = *__diag8c_tmp_amode31; + spin_unlock_irqrestore(&diag8c_lock, flags); + + return ccode; +} +EXPORT_SYMBOL(diag8c); + int diag224(void *ptr) { int rc = -EOPNOTSUPP; diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 6030fdd6997b..59eba19ae0f2 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -18,6 +18,7 @@ #include <linux/uaccess.h> #include <linux/kernel.h> #include <asm/asm-extable.h> +#include <linux/memblock.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/ipl.h> @@ -160,9 +161,7 @@ static noinline __init void setup_lowcore_early(void) psw_t psw; psw.addr = (unsigned long)early_pgm_check_handler; - psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; - if (IS_ENABLED(CONFIG_KASAN)) - psw.mask |= PSW_MASK_DAT; + psw.mask = PSW_KERNEL_BITS; S390_lowcore.program_new_psw = psw; S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; } @@ -227,6 +226,8 @@ static __init void detect_machine_facilities(void) S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO; /* the control bit is set during PCI initialization */ } + if (test_facility(194)) + S390_lowcore.machine_flags |= MACHINE_FLAG_RDP; } static inline void save_vector_registers(void) @@ -288,7 +289,6 @@ static void __init sort_amode31_extable(void) void __init startup_init(void) { - sclp_early_adjust_va(); reset_tod_clock(); check_image_bootable(); time_early_init(); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 0f423e9df095..c8d8c9960936 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -137,19 +137,13 @@ _LPP_OFFSET = __LC_LPP lgr %r14,\reg larl %r13,\start slgr %r14,%r13 -#ifdef CONFIG_AS_IS_LLVM clgfrl %r14,.Lrange_size\@ -#else - clgfi %r14,\end - \start -#endif jhe \outside_label -#ifdef CONFIG_AS_IS_LLVM .section .rodata, "a" .align 4 .Lrange_size\@: .long \end - \start .previous -#endif .endm .macro SIEEXIT diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 995ec7449feb..34674e38826b 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -73,6 +73,5 @@ extern struct exception_table_entry _stop_amode31_ex_table[]; #define __amode31_data __section(".amode31.data") #define __amode31_ref __section(".amode31.refs") extern long _start_amode31_refs[], _end_amode31_refs[]; -extern unsigned long __amode31_base; #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index d7b8b6ad574d..3b3bf8329e6c 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -25,6 +25,7 @@ ENTRY(startup_continue) larl %r14,init_task stg %r14,__LC_CURRENT larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD-__PT_SIZE + brasl %r14,sclp_early_adjust_va # allow sclp_early_printk #ifdef CONFIG_KASAN brasl %r14,kasan_early_init #endif diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 4bf1ee293f2b..38e267c7bff7 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -12,9 +12,9 @@ #include <linux/notifier.h> #include <linux/init.h> #include <linux/cpu.h> -#include <linux/sched/cputime.h> #include <trace/events/power.h> #include <asm/cpu_mf.h> +#include <asm/cputime.h> #include <asm/nmi.h> #include <asm/smp.h> #include "entry.h" @@ -24,117 +24,61 @@ static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + unsigned long idle_time; u64 cycles_new[8]; int i; - clear_cpu_flag(CIF_ENABLED_WAIT); if (smp_cpu_mtid) { stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); for (i = 0; i < smp_cpu_mtid; i++) this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } - idle->clock_idle_exit = S390_lowcore.int_clock; - idle->timer_idle_exit = S390_lowcore.sys_enter_timer; + idle_time = S390_lowcore.int_clock - idle->clock_idle_enter; S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; - S390_lowcore.last_update_clock = idle->clock_idle_exit; + S390_lowcore.last_update_clock = S390_lowcore.int_clock; S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; - S390_lowcore.last_update_timer = idle->timer_idle_exit; + S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; + + /* Account time spent with enabled wait psw loaded as idle time. */ + WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); + WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); + account_idle_time(cputime_to_nsecs(idle_time)); } -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - unsigned long idle_time; unsigned long psw_mask; /* Wait for external, I/O or machine check interrupt. */ - psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); /* psw_idle() returns with interrupts disabled. */ psw_idle(idle, psw_mask); - - /* Account time spent with enabled wait psw loaded as idle time. */ - raw_write_seqcount_begin(&idle->seqcount); - idle_time = idle->clock_idle_exit - idle->clock_idle_enter; - idle->clock_idle_enter = idle->clock_idle_exit = 0ULL; - idle->idle_time += idle_time; - idle->idle_count++; - account_idle_time(cputime_to_nsecs(idle_time)); - raw_write_seqcount_end(&idle->seqcount); - raw_local_irq_enable(); } static ssize_t show_idle_count(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned long idle_count; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_count = READ_ONCE(idle->idle_count); - if (READ_ONCE(idle->clock_idle_enter)) - idle_count++; - } while (read_seqcount_retry(&idle->seqcount, seq)); - return sprintf(buf, "%lu\n", idle_count); + + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count)); } DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); static ssize_t show_idle_time(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { - unsigned long now, idle_time, idle_enter, idle_exit, in_idle; struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_time = READ_ONCE(idle->idle_time); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - idle_time += in_idle; - return sprintf(buf, "%lu\n", idle_time >> 12); -} -DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); -u64 arch_cpu_idle_time(int cpu) -{ - struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); - unsigned long now, idle_enter, idle_exit, in_idle; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - return cputime_to_nsecs(in_idle); + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12); } +DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); void arch_cpu_idle_enter(void) { diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index fbd646dbf440..5f0f5c86963a 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -593,6 +593,7 @@ static struct attribute *ipl_eckd_attrs[] = { &sys_ipl_type_attr.attr, &sys_ipl_eckd_bootprog_attr.attr, &sys_ipl_eckd_br_chr_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_secure_attr.attr, &sys_ipl_has_secure_attr.attr, @@ -888,23 +889,27 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, return len; } -/* FCP wrapper */ -static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_fcp, page); -} - -static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_fcp, buf, len); -} - -static struct kobj_attribute sys_reipl_fcp_loadparm_attr = - __ATTR(loadparm, 0644, reipl_fcp_loadparm_show, - reipl_fcp_loadparm_store); +#define DEFINE_GENERIC_LOADPARM(name) \ +static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *page) \ +{ \ + return reipl_generic_loadparm_show(reipl_block_##name, page); \ +} \ +static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return reipl_generic_loadparm_store(reipl_block_##name, buf, len); \ +} \ +static struct kobj_attribute sys_reipl_##name##_loadparm_attr = \ + __ATTR(loadparm, 0644, reipl_##name##_loadparm_show, \ + reipl_##name##_loadparm_store) + +DEFINE_GENERIC_LOADPARM(fcp); +DEFINE_GENERIC_LOADPARM(nvme); +DEFINE_GENERIC_LOADPARM(ccw); +DEFINE_GENERIC_LOADPARM(nss); +DEFINE_GENERIC_LOADPARM(eckd); static ssize_t reipl_fcp_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) @@ -994,24 +999,6 @@ DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", reipl_block_nvme->nvme.br_lba); -/* nvme wrapper */ -static ssize_t reipl_nvme_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_nvme, page); -} - -static ssize_t reipl_nvme_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_nvme, buf, len); -} - -static struct kobj_attribute sys_reipl_nvme_loadparm_attr = - __ATTR(loadparm, 0644, reipl_nvme_loadparm_show, - reipl_nvme_loadparm_store); - static struct attribute *reipl_nvme_attrs[] = { &sys_reipl_nvme_fid_attr.attr, &sys_reipl_nvme_nsid_attr.attr, @@ -1047,38 +1034,6 @@ static struct kobj_attribute sys_reipl_nvme_clear_attr = /* CCW reipl device attributes */ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); -/* NSS wrapper */ -static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_nss, page); -} - -static ssize_t reipl_nss_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_nss, buf, len); -} - -/* CCW wrapper */ -static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_ccw, page); -} - -static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_ccw, buf, len); -} - -static struct kobj_attribute sys_reipl_ccw_loadparm_attr = - __ATTR(loadparm, 0644, reipl_ccw_loadparm_show, - reipl_ccw_loadparm_store); - static ssize_t reipl_ccw_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -1176,6 +1131,7 @@ static struct attribute *reipl_eckd_attrs[] = { &sys_reipl_eckd_device_attr.attr, &sys_reipl_eckd_bootprog_attr.attr, &sys_reipl_eckd_br_chr_attr.attr, + &sys_reipl_eckd_loadparm_attr.attr, NULL, }; @@ -1194,7 +1150,7 @@ static ssize_t reipl_eckd_clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - if (strtobool(buf, &reipl_eckd_clear) < 0) + if (kstrtobool(buf, &reipl_eckd_clear) < 0) return -EINVAL; return len; } @@ -1251,10 +1207,6 @@ static struct kobj_attribute sys_reipl_nss_name_attr = __ATTR(name, 0644, reipl_nss_name_show, reipl_nss_name_store); -static struct kobj_attribute sys_reipl_nss_loadparm_attr = - __ATTR(loadparm, 0644, reipl_nss_loadparm_show, - reipl_nss_loadparm_store); - static struct attribute *reipl_nss_attrs[] = { &sys_reipl_nss_name_attr.attr, &sys_reipl_nss_loadparm_attr.attr, @@ -1986,15 +1938,14 @@ static void dump_reipl_run(struct shutdown_trigger *trigger) { unsigned long ipib = (unsigned long) reipl_block_actual; struct lowcore *abs_lc; - unsigned long flags; unsigned int csum; csum = (__force unsigned int) csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); - abs_lc = get_abs_lowcore(&flags); + abs_lc = get_abs_lowcore(); abs_lc->ipib = ipib; abs_lc->ipib_checksum = csum; - put_abs_lowcore(abs_lc, flags); + put_abs_lowcore(abs_lc); dump_run(trigger); } diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 45393919fe61..b020ff17d206 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -136,7 +136,7 @@ void noinstr do_io_irq(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); struct pt_regs *old_regs = set_irq_regs(regs); - int from_idle; + bool from_idle; irq_enter_rcu(); @@ -146,7 +146,7 @@ void noinstr do_io_irq(struct pt_regs *regs) current->thread.last_break = regs->last_break; } - from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); @@ -171,7 +171,7 @@ void noinstr do_ext_irq(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); struct pt_regs *old_regs = set_irq_regs(regs); - int from_idle; + bool from_idle; irq_enter_rcu(); @@ -185,7 +185,7 @@ void noinstr do_ext_irq(struct pt_regs *regs) regs->int_parm = S390_lowcore.ext_params; regs->int_parm_long = S390_lowcore.ext_params2; - from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 401f9c933ff9..5e713f318de3 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -281,16 +281,6 @@ static void pop_kprobe(struct kprobe_ctlblk *kcb) } NOKPROBE_SYMBOL(pop_kprobe); -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *)regs->gprs[14]; - ri->fp = (void *)regs->gprs[15]; - - /* Replace the return addr with trampoline addr */ - regs->gprs[14] = (unsigned long)&__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) { switch (kcb->kprobe_status) { @@ -371,26 +361,6 @@ static int kprobe_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_handler); -void arch_kretprobe_fixup_return(struct pt_regs *regs, - kprobe_opcode_t *correct_ret_addr) -{ - /* Replace fake return address with real one. */ - regs->gprs[14] = (unsigned long)correct_ret_addr; -} -NOKPROBE_SYMBOL(arch_kretprobe_fixup_return); - -/* - * Called from __kretprobe_trampoline - */ -void trampoline_probe_handler(struct pt_regs *regs) -{ - kretprobe_trampoline_handler(regs, (void *)regs->gprs[15]); -} -NOKPROBE_SYMBOL(trampoline_probe_handler); - -/* assembler function that handles the kretprobes must not be probed itself */ -NOKPROBE_SYMBOL(__kretprobe_trampoline); - /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 4579b42286d5..2a8e73266428 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -224,7 +224,6 @@ void machine_kexec_cleanup(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { struct lowcore *abs_lc; - unsigned long flags; VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); @@ -232,9 +231,9 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("SAMODE31=%lx\n", __samode31); vmcoreinfo_append_str("EAMODE31=%lx\n", __eamode31); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - abs_lc = get_abs_lowcore(&flags); + abs_lc = get_abs_lowcore(); abs_lc->vmcore_info = paddr_vmcoreinfo_note(); - put_abs_lowcore(abs_lc, flags); + put_abs_lowcore(abs_lc); } void machine_shutdown(void) diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 4786bfe02144..43ff91073d2a 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -135,9 +135,9 @@ SYM_FUNC_END(return_to_handler) #endif #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_KPROBES +#ifdef CONFIG_RETHOOK -SYM_FUNC_START(__kretprobe_trampoline) +SYM_FUNC_START(arch_rethook_trampoline) stg %r14,(__SF_GPRS+8*8)(%r15) lay %r15,-STACK_FRAME_SIZE(%r15) @@ -152,16 +152,16 @@ SYM_FUNC_START(__kretprobe_trampoline) epsw %r2,%r3 risbg %r3,%r2,0,31,32 stg %r3,STACK_PTREGS_PSW(%r15) - larl %r1,__kretprobe_trampoline + larl %r1,arch_rethook_trampoline stg %r1,STACK_PTREGS_PSW+8(%r15) lay %r2,STACK_PTREGS(%r15) - brasl %r14,trampoline_probe_handler + brasl %r14,arch_rethook_trampoline_callback mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15) lmg %r0,%r15,STACK_PTREGS_GPRS(%r15) lpswe __SF_EMPTY(%r15) -SYM_FUNC_END(__kretprobe_trampoline) +SYM_FUNC_END(arch_rethook_trampoline) -#endif /* CONFIG_KPROBES */ +#endif /* CONFIG_RETHOOK */ diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index ec0bd9457e90..6e1824141b29 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -59,15 +59,14 @@ void os_info_entry_add(int nr, void *ptr, u64 size) void __init os_info_init(void) { struct lowcore *abs_lc; - unsigned long flags; os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; os_info.csum = os_info_csum(&os_info); - abs_lc = get_abs_lowcore(&flags); + abs_lc = get_abs_lowcore(); abs_lc->os_info = __pa(&os_info); - put_abs_lowcore(abs_lc, flags); + put_abs_lowcore(abs_lc); } #ifdef CONFIG_CRASH_DUMP diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f043a7ff220b..c9ab971498d6 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,7 +2,7 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2021 + * Copyright IBM Corp. 2012, 2023 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> * Thomas Richter <tmricht@linux.ibm.com> */ @@ -16,11 +16,82 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/miscdevice.h> +#include <linux/perf_event.h> -#include <asm/cpu_mcf.h> +#include <asm/cpu_mf.h> #include <asm/hwctrset.h> #include <asm/debug.h> +enum cpumf_ctr_set { + CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ + CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ + CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ + CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ + CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ + + /* Maximum number of counter sets */ + CPUMF_CTR_SET_MAX, +}; + +#define CPUMF_LCCTL_ENABLE_SHIFT 16 +#define CPUMF_LCCTL_ACTCTL_SHIFT 0 + +static inline void ctr_set_enable(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; +} + +static inline void ctr_set_disable(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); +} + +static inline void ctr_set_start(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; +} + +static inline void ctr_set_stop(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); +} + +static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) +{ + switch (set) { + case CPUMF_CTR_SET_BASIC: + return stcctm(BASIC, range, dest); + case CPUMF_CTR_SET_USER: + return stcctm(PROBLEM_STATE, range, dest); + case CPUMF_CTR_SET_CRYPTO: + return stcctm(CRYPTO_ACTIVITY, range, dest); + case CPUMF_CTR_SET_EXT: + return stcctm(EXTENDED, range, dest); + case CPUMF_CTR_SET_MT_DIAG: + return stcctm(MT_DIAG_CLEARING, range, dest); + case CPUMF_CTR_SET_MAX: + return 3; + } + return 3; +} + +struct cpu_cf_events { + struct cpumf_ctr_info info; + atomic_t ctr_set[CPUMF_CTR_SET_MAX]; + u64 state; /* For perf_event_open SVC */ + u64 dev_state; /* For /dev/hwctr */ + unsigned int flags; + size_t used; /* Bytes used in data */ + size_t usedss; /* Bytes used in start/stop */ + unsigned char start[PAGE_SIZE]; /* Counter set at event add */ + unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ + unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ + unsigned int sets; /* # Counter set saved in memory */ +}; + +/* Per-CPU event structure for the counter facility */ +static DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events); + static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ static debug_info_t *cf_dbg; @@ -112,6 +183,53 @@ static void cfdiag_trailer(struct cf_trailer_entry *te) te->timestamp = get_tod_clock_fast(); } +/* + * Return the maximum possible counter set size (in number of 8 byte counters) + * depending on type and model number. + */ +static size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, + struct cpumf_ctr_info *info) +{ + size_t ctrset_size = 0; + + switch (ctrset) { + case CPUMF_CTR_SET_BASIC: + if (info->cfvn >= 1) + ctrset_size = 6; + break; + case CPUMF_CTR_SET_USER: + if (info->cfvn == 1) + ctrset_size = 6; + else if (info->cfvn >= 3) + ctrset_size = 2; + break; + case CPUMF_CTR_SET_CRYPTO: + if (info->csvn >= 1 && info->csvn <= 5) + ctrset_size = 16; + else if (info->csvn == 6 || info->csvn == 7) + ctrset_size = 20; + break; + case CPUMF_CTR_SET_EXT: + if (info->csvn == 1) + ctrset_size = 32; + else if (info->csvn == 2) + ctrset_size = 48; + else if (info->csvn >= 3 && info->csvn <= 5) + ctrset_size = 128; + else if (info->csvn == 6 || info->csvn == 7) + ctrset_size = 160; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (info->csvn > 3) + ctrset_size = 48; + break; + case CPUMF_CTR_SET_MAX: + break; + } + + return ctrset_size; +} + /* Read a counter set. The counter set number determines the counter set and * the CPUM-CF first and second version number determine the number of * available counters in each counter set. @@ -388,6 +506,47 @@ static void cpumf_pmu_disable(struct pmu *pmu) cpuhw->flags &= ~PMU_F_ENABLED; } +#define PMC_INIT 0UL +#define PMC_RELEASE 1UL + +static void cpum_cf_setup_cpu(void *flags) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + + switch ((unsigned long)flags) { + case PMC_INIT: + memset(&cpuhw->info, 0, sizeof(cpuhw->info)); + qctri(&cpuhw->info); + cpuhw->flags |= PMU_F_RESERVED; + break; + + case PMC_RELEASE: + cpuhw->flags &= ~PMU_F_RESERVED; + break; + } + + /* Disable CPU counter sets */ + lcctl(0); + debug_sprintf_event(cf_dbg, 5, "%s flags %#x flags %#x state %#llx\n", + __func__, *(int *)flags, cpuhw->flags, + cpuhw->state); +} + +/* Initialize the CPU-measurement counter facility */ +static int __kernel_cpumcf_begin(void) +{ + on_each_cpu(cpum_cf_setup_cpu, (void *)PMC_INIT, 1); + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + + return 0; +} + +/* Release the CPU-measurement counter facility */ +static void __kernel_cpumcf_end(void) +{ + on_each_cpu(cpum_cf_setup_cpu, (void *)PMC_RELEASE, 1); + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); +} /* Number of perf events counting hardware events */ static atomic_t num_events = ATOMIC_INIT(0); @@ -397,12 +556,10 @@ static DEFINE_MUTEX(pmc_reserve_mutex); /* Release the PMU if event is the last perf event */ static void hw_perf_event_destroy(struct perf_event *event) { - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - __kernel_cpumcf_end(); - mutex_unlock(&pmc_reserve_mutex); - } + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + __kernel_cpumcf_end(); + mutex_unlock(&pmc_reserve_mutex); } /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ @@ -434,6 +591,12 @@ static void cpumf_hw_inuse(void) mutex_unlock(&pmc_reserve_mutex); } +static int is_userspace_event(u64 ev) +{ + return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; +} + static int __hw_perf_event_init(struct perf_event *event, unsigned int type) { struct perf_event_attr *attr = &event->attr; @@ -456,19 +619,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) if (is_sampling_event(event)) /* No sampling support */ return -ENOENT; ev = attr->config; - /* Count user space (problem-state) only */ if (!attr->exclude_user && attr->exclude_kernel) { - if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_user[ev]; - - /* No support for kernel space counters only */ + /* + * Count user space (problem-state) only + * Handle events 32 and 33 as 0:u and 1:u + */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + } } else if (!attr->exclude_kernel && attr->exclude_user) { + /* No support for kernel space counters only */ return -EOPNOTSUPP; - } else { /* Count user and kernel space */ - if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_basic[ev]; + } else { + /* Count user and kernel space, incl. events 32 + 33 */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } } break; @@ -662,9 +832,7 @@ static int cfdiag_push_sample(struct perf_event *event, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = cpuhw->usedss; raw.frag.data = cpuhw->stop; - raw.size = raw.frag.size; - data.raw = &raw; - data.sample_flags |= PERF_SAMPLE_RAW; + perf_sample_save_raw_data(&data, &raw); } overflow = perf_event_overflow(event, &data, ®s); @@ -763,31 +931,120 @@ static struct pmu cpumf_pmu = { .read = cpumf_pmu_read, }; +static int cpum_cf_setup(unsigned int cpu, unsigned long flags) +{ + local_irq_disable(); + cpum_cf_setup_cpu((void *)flags); + local_irq_enable(); + return 0; +} + +static int cfset_online_cpu(unsigned int cpu); +static int cpum_cf_online_cpu(unsigned int cpu) +{ + debug_sprintf_event(cf_dbg, 4, "%s cpu %d in_irq %ld\n", __func__, + cpu, in_interrupt()); + cpum_cf_setup(cpu, PMC_INIT); + return cfset_online_cpu(cpu); +} + +static int cfset_offline_cpu(unsigned int cpu); +static int cpum_cf_offline_cpu(unsigned int cpu) +{ + debug_sprintf_event(cf_dbg, 4, "%s cpu %d\n", __func__, cpu); + cfset_offline_cpu(cpu); + return cpum_cf_setup(cpu, PMC_RELEASE); +} + +/* Return true if store counter set multiple instruction is available */ +static inline int stccm_avail(void) +{ + return test_facility(142); +} + +/* CPU-measurement alerts for the counter facility */ +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_cf_events *cpuhw; + + if (!(alert & CPU_MF_INT_CF_MASK)) + return; + + inc_irq_stat(IRQEXT_CMC); + cpuhw = this_cpu_ptr(&cpu_cf_events); + + /* + * Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. + */ + if (!(cpuhw->flags & PMU_F_RESERVED)) + return; + + /* counter authorization change alert */ + if (alert & CPU_MF_INT_CF_CACA) + qctri(&cpuhw->info); + + /* loss of counter data alert */ + if (alert & CPU_MF_INT_CF_LCDA) + pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); + + /* loss of MT counter data alert */ + if (alert & CPU_MF_INT_CF_MTDA) + pr_warn("CPU[%i] MT counter data was lost\n", + smp_processor_id()); +} + static int cfset_init(void); static int __init cpumf_pmu_init(void) { int rc; - if (!kernel_cpumcf_avail()) + if (!cpum_cf_avail()) return -ENODEV; + /* + * Clear bit 15 of cr0 to unauthorize problem-state to + * extract measurement counters + */ + ctl_clear_bit(0, 48); + + /* register handler for measurement-alert interruptions */ + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (rc) { + pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc); + return rc; + } + /* Setup s390dbf facility */ cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); if (!cf_dbg) { pr_err("Registration of s390dbf(cpum_cf) failed\n"); - return -ENOMEM; + rc = -ENOMEM; + goto out1; } debug_register_view(cf_dbg, &debug_sprintf_view); cpumf_pmu.attr_groups = cpumf_cf_event_group(); rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); if (rc) { - debug_unregister_view(cf_dbg, &debug_sprintf_view); - debug_unregister(cf_dbg); pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + goto out2; } else if (stccm_avail()) { /* Setup counter set device */ cfset_init(); } + + rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, + "perf/s390/cf:online", + cpum_cf_online_cpu, cpum_cf_offline_cpu); + return rc; + +out2: + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); +out1: + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert); return rc; } @@ -1005,7 +1262,6 @@ static int cfset_all_start(struct cfset_request *req) return rc; } - /* Return the maximum required space for all possible CPUs in case one * CPU will be onlined during the START, READ, STOP cycles. * To find out the size of the counter sets, any one CPU will do. They @@ -1268,7 +1524,7 @@ static struct miscdevice cfset_dev = { /* Hotplug add of a CPU. Scan through all active processes and add * that CPU to the list of CPUs supplied with ioctl(..., START, ...). */ -int cfset_online_cpu(unsigned int cpu) +static int cfset_online_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; struct cfset_request *rp; @@ -1288,7 +1544,7 @@ int cfset_online_cpu(unsigned int cpu) /* Hotplug remove of a CPU. Scan through all active processes and clear * that CPU from the list of CPUs supplied with ioctl(..., START, ...). */ -int cfset_offline_cpu(unsigned int cpu) +static int cfset_offline_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; struct cfset_request *rp; diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c deleted file mode 100644 index 8ee48672233f..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * CPU-Measurement Counter Facility Support - Common Layer - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - */ -#define KMSG_COMPONENT "cpum_cf_common" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <asm/ctl_reg.h> -#include <asm/irq.h> -#include <asm/cpu_mcf.h> - -/* Per-CPU event structure for the counter facility */ -DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = { - .ctr_set = { - [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0), - }, - .alert = ATOMIC64_INIT(0), - .state = 0, - .dev_state = 0, - .flags = 0, - .used = 0, - .usedss = 0, - .sets = 0 -}; -/* Indicator whether the CPU-Measurement Counter Facility Support is ready */ -static bool cpum_cf_initalized; - -/* CPU-measurement alerts for the counter facility */ -static void cpumf_measurement_alert(struct ext_code ext_code, - unsigned int alert, unsigned long unused) -{ - struct cpu_cf_events *cpuhw; - - if (!(alert & CPU_MF_INT_CF_MASK)) - return; - - inc_irq_stat(IRQEXT_CMC); - cpuhw = this_cpu_ptr(&cpu_cf_events); - - /* Measurement alerts are shared and might happen when the PMU - * is not reserved. Ignore these alerts in this case. */ - if (!(cpuhw->flags & PMU_F_RESERVED)) - return; - - /* counter authorization change alert */ - if (alert & CPU_MF_INT_CF_CACA) - qctri(&cpuhw->info); - - /* loss of counter data alert */ - if (alert & CPU_MF_INT_CF_LCDA) - pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); - - /* loss of MT counter data alert */ - if (alert & CPU_MF_INT_CF_MTDA) - pr_warn("CPU[%i] MT counter data was lost\n", - smp_processor_id()); - - /* store alert for special handling by in-kernel users */ - atomic64_or(alert, &cpuhw->alert); -} - -#define PMC_INIT 0 -#define PMC_RELEASE 1 -static void cpum_cf_setup_cpu(void *flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - switch (*((int *) flags)) { - case PMC_INIT: - memset(&cpuhw->info, 0, sizeof(cpuhw->info)); - qctri(&cpuhw->info); - cpuhw->flags |= PMU_F_RESERVED; - break; - - case PMC_RELEASE: - cpuhw->flags &= ~PMU_F_RESERVED; - break; - } - - /* Disable CPU counter sets */ - lcctl(0); -} - -bool kernel_cpumcf_avail(void) -{ - return cpum_cf_initalized; -} -EXPORT_SYMBOL(kernel_cpumcf_avail); - -/* Initialize the CPU-measurement counter facility */ -int __kernel_cpumcf_begin(void) -{ - int flags = PMC_INIT; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; -} -EXPORT_SYMBOL(__kernel_cpumcf_begin); - -/* Obtain the CPU-measurement alerts for the counter facility */ -unsigned long kernel_cpumcf_alert(int clear) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - unsigned long alert; - - alert = atomic64_read(&cpuhw->alert); - if (clear) - atomic64_set(&cpuhw->alert, 0); - - return alert; -} -EXPORT_SYMBOL(kernel_cpumcf_alert); - -/* Release the CPU-measurement counter facility */ -void __kernel_cpumcf_end(void) -{ - int flags = PMC_RELEASE; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); -} -EXPORT_SYMBOL(__kernel_cpumcf_end); - -static int cpum_cf_setup(unsigned int cpu, int flags) -{ - local_irq_disable(); - cpum_cf_setup_cpu(&flags); - local_irq_enable(); - return 0; -} - -static int cpum_cf_online_cpu(unsigned int cpu) -{ - cpum_cf_setup(cpu, PMC_INIT); - return cfset_online_cpu(cpu); -} - -static int cpum_cf_offline_cpu(unsigned int cpu) -{ - cfset_offline_cpu(cpu); - return cpum_cf_setup(cpu, PMC_RELEASE); -} - -/* Return the maximum possible counter set size (in number of 8 byte counters) - * depending on type and model number. - */ -size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info) -{ - size_t ctrset_size = 0; - - switch (ctrset) { - case CPUMF_CTR_SET_BASIC: - if (info->cfvn >= 1) - ctrset_size = 6; - break; - case CPUMF_CTR_SET_USER: - if (info->cfvn == 1) - ctrset_size = 6; - else if (info->cfvn >= 3) - ctrset_size = 2; - break; - case CPUMF_CTR_SET_CRYPTO: - if (info->csvn >= 1 && info->csvn <= 5) - ctrset_size = 16; - else if (info->csvn == 6 || info->csvn == 7) - ctrset_size = 20; - break; - case CPUMF_CTR_SET_EXT: - if (info->csvn == 1) - ctrset_size = 32; - else if (info->csvn == 2) - ctrset_size = 48; - else if (info->csvn >= 3 && info->csvn <= 5) - ctrset_size = 128; - else if (info->csvn == 6 || info->csvn == 7) - ctrset_size = 160; - break; - case CPUMF_CTR_SET_MT_DIAG: - if (info->csvn > 3) - ctrset_size = 48; - break; - case CPUMF_CTR_SET_MAX: - break; - } - - return ctrset_size; -} - -static int __init cpum_cf_init(void) -{ - int rc; - - if (!cpum_cf_avail()) - return -ENODEV; - - /* clear bit 15 of cr0 to unauthorize problem-state to - * extract measurement counters */ - ctl_clear_bit(0, 48); - - /* register handler for measurement-alert interruptions */ - rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, - cpumf_measurement_alert); - if (rc) { - pr_err("Registering for CPU-measurement alerts " - "failed with rc=%i\n", rc); - return rc; - } - - rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, - "perf/s390/cf:online", - cpum_cf_online_cpu, cpum_cf_offline_cpu); - if (!rc) - cpum_cf_initalized = true; - - return rc; -} -early_initcall(cpum_cf_init); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index ce886a03545a..79904a839fb9 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -22,6 +22,7 @@ #include <asm/irq.h> #include <asm/debug.h> #include <asm/timex.h> +#include <asm-generic/io.h> /* Minimum number of sample-data-block-tables: * At least one table is required for the sampling buffer structure. @@ -99,6 +100,57 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); /* Debug feature */ static debug_info_t *sfdbg; +/* Sampling control helper functions */ +static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, + unsigned long freq) +{ + return (USEC_PER_SEC / freq) * qsi->cpu_speed; +} + +static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, + unsigned long rate) +{ + return USEC_PER_SEC * qsi->cpu_speed / rate; +} + +/* Return TOD timestamp contained in an trailer entry */ +static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) +{ + /* TOD in STCKE format */ + if (te->header.t) + return *((unsigned long long *)&te->timestamp[1]); + + /* TOD in STCK format */ + return *((unsigned long long *)&te->timestamp[0]); +} + +/* Return pointer to trailer entry of an sample data block */ +static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) +{ + void *ret; + + ret = (void *)v; + ret += PAGE_SIZE; + ret -= sizeof(struct hws_trailer_entry); + + return ret; +} + +/* + * Return true if the entry in the sample data block table (sdbt) + * is a link to the next sdbt + */ +static inline int is_link_entry(unsigned long *s) +{ + return *s & 0x1UL ? 1 : 0; +} + +/* Return pointer to the linked sdbt */ +static inline unsigned long *get_next_sdbt(unsigned long *s) +{ + return phys_to_virt(*s & ~0x1UL); +} + /* * sf_disable() - Switch off sampling facility */ @@ -150,7 +202,7 @@ static void free_sampling_buffer(struct sf_buffer *sfb) } else { /* Process SDB pointer */ if (*curr) { - free_page(*curr); + free_page((unsigned long)phys_to_virt(*curr)); curr++; } } @@ -170,11 +222,11 @@ static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags) sdb = get_zeroed_page(gfp_flags); if (!sdb) return -ENOMEM; - te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + te = trailer_entry_ptr(sdb); te->header.a = 1; /* Link SDB into the sample-data-block-table */ - *sdbt = sdb; + *sdbt = virt_to_phys((void *)sdb); return 0; } @@ -233,7 +285,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, } sfb->num_sdbt++; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys((void *)new) + 1; tail_prev = tail; tail = new; } @@ -263,7 +315,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, } /* Link sampling buffer to its origin */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; debug_sprintf_event(sfdbg, 4, "%s: new buffer" @@ -301,7 +353,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) * realloc_sampling_buffer() invocation. */ sfb->tail = sfb->sdbt; - *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1; + *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1; /* Allocate requested number of sample-data-blocks */ rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); @@ -557,9 +609,6 @@ static void setup_pmc_cpu(void *flags) if (err) pr_err("Switching off the sampling facility failed " "with rc %i\n", err); - debug_sprintf_event(sfdbg, 5, - "%s: initialized: cpuhw %p\n", __func__, - cpusf); break; case PMC_RELEASE: cpusf->flags &= ~PMU_F_RESERVED; @@ -569,9 +618,6 @@ static void setup_pmc_cpu(void *flags) "with rc %i\n", err); } else deallocate_buffers(cpusf); - debug_sprintf_event(sfdbg, 5, - "%s: released: cpuhw %p\n", __func__, - cpusf); break; } if (err) @@ -672,7 +718,8 @@ static void cpumsf_output_event_pid(struct perf_event *event, /* Protect callchain buffers, tasks */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); if (perf_output_begin(&handle, data, event, header.size)) goto out; @@ -1176,8 +1223,8 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, struct hws_trailer_entry *te; struct hws_basic_entry *sample; - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); - sample = (struct hws_basic_entry *) *sdbt; + te = trailer_entry_ptr((unsigned long)sdbt); + sample = (struct hws_basic_entry *)sdbt; while ((unsigned long *) sample < (unsigned long *) te) { /* Check for an empty sample */ if (!sample->def || sample->LS) @@ -1258,7 +1305,7 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) union hws_trailer_header old, prev, new; struct hw_perf_event *hwc = &event->hw; struct hws_trailer_entry *te; - unsigned long *sdbt; + unsigned long *sdbt, sdb; int done; /* @@ -1275,7 +1322,8 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) done = event_overflow = sampl_overflow = num_sdb = 0; while (!done) { /* Get the trailer entry of the sample-data-block */ - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); + sdb = (unsigned long)phys_to_virt(*sdbt); + te = trailer_entry_ptr(sdb); /* Leave loop if no more work to do (block full indicator) */ if (!te->header.f) { @@ -1293,16 +1341,17 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) sampl_overflow += te->header.overflow; /* Timestamps are valid for full sample-data-blocks only */ - debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx " + debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx " "overflow %llu timestamp %#llx\n", - __func__, (unsigned long)sdbt, te->header.overflow, + __func__, sdb, (unsigned long)sdbt, + te->header.overflow, (te->header.f) ? trailer_timestamp(te) : 0ULL); /* Collect all samples from a single sample-data-block and * flag if an (perf) event overflow happened. If so, the PMU * is stopped and remaining samples will be discarded. */ - hw_collect_samples(event, sdbt, &event_overflow); + hw_collect_samples(event, (unsigned long *)sdb, &event_overflow); num_sdb++; /* Reset trailer (using compare-double-and-swap) */ @@ -1360,10 +1409,26 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) OVERFLOW_REG(hwc), num_sdb); } -#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb) -#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0) -#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark) -#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark) +static inline unsigned long aux_sdb_index(struct aux_buffer *aux, + unsigned long i) +{ + return i % aux->sfb.num_sdb; +} + +static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end) +{ + return end >= start ? end - start + 1 : 0; +} + +static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->alert_mark); +} + +static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->empty_mark); +} /* * Get trailer entry by index of SDB. @@ -1373,9 +1438,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, { unsigned long sdb; - index = AUX_SDB_INDEX(aux, index); + index = aux_sdb_index(aux, index); sdb = aux->sdb_index[index]; - return (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + return trailer_entry_ptr(sdb); } /* @@ -1397,7 +1462,7 @@ static void aux_output_end(struct perf_output_handle *handle) if (!aux) return; - range_scan = AUX_SDB_NUM_ALERT(aux); + range_scan = aux_sdb_num_alert(aux); for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); if (!te->header.f) @@ -1427,9 +1492,7 @@ static int aux_output_begin(struct perf_output_handle *handle, struct aux_buffer *aux, struct cpu_hw_sf *cpuhw) { - unsigned long range; - unsigned long i, range_scan, idx; - unsigned long head, base, offset; + unsigned long range, i, range_scan, idx, head, base, offset; struct hws_trailer_entry *te; if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) @@ -1448,8 +1511,8 @@ static int aux_output_begin(struct perf_output_handle *handle, "%s: range %ld head %ld alert %ld empty %ld\n", __func__, range, aux->head, aux->alert_mark, aux->empty_mark); - if (range > AUX_SDB_NUM_EMPTY(aux)) { - range_scan = range - AUX_SDB_NUM_EMPTY(aux); + if (range > aux_sdb_num_empty(aux)) { + range_scan = range - aux_sdb_num_empty(aux); idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); @@ -1467,11 +1530,11 @@ static int aux_output_begin(struct perf_output_handle *handle, te->header.a = 1; /* Reset hardware buffer head */ - head = AUX_SDB_INDEX(aux, aux->head); + head = aux_sdb_index(aux, aux->head); base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; offset = head % CPUM_SF_SDB_PER_TABLE; - cpuhw->lsctl.tear = base + offset * sizeof(unsigned long); - cpuhw->lsctl.dear = aux->sdb_index[head]; + cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld " "index %ld tear %#lx dear %#lx\n", __func__, @@ -1549,7 +1612,7 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld " "empty %ld\n", __func__, range, aux->head, aux->alert_mark, aux->empty_mark); - if (range <= AUX_SDB_NUM_EMPTY(aux)) + if (range <= aux_sdb_num_empty(aux)) /* * No need to scan. All SDBs in range are marked as empty. * Just set alert indicator. Should check race with hardware @@ -1570,7 +1633,7 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, * Start scanning from one SDB behind empty_mark. If the new alert * indicator fall into this range, set it. */ - range_scan = range - AUX_SDB_NUM_EMPTY(aux); + range_scan = range - aux_sdb_num_empty(aux); idx_old = idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); @@ -1617,7 +1680,7 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) return; /* Inform user space new data arrived */ - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__, size >> PAGE_SHIFT); perf_aux_output_end(handle, size); @@ -1659,7 +1722,7 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) "overflow %lld\n", __func__, aux->head, range, overflow); } else { - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; perf_aux_output_end(&cpuhw->handle, size); debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " "already full, try another\n", @@ -1701,7 +1764,7 @@ static void aux_sdb_init(unsigned long sdb) { struct hws_trailer_entry *te; - te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + te = trailer_entry_ptr(sdb); /* Save clock base */ te->clock_base = 1; @@ -1781,18 +1844,18 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, goto no_sdbt; aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys(new) + 1; tail = new; } /* Tail is the entry in a SDBT */ - *tail = (unsigned long)pages[i]; + *tail = virt_to_phys(pages[i]); aux->sdb_index[i] = (unsigned long)pages[i]; aux_sdb_init((unsigned long)pages[i]); } sfb->num_sdb = nr_pages; /* Link the last entry in the SDBT to the first SDBT */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; /* @@ -1932,7 +1995,7 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) cpuhw->lsctl.h = 1; cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); if (!SAMPL_DIAG_MODE(&event->hw)) { - cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; + cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt); cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt; } diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 985e243a2ed8..a7b339c4fd7c 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -362,9 +362,7 @@ static int paicrypt_push_sample(void) if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - raw.size = raw.frag.size; - data.raw = &raw; - data.sample_flags |= PERF_SAMPLE_RAW; + perf_sample_save_raw_data(&data, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 1138f57baae3..fcea307d7529 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -16,8 +16,8 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/io.h> +#include <linux/perf_event.h> -#include <asm/cpu_mcf.h> #include <asm/ctl_reg.h> #include <asm/pai.h> #include <asm/debug.h> @@ -451,9 +451,7 @@ static int paiext_push_sample(void) if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - raw.size = raw.frag.size; - data.raw = &raw; - data.sample_flags |= PERF_SAMPLE_RAW; + perf_sample_save_raw_data(&data, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 3f5d2db0b854..67df64ef4839 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -147,8 +147,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); - frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | + PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.psw.addr = (unsigned long)__ret_from_fork; frame->childregs.gprs[9] = (unsigned long)args->fn; diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 53e0209229f8..cf9659e13f03 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -990,7 +990,7 @@ static int s390_vxrs_low_get(struct task_struct *target, if (target == current) save_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + vxrs[i] = target->thread.fpu.vxrs[i].low; return membuf_write(&to, vxrs, sizeof(vxrs)); } @@ -1008,12 +1008,12 @@ static int s390_vxrs_low_set(struct task_struct *target, save_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + vxrs[i] = target->thread.fpu.vxrs[i].low; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); if (rc == 0) for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i]; + target->thread.fpu.vxrs[i].low = vxrs[i]; return rc; } diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c new file mode 100644 index 000000000000..af10e6bdd34e --- /dev/null +++ b/arch/s390/kernel/rethook.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/rethook.h> +#include <linux/kprobes.h> +#include "rethook.h" + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + rh->ret_addr = regs->gprs[14]; + rh->frame = regs->gprs[15]; + + /* Replace the return addr with trampoline addr */ + regs->gprs[14] = (unsigned long)&arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); + +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* Replace fake return address with real one. */ + regs->gprs[14] = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +/* + * Called from arch_rethook_trampoline + */ +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + return rethook_trampoline_handler(regs, regs->gprs[15]); +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* assembler function that handles the rethook must not be probed itself */ +NOKPROBE_SYMBOL(arch_rethook_trampoline); diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h new file mode 100644 index 000000000000..32f069eed3f3 --- /dev/null +++ b/arch/s390/kernel/rethook.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __S390_RETHOOK_H +#define __S390_RETHOOK_H + +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs); + +#endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 696c9e007a36..8ec5cdf9dadc 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -149,6 +149,9 @@ int __bootdata(noexec_disabled); unsigned long __bootdata(ident_map_size); struct mem_detect_info __bootdata(mem_detect); struct initrd_data __bootdata(initrd_data); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); unsigned long __bootdata_preserved(__kaslr_offset); unsigned long __bootdata(__amode31_base); @@ -411,15 +414,10 @@ void __init arch_call_rest_init(void) call_on_stack_noreturn(rest_init, stack); } -static void __init setup_lowcore_dat_off(void) +static void __init setup_lowcore(void) { - unsigned long int_psw_mask = PSW_KERNEL_BITS; - struct lowcore *abs_lc, *lc; + struct lowcore *lc, *abs_lc; unsigned long mcck_stack; - unsigned long flags; - - if (IS_ENABLED(CONFIG_KASAN)) - int_psw_mask |= PSW_MASK_DAT; /* * Setup lowcore for boot cpu @@ -430,17 +428,17 @@ static void __init setup_lowcore_dat_off(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); - lc->restart_psw.mask = PSW_KERNEL_BITS; - lc->restart_psw.addr = (unsigned long) restart_int_handler; - lc->external_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; + lc->restart_psw.addr = __pa(restart_int_handler); + lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->svc_new_psw.addr = (unsigned long) system_call; - lc->program_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; - lc->mcck_new_psw.mask = int_psw_mask; + lc->mcck_new_psw.mask = PSW_KERNEL_BITS; lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; - lc->io_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; lc->clock_comparator = clock_comparator_max; lc->nodat_stack = ((unsigned long) &init_thread_union) @@ -477,15 +475,7 @@ static void __init setup_lowcore_dat_off(void) lc->restart_fn = (unsigned long) do_restart; lc->restart_data = 0; lc->restart_source = -1U; - - abs_lc = get_abs_lowcore(&flags); - abs_lc->restart_stack = lc->restart_stack; - abs_lc->restart_fn = lc->restart_fn; - abs_lc->restart_data = lc->restart_data; - abs_lc->restart_source = lc->restart_source; - abs_lc->restart_psw = lc->restart_psw; - abs_lc->mcesad = lc->mcesad; - put_abs_lowcore(abs_lc, flags); + __ctl_store(lc->cregs_save_area, 0, 15); mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); if (!mcck_stack) @@ -499,34 +489,25 @@ static void __init setup_lowcore_dat_off(void) lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; + lc->kernel_asce = S390_lowcore.kernel_asce; + lc->user_asce = S390_lowcore.user_asce; + + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + memcpy(abs_lc->cregs_save_area, lc->cregs_save_area, sizeof(abs_lc->cregs_save_area)); + abs_lc->program_new_psw = lc->program_new_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc); set_prefix(__pa(lc)); lowcore_ptr[0] = lc; -} - -static void __init setup_lowcore_dat_on(void) -{ - struct lowcore *abs_lc; - unsigned long flags; - int i; - - __ctl_clear_bit(0, 28); - S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.mcck_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; - __ctl_set_bit(0, 28); - __ctl_store(S390_lowcore.cregs_save_area, 0, 15); - if (abs_lowcore_map(0, lowcore_ptr[0], true)) + if (abs_lowcore_map(0, lowcore_ptr[0], false)) panic("Couldn't setup absolute lowcore"); - abs_lowcore_mapped = true; - abs_lc = get_abs_lowcore(&flags); - abs_lc->restart_flags = RESTART_FLAG_CTLREGS; - abs_lc->program_new_psw = S390_lowcore.program_new_psw; - for (i = 0; i < 16; i++) - abs_lc->cregs_save_area[i] = S390_lowcore.cregs_save_area[i]; - put_abs_lowcore(abs_lc, flags); } static struct resource code_resource = { @@ -619,7 +600,6 @@ static void __init setup_resources(void) static void __init setup_memory_end(void) { - memblock_remove(ident_map_size, PHYS_ADDR_MAX - ident_map_size); max_pfn = max_low_pfn = PFN_DOWN(ident_map_size); pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20); } @@ -651,6 +631,14 @@ static struct notifier_block kdump_mem_nb = { #endif /* + * Reserve page tables created by decompressor + */ +static void __init reserve_pgtables(void) +{ + memblock_reserve(pgalloc_pos, pgalloc_end - pgalloc_pos); +} + +/* * Reserve memory for kdump kernel to be loaded with kexec */ static void __init reserve_crashkernel(void) @@ -784,10 +772,10 @@ static void __init memblock_add_mem_detect_info(void) get_mem_info_source(), mem_detect.info_source); /* keep memblock lists close to the kernel */ memblock_set_bottom_up(true); - for_each_mem_detect_block(i, &start, &end) { + for_each_mem_detect_usable_block(i, &start, &end) memblock_add(start, end - start); + for_each_mem_detect_block(i, &start, &end) memblock_physmem_add(start, end - start); - } memblock_set_bottom_up(false); memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); } @@ -1005,6 +993,7 @@ void __init setup_arch(char **cmdline_p) setup_control_program_code(); /* Do some memory reservations *before* memory is added to memblock */ + reserve_pgtables(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); @@ -1039,7 +1028,7 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_lowcore_dat_off(); + setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); cpu_init(); @@ -1051,15 +1040,14 @@ void __init setup_arch(char **cmdline_p) static_branch_enable(&cpu_has_bear); /* - * Create kernel page tables and switch to virtual addressing. + * Create kernel page tables. */ paging_init(); - memcpy_real_init(); + /* * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. */ - setup_lowcore_dat_on(); #ifdef CONFIG_CRASH_DUMP smp_save_dump_ipl_cpu(); #endif diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 38258f817048..d63557d3868c 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -184,7 +184,7 @@ static int save_sigregs_ext(struct pt_regs *regs, /* Save vector registers to signal stack */ if (MACHINE_HAS_VX) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + vxrs[i] = current->thread.fpu.vxrs[i].low; if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, @@ -210,7 +210,7 @@ static int restore_sigregs_ext(struct pt_regs *regs, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + current->thread.fpu.vxrs[i].low = vxrs[i]; } return 0; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 0031325ce4bc..23c427284773 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -323,11 +323,10 @@ static void pcpu_delegate(struct pcpu *pcpu, { struct lowcore *lc, *abs_lc; unsigned int source_cpu; - unsigned long flags; lc = lowcore_ptr[pcpu - pcpu_devices]; source_cpu = stap(); - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + if (pcpu->address == source_cpu) { call_on_stack(2, stack, void, __pcpu_delegate, pcpu_delegate_fn *, func, void *, data); @@ -341,12 +340,12 @@ static void pcpu_delegate(struct pcpu *pcpu, lc->restart_data = (unsigned long)data; lc->restart_source = source_cpu; } else { - abs_lc = get_abs_lowcore(&flags); + abs_lc = get_abs_lowcore(); abs_lc->restart_stack = stack; abs_lc->restart_fn = (unsigned long)func; abs_lc->restart_data = (unsigned long)data; abs_lc->restart_source = source_cpu; - put_abs_lowcore(abs_lc, flags); + put_abs_lowcore(abs_lc); } __bpon(); asm volatile( @@ -488,7 +487,7 @@ void smp_send_stop(void) int cpu; /* Disable all interrupts/machine checks */ - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + __load_psw_mask(PSW_KERNEL_BITS); trace_hardirqs_off(); debug_set_critical(); @@ -593,7 +592,6 @@ void smp_ctl_set_clear_bit(int cr, int bit, bool set) { struct ec_creg_mask_parms parms = { .cr = cr, }; struct lowcore *abs_lc; - unsigned long flags; u64 ctlreg; if (set) { @@ -604,11 +602,11 @@ void smp_ctl_set_clear_bit(int cr, int bit, bool set) parms.andval = ~(1UL << bit); } spin_lock(&ctl_lock); - abs_lc = get_abs_lowcore(&flags); + abs_lc = get_abs_lowcore(); ctlreg = abs_lc->cregs_save_area[cr]; ctlreg = (ctlreg & parms.andval) | parms.orval; abs_lc->cregs_save_area[cr] = ctlreg; - put_abs_lowcore(abs_lc, flags); + put_abs_lowcore(abs_lc); spin_unlock(&ctl_lock); on_each_cpu(smp_ctl_bit_callback, &parms, 1); } diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 7ee455e8e3d5..0787010139f7 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -40,12 +40,12 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; -#ifdef CONFIG_KPROBES +#ifdef CONFIG_RETHOOK /* - * Mark stacktraces with kretprobed functions on them + * Mark stacktraces with krethook functions on them * as unreliable. */ - if (state.ip == (unsigned long)__kretprobe_trampoline) + if (state.ip == (unsigned long)arch_rethook_trampoline) return -EINVAL; #endif diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S index 2c8b14cc5556..e0f01ce251f5 100644 --- a/arch/s390/kernel/text_amode31.S +++ b/arch/s390/kernel/text_amode31.S @@ -63,6 +63,19 @@ ENTRY(_diag210_amode31) ENDPROC(_diag210_amode31) /* + * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len) +*/ +ENTRY(_diag8c_amode31) + llgf %r3,0(%r3) + sam31 + diag %r2,%r4,0x8c +.Ldiag8c_ex: + sam64 + lgfr %r2,%r3 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex) +ENDPROC(_diag8c_amode31) +/* * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode) */ ENTRY(_diag26c_amode31) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index ff7bf4432229..bbaefd84f15e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -59,11 +59,9 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (!vma_is_special_mapping(vma, &vvar_mapping)) continue; - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); break; } mmap_read_unlock(mm); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index cbf9c1b0beda..b653ba8d51e6 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -44,7 +44,6 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT @@ -217,6 +216,9 @@ SECTIONS QUAD(__rela_dyn_start) /* rela_dyn_start */ QUAD(__rela_dyn_end) /* rela_dyn_end */ QUAD(_eamode31 - _samode31) /* amode31_size */ + QUAD(init_mm) + QUAD(swapper_pg_dir) + QUAD(invalid_pg_dir) } :NONE /* Debugging sections. */ @@ -228,5 +230,6 @@ SECTIONS DISCARDS /DISCARD/ : { *(.eh_frame) + *(.interp) } } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 9436f3053b88..e0a88dcaf5cb 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -7,13 +7,13 @@ */ #include <linux/kernel_stat.h> -#include <linux/sched/cputime.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/types.h> #include <linux/time.h> #include <asm/alternative.h> +#include <asm/cputime.h> #include <asm/vtimer.h> #include <asm/vtime.h> #include <asm/cpu_mf.h> diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index 5a053b393d5c..7231bf97b93a 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -47,7 +47,7 @@ static void print_backtrace(char *bt) static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, unsigned long sp) { - int frame_count, prev_is_func2, seen_func2_func1, seen_kretprobe_trampoline; + int frame_count, prev_is_func2, seen_func2_func1, seen_arch_rethook_trampoline; const int max_frames = 128; struct unwind_state state; size_t bt_pos = 0; @@ -63,7 +63,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, frame_count = 0; prev_is_func2 = 0; seen_func2_func1 = 0; - seen_kretprobe_trampoline = 0; + seen_arch_rethook_trampoline = 0; unwind_for_each_frame(&state, task, regs, sp) { unsigned long addr = unwind_get_return_address(&state); char sym[KSYM_SYMBOL_LEN]; @@ -89,8 +89,8 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1")) seen_func2_func1 = 1; prev_is_func2 = str_has_prefix(sym, "unwindme_func2"); - if (str_has_prefix(sym, "__kretprobe_trampoline+0x0/")) - seen_kretprobe_trampoline = 1; + if (str_has_prefix(sym, "arch_rethook_trampoline+0x0/")) + seen_arch_rethook_trampoline = 1; } /* Check the results. */ @@ -106,8 +106,8 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, kunit_err(current_test, "Maximum number of frames exceeded\n"); ret = -EINVAL; } - if (seen_kretprobe_trampoline) { - kunit_err(current_test, "__kretprobe_trampoline+0x0 in unwinding results\n"); + if (seen_arch_rethook_trampoline) { + kunit_err(current_test, "arch_rethook_trampoline+0x0 in unwinding results\n"); ret = -EINVAL; } if (ret || force_bt) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 9953819d7959..ba5f80268878 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -33,10 +33,6 @@ enum address_markers_idx { #endif IDENTITY_AFTER_NR, IDENTITY_AFTER_END_NR, -#ifdef CONFIG_KASAN - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, -#endif VMEMMAP_NR, VMEMMAP_END_NR, VMALLOC_NR, @@ -47,6 +43,10 @@ enum address_markers_idx { ABS_LOWCORE_END_NR, MEMCPY_REAL_NR, MEMCPY_REAL_END_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif }; static struct addr_marker address_markers[] = { @@ -62,10 +62,6 @@ static struct addr_marker address_markers[] = { #endif [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, -#ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, - [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, -#endif [VMEMMAP_NR] = {0, "vmemmap Area Start"}, [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, [VMALLOC_NR] = {0, "vmalloc Area Start"}, @@ -76,6 +72,10 @@ static struct addr_marker address_markers[] = { [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, +#ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, + [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, +#endif { -1, NULL } }; diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c index 1e4d2187541a..fe87291df95d 100644 --- a/arch/s390/mm/extable.c +++ b/arch/s390/mm/extable.c @@ -47,13 +47,16 @@ static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struc return true; } -static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, + bool pair, struct pt_regs *regs) { unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); regs->gprs[reg_err] = -EFAULT; regs->gprs[reg_zero] = 0; + if (pair) + regs->gprs[reg_zero + 1] = 0; regs->psw.addr = extable_fixup(ex); return true; } @@ -75,7 +78,9 @@ bool fixup_exception(struct pt_regs *regs) case EX_TYPE_UA_LOAD_MEM: return ex_handler_ua_load_mem(ex, regs); case EX_TYPE_UA_LOAD_REG: - return ex_handler_ua_load_reg(ex, regs); + return ex_handler_ua_load_reg(ex, false, regs); + case EX_TYPE_UA_LOAD_REGPAIR: + return ex_handler_ua_load_reg(ex, true, regs); } panic("invalid exception table entry"); } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 9649d9382e0a..a2632fd97d00 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -46,11 +46,15 @@ #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL -#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) -#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) -#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) -#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) -#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) +/* + * Allocate private vm_fault_reason from top. Please make sure it won't + * collide with vm_fault_reason. + */ +#define VM_FAULT_BADCONTEXT ((__force vm_fault_t)0x80000000) +#define VM_FAULT_BADMAP ((__force vm_fault_t)0x40000000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x20000000) +#define VM_FAULT_SIGNAL ((__force vm_fault_t)0x10000000) +#define VM_FAULT_PFAULT ((__force vm_fault_t)0x8000000) enum fault_type { KERNEL_FAULT, @@ -96,6 +100,20 @@ static enum fault_type get_fault_type(struct pt_regs *regs) return KERNEL_FAULT; } +static unsigned long get_fault_address(struct pt_regs *regs) +{ + unsigned long trans_exc_code = regs->int_parm_long; + + return trans_exc_code & __FAIL_ADDR_MASK; +} + +static bool fault_is_write(struct pt_regs *regs) +{ + unsigned long trans_exc_code = regs->int_parm_long; + + return (trans_exc_code & store_indication) == 0x400; +} + static int bad_address(void *p) { unsigned long dummy; @@ -228,15 +246,26 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -static noinline void do_no_context(struct pt_regs *regs) +static noinline void do_no_context(struct pt_regs *regs, vm_fault_t fault) { + enum fault_type fault_type; + unsigned long address; + bool is_write; + if (fixup_exception(regs)) return; + fault_type = get_fault_type(regs); + if ((fault_type == KERNEL_FAULT) && (fault == VM_FAULT_BADCONTEXT)) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; + } /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - if (get_fault_type(regs) == KERNEL_FAULT) + if (fault_type == KERNEL_FAULT) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " in virtual kernel address space\n"); else @@ -255,7 +284,7 @@ static noinline void do_low_address(struct pt_regs *regs) die (regs, "Low-address protection"); } - do_no_context(regs); + do_no_context(regs, VM_FAULT_BADACCESS); } static noinline void do_sigbus(struct pt_regs *regs) @@ -286,28 +315,28 @@ static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) fallthrough; case VM_FAULT_BADCONTEXT: case VM_FAULT_PFAULT: - do_no_context(regs); + do_no_context(regs, fault); break; case VM_FAULT_SIGNAL: if (!user_mode(regs)) - do_no_context(regs); + do_no_context(regs, fault); break; default: /* fault & VM_FAULT_ERROR */ if (fault & VM_FAULT_OOM) { if (!user_mode(regs)) - do_no_context(regs); + do_no_context(regs, fault); else pagefault_out_of_memory(); } else if (fault & VM_FAULT_SIGSEGV) { /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) - do_no_context(regs); + do_no_context(regs, fault); else do_sigsegv(regs, SEGV_MAPERR); } else if (fault & VM_FAULT_SIGBUS) { /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) - do_no_context(regs); + do_no_context(regs, fault); else do_sigbus(regs); } else @@ -334,7 +363,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) struct mm_struct *mm; struct vm_area_struct *vma; enum fault_type type; - unsigned long trans_exc_code; unsigned long address; unsigned int flags; vm_fault_t fault; @@ -351,9 +379,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) return 0; mm = tsk->mm; - trans_exc_code = regs->int_parm_long; - address = trans_exc_code & __FAIL_ADDR_MASK; - is_write = (trans_exc_code & store_indication) == 0x400; + address = get_fault_address(regs); + is_write = fault_is_write(regs); /* * Verify that the fault happened in user space, that @@ -364,8 +391,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) type = get_fault_type(regs); switch (type) { case KERNEL_FAULT: - if (kfence_handle_page_fault(address, is_write, regs)) - return 0; goto out; case USER_FAULT: case GMAP_FAULT: diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 74e1d873dce0..5a716bdcba05 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -722,7 +722,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) if (is_vm_hugetlb_page(vma)) continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); + zap_page_range_single(vma, vmaddr, size, NULL); } mmap_read_unlock(gmap->mm); } @@ -2522,8 +2522,7 @@ static inline void thp_split_mm(struct mm_struct *mm) VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; @@ -2588,14 +2587,18 @@ int gmap_mark_unmergeable(void) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + unsigned long vm_flags; int ret; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); + MADV_UNMERGEABLE, &vm_flags); if (ret) return ret; + vm_flags_reset(vma, vm_flags); } mm->def_flags &= ~VM_MERGEABLE; return 0; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 30ab55f868f6..144447d5cb4c 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -52,9 +52,9 @@ #include <linux/virtio_config.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); -static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); -unsigned long s390_invalid_asce; +unsigned long __bootdata_preserved(s390_invalid_asce); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); @@ -93,37 +93,8 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - psw_t psw; - - s390_invalid_asce = (unsigned long)invalid_pg_dir; - s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); - init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > _REGION2_SIZE) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } - init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = s390_invalid_asce; - crst_table_init((unsigned long *) init_mm.pgd, pgd_type); - vmem_map_init(); - kasan_copy_shadow_mapping(); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.user_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); - kasan_free_early_identity(); + vmem_map_init(); sparse_init(); zone_dma_bits = 31; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 9f988d4582ed..ef89a5f26853 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kasan.h> #include <linux/sched/task.h> -#include <linux/memblock.h> #include <linux/pgtable.h> #include <asm/pgalloc.h> #include <asm/kasan.h> @@ -15,16 +14,11 @@ static unsigned long segment_pos __initdata; static unsigned long segment_low __initdata; -static unsigned long pgalloc_pos __initdata; -static unsigned long pgalloc_low __initdata; -static unsigned long pgalloc_freeable __initdata; static bool has_edat __initdata; static bool has_nx __initdata; #define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) -static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - static void __init kasan_early_panic(const char *reason) { sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); @@ -39,7 +33,7 @@ static void * __init kasan_early_alloc_segment(void) if (segment_pos < segment_low) kasan_early_panic("out of memory during initialisation\n"); - return (void *)segment_pos; + return __va(segment_pos); } static void * __init kasan_early_alloc_pages(unsigned int order) @@ -49,7 +43,7 @@ static void * __init kasan_early_alloc_pages(unsigned int order) if (pgalloc_pos < pgalloc_low) kasan_early_panic("out of memory during initialisation\n"); - return (void *)pgalloc_pos; + return __va(pgalloc_pos); } static void * __init kasan_early_crst_alloc(unsigned long val) @@ -81,35 +75,37 @@ static pte_t * __init kasan_early_pte_alloc(void) } enum populate_mode { - POPULATE_ONE2ONE, POPULATE_MAP, POPULATE_ZERO_SHADOW, POPULATE_SHALLOW }; + +static inline pgprot_t pgprot_clear_bit(pgprot_t pgprot, unsigned long bit) +{ + return __pgprot(pgprot_val(pgprot) & ~bit); +} + static void __init kasan_early_pgtable_populate(unsigned long address, unsigned long end, enum populate_mode mode) { - unsigned long pgt_prot_zero, pgt_prot, sgt_prot; + pgprot_t pgt_prot_zero = PAGE_KERNEL_RO; + pgprot_t pgt_prot = PAGE_KERNEL; + pgprot_t sgt_prot = SEGMENT_KERNEL; pgd_t *pg_dir; p4d_t *p4_dir; pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; + pmd_t pmd; + pte_t pte; - pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); - if (!has_nx) - pgt_prot_zero &= ~_PAGE_NOEXEC; - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!has_nx || mode == POPULATE_ONE2ONE) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; + if (!has_nx) { + pgt_prot_zero = pgprot_clear_bit(pgt_prot_zero, _PAGE_NOEXEC); + pgt_prot = pgprot_clear_bit(pgt_prot, _PAGE_NOEXEC); + sgt_prot = pgprot_clear_bit(sgt_prot, _SEGMENT_ENTRY_NOEXEC); } - /* - * The first 1MB of 1:1 mapping is mapped with 4KB pages - */ while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { @@ -166,16 +162,13 @@ static void __init kasan_early_pgtable_populate(unsigned long address, pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte); address = (address + PMD_SIZE) & PMD_MASK; continue; - } else if (has_edat && address) { - void *page; - - if (mode == POPULATE_ONE2ONE) { - page = (void *)address; - } else { - page = kasan_early_alloc_segment(); - memset(page, 0, _SEGMENT_SIZE); - } - set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot)); + } else if (has_edat) { + void *page = kasan_early_alloc_segment(); + + memset(page, 0, _SEGMENT_SIZE); + pmd = __pmd(__pa(page)); + pmd = set_pmd_bit(pmd, sgt_prot); + set_pmd(pm_dir, pmd); address = (address + PMD_SIZE) & PMD_MASK; continue; } @@ -192,18 +185,18 @@ static void __init kasan_early_pgtable_populate(unsigned long address, void *page; switch (mode) { - case POPULATE_ONE2ONE: - page = (void *)address; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); - break; case POPULATE_MAP: page = kasan_early_alloc_pages(0); memset(page, 0, PAGE_SIZE); - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); + pte = __pte(__pa(page)); + pte = set_pte_bit(pte, pgt_prot); + set_pte(pt_dir, pte); break; case POPULATE_ZERO_SHADOW: page = kasan_early_shadow_page; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero)); + pte = __pte(__pa(page)); + pte = set_pte_bit(pte, pgt_prot_zero); + set_pte(pt_dir, pte); break; case POPULATE_SHALLOW: /* should never happen */ @@ -214,29 +207,6 @@ static void __init kasan_early_pgtable_populate(unsigned long address, } } -static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) -{ - unsigned long asce_bits; - - asce_bits = asce_type | _ASCE_TABLE_LENGTH; - S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); -} - -static void __init kasan_enable_dat(void) -{ - psw_t psw; - - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); -} - static void __init kasan_early_detect_facilities(void) { if (test_facility(8)) { @@ -251,153 +221,81 @@ static void __init kasan_early_detect_facilities(void) void __init kasan_early_init(void) { - unsigned long shadow_alloc_size; - unsigned long initrd_end; - unsigned long memsize; - unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); - pte_t pte_z; + pte_t pte_z = __pte(__pa(kasan_early_shadow_page) | pgprot_val(PAGE_KERNEL_RO)); pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); + unsigned long untracked_end = MODULES_VADDR; + unsigned long shadow_alloc_size; + unsigned long start, end; + int i; kasan_early_detect_facilities(); if (!has_nx) - pgt_prot &= ~_PAGE_NOEXEC; - pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot); - - memsize = get_mem_detect_end(); - if (!memsize) - kasan_early_panic("cannot detect physical memory size\n"); - /* - * Kasan currently supports standby memory but only if it follows - * online memory (default allocation), i.e. no memory holes. - * - memsize represents end of online memory - * - ident_map_size represents online + standby and memory limits - * accounted. - * Kasan maps "memsize" right away. - * [0, memsize] - as identity mapping - * [__sha(0), __sha(memsize)] - shadow memory for identity mapping - * The rest [memsize, ident_map_size] if memsize < ident_map_size - * could be mapped/unmapped dynamically later during memory hotplug. - */ - memsize = min(memsize, ident_map_size); + pte_z = clear_pte_bit(pte_z, __pgprot(_PAGE_NOEXEC)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); /* init kasan zero shadow */ - crst_table_init((unsigned long *)kasan_early_shadow_p4d, - p4d_val(p4d_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pud, - pud_val(pud_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pmd, - pmd_val(pmd_z)); + crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z)); + crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z)); memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); - shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; - pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { - initrd_end = - round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); - pgalloc_low = max(pgalloc_low, initrd_end); - } - - if (pgalloc_low + shadow_alloc_size > memsize) - kasan_early_panic("out of memory during initialisation\n"); - if (has_edat) { - segment_pos = round_down(memsize, _SEGMENT_SIZE); + shadow_alloc_size = get_mem_detect_usable_total() >> KASAN_SHADOW_SCALE_SHIFT; + segment_pos = round_down(pgalloc_pos, _SEGMENT_SIZE); segment_low = segment_pos - shadow_alloc_size; + segment_low = round_down(segment_low, _SEGMENT_SIZE); pgalloc_pos = segment_low; - } else { - pgalloc_pos = memsize; } - init_mm.pgd = early_pg_dir; /* * Current memory layout: - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | | - * | |/ | kasan | - * +- shadow start --+ | zero | - * | 1/8 addr space | | page | - * +- shadow end -+ | mapping | - * | ... gap ... |\ | (untracked) | - * +- vmalloc area -+ \ | | - * | vmalloc_size | \ | | - * +- modules vaddr -+ \ +----------------+ - * | 2Gb | \| unmapped | allocated per module - * +-----------------+ +- shadow end ---+ + * +- 0 -------------+ +- shadow start -+ + * |1:1 ident mapping| /|1/8 of ident map| + * | | / | | + * +-end of ident map+ / +----------------+ + * | ... gap ... | / | kasan | + * | | / | zero page | + * +- vmalloc area -+ / | mapping | + * | vmalloc_size | / | (untracked) | + * +- modules vaddr -+ / +----------------+ + * | 2Gb |/ | unmapped | allocated per module + * +- shadow start -+ +----------------+ + * | 1/8 addr space | | zero pg mapping| (untracked) + * +- shadow end ----+---------+- shadow end ---+ * * Current memory layout (KASAN_VMALLOC): - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | kasan | - * | |/ | zero | - * +- shadow start --+ | page | - * | 1/8 addr space | | mapping | - * +- shadow end -+ | (untracked) | - * | ... gap ... |\ | | - * +- vmalloc area -+ \ +- vmalloc area -+ - * | vmalloc_size | \ |shallow populate| - * +- modules vaddr -+ \ +- modules area -+ - * | 2Gb | \|shallow populate| - * +-----------------+ +- shadow end ---+ + * +- 0 -------------+ +- shadow start -+ + * |1:1 ident mapping| /|1/8 of ident map| + * | | / | | + * +-end of ident map+ / +----------------+ + * | ... gap ... | / | kasan zero page| (untracked) + * | | / | mapping | + * +- vmalloc area -+ / +----------------+ + * | vmalloc_size | / |shallow populate| + * +- modules vaddr -+ / +----------------+ + * | 2Gb |/ |shallow populate| + * +- shadow start -+ +----------------+ + * | 1/8 addr space | | zero pg mapping| (untracked) + * +- shadow end ----+---------+- shadow end ---+ */ /* populate kasan shadow (for identity mapping and zero page mapping) */ - kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP); + for_each_mem_detect_usable_block(i, &start, &end) + kasan_early_pgtable_populate(__sha(start), __sha(end), POPULATE_MAP); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + untracked_end = VMALLOC_START; /* shallowly populate kasan shadow for vmalloc and modules */ kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END), POPULATE_SHALLOW); } /* populate kasan shadow for untracked memory */ - kasan_early_pgtable_populate(__sha(ident_map_size), - IS_ENABLED(CONFIG_KASAN_VMALLOC) ? - __sha(VMALLOC_START) : - __sha(MODULES_VADDR), + kasan_early_pgtable_populate(__sha(ident_map_size), __sha(untracked_end), POPULATE_ZERO_SHADOW); kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), POPULATE_ZERO_SHADOW); - /* memory allocated for identity mapping structs will be freed later */ - pgalloc_freeable = pgalloc_pos; - /* populate identity mapping */ - kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2); - kasan_enable_dat(); /* enable kasan */ init_task.kasan_depth = 0; - memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); sclp_early_printk("KernelAddressSanitizer initialized\n"); } - -void __init kasan_copy_shadow_mapping(void) -{ - /* - * At this point we are still running on early pages setup early_pg_dir, - * while swapper_pg_dir has just been initialized with identity mapping. - * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. - */ - - pgd_t *pg_dir_src; - pgd_t *pg_dir_dst; - p4d_t *p4_dir_src; - p4d_t *p4_dir_dst; - - pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START); - p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); - p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); -} - -void __init kasan_free_early_identity(void) -{ - memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); -} diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 4824d1cd33d8..d02a61620cfa 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -21,7 +21,7 @@ #include <asm/maccess.h> unsigned long __bootdata_preserved(__memcpy_real_area); -static __ro_after_init pte_t *memcpy_real_ptep; +pte_t *__bootdata_preserved(memcpy_real_ptep); static DEFINE_MUTEX(memcpy_real_mutex); static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) @@ -68,28 +68,17 @@ notrace void *s390_kernel_write(void *dst, const void *src, size_t size) long copied; spin_lock_irqsave(&s390_kernel_write_lock, flags); - if (!(flags & PSW_MASK_DAT)) { - memcpy(dst, src, size); - } else { - while (size) { - copied = s390_kernel_write_odd(tmp, src, size); - tmp += copied; - src += copied; - size -= copied; - } + while (size) { + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; + src += copied; + size -= copied; } spin_unlock_irqrestore(&s390_kernel_write_lock, flags); return dst; } -void __init memcpy_real_init(void) -{ - memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true); - if (!memcpy_real_ptep) - panic("Couldn't setup memcpy real area"); -} - size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { size_t len, copied, res = 0; @@ -162,7 +151,6 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) void *ptr = phys_to_virt(addr); void *bounce = ptr; struct lowcore *abs_lc; - unsigned long flags; unsigned long size; int this_cpu, cpu; @@ -178,10 +166,10 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) goto out; size = PAGE_SIZE - (addr & ~PAGE_MASK); if (addr < sizeof(struct lowcore)) { - abs_lc = get_abs_lowcore(&flags); + abs_lc = get_abs_lowcore(); ptr = (void *)abs_lc + addr; memcpy(bounce, ptr, size); - put_abs_lowcore(abs_lc, flags); + put_abs_lowcore(abs_lc); } else if (cpu == this_cpu) { ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); memcpy(bounce, ptr, size); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4909dcd762e8..6effb24de6d9 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -302,6 +302,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_direct); +/* + * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that + * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). + */ +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new) +{ + preempt_disable(); + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_rdp(addr, ptep, 0, 0, 1); + else + __ptep_rdp(addr, ptep, 0, 0, 0); + /* + * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That + * means it is still valid and active, and must not be changed according + * to the architecture. But writing a new value that only differs in SW + * bits is allowed. + */ + set_pte(ptep, new); + atomic_dec(&mm->context.flush_count); + preempt_enable(); +} +EXPORT_SYMBOL(ptep_reset_dat_prot); + pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index ee1a97078527..4113a7ffa149 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -11,6 +11,7 @@ #include <linux/list.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/sort.h> #include <asm/cacheflush.h> #include <asm/nospec-branch.h> #include <asm/pgalloc.h> @@ -296,10 +297,7 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start) /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ if (end > VMALLOC_START) return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif + pmd = pmd_offset(pud, start); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) if (!pmd_none(*pmd)) @@ -371,10 +369,6 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start) /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ if (end > VMALLOC_START) return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif pud = pud_offset(p4d, start); for (i = 0; i < PTRS_PER_PUD; i++, pud++) { @@ -425,10 +419,6 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start) /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ if (end > VMALLOC_START) return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif p4d = p4d_offset(pgd, start); for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { @@ -657,6 +647,23 @@ void vmem_unmap_4k_page(unsigned long addr) mutex_unlock(&vmem_mutex); } +static int __init memblock_region_cmp(const void *a, const void *b) +{ + const struct memblock_region *r1 = a; + const struct memblock_region *r2 = b; + + if (r1->base < r2->base) + return -1; + if (r1->base > r2->base) + return 1; + return 0; +} + +static void __init memblock_region_swap(void *a, void *b, int size) +{ + swap(*(struct memblock_region *)a, *(struct memblock_region *)b); +} + /* * map whole physical memory to virtual memory (identity mapping) * we reserve enough space in the vmalloc area for vmemmap to hotplug @@ -664,11 +671,68 @@ void vmem_unmap_4k_page(unsigned long addr) */ void __init vmem_map_init(void) { + struct memblock_region memory_rwx_regions[] = { + { + .base = 0, + .size = sizeof(struct lowcore), + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __pa(_stext), + .size = _etext - _stext, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __pa(_sinittext), + .size = _einittext - _sinittext, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + { + .base = __stext_amode31, + .size = __etext_amode31 - __stext_amode31, + .flags = MEMBLOCK_NONE, +#ifdef CONFIG_NUMA + .nid = NUMA_NO_NODE, +#endif + }, + }; + struct memblock_type memory_rwx = { + .regions = memory_rwx_regions, + .cnt = ARRAY_SIZE(memory_rwx_regions), + .max = ARRAY_SIZE(memory_rwx_regions), + }; phys_addr_t base, end; u64 i; - for_each_mem_range(i, &base, &end) - vmem_add_range(base, end - base); + /* + * Set RW+NX attribute on all memory, except regions enumerated with + * memory_rwx exclude type. These regions need different attributes, + * which are enforced afterwards. + * + * __for_each_mem_range() iterate and exclude types should be sorted. + * The relative location of _stext and _sinittext is hardcoded in the + * linker script. However a location of __stext_amode31 and the kernel + * image itself are chosen dynamically. Thus, sort the exclude type. + */ + sort(&memory_rwx_regions, + ARRAY_SIZE(memory_rwx_regions), sizeof(memory_rwx_regions[0]), + memblock_region_cmp, memblock_region_swap); + __for_each_mem_range(i, &memblock.memory, &memory_rwx, + NUMA_NO_NODE, MEMBLOCK_NONE, &base, &end, NULL) { + __set_memory((unsigned long)__va(base), + (end - base) >> PAGE_SHIFT, + SET_MEMORY_RW | SET_MEMORY_NX); + } + __set_memory((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); @@ -678,15 +742,14 @@ void __init vmem_map_init(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, + __set_memory(__stext_amode31, + (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - /* lowcore requires 4k mapping for real addresses / prefixing */ - set_memory_4k(0, LC_PAGES); - /* lowcore must be executable for LPSWE */ - if (!static_key_enabled(&cpu_has_bear)) - set_memory_x(0, 1); + if (static_key_enabled(&cpu_has_bear)) + set_memory_nx(0, 1); + set_memory_nx(PAGE_SIZE, 1); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index af35052d06ed..d0846ba818ee 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -30,6 +30,7 @@ #include <asm/facility.h> #include <asm/nospec-branch.h> #include <asm/set_memory.h> +#include <asm/text-patching.h> #include "bpf_jit.h" struct bpf_jit { @@ -50,12 +51,13 @@ struct bpf_jit { int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ + int prologue_plt_ret; /* Return address for prologue hotpatch PLT */ + int prologue_plt; /* Start of prologue hotpatch PLT */ }; #define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ #define SEEN_LITERAL BIT(1) /* code uses literals */ #define SEEN_FUNC BIT(2) /* calls C functions */ -#define SEEN_TAIL_CALL BIT(3) /* code uses tail calls */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM) /* @@ -68,6 +70,10 @@ struct bpf_jit { #define REG_0 REG_W0 /* Register 0 */ #define REG_1 REG_W1 /* Register 1 */ #define REG_2 BPF_REG_1 /* Register 2 */ +#define REG_3 BPF_REG_2 /* Register 3 */ +#define REG_4 BPF_REG_3 /* Register 4 */ +#define REG_7 BPF_REG_6 /* Register 7 */ +#define REG_8 BPF_REG_7 /* Register 8 */ #define REG_14 BPF_REG_0 /* Register 14 */ /* @@ -507,20 +513,58 @@ static void bpf_skip(struct bpf_jit *jit, int size) } /* + * PLT for hotpatchable calls. The calling convention is the same as for the + * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered. + */ +extern const char bpf_plt[]; +extern const char bpf_plt_ret[]; +extern const char bpf_plt_target[]; +extern const char bpf_plt_end[]; +#define BPF_PLT_SIZE 32 +asm( + ".pushsection .rodata\n" + " .align 8\n" + "bpf_plt:\n" + " lgrl %r0,bpf_plt_ret\n" + " lgrl %r1,bpf_plt_target\n" + " br %r1\n" + " .align 8\n" + "bpf_plt_ret: .quad 0\n" + "bpf_plt_target: .quad 0\n" + "bpf_plt_end:\n" + " .popsection\n" +); + +static void bpf_jit_plt(void *plt, void *ret, void *target) +{ + memcpy(plt, bpf_plt, BPF_PLT_SIZE); + *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; + *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target; +} + +/* * Emit function prologue * * Save registers and create stack frame if necessary. - * See stack frame layout desription in "bpf_jit.h"! + * See stack frame layout description in "bpf_jit.h"! */ -static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) +static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, + u32 stack_depth) { - if (jit->seen & SEEN_TAIL_CALL) { + /* No-op for hotpatching */ + /* brcl 0,prologue_plt */ + EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt); + jit->prologue_plt_ret = jit->prg; + + if (fp->aux->func_idx == 0) { + /* Initialize the tail call counter in the main program. */ /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT); } else { /* - * There are no tail calls. Insert nops in order to have - * tail_call_start at a predictable offset. + * Skip the tail call counter initialization in subprograms. + * Insert nops in order to have tail_call_start at a + * predictable offset. */ bpf_skip(jit, 6); } @@ -558,6 +602,43 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) } /* + * Emit an expoline for a jump that follows + */ +static void emit_expoline(struct bpf_jit *jit) +{ + /* exrl %r0,.+10 */ + EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); + /* j . */ + EMIT4_PCREL(0xa7f40000, 0); +} + +/* + * Emit __s390_indirect_jump_r1 thunk if necessary + */ +static void emit_r1_thunk(struct bpf_jit *jit) +{ + if (nospec_uses_trampoline()) { + jit->r1_thunk_ip = jit->prg; + emit_expoline(jit); + /* br %r1 */ + _EMIT2(0x07f1); + } +} + +/* + * Call r1 either directly or via __s390_indirect_jump_r1 thunk + */ +static void call_r1(struct bpf_jit *jit) +{ + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); + else + /* basr %r14,%r1 */ + EMIT2(0x0d00, REG_14, REG_1); +} + +/* * Function epilogue */ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) @@ -570,25 +651,20 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) if (nospec_uses_trampoline()) { jit->r14_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r14 thunk */ - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); + emit_expoline(jit); } /* br %r14 */ _EMIT2(0x07fe); - if ((nospec_uses_trampoline()) && - (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) { - jit->r1_thunk_ip = jit->prg; - /* Generate __s390_indirect_jump_r1 thunk */ - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); - /* br %r1 */ - _EMIT2(0x07f1); - } + if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) + emit_r1_thunk(jit); + + jit->prg = ALIGN(jit->prg, 8); + jit->prologue_plt = jit->prg; + if (jit->prg_buf) + bpf_jit_plt(jit->prg_buf + jit->prg, + jit->prg_buf + jit->prologue_plt_ret, NULL); + jit->prg += BPF_PLT_SIZE; } static int get_probe_mem_regno(const u8 *insn) @@ -663,6 +739,34 @@ static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp, } /* + * Sign-extend the register if necessary + */ +static int sign_extend(struct bpf_jit *jit, int r, u8 size, u8 flags) +{ + if (!(flags & BTF_FMODEL_SIGNED_ARG)) + return 0; + + switch (size) { + case 1: + /* lgbr %r,%r */ + EMIT4(0xb9060000, r, r); + return 0; + case 2: + /* lghr %r,%r */ + EMIT4(0xb9070000, r, r); + return 0; + case 4: + /* lgfr %r,%r */ + EMIT4(0xb9140000, r, r); + return 0; + case 8: + return 0; + default: + return -1; + } +} + +/* * Compile one eBPF instruction into s390x code * * NOTE: Use noinline because for gcov (-fprofile-arcs) gcc allocates a lot of @@ -1297,9 +1401,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, */ case BPF_JMP | BPF_CALL: { - u64 func; + const struct btf_func_model *m; bool func_addr_fixed; - int ret; + int j, ret; + u64 func; ret = bpf_jit_get_func_addr(fp, insn, extra_pass, &func, &func_addr_fixed); @@ -1308,15 +1413,38 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; + /* + * Copy the tail call counter to where the callee expects it. + * + * Note 1: The callee can increment the tail call counter, but + * we do not load it back, since the x86 JIT does not do this + * either. + * + * Note 2: We assume that the verifier does not let us call the + * main program, which clears the tail call counter on entry. + */ + /* mvc STK_OFF_TCCNT(4,%r15),N(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, + 0xf000 | (STK_OFF_TCCNT + STK_OFF + stack_depth)); + + /* Sign-extend the kfunc arguments. */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + m = bpf_jit_find_kfunc_model(fp, insn); + if (!m) + return -1; + + for (j = 0; j < m->nr_args; j++) { + if (sign_extend(jit, BPF_REG_1 + j, + m->arg_size[j], + m->arg_flags[j])) + return -1; + } + } + /* lgrl %w1,func */ EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - if (nospec_uses_trampoline()) { - /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); - } else { - /* basr %r14,%w1 */ - EMIT2(0x0d00, REG_14, REG_W1); - } + /* %r1() */ + call_r1(jit); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); break; @@ -1329,10 +1457,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * B1: pointer to ctx * B2: pointer to bpf_array * B3: index in bpf_array - */ - jit->seen |= SEEN_TAIL_CALL; - - /* + * * if (index >= array->map.max_entries) * goto out; */ @@ -1393,8 +1518,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* lg %r1,bpf_func(%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0, offsetof(struct bpf_prog, bpf_func)); - /* bc 0xf,tail_call_start(%r1) */ - _EMIT4(0x47f01000 + jit->tail_call_start); + if (nospec_uses_trampoline()) { + jit->seen |= SEEN_FUNC; + /* aghi %r1,tail_call_start */ + EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start); + /* brcl 0xf,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip); + } else { + /* bc 0xf,tail_call_start(%r1) */ + _EMIT4(0x47f01000 + jit->tail_call_start); + } /* out: */ if (jit->prg_buf) { *(u16 *)(jit->prg_buf + patch_1_clrj + 2) = @@ -1688,7 +1821,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->prg = 0; jit->excnt = 0; - bpf_jit_prologue(jit, stack_depth); + bpf_jit_prologue(jit, fp, stack_depth); if (bpf_set_addr(jit, 0) < 0) return -1; for (i = 0; i < fp->len; i += insn_count) { @@ -1768,6 +1901,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_jit jit; int pass; + if (WARN_ON_ONCE(bpf_plt_end - bpf_plt != BPF_PLT_SIZE)) + return orig_fp; + if (!fp->jit_requested) return orig_fp; @@ -1859,3 +1995,508 @@ out: tmp : orig_fp); return fp; } + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + struct { + u16 opc; + s32 disp; + } __packed insn; + char expected_plt[BPF_PLT_SIZE]; + char current_plt[BPF_PLT_SIZE]; + char *plt; + int err; + + /* Verify the branch to be patched. */ + err = copy_from_kernel_nofault(&insn, ip, sizeof(insn)); + if (err < 0) + return err; + if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0))) + return -EINVAL; + + if (t == BPF_MOD_JUMP && + insn.disp == ((char *)new_addr - (char *)ip) >> 1) { + /* + * The branch already points to the destination, + * there is no PLT. + */ + } else { + /* Verify the PLT. */ + plt = (char *)ip + (insn.disp << 1); + err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); + if (err < 0) + return err; + bpf_jit_plt(expected_plt, (char *)ip + 6, old_addr); + if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) + return -EINVAL; + /* Adjust the call address. */ + s390_kernel_write(plt + (bpf_plt_target - bpf_plt), + &new_addr, sizeof(void *)); + } + + /* Adjust the mask of the branch. */ + insn.opc = 0xc004 | (new_addr ? 0xf0 : 0); + s390_kernel_write((char *)ip + 1, (char *)&insn.opc + 1, 1); + + /* Make the new code visible to the other CPUs. */ + text_poke_sync_lock(); + + return 0; +} + +struct bpf_tramp_jit { + struct bpf_jit common; + int orig_stack_args_off;/* Offset of arguments placed on stack by the + * func_addr's original caller + */ + int stack_size; /* Trampoline stack size */ + int stack_args_off; /* Offset of stack arguments for calling + * func_addr, has to be at the top + */ + int reg_args_off; /* Offset of register arguments for calling + * func_addr + */ + int ip_off; /* For bpf_get_func_ip(), has to be at + * (ctx - 16) + */ + int arg_cnt_off; /* For bpf_get_func_arg_cnt(), has to be at + * (ctx - 8) + */ + int bpf_args_off; /* Offset of BPF_PROG context, which consists + * of BPF arguments followed by return value + */ + int retval_off; /* Offset of return value (see above) */ + int r7_r8_off; /* Offset of saved %r7 and %r8, which are used + * for __bpf_prog_enter() return value and + * func_addr respectively + */ + int r14_off; /* Offset of saved %r14 */ + int run_ctx_off; /* Offset of struct bpf_tramp_run_ctx */ + int do_fexit; /* do_fexit: label */ +}; + +static void load_imm64(struct bpf_jit *jit, int dst_reg, u64 val) +{ + /* llihf %dst_reg,val_hi */ + EMIT6_IMM(0xc00e0000, dst_reg, (val >> 32)); + /* oilf %rdst_reg,val_lo */ + EMIT6_IMM(0xc00d0000, dst_reg, val); +} + +static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + struct bpf_tramp_link *tlink, bool save_ret) +{ + struct bpf_jit *jit = &tjit->common; + int cookie_off = tjit->run_ctx_off + + offsetof(struct bpf_tramp_run_ctx, bpf_cookie); + struct bpf_prog *p = tlink->link.prog; + int patch; + + /* + * run_ctx.cookie = tlink->cookie; + */ + + /* %r0 = tlink->cookie */ + load_imm64(jit, REG_W0, tlink->cookie); + /* stg %r0,cookie_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W0, REG_0, REG_15, cookie_off); + + /* + * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0) + * goto skip; + */ + + /* %r1 = __bpf_prog_enter */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* la %r3,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + /* ltgr %r7,%r2 */ + EMIT4(0xb9020000, REG_7, REG_2); + /* brcl 8,skip */ + patch = jit->prg; + EMIT6_PCREL_RILC(0xc0040000, 8, 0); + + /* + * retval = bpf_func(args, p->insnsi); + */ + + /* %r1 = p->bpf_func */ + load_imm64(jit, REG_1, (u64)p->bpf_func); + /* la %r2,bpf_args_off(%r15) */ + EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); + /* %r3 = p->insnsi */ + if (!p->jited) + load_imm64(jit, REG_3, (u64)p->insnsi); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + if (save_ret) { + if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) + return -1; + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + } + + /* skip: */ + if (jit->prg_buf) + *(u32 *)&jit->prg_buf[patch + 2] = (jit->prg - patch) >> 1; + + /* + * __bpf_prog_exit(p, start, &run_ctx); + */ + + /* %r1 = __bpf_prog_exit */ + load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); + /* %r2 = p */ + load_imm64(jit, REG_2, (u64)p); + /* lgr %r3,%r7 */ + EMIT4(0xb9040000, REG_3, REG_7); + /* la %r4,run_ctx_off(%r15) */ + EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); + /* %r1() */ + call_r1(jit); + + return 0; +} + +static int alloc_stack(struct bpf_tramp_jit *tjit, size_t size) +{ + int stack_offset = tjit->stack_size; + + tjit->stack_size += size; + return stack_offset; +} + +/* ABI uses %r2 - %r6 for parameter passing. */ +#define MAX_NR_REG_ARGS 5 + +/* The "L" field of the "mvc" instruction is 8 bits. */ +#define MAX_MVC_SIZE 256 +#define MAX_NR_STACK_ARGS (MAX_MVC_SIZE / sizeof(u64)) + +/* -mfentry generates a 6-byte nop on s390x. */ +#define S390X_PATCH_SIZE 6 + +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, + struct bpf_tramp_jit *tjit, + const struct btf_func_model *m, + u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + int nr_bpf_args, nr_reg_args, nr_stack_args; + struct bpf_jit *jit = &tjit->common; + int arg, bpf_arg_off; + int i, j; + + /* Support as many stack arguments as "mvc" instruction can handle. */ + nr_reg_args = min_t(int, m->nr_args, MAX_NR_REG_ARGS); + nr_stack_args = m->nr_args - nr_reg_args; + if (nr_stack_args > MAX_NR_STACK_ARGS) + return -ENOTSUPP; + + /* Return to %r14, since func_addr and %r0 are not available. */ + if (!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) + flags |= BPF_TRAMP_F_SKIP_FRAME; + + /* + * Compute how many arguments we need to pass to BPF programs. + * BPF ABI mirrors that of x86_64: arguments that are 16 bytes or + * smaller are packed into 1 or 2 registers; larger arguments are + * passed via pointers. + * In s390x ABI, arguments that are 8 bytes or smaller are packed into + * a register; larger arguments are passed via pointers. + * We need to deal with this difference. + */ + nr_bpf_args = 0; + for (i = 0; i < m->nr_args; i++) { + if (m->arg_size[i] <= 8) + nr_bpf_args += 1; + else if (m->arg_size[i] <= 16) + nr_bpf_args += 2; + else + return -ENOTSUPP; + } + + /* + * Calculate the stack layout. + */ + + /* Reserve STACK_FRAME_OVERHEAD bytes for the callees. */ + tjit->stack_size = STACK_FRAME_OVERHEAD; + tjit->stack_args_off = alloc_stack(tjit, nr_stack_args * sizeof(u64)); + tjit->reg_args_off = alloc_stack(tjit, nr_reg_args * sizeof(u64)); + tjit->ip_off = alloc_stack(tjit, sizeof(u64)); + tjit->arg_cnt_off = alloc_stack(tjit, sizeof(u64)); + tjit->bpf_args_off = alloc_stack(tjit, nr_bpf_args * sizeof(u64)); + tjit->retval_off = alloc_stack(tjit, sizeof(u64)); + tjit->r7_r8_off = alloc_stack(tjit, 2 * sizeof(u64)); + tjit->r14_off = alloc_stack(tjit, sizeof(u64)); + tjit->run_ctx_off = alloc_stack(tjit, + sizeof(struct bpf_tramp_run_ctx)); + /* The caller has already reserved STACK_FRAME_OVERHEAD bytes. */ + tjit->stack_size -= STACK_FRAME_OVERHEAD; + tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD; + + /* aghi %r15,-stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size); + /* stmg %r2,%rN,fwd_reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + for (i = 0, j = 0; i < m->nr_args; i++) { + if (i < MAX_NR_REG_ARGS) + arg = REG_2 + i; + else + arg = tjit->orig_stack_args_off + + (i - MAX_NR_REG_ARGS) * sizeof(u64); + bpf_arg_off = tjit->bpf_args_off + j * sizeof(u64); + if (m->arg_size[i] <= 8) { + if (i < MAX_NR_REG_ARGS) + /* stg %arg,bpf_arg_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, arg, + REG_0, REG_15, bpf_arg_off); + else + /* mvc bpf_arg_off(8,%r15),arg(%r15) */ + _EMIT6(0xd207f000 | bpf_arg_off, + 0xf000 | arg); + j += 1; + } else { + if (i < MAX_NR_REG_ARGS) { + /* mvc bpf_arg_off(16,%r15),0(%arg) */ + _EMIT6(0xd20ff000 | bpf_arg_off, + reg2hex[arg] << 12); + } else { + /* lg %r1,arg(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_0, + REG_15, arg); + /* mvc bpf_arg_off(16,%r15),0(%r1) */ + _EMIT6(0xd20ff000 | bpf_arg_off, 0x1000); + } + j += 2; + } + } + /* stmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0024, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* stg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_14, REG_0, REG_15, tjit->r14_off); + + if (flags & BPF_TRAMP_F_ORIG_STACK) { + /* + * The ftrace trampoline puts the return address (which is the + * address of the original function + S390X_PATCH_SIZE) into + * %r0; see ftrace_shared_hotpatch_trampoline_br and + * ftrace_init_nop() for details. + */ + + /* lgr %r8,%r0 */ + EMIT4(0xb9040000, REG_8, REG_0); + } else { + /* %r8 = func_addr + S390X_PATCH_SIZE */ + load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); + } + + /* + * ip = func_addr; + * arg_cnt = m->nr_args; + */ + + if (flags & BPF_TRAMP_F_IP_ARG) { + /* %r0 = func_addr */ + load_imm64(jit, REG_0, (u64)func_addr); + /* stg %r0,ip_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->ip_off); + } + /* lghi %r0,nr_bpf_args */ + EMIT4_IMM(0xa7090000, REG_0, nr_bpf_args); + /* stg %r0,arg_cnt_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_0, REG_0, REG_15, + tjit->arg_cnt_off); + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * __bpf_tramp_enter(im); + */ + + /* %r1 = __bpf_tramp_enter */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + for (i = 0; i < fentry->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fentry->links[i], + flags & BPF_TRAMP_F_RET_FENTRY_RET)) + return -EINVAL; + + if (fmod_ret->nr_links) { + /* + * retval = 0; + */ + + /* xc retval_off(8,%r15),retval_off(%r15) */ + _EMIT6(0xd707f000 | tjit->retval_off, + 0xf000 | tjit->retval_off); + + for (i = 0; i < fmod_ret->nr_links; i++) { + if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true)) + return -EINVAL; + + /* + * if (retval) + * goto do_fexit; + */ + + /* ltg %r0,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0002, REG_0, REG_0, REG_15, + tjit->retval_off); + /* brcl 7,do_fexit */ + EMIT6_PCREL_RILC(0xc0040000, 7, tjit->do_fexit); + } + } + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * retval = func_addr(args); + */ + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if (nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* mvc stack_args_off(N,%r15),orig_stack_args_off(%r15) */ + if (nr_stack_args) + _EMIT6(0xd200f000 | + (nr_stack_args * sizeof(u64) - 1) << 16 | + tjit->stack_args_off, + 0xf000 | tjit->orig_stack_args_off); + /* lgr %r1,%r8 */ + EMIT4(0xb9040000, REG_1, REG_8); + /* %r1() */ + call_r1(jit); + /* stg %r2,retval_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, + tjit->retval_off); + + im->ip_after_call = jit->prg_buf + jit->prg; + + /* + * The following nop will be patched by bpf_tramp_image_put(). + */ + + /* brcl 0,im->ip_epilogue */ + EMIT6_PCREL_RILC(0xc0040000, 0, (u64)im->ip_epilogue); + } + + /* do_fexit: */ + tjit->do_fexit = jit->prg; + for (i = 0; i < fexit->nr_links; i++) + if (invoke_bpf_prog(tjit, m, fexit->links[i], false)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = jit->prg_buf + jit->prg; + + /* + * __bpf_tramp_exit(im); + */ + + /* %r1 = __bpf_tramp_exit */ + load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); + /* %r2 = im */ + load_imm64(jit, REG_2, (u64)im); + /* %r1() */ + call_r1(jit); + } + + /* lmg %r2,%rN,reg_args_off(%r15) */ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && nr_reg_args) + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_2, + REG_2 + (nr_reg_args - 1), REG_15, + tjit->reg_args_off); + /* lgr %r1,%r8 */ + if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + EMIT4(0xb9040000, REG_1, REG_8); + /* lmg %r7,%r8,r7_r8_off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, + tjit->r7_r8_off); + /* lg %r14,r14_off(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_14, REG_0, REG_15, tjit->r14_off); + /* lg %r2,retval_off(%r15) */ + if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET)) + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15, + tjit->retval_off); + /* aghi %r15,stack_size */ + EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); + /* Emit an expoline for the following indirect jump. */ + if (nospec_uses_trampoline()) + emit_expoline(jit); + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* br %r14 */ + _EMIT2(0x07fe); + else + /* br %r1 */ + _EMIT2(0x07f1); + + emit_r1_thunk(jit); + + return 0; +} + +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, + void *image_end, const struct btf_func_model *m, + u32 flags, struct bpf_tramp_links *tlinks, + void *func_addr) +{ + struct bpf_tramp_jit tjit; + int ret; + int i; + + for (i = 0; i < 2; i++) { + if (i == 0) { + /* Compute offsets, check whether the code fits. */ + memset(&tjit, 0, sizeof(tjit)); + } else { + /* Generate the code. */ + tjit.common.prg = 0; + tjit.common.prg_buf = image; + } + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); + if (ret < 0) + return ret; + if (tjit.common.prg > (char *)image_end - (char *)image) + /* + * Use the same error code as for exceeding + * BPF_MAX_TRAMP_LINKS. + */ + return -E2BIG; + } + + return ret; +} + +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} |