diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 13:58:10 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 13:58:10 -0700 |
commit | 45365a06aa305c9eca1cbf48aef48a7a0cea3b4e (patch) | |
tree | aa3e1cdbe77aab1ac5f16cbaaa8ed9147df82efe /arch | |
parent | cdc8fcb49905c0b67e355e027cb462ee168ffaa3 (diff) | |
parent | 9a996c67a65d937b23408e56935ef23404c9418e (diff) |
Merge tag 's390-5.9-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
Pull s390 updates from Heiko Carstens:
- Add support for function error injection.
- Add support for custom exception handlers, as required by
BPF_PROBE_MEM.
- Add support for BPF_PROBE_MEM.
- Add trace events for idle enter / exit for the s390 specific idle
implementation.
- Remove unused zcore memmmap device.
- Remove unused "raw view" from s390 debug feature.
- AP bus + zcrypt device driver code refactoring.
- Provide cex4 cca sysfs attributes for cex3 for zcrypt device driver.
- Expose only minimal interface to walk physmem for mm/memblock. This
is a common code change and it has been agreed on with Mike Rapoport
and Andrew Morton that this can go upstream via the s390 tree.
- Rework of the s390 vmem/vmmemap code to allow for future memory hot
remove.
- Get rid of FORCE_MAX_ZONEORDER to finally allow for order-10
allocations again, instead of only order-8 allocations.
- Various small improvements and fixes.
* tag 's390-5.9-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (48 commits)
s390/vmemmap: coding style updates
s390/vmemmap: avoid memset(PAGE_UNUSED) when adding consecutive sections
s390/vmemmap: remember unused sub-pmd ranges
s390/vmemmap: fallback to PTEs if mapping large PMD fails
s390/vmem: cleanup empty page tables
s390/vmemmap: take the vmem_mutex when populating/freeing
s390/vmemmap: cleanup when vmemmap_populate() fails
s390/vmemmap: extend modify_pagetable() to handle vmemmap
s390/vmem: consolidate vmem_add_range() and vmem_remove_range()
s390/vmem: rename vmem_add_mem() to vmem_add_range()
s390: enable HAVE_FUNCTION_ERROR_INJECTION
s390/pci: clarify comment in s390_mmio_read/write
s390/time: improve comparison for tod steering
s390/time: select CLOCKSOURCE_VALIDATE_LAST_CYCLE
s390/time: use CLOCKSOURCE_MASK
s390/bpf: implement BPF_PROBE_MEM
s390/kernel: expand exception table logic to allow new handling options
s390/kernel: unify EX_TABLE* implementations
s390/mm: allow order 10 allocations
s390/mm: avoid trimming to MAX_ORDER
...
Diffstat (limited to 'arch')
33 files changed, 812 insertions, 583 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c7d7ede6300c..9cfd8de907cb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -102,7 +102,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE - select ARCH_KEEP_MEMBLOCK select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING @@ -126,6 +125,7 @@ config S390 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC + select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY @@ -145,6 +145,7 @@ config S390 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX @@ -626,10 +627,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y -config FORCE_MAX_ZONEORDER - int - default "9" - config MAX_PHYSMEM_BITS int "Maximum size of supported physical memory in bits (42-53)" range 42 53 diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 5503217366ec..a363d30ce739 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -129,8 +129,7 @@ static void appldata_get_os_data(void *data) os_data->nr_cpus = j; - new_size = sizeof(struct appldata_os_data) + - (os_data->nr_cpus * sizeof(struct appldata_os_per_cpu)); + new_size = struct_size(os_data, os_cpu, os_data->nr_cpus); if (ops.size != new_size) { if (ops.active) { rc = appldata_diag(APPLDATA_RECORD_OS_ID, @@ -165,8 +164,7 @@ static int __init appldata_os_init(void) { int rc, max_size; - max_size = sizeof(struct appldata_os_data) + - (num_possible_cpus() * sizeof(struct appldata_os_per_cpu)); + max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus()); if (max_size > APPLDATA_MAX_REC_SIZE) { pr_err("Maximum OS record size %i exceeds the maximum " "record size %i\n", max_size, APPLDATA_MAX_REC_SIZE); diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h new file mode 100644 index 000000000000..11f615eb0066 --- /dev/null +++ b/arch/s390/include/asm/asm-const.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ASM_CONST_H +#define _ASM_S390_ASM_CONST_H + +#ifdef __ASSEMBLY__ +# define stringify_in_c(...) __VA_ARGS__ +#else +/* This version of stringify will deal with commas... */ +# define __stringify_in_c(...) #__VA_ARGS__ +# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " +#endif +#endif /* _ASM_S390_ASM_CONST_H */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 310134015541..17a26261f288 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -12,7 +12,7 @@ #include <linux/kernel.h> #include <linux/time.h> #include <linux/refcount.h> -#include <uapi/asm/debug.h> +#include <linux/fs.h> #define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ #define DEBUG_OFF_LEVEL -1 /* level where debug is switched off */ @@ -26,6 +26,21 @@ #define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */ /* the entry information */ +#define __DEBUG_FEATURE_VERSION 2 /* version of debug feature */ + +struct __debug_entry { + union { + struct { + unsigned long clock : 52; + unsigned long exception : 1; + unsigned long level : 3; + unsigned long cpuid : 8; + } fields; + unsigned long stck; + } id; + void *caller; +} __packed; + typedef struct __debug_entry debug_entry_t; struct debug_view; @@ -82,7 +97,6 @@ struct debug_view { }; extern struct debug_view debug_hex_ascii_view; -extern struct debug_view debug_raw_view; extern struct debug_view debug_sprintf_view; /* do NOT use the _common functions */ diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index ae27f756b409..3beb294fd553 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -1,12 +1,20 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __S390_EXTABLE_H #define __S390_EXTABLE_H + +#include <asm/ptrace.h> +#include <linux/compiler.h> + /* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. + * The exception table consists of three addresses: + * + * - Address of an instruction that is allowed to fault. + * - Address at which the program should continue. + * - Optional address of handler that takes pt_regs * argument and runs in + * interrupt context. + * + * No registers are modified, so it is entirely up to the continuation code + * to figure out what to do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -17,6 +25,7 @@ struct exception_table_entry { int insn, fixup; + long handler; }; extern struct exception_table_entry *__start_dma_ex_table; @@ -29,6 +38,39 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x) return (unsigned long)&x->fixup + x->fixup; } +typedef bool (*ex_handler_t)(const struct exception_table_entry *, + struct pt_regs *); + +static inline ex_handler_t +ex_fixup_handler(const struct exception_table_entry *x) +{ + if (likely(!x->handler)) + return NULL; + return (ex_handler_t)((unsigned long)&x->handler + x->handler); +} + +static inline bool ex_handle(const struct exception_table_entry *x, + struct pt_regs *regs) +{ + ex_handler_t handler = ex_fixup_handler(x); + + if (unlikely(handler)) + return handler(x, regs); + regs->psw.addr = extable_fixup(x); + return true; +} + #define ARCH_HAS_RELATIVE_EXTABLE +static inline void swap_ex_entry_fixup(struct exception_table_entry *a, + struct exception_table_entry *b, + struct exception_table_entry tmp, + int delta) +{ + a->fixup = b->fixup + delta; + b->fixup = tmp.fixup - delta; + a->handler = b->handler + delta; + b->handler = tmp.handler - delta; +} + #endif diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index 7f22262b0e46..a0a7a2c72bd4 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -2,38 +2,27 @@ #ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H +#include <asm/asm-const.h> #include <linux/stringify.h> #define __ALIGN .align 4, 0x07 #define __ALIGN_STR __stringify(__ALIGN) -#ifndef __ASSEMBLY__ - /* * Helper macro for exception table entries */ -#define EX_TABLE(_fault, _target) \ - ".section __ex_table,\"a\"\n" \ - ".align 4\n" \ - ".long (" #_fault ") - .\n" \ - ".long (" #_target ") - .\n" \ - ".previous\n" - -#else /* __ASSEMBLY__ */ -#define EX_TABLE(_fault, _target) \ - .section __ex_table,"a" ; \ - .align 4 ; \ - .long (_fault) - . ; \ - .long (_target) - . ; \ - .previous +#define __EX_TABLE(_section, _fault, _target) \ + stringify_in_c(.section _section,"a";) \ + stringify_in_c(.align 8;) \ + stringify_in_c(.long (_fault) - .;) \ + stringify_in_c(.long (_target) - .;) \ + stringify_in_c(.quad 0;) \ + stringify_in_c(.previous) -#define EX_TABLE_DMA(_fault, _target) \ - .section .dma.ex_table, "a" ; \ - .align 4 ; \ - .long (_fault) - . ; \ - .long (_target) - . ; \ - .previous +#define EX_TABLE(_fault, _target) \ + __EX_TABLE(__ex_table, _fault, _target) +#define EX_TABLE_DMA(_fault, _target) \ + __EX_TABLE(.dma.ex_table, _fault, _target) -#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h index 419fac7a62c0..f62cd3ed2d44 100644 --- a/arch/s390/include/asm/pci_dma.h +++ b/arch/s390/include/asm/pci_dma.h @@ -131,12 +131,6 @@ static inline void validate_st_entry(unsigned long *entry) *entry |= ZPCI_TABLE_VALID; } -static inline void invalidate_table_entry(unsigned long *entry) -{ - *entry &= ~ZPCI_TABLE_VALID_MASK; - *entry |= ZPCI_TABLE_INVALID; -} - static inline void invalidate_pt_entry(unsigned long *entry) { WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID); @@ -173,11 +167,6 @@ static inline int pt_entry_isvalid(unsigned long entry) return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; } -static inline int entry_isprotected(unsigned long entry) -{ - return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED; -} - static inline unsigned long *get_rt_sto(unsigned long entry) { return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 19d603bd1f36..7eb01a5459cd 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1669,7 +1669,7 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define kern_addr_valid(addr) (1) extern int vmem_add_mapping(unsigned long start, unsigned long size); -extern int vmem_remove_mapping(unsigned long start, unsigned long size); +extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int s390_enable_sie(void); extern int s390_enable_skey(void); extern void s390_reset_cmma(struct mm_struct *mm); diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index f009a13afe71..16b3e4396312 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -184,5 +184,10 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) return regs->gprs[15]; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->gprs[2] = rc; +} + #endif /* __ASSEMBLY__ */ #endif /* _S390_PTRACE_H */ diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 7326f110d48c..20b37b059e2b 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -54,6 +54,10 @@ static inline int smp_get_base_cpu(int cpu) return cpu - (cpu % (smp_cpu_mtid + 1)); } +static inline void smp_cpus_done(unsigned int max_cpus) +{ +} + extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 3c3d6fe8e2f0..1320f4213d80 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -30,7 +30,7 @@ }) #define __S390_SYS_STUBx(x, name, ...) \ - asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ + asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));\ ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ { \ @@ -46,7 +46,7 @@ #define COMPAT_SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ asmlinkage long __s390_compat_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__s390_compat__sys_##sname, ERRNO); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO); \ asmlinkage long __s390_compat_sys_##sname(void) #define SYSCALL_DEFINE0(sname) \ @@ -72,7 +72,7 @@ asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ - ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 6bf3a45ccfec..289aaff4d365 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -49,11 +49,6 @@ static inline void set_clock_comparator(__u64 time) asm volatile("sckc %0" : : "Q" (time)); } -static inline void store_clock_comparator(__u64 *time) -{ - asm volatile("stckc %0" : "=Q" (*time)); -} - void clock_comparator_work(void); void __init time_early_init(void); diff --git a/arch/s390/include/uapi/asm/debug.h b/arch/s390/include/uapi/asm/debug.h deleted file mode 100644 index c7c564d9aea4..000000000000 --- a/arch/s390/include/uapi/asm/debug.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S/390 debug facility - * - * Copyright IBM Corp. 1999, 2000 - */ - -#ifndef _UAPIDEBUG_H -#define _UAPIDEBUG_H - -#include <linux/fs.h> - -/* Note: - * struct __debug_entry must be defined outside of #ifdef __KERNEL__ - * in order to allow a user program to analyze the 'raw'-view. - */ - -struct __debug_entry{ - union { - struct { - unsigned long long clock:52; - unsigned long long exception:1; - unsigned long long level:3; - unsigned long long cpuid:8; - } fields; - - unsigned long long stck; - } id; - void* caller; -} __attribute__((packed)); - - -#define __DEBUG_FEATURE_VERSION 2 /* version of debug feature */ - -#endif /* _UAPIDEBUG_H */ diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index 5a2177e96e88..22fd202856bc 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -36,12 +36,12 @@ * - length(n_modulus) = inputdatalength */ struct ica_rsa_modexpo { - char __user *inputdata; - unsigned int inputdatalength; - char __user *outputdata; - unsigned int outputdatalength; - char __user *b_key; - char __user *n_modulus; + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *b_key; + __u8 __user *n_modulus; }; /** @@ -59,15 +59,15 @@ struct ica_rsa_modexpo { * - length(u_mult_inv) = inputdatalength/2 + 8 */ struct ica_rsa_modexpo_crt { - char __user *inputdata; - unsigned int inputdatalength; - char __user *outputdata; - unsigned int outputdatalength; - char __user *bp_key; - char __user *bq_key; - char __user *np_prime; - char __user *nq_prime; - char __user *u_mult_inv; + __u8 __user *inputdata; + __u32 inputdatalength; + __u8 __user *outputdata; + __u32 outputdatalength; + __u8 __user *bp_key; + __u8 __user *bq_key; + __u8 __user *np_prime; + __u8 __user *nq_prime; + __u8 __user *u_mult_inv; }; /** @@ -83,67 +83,67 @@ struct ica_rsa_modexpo_crt { * key block */ struct CPRBX { - unsigned short cprb_len; /* CPRB length 220 */ - unsigned char cprb_ver_id; /* CPRB version id. 0x02 */ - unsigned char pad_000[3]; /* Alignment pad bytes */ - unsigned char func_id[2]; /* function id 0x5432 */ - unsigned char cprb_flags[4]; /* Flags */ - unsigned int req_parml; /* request parameter buffer len */ - unsigned int req_datal; /* request data buffer */ - unsigned int rpl_msgbl; /* reply message block length */ - unsigned int rpld_parml; /* replied parameter block len */ - unsigned int rpl_datal; /* reply data block len */ - unsigned int rpld_datal; /* replied data block len */ - unsigned int req_extbl; /* request extension block len */ - unsigned char pad_001[4]; /* reserved */ - unsigned int rpld_extbl; /* replied extension block len */ - unsigned char padx000[16 - sizeof(char *)]; - unsigned char *req_parmb; /* request parm block 'address' */ - unsigned char padx001[16 - sizeof(char *)]; - unsigned char *req_datab; /* request data block 'address' */ - unsigned char padx002[16 - sizeof(char *)]; - unsigned char *rpl_parmb; /* reply parm block 'address' */ - unsigned char padx003[16 - sizeof(char *)]; - unsigned char *rpl_datab; /* reply data block 'address' */ - unsigned char padx004[16 - sizeof(char *)]; - unsigned char *req_extb; /* request extension block 'addr'*/ - unsigned char padx005[16 - sizeof(char *)]; - unsigned char *rpl_extb; /* reply extension block 'address'*/ - unsigned short ccp_rtcode; /* server return code */ - unsigned short ccp_rscode; /* server reason code */ - unsigned int mac_data_len; /* Mac Data Length */ - unsigned char logon_id[8]; /* Logon Identifier */ - unsigned char mac_value[8]; /* Mac Value */ - unsigned char mac_content_flgs;/* Mac content flag byte */ - unsigned char pad_002; /* Alignment */ - unsigned short domain; /* Domain */ - unsigned char usage_domain[4];/* Usage domain */ - unsigned char cntrl_domain[4];/* Control domain */ - unsigned char S390enf_mask[4];/* S/390 enforcement mask */ - unsigned char pad_004[36]; /* reserved */ + __u16 cprb_len; /* CPRB length 220 */ + __u8 cprb_ver_id; /* CPRB version id. 0x02 */ + __u8 pad_000[3]; /* Alignment pad bytes */ + __u8 func_id[2]; /* function id 0x5432 */ + __u8 cprb_flags[4]; /* Flags */ + __u32 req_parml; /* request parameter buffer len */ + __u32 req_datal; /* request data buffer */ + __u32 rpl_msgbl; /* reply message block length */ + __u32 rpld_parml; /* replied parameter block len */ + __u32 rpl_datal; /* reply data block len */ + __u32 rpld_datal; /* replied data block len */ + __u32 req_extbl; /* request extension block len */ + __u8 pad_001[4]; /* reserved */ + __u32 rpld_extbl; /* replied extension block len */ + __u8 padx000[16 - sizeof(__u8 *)]; + __u8 __user *req_parmb; /* request parm block 'address' */ + __u8 padx001[16 - sizeof(__u8 *)]; + __u8 __user *req_datab; /* request data block 'address' */ + __u8 padx002[16 - sizeof(__u8 *)]; + __u8 __user *rpl_parmb; /* reply parm block 'address' */ + __u8 padx003[16 - sizeof(__u8 *)]; + __u8 __user *rpl_datab; /* reply data block 'address' */ + __u8 padx004[16 - sizeof(__u8 *)]; + __u8 __user *req_extb; /* request extension block 'addr'*/ + __u8 padx005[16 - sizeof(__u8 *)]; + __u8 __user *rpl_extb; /* reply extension block 'address'*/ + __u16 ccp_rtcode; /* server return code */ + __u16 ccp_rscode; /* server reason code */ + __u32 mac_data_len; /* Mac Data Length */ + __u8 logon_id[8]; /* Logon Identifier */ + __u8 mac_value[8]; /* Mac Value */ + __u8 mac_content_flgs; /* Mac content flag byte */ + __u8 pad_002; /* Alignment */ + __u16 domain; /* Domain */ + __u8 usage_domain[4]; /* Usage domain */ + __u8 cntrl_domain[4]; /* Control domain */ + __u8 S390enf_mask[4]; /* S/390 enforcement mask */ + __u8 pad_004[36]; /* reserved */ } __attribute__((packed)); /** * xcRB */ struct ica_xcRB { - unsigned short agent_ID; - unsigned int user_defined; - unsigned short request_ID; - unsigned int request_control_blk_length; - unsigned char padding1[16 - sizeof(char *)]; - char __user *request_control_blk_addr; - unsigned int request_data_length; - char padding2[16 - sizeof(char *)]; - char __user *request_data_address; - unsigned int reply_control_blk_length; - char padding3[16 - sizeof(char *)]; - char __user *reply_control_blk_addr; - unsigned int reply_data_length; - char padding4[16 - sizeof(char *)]; - char __user *reply_data_addr; - unsigned short priority_window; - unsigned int status; + __u16 agent_ID; + __u32 user_defined; + __u16 request_ID; + __u32 request_control_blk_length; + __u8 _padding1[16 - sizeof(__u8 *)]; + __u8 __user *request_control_blk_addr; + __u32 request_data_length; + __u8 _padding2[16 - sizeof(__u8 *)]; + __u8 __user *request_data_address; + __u32 reply_control_blk_length; + __u8 _padding3[16 - sizeof(__u8 *)]; + __u8 __user *reply_control_blk_addr; + __u32 reply_data_length; + __u8 __padding4[16 - sizeof(__u8 *)]; + __u8 __user *reply_data_addr; + __u16 priority_window; + __u32 status; } __attribute__((packed)); /** diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index f96a5857bbfd..c42ce348103c 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -549,8 +549,7 @@ static int get_mem_chunk_cnt(void) int cnt = 0; u64 idx; - for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, - MEMBLOCK_NONE, NULL, NULL, NULL) + for_each_physmem_range(idx, &oldmem_type, NULL, NULL) cnt++; return cnt; } @@ -563,8 +562,7 @@ static void loads_init(Elf64_Phdr *phdr, u64 loads_offset) phys_addr_t start, end; u64 idx; - for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_physmem_range(idx, &oldmem_type, &start, &end) { phdr->p_filesz = end - start; phdr->p_type = PT_LOAD; phdr->p_offset = start; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 263075a1af36..beb4b44a11d1 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -90,27 +90,11 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, size_t user_buf_size, loff_t *offset); static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, char *out_buf, const char *in_buf); -static int debug_raw_format_fn(debug_info_t *id, - struct debug_view *view, char *out_buf, - const char *in_buf); -static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf); - static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, char *out_buf, debug_sprintf_entry_t *curr_event); /* globals */ -struct debug_view debug_raw_view = { - "raw", - NULL, - &debug_raw_header_fn, - &debug_raw_format_fn, - NULL, - NULL -}; -EXPORT_SYMBOL(debug_raw_view); - struct debug_view debug_hex_ascii_view = { "hex_ascii", NULL, @@ -1386,32 +1370,6 @@ out: } /* - * prints debug header in raw format - */ -static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf) -{ - int rc; - - rc = sizeof(debug_entry_t); - memcpy(out_buf, entry, sizeof(debug_entry_t)); - return rc; -} - -/* - * prints debug data in raw format - */ -static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf) -{ - int rc; - - rc = id->buf_size; - memcpy(out_buf, in_buf, id->buf_size); - return rc; -} - -/* * prints debug data in hex/ascii format */ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 969b35b177dd..23edf196d3dc 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -370,7 +370,7 @@ EXPORT_SYMBOL(sie_exit) /* * SVC interrupt handler routine. System calls are synchronous events and - * are executed with interrupts enabled. + * are entered with interrupts disabled. */ ENTRY(system_call) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 0d7fbdfe995a..88bb42ca5008 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/sched/cputime.h> +#include <trace/events/power.h> #include <asm/nmi.h> #include <asm/smp.h> #include "entry.h" @@ -32,11 +33,12 @@ void enabled_wait(void) PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); + trace_cpu_idle_rcuidle(1, smp_processor_id()); local_irq_save(flags); /* Call the assembler magic in entry.S */ psw_idle(idle, psw_mask); local_irq_restore(flags); - + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); /* Account time spent with enabled wait psw loaded as idle time. */ write_seqcount_begin(&idle->seqcount); diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 548d0ea9808d..d2a71d872638 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -523,10 +523,8 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) * zero, try to fix up. */ entry = s390_search_extables(regs->psw.addr); - if (entry) { - regs->psw.addr = extable_fixup(entry); + if (entry && ex_handle(entry, regs)) return 1; - } /* * fixup_exception() could not handle it, diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 452502f9a0d9..3b895971c3d0 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -167,7 +167,7 @@ static struct timer_list lgr_timer; */ static void lgr_timer_set(void) { - mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ); + mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); } /* diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 07aa15ba43b3..0c4194d407ac 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1127,14 +1127,6 @@ void __init setup_arch(char **cmdline_p) free_mem_detect_info(); remove_oldmem(); - /* - * Make sure all chunks are MAX_ORDER aligned so we don't need the - * extra checks that HOLES_IN_ZONE would require. - * - * Is this still required? - */ - memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT)); - if (is_prot_virt_host()) setup_uv(); setup_memory_end(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index e6be63ff162a..f685a38f166d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1012,10 +1012,6 @@ void __init smp_prepare_boot_cpu(void) smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); } -void __init smp_cpus_done(unsigned int max_cpus) -{ -} - void __init smp_setup_processor_id(void) { pcpu_devices[0].address = stap(); @@ -1145,6 +1141,7 @@ static int smp_cpu_online(unsigned int cpu) return sysfs_create_group(&s->kobj, &cpu_online_attr_group); } + static int smp_cpu_pre_down(unsigned int cpu) { struct device *s = &per_cpu(cpu_device, cpu)->dev; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index b1113b519432..513e59d08a55 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -237,7 +237,7 @@ static u64 read_tod_clock(struct clocksource *cs) preempt_disable(); /* protect from changes to steering parameters */ now = get_tod_clock(); adj = tod_steering_end - now; - if (unlikely((s64) adj >= 0)) + if (unlikely((s64) adj > 0)) /* * manually steer by 1 cycle every 2^16 cycles. This * corresponds to shifting the tod delta by 15. 1s is @@ -253,7 +253,7 @@ static struct clocksource clocksource_tod = { .name = "tod", .rating = 400, .read = read_tod_clock, - .mask = -1ULL, + .mask = CLOCKSOURCE_MASK(64), .mult = 1000, .shift = 12, .flags = CLOCK_SOURCE_IS_CONTINUOUS, @@ -669,7 +669,7 @@ static void stp_work_fn(struct work_struct *work) * There is a usable clock but the synchonization failed. * Retry after a second. */ - mod_timer(&stp_timer, jiffies + HZ); + mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); out_unlock: mutex_unlock(&stp_work_mutex); @@ -683,7 +683,7 @@ static struct bus_type stp_subsys = { .dev_name = "stp", }; -static ssize_t stp_ctn_id_show(struct device *dev, +static ssize_t ctn_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -693,9 +693,9 @@ static ssize_t stp_ctn_id_show(struct device *dev, *(unsigned long long *) stp_info.ctnid); } -static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL); +static DEVICE_ATTR_RO(ctn_id); -static ssize_t stp_ctn_type_show(struct device *dev, +static ssize_t ctn_type_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -704,9 +704,9 @@ static ssize_t stp_ctn_type_show(struct device *dev, return sprintf(buf, "%i\n", stp_info.ctn); } -static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL); +static DEVICE_ATTR_RO(ctn_type); -static ssize_t stp_dst_offset_show(struct device *dev, +static ssize_t dst_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -715,9 +715,9 @@ static ssize_t stp_dst_offset_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); } -static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL); +static DEVICE_ATTR_RO(dst_offset); -static ssize_t stp_leap_seconds_show(struct device *dev, +static ssize_t leap_seconds_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -726,9 +726,9 @@ static ssize_t stp_leap_seconds_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); } -static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL); +static DEVICE_ATTR_RO(leap_seconds); -static ssize_t stp_stratum_show(struct device *dev, +static ssize_t stratum_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -737,9 +737,9 @@ static ssize_t stp_stratum_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); } -static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL); +static DEVICE_ATTR_RO(stratum); -static ssize_t stp_time_offset_show(struct device *dev, +static ssize_t time_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -748,9 +748,9 @@ static ssize_t stp_time_offset_show(struct device *dev, return sprintf(buf, "%i\n", (int) stp_info.tto); } -static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL); +static DEVICE_ATTR_RO(time_offset); -static ssize_t stp_time_zone_offset_show(struct device *dev, +static ssize_t time_zone_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -759,10 +759,9 @@ static ssize_t stp_time_zone_offset_show(struct device *dev, return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); } -static DEVICE_ATTR(time_zone_offset, 0400, - stp_time_zone_offset_show, NULL); +static DEVICE_ATTR_RO(time_zone_offset); -static ssize_t stp_timing_mode_show(struct device *dev, +static ssize_t timing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -771,9 +770,9 @@ static ssize_t stp_timing_mode_show(struct device *dev, return sprintf(buf, "%i\n", stp_info.tmd); } -static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL); +static DEVICE_ATTR_RO(timing_mode); -static ssize_t stp_timing_state_show(struct device *dev, +static ssize_t timing_state_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -782,16 +781,16 @@ static ssize_t stp_timing_state_show(struct device *dev, return sprintf(buf, "%i\n", stp_info.tst); } -static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL); +static DEVICE_ATTR_RO(timing_state); -static ssize_t stp_online_show(struct device *dev, +static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%i\n", stp_online); } -static ssize_t stp_online_store(struct device *dev, +static ssize_t online_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -817,18 +816,14 @@ static ssize_t stp_online_store(struct device *dev, * Can't use DEVICE_ATTR because the attribute should be named * stp/online but dev_attr_online already exists in this file .. */ -static struct device_attribute dev_attr_stp_online = { - .attr = { .name = "online", .mode = 0600 }, - .show = stp_online_show, - .store = stp_online_store, -}; +static DEVICE_ATTR_RW(online); static struct device_attribute *stp_attributes[] = { &dev_attr_ctn_id, &dev_attr_ctn_type, &dev_attr_dst_offset, &dev_attr_leap_seconds, - &dev_attr_stp_online, + &dev_attr_online, &dev_attr_stratum, &dev_attr_time_offset, &dev_attr_time_zone_offset, diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 332b542548cd..ca47141a5be9 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -356,9 +356,9 @@ static atomic_t topology_poll = ATOMIC_INIT(0); static void set_topology_timer(void) { if (atomic_add_unless(&topology_poll, -1, 0)) - mod_timer(&topology_timer, jiffies + HZ / 10); + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); else - mod_timer(&topology_timer, jiffies + HZ * 60); + mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); } void topology_expect_change(void) diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index ff9cc4c3290e..8d1e8a1a97df 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -50,11 +50,8 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) } else { const struct exception_table_entry *fixup; fixup = s390_search_extables(regs->psw.addr); - if (fixup) - regs->psw.addr = extable_fixup(fixup); - else { + if (!fixup || !ex_handle(fixup, regs)) die(regs, str); - } } } @@ -251,7 +248,7 @@ void monitor_event_exception(struct pt_regs *regs) case BUG_TRAP_TYPE_NONE: fixup = s390_search_extables(regs->psw.addr); if (fixup) - regs->psw.addr = extable_fixup(fixup); + ex_handle(fixup, regs); break; case BUG_TRAP_TYPE_WARN: break; diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 28fd66d558ff..678333936f78 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -14,3 +14,5 @@ KASAN_SANITIZE_uaccess.o := n obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o CFLAGS_test_unwind.o += -fno-optimize-sibling-calls + +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c new file mode 100644 index 000000000000..8c9d4da87eef --- /dev/null +++ b/arch/s390/lib/error-inject.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include <asm/ptrace.h> +#include <linux/error-injection.h> +#include <linux/kprobes.h> + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some + * kernel function. + */ + regs->psw.addr = regs->gprs[14]; +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 36bce727897b..5c15ae3daf71 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -189,7 +189,7 @@ static void cmm_set_timer(void) del_timer(&cmm_timer); return; } - mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ); + mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); } static void cmm_timer_fn(struct timer_list *unused) diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 9e0aa7aa03ba..5060956b8e7d 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -313,15 +313,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long goto out_free; } - rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); - - if (rc) - goto out_free; - seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (seg->res == NULL) { rc = -ENOMEM; - goto out_shared; + goto out_free; } seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; seg->res->start = seg->start_addr; @@ -335,12 +330,17 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long if (rc == SEG_TYPE_SC || ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; + + /* Check for overlapping resources before adding the mapping. */ if (request_resource(&iomem_resource, seg->res)) { rc = -EBUSY; - kfree(seg->res); - goto out_shared; + goto out_free_resource; } + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + if (rc) + goto out_resource; + if (do_nonshared) diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, &start_addr, &end_addr); @@ -351,14 +351,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); rc = diag_cc; - goto out_resource; + goto out_mapping; } if (diag_cc > 1) { pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr); rc = dcss_diag_translate_rc(end_addr); dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); - goto out_resource; + goto out_mapping; } seg->start_addr = start_addr; seg->end = end_addr; @@ -377,11 +377,12 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long (void*) seg->end, segtype_string[seg->vm_segtype]); } goto out; + out_mapping: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_resource: release_resource(seg->res); + out_free_resource: kfree(seg->res); - out_shared: - vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_free: kfree(seg); out: @@ -400,8 +401,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long * -EIO : could not perform query or load diagnose * -ENOENT : no such segment * -EOPNOTSUPP: multi-part segment cannot be used with linux - * -ENOSPC : segment cannot be used (overlaps with storage) - * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * -EBUSY : segment cannot be used (overlaps with dcss or storage) * -ERANGE : segment cannot be used (exceeds kernel mapping range) * -EPERM : segment is currently loaded with incompatible permissions * -ENOMEM : out of memory @@ -626,10 +626,6 @@ void segment_warning(int rc, char *seg_name) pr_err("DCSS %s has multiple page ranges and cannot be " "loaded or queried\n", seg_name); break; - case -ENOSPC: - pr_err("DCSS %s overlaps with used storage and cannot " - "be loaded\n", seg_name); - break; case -EBUSY: pr_err("%s needs used memory resources and cannot be " "loaded or queried\n", seg_name); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d53c2e2ea1fd..aebf9183bedd 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -255,10 +255,8 @@ static noinline void do_no_context(struct pt_regs *regs) /* Are we prepared to handle this kernel fault? */ fixup = s390_search_extables(regs->psw.addr); - if (fixup) { - regs->psw.addr = extable_fixup(fixup); + if (fixup && ex_handle(fixup, regs)) return; - } /* * Oops. The kernel tried to access some bad page. We'll have to @@ -376,7 +374,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suprression) + * 04 Protection -> Write-Protection (suppression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 8b6282cf7d13..1aed1a4dfc2d 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -20,14 +20,6 @@ static DEFINE_MUTEX(vmem_mutex); -struct memory_segment { - struct list_head list; - unsigned long start; - unsigned long size; -}; - -static LIST_HEAD(mem_segs); - static void __ref *vmem_alloc_pages(unsigned int order) { unsigned long size = PAGE_SIZE << order; @@ -37,6 +29,15 @@ static void __ref *vmem_alloc_pages(unsigned int order) return (void *) memblock_phys_alloc(size, size); } +static void vmem_free_pages(unsigned long addr, int order) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(phys_to_page(addr)))) + return; + free_pages(addr, order); +} + void *vmem_crst_alloc(unsigned long val) { unsigned long *table; @@ -62,332 +63,486 @@ pte_t __ref *vmem_pte_alloc(void) return pte; } +static void vmem_pte_free(unsigned long *table) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page(table)))) + return; + page_table_free(&init_mm, table); +} + +#define PAGE_UNUSED 0xFD + /* - * Add a physical memory range to the 1:1 mapping. + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges + * from unused_pmd_start to next PMD_SIZE boundary. */ -static int vmem_add_mem(unsigned long start, unsigned long size) +static unsigned long unused_pmd_start; + +static void vmemmap_flush_unused_pmd(void) { - unsigned long pgt_prot, sgt_prot, r3_prot; - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - int ret = -ENOMEM; + if (!unused_pmd_start) + return; + memset(__va(unused_pmd_start), PAGE_UNUSED, + ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start); + unused_pmd_start = 0; +} + +static void __vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed (just in case the memmap never gets initialized, + * e.g., because the memory block never gets onlined). + */ + memset(__va(start), 0, sizeof(struct page)); +} - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - r3_prot = pgprot_val(REGION3_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - r3_prot &= ~_REGION_ENTRY_NOEXEC; +static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_pmd_start == start) { + unused_pmd_start = end; + if (likely(IS_ALIGNED(unused_pmd_start, PMD_SIZE))) + unused_pmd_start = 0; + return; } - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } - pu_dir = pud_offset(p4_dir, address); - if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && - !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pud_val(*pu_dir) = address | r3_prot; - address += PUD_SIZE; - pages2g++; - continue; - } - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) - goto out; - pud_populate(&init_mm, pu_dir, pm_dir); - } - pm_dir = pmd_offset(pu_dir, address); - if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && - !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pmd_val(*pm_dir) = address | sgt_prot; - address += PMD_SIZE; - pages1m++; + vmemmap_flush_unused_pmd(); + __vmemmap_use_sub_pmd(start, end); +} + +static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + + vmemmap_flush_unused_pmd(); + + /* Could be our memmap page is filled with PAGE_UNUSED already ... */ + __vmemmap_use_sub_pmd(start, end); + + /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset(page, PAGE_UNUSED, start - __pa(page)); + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD the last + * unused range in the populated PMD. + */ + if (!IS_ALIGNED(end, PMD_SIZE)) + unused_pmd_start = end; +} + +/* Returns true if the PMD is completely unused and can be freed. */ +static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) +{ + void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + + vmemmap_flush_unused_pmd(); + memset(__va(start), PAGE_UNUSED, end - start); + return !memchr_inv(page, PAGE_UNUSED, PMD_SIZE); +} + +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool add, bool direct) +{ + unsigned long prot, pages = 0; + int ret = -ENOMEM; + pte_t *pte; + + prot = pgprot_val(PAGE_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_PAGE_NOEXEC; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (!add) { + if (pte_none(*pte)) + continue; + if (!direct) + vmem_free_pages(pfn_to_phys(pte_pfn(*pte)), 0); + pte_clear(&init_mm, addr, pte); + } else if (pte_none(*pte)) { + if (!direct) { + void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + + if (!new_page) + goto out; + pte_val(*pte) = __pa(new_page) | prot; + } else { + pte_val(*pte) = addr | prot; + } + } else { continue; } - if (pmd_none(*pm_dir)) { - pt_dir = vmem_pte_alloc(); - if (!pt_dir) - goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } - - pt_dir = pte_offset_kernel(pm_dir, address); - pte_val(*pt_dir) = address | pgt_prot; - address += PAGE_SIZE; - pages4k++; + pages++; } ret = 0; out: - update_page_count(PG_DIRECT_MAP_4K, pages4k); - update_page_count(PG_DIRECT_MAP_1M, pages1m); - update_page_count(PG_DIRECT_MAP_2G, pages2g); + if (direct) + update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); return ret; } -/* - * Remove a physical memory range from the 1:1 mapping. - * Currently only invalidates page table entries. - */ -static void vmem_remove_range(unsigned long start, unsigned long size) +static void try_free_pte_table(pmd_t *pmd, unsigned long start) { - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - address += PGDIR_SIZE; - continue; - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - address += P4D_SIZE; - continue; - } - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - address += PUD_SIZE; - continue; - } - if (pud_large(*pu_dir)) { - pud_clear(pu_dir); - address += PUD_SIZE; - pages2g++; - continue; - } - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - address += PMD_SIZE; - continue; - } - if (pmd_large(*pm_dir)) { - pmd_clear(pm_dir); - address += PMD_SIZE; - pages1m++; - continue; - } - pt_dir = pte_offset_kernel(pm_dir, address); - pte_clear(&init_mm, address, pt_dir); - address += PAGE_SIZE; - pages4k++; + pte_t *pte; + int i; + + /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ + pte = pte_offset_kernel(pmd, start); + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(*pte)) + return; } - flush_tlb_kernel_range(start, end); - update_page_count(PG_DIRECT_MAP_4K, -pages4k); - update_page_count(PG_DIRECT_MAP_1M, -pages1m); - update_page_count(PG_DIRECT_MAP_2G, -pages2g); + vmem_pte_free(__va(pmd_deref(*pmd))); + pmd_clear(pmd); } -/* - * Add a backed mem_map array to the virtual mem_map array. - */ -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, + unsigned long end, bool add, bool direct) { - unsigned long pgt_prot, sgt_prot; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; + unsigned long next, prot, pages = 0; int ret = -ENOMEM; + pmd_t *pmd; + pte_t *pte; - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - } - for (address = start; address < end;) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } + prot = pgprot_val(SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_SEGMENT_ENTRY_NOEXEC; - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (!add) { + if (pmd_none(*pmd)) + continue; + if (pmd_large(*pmd) && !add) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + pages++; + } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + } + continue; + } + } else if (pmd_none(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE) && + MACHINE_HAS_EDAT1 && addr && direct && + !debug_pagealloc_enabled()) { + pmd_val(*pmd) = addr | prot; + pages++; + continue; + } else if (!direct && MACHINE_HAS_EDAT1) { + void *new_page; - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) + /* + * Use 1MB frames for vmemmap if available. We + * always use large frames even if they are only + * partially used. Otherwise we would have also + * page tables since vmemmap_populate gets + * called for each section separately. + */ + new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + if (new_page) { + pmd_val(*pmd) = __pa(new_page) | prot; + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + vmemmap_use_new_sub_pmd(addr, next); + } + continue; + } + } + pte = vmem_pte_alloc(); + if (!pte) goto out; - pud_populate(&init_mm, pu_dir, pm_dir); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + if (!direct) + vmemmap_use_sub_pmd(addr, next); + continue; } + ret = modify_pte_table(pmd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pte_table(pmd, addr & PMD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); + return ret; +} - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - /* Use 1MB frames for vmemmap if available. We always - * use large frames even if they are only partially - * used. - * Otherwise we would have also page tables since - * vmemmap_populate gets called for each section - * separately. */ - if (MACHINE_HAS_EDAT1) { - void *new_page; +static void try_free_pmd_table(pud_t *pud, unsigned long start) +{ + const unsigned long end = start + PUD_SIZE; + pmd_t *pmd; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + pmd = pmd_offset(pud, start); + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) + if (!pmd_none(*pmd)) + return; + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + pud_clear(pud); +} - new_page = vmemmap_alloc_block(PMD_SIZE, node); - if (!new_page) - goto out; - pmd_val(*pm_dir) = __pa(new_page) | sgt_prot; - address = (address + PMD_SIZE) & PMD_MASK; +static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pud_t *pud; + pmd_t *pmd; + + prot = pgprot_val(REGION3_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_REGION_ENTRY_NOEXEC; + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (!add) { + if (pud_none(*pud)) + continue; + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + pud_clear(pud); + pages++; + } + continue; + } + } else if (pud_none(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE) && + MACHINE_HAS_EDAT2 && addr && direct && + !debug_pagealloc_enabled()) { + pud_val(*pud) = addr | prot; + pages++; continue; } - pt_dir = vmem_pte_alloc(); - if (!pt_dir) + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { continue; } + ret = modify_pmd_table(pud, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pmd_table(pud, addr & PUD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); + return ret; +} - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *new_page; +static void try_free_pud_table(p4d_t *p4d, unsigned long start) +{ + const unsigned long end = start + P4D_SIZE; + pud_t *pud; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + + pud = pud_offset(p4d, start); + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + if (!pud_none(*pud)) + return; + } + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + p4d_clear(p4d); +} - new_page = vmemmap_alloc_block(PAGE_SIZE, node); - if (!new_page) +static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next; + int ret = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (!add) { + if (p4d_none(*p4d)) + continue; + } else if (p4d_none(*p4d)) { + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) goto out; - pte_val(*pt_dir) = __pa(new_page) | pgt_prot; } - address += PAGE_SIZE; + ret = modify_pud_table(p4d, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pud_table(p4d, addr & P4D_MASK); } ret = 0; out: return ret; } -void vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void try_free_p4d_table(pgd_t *pgd, unsigned long start) { + const unsigned long end = start + PGDIR_SIZE; + p4d_t *p4d; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + + p4d = p4d_offset(pgd, start); + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + if (!p4d_none(*p4d)) + return; + } + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + pgd_clear(pgd); } -/* - * Add memory segment to the segment list if it doesn't overlap with - * an already present segment. - */ -static int insert_memory_segment(struct memory_segment *seg) +static int modify_pagetable(unsigned long start, unsigned long end, bool add, + bool direct) { - struct memory_segment *tmp; + unsigned long addr, next; + int ret = -ENOMEM; + pgd_t *pgd; + p4d_t *p4d; - if (seg->start + seg->size > VMEM_MAX_PHYS || - seg->start + seg->size < seg->start) - return -ERANGE; + if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) + return -EINVAL; + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); - list_for_each_entry(tmp, &mem_segs, list) { - if (seg->start >= tmp->start + tmp->size) - continue; - if (seg->start + seg->size <= tmp->start) - continue; - return -ENOSPC; + if (!add) { + if (pgd_none(*pgd)) + continue; + } else if (pgd_none(*pgd)) { + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + ret = modify_p4d_table(pgd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_p4d_table(pgd, addr & PGDIR_MASK); } - list_add(&seg->list, &mem_segs); - return 0; + ret = 0; +out: + if (!add) + flush_tlb_kernel_range(start, end); + return ret; +} + +static int add_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, true, direct); +} + +static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, false, direct); } /* - * Remove memory segment from the segment list. + * Add a physical memory range to the 1:1 mapping. */ -static void remove_memory_segment(struct memory_segment *seg) +static int vmem_add_range(unsigned long start, unsigned long size) { - list_del(&seg->list); + return add_pagetable(start, start + size, true); } -static void __remove_shared_memory(struct memory_segment *seg) +/* + * Remove a physical memory range from the 1:1 mapping. + */ +static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_memory_segment(seg); - vmem_remove_range(seg->start, seg->size); + remove_pagetable(start, start + size, true); } -int vmem_remove_mapping(unsigned long start, unsigned long size) +/* + * Add a backed mem_map array to the virtual mem_map array. + */ +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { - struct memory_segment *seg; int ret; mutex_lock(&vmem_mutex); + /* We don't care about the node, just use NUMA_NO_NODE on allocations */ + ret = add_pagetable(start, end, false); + if (ret) + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); + return ret; +} - ret = -ENOENT; - list_for_each_entry(seg, &mem_segs, list) { - if (seg->start == start && seg->size == size) - break; - } - - if (seg->start != start || seg->size != size) - goto out; +void vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + mutex_lock(&vmem_mutex); + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); +} - ret = 0; - __remove_shared_memory(seg); - kfree(seg); -out: +void vmem_remove_mapping(unsigned long start, unsigned long size) +{ + mutex_lock(&vmem_mutex); + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); - return ret; } int vmem_add_mapping(unsigned long start, unsigned long size) { - struct memory_segment *seg; int ret; - mutex_lock(&vmem_mutex); - ret = -ENOMEM; - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - goto out; - seg->start = start; - seg->size = size; - - ret = insert_memory_segment(seg); - if (ret) - goto out_free; + if (start + size > VMEM_MAX_PHYS || + start + size < start) + return -ERANGE; - ret = vmem_add_mem(start, size); + mutex_lock(&vmem_mutex); + ret = vmem_add_range(start, size); if (ret) - goto out_remove; - goto out; - -out_remove: - __remove_shared_memory(seg); -out_free: - kfree(seg); -out: + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); return ret; } @@ -402,7 +557,7 @@ void __init vmem_map_init(void) struct memblock_region *reg; for_each_memblock(memory, reg) - vmem_add_mem(reg->base, reg->size); + vmem_add_range(reg->base, reg->size); __set_memory((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); @@ -421,27 +576,3 @@ void __init vmem_map_init(void) pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } - -/* - * Convert memblock.memory to a memory segment list so there is a single - * list that contains all memory segments. - */ -static int __init vmem_convert_memory_chunk(void) -{ - struct memblock_region *reg; - struct memory_segment *seg; - - mutex_lock(&vmem_mutex); - for_each_memblock(memory, reg) { - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - panic("Out of memory...\n"); - seg->start = reg->base; - seg->size = reg->size; - insert_memory_segment(seg); - } - mutex_unlock(&vmem_mutex); - return 0; -} - -core_initcall(vmem_convert_memory_chunk); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index f4242b894cf2..8fe7bdfc8d15 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -49,6 +49,7 @@ struct bpf_jit { int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ + int excnt; /* Number of exception table entries */ int labels[1]; /* Labels for local jumps */ }; @@ -588,6 +589,84 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) } } +static int get_probe_mem_regno(const u8 *insn) +{ + /* + * insn must point to llgc, llgh, llgf or lg, which have destination + * register at the same position. + */ + if (insn[0] != 0xe3) /* common llgc, llgh, llgf and lg prefix */ + return -1; + if (insn[5] != 0x90 && /* llgc */ + insn[5] != 0x91 && /* llgh */ + insn[5] != 0x16 && /* llgf */ + insn[5] != 0x04) /* lg */ + return -1; + return insn[1] >> 4; +} + +static bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs) +{ + int regno; + u8 *insn; + + regs->psw.addr = extable_fixup(x); + insn = (u8 *)__rewind_psw(regs->psw, regs->int_code >> 16); + regno = get_probe_mem_regno(insn); + if (WARN_ON_ONCE(regno < 0)) + /* JIT bug - unexpected instruction. */ + return false; + regs->gprs[regno] = 0; + return true; +} + +static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp, + int probe_prg, int nop_prg) +{ + struct exception_table_entry *ex; + s64 delta; + u8 *insn; + int prg; + int i; + + if (!fp->aux->extable) + /* Do nothing during early JIT passes. */ + return 0; + insn = jit->prg_buf + probe_prg; + if (WARN_ON_ONCE(get_probe_mem_regno(insn) < 0)) + /* JIT bug - unexpected probe instruction. */ + return -1; + if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg)) + /* JIT bug - gap between probe and nop instructions. */ + return -1; + for (i = 0; i < 2; i++) { + if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries)) + /* Verifier bug - not enough entries. */ + return -1; + ex = &fp->aux->extable[jit->excnt]; + /* Add extable entries for probe and nop instructions. */ + prg = i == 0 ? probe_prg : nop_prg; + delta = jit->prg_buf + prg - (u8 *)&ex->insn; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - code and extable must be close. */ + return -1; + ex->insn = delta; + /* + * Always land on the nop. Note that extable infrastructure + * ignores fixup field, it is handled by ex_handler_bpf(). + */ + delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup; + if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX)) + /* JIT bug - landing pad and extable must be close. */ + return -1; + ex->fixup = delta; + ex->handler = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; + jit->excnt++; + } + return 0; +} + /* * Compile one eBPF instruction into s390x code * @@ -604,7 +683,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, u32 *addrs = jit->addrs; s32 imm = insn->imm; s16 off = insn->off; + int probe_prg = -1; unsigned int mask; + int nop_prg; + int err; + + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + probe_prg = jit->prg; switch (insn->code) { /* @@ -1119,6 +1205,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, * BPF_LDX */ case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_B: /* llgc %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; @@ -1126,6 +1213,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_H: /* llgh %dst,0(off,%src) */ EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off); jit->seen |= SEEN_MEM; @@ -1133,6 +1221,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_W: /* llgf %dst,off(%src) */ jit->seen |= SEEN_MEM; EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off); @@ -1140,6 +1229,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, insn_count = 2; break; case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: /* lg %dst,0(off,%src) */ jit->seen |= SEEN_MEM; EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off); @@ -1485,6 +1575,23 @@ branch_oc: pr_err("Unknown opcode %02x\n", insn->code); return -1; } + + if (probe_prg != -1) { + /* + * Handlers of certain exceptions leave psw.addr pointing to + * the instruction directly after the failing one. Therefore, + * create two exception table entries and also add a nop in + * case two probing instructions come directly after each + * other. + */ + nop_prg = jit->prg; + /* bcr 0,%0 */ + _EMIT2(0x0700); + err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg); + if (err < 0) + return err; + } + return insn_count; } @@ -1527,6 +1634,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->lit32 = jit->lit32_start; jit->lit64 = jit->lit64_start; jit->prg = 0; + jit->excnt = 0; bpf_jit_prologue(jit, stack_depth); if (bpf_set_addr(jit, 0) < 0) @@ -1551,6 +1659,12 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp, jit->lit64_start = ALIGN(jit->lit64_start, 8); jit->size = jit->lit64_start + lit64_size; jit->size_prg = jit->prg; + + if (WARN_ON_ONCE(fp->aux->extable && + jit->excnt != fp->aux->num_exentries)) + /* Verifier bug - too many entries. */ + return -1; + return 0; } @@ -1565,6 +1679,29 @@ struct s390_jit_data { int pass; }; +static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, + struct bpf_prog *fp) +{ + struct bpf_binary_header *header; + u32 extable_size; + u32 code_size; + + /* We need two entries per insn. */ + fp->aux->num_exentries *= 2; + + code_size = roundup(jit->size, + __alignof__(struct exception_table_entry)); + extable_size = fp->aux->num_exentries * + sizeof(struct exception_table_entry); + header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf, + 8, jit_fill_hole); + if (!header) + return NULL; + fp->aux->extable = (struct exception_table_entry *) + (jit->prg_buf + code_size); + return header; +} + /* * Compile eBPF program "fp" */ @@ -1631,7 +1768,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* * Final pass: Allocate and generate program */ - header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole); + header = bpf_jit_alloc(&jit, fp); if (!header) { fp = orig_fp; goto free_addrs; diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 38efa3e852c4..401cf670a243 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -155,10 +155,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, return -EINVAL; /* - * Only support read access to MIO capable devices on a MIO enabled - * system. Otherwise we would have to check for every address if it is - * a special ZPCI_ADDR and we would have to do a get_pfn() which we - * don't need for MIO capable devices. + * We only support write access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a get_pfn() which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. */ if (static_branch_likely(&have_mio)) { ret = __memcpy_toio_inuser((void __iomem *) mmio_addr, @@ -282,10 +284,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, return -EINVAL; /* - * Only support write access to MIO capable devices on a MIO enabled - * system. Otherwise we would have to check for every address if it is - * a special ZPCI_ADDR and we would have to do a get_pfn() which we - * don't need for MIO capable devices. + * We only support read access to MIO capable devices if we are on + * a MIO enabled system. Otherwise we would have to check for every + * address if it is a special ZPCI_ADDR and would have to do + * a get_pfn() which we don't need for MIO capable devices. Currently + * ISM devices are the only devices without MIO support and there is no + * known need for accessing these from userspace. */ if (static_branch_likely(&have_mio)) { ret = __memcpy_fromio_inuser( |