diff options
Diffstat (limited to 'arch')
174 files changed, 4146 insertions, 2284 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index f76b214cf7ad..dc26b6d9175e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -425,7 +425,7 @@ config GCC_PLUGIN_STRUCTLEAK bool "Force initialization of variables containing userspace addresses" depends on GCC_PLUGINS help - This plugin zero-initializes any structures that containing a + This plugin zero-initializes any structures containing a __user attribute. This can prevent some classes of information exposures. @@ -443,6 +443,45 @@ config GCC_PLUGIN_STRUCTLEAK_VERBOSE initialized. Since not all existing initializers are detected by the plugin, this can produce false positive warnings. +config GCC_PLUGIN_RANDSTRUCT + bool "Randomize layout of sensitive kernel structures" + depends on GCC_PLUGINS + select MODVERSIONS if MODULES + help + If you say Y here, the layouts of structures explicitly + marked by __randomize_layout will be randomized at + compile-time. This can introduce the requirement of an + additional information exposure vulnerability for exploits + targeting these structure types. + + Enabling this feature will introduce some performance impact, + slightly increase memory usage, and prevent the use of forensic + tools like Volatility against the system (unless the kernel + source tree isn't cleaned after kernel installation). + + The seed used for compilation is located at + scripts/gcc-plgins/randomize_layout_seed.h. It remains after + a make clean to allow for external modules to be compiled with + the existing seed and will be removed by a make mrproper or + make distclean. + + Note that the implementation requires gcc 4.7 or newer. + + This plugin was ported from grsecurity/PaX. More information at: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + +config GCC_PLUGIN_RANDSTRUCT_PERFORMANCE + bool "Use cacheline-aware structure randomization" + depends on GCC_PLUGIN_RANDSTRUCT + depends on !COMPILE_TEST + help + If you say Y here, the RANDSTRUCT randomization will make a + best effort at restricting randomization to cacheline-sized + groups of elements. It will further not randomize bitfields + in structures. This reduces the performance hit of RANDSTRUCT + at the cost of weakened randomization. + config HAVE_CC_STACKPROTECTOR bool help diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 148d7a32754e..7b285dd4fe05 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -105,4 +105,8 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index ce93124a850b..b23d6fbbb225 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1183,48 +1183,23 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options, struct rusage32 __user *, ur) { - struct rusage r; - long ret, err; unsigned int status = 0; - mm_segment_t old_fs; - + struct rusage r; + long err = kernel_wait4(pid, &status, options, &r); + if (err <= 0) + return err; + if (put_user(status, ustatus)) + return -EFAULT; if (!ur) - return sys_wait4(pid, ustatus, options, NULL); - - old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, (unsigned int __user *) &status, options, - (struct rusage __user *) &r); - set_fs (old_fs); - - if (!access_ok(VERIFY_WRITE, ur, sizeof(*ur))) + return err; + if (put_tv32(&ur->ru_utime, &r.ru_utime)) return -EFAULT; - - err = put_user(status, ustatus); - if (ret < 0) - return err ? err : ret; - - err |= __put_user(r.ru_utime.tv_sec, &ur->ru_utime.tv_sec); - err |= __put_user(r.ru_utime.tv_usec, &ur->ru_utime.tv_usec); - err |= __put_user(r.ru_stime.tv_sec, &ur->ru_stime.tv_sec); - err |= __put_user(r.ru_stime.tv_usec, &ur->ru_stime.tv_usec); - err |= __put_user(r.ru_maxrss, &ur->ru_maxrss); - err |= __put_user(r.ru_ixrss, &ur->ru_ixrss); - err |= __put_user(r.ru_idrss, &ur->ru_idrss); - err |= __put_user(r.ru_isrss, &ur->ru_isrss); - err |= __put_user(r.ru_minflt, &ur->ru_minflt); - err |= __put_user(r.ru_majflt, &ur->ru_majflt); - err |= __put_user(r.ru_nswap, &ur->ru_nswap); - err |= __put_user(r.ru_inblock, &ur->ru_inblock); - err |= __put_user(r.ru_oublock, &ur->ru_oublock); - err |= __put_user(r.ru_msgsnd, &ur->ru_msgsnd); - err |= __put_user(r.ru_msgrcv, &ur->ru_msgrcv); - err |= __put_user(r.ru_nsignals, &ur->ru_nsignals); - err |= __put_user(r.ru_nvcsw, &ur->ru_nvcsw); - err |= __put_user(r.ru_nivcsw, &ur->ru_nivcsw); - - return err ? err : ret; + if (put_tv32(&ur->ru_stime, &r.ru_stime)) + return -EFAULT; + if (copy_to_user(&ur->ru_maxrss, &r.ru_maxrss, + sizeof(struct rusage32) - offsetof(struct rusage32, ru_maxrss))) + return -EFAULT; + return err; } /* diff --git a/arch/arm/configs/lpc32xx_defconfig b/arch/arm/configs/lpc32xx_defconfig index 6ba430d2b5b2..e15fa5f168bb 100644 --- a/arch/arm/configs/lpc32xx_defconfig +++ b/arch/arm/configs/lpc32xx_defconfig @@ -112,7 +112,7 @@ CONFIG_GPIO_SX150X=y CONFIG_GPIO_74X164=y CONFIG_GPIO_MAX7301=y CONFIG_GPIO_MC33880=y -CONFIG_GPIO_MCP23S08=y +CONFIG_PINCTRL_MCP23S08=y CONFIG_SENSORS_DS620=y CONFIG_SENSORS_MAX6639=y CONFIG_WATCHDOG=y diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index 883b84d828c5..0f966a8ca1ce 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -14,6 +14,7 @@ #include <crypto/aes.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> +#include <linux/cpufeature.h> #include <linux/module.h> #include <crypto/xts.h> @@ -425,9 +426,6 @@ static int __init aes_init(void) int err; int i; - if (!(elf_hwcap2 & HWCAP2_AES)) - return -ENODEV; - err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); if (err) return err; @@ -451,5 +449,5 @@ unregister_simds: return err; } -module_init(aes_init); +module_cpu_feature_match(AES, aes_init); module_exit(aes_exit); diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c index e1566bec1016..1b0e0e86ee9c 100644 --- a/arch/arm/crypto/crc32-ce-glue.c +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -8,6 +8,7 @@ * published by the Free Software Foundation. */ +#include <linux/cpufeature.h> #include <linux/crc32.h> #include <linux/init.h> #include <linux/kernel.h> @@ -233,6 +234,11 @@ static void __exit crc32_pmull_mod_exit(void) ARRAY_SIZE(crc32_pmull_algs)); } +static const struct cpu_feature crc32_cpu_feature[] = { + { cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { } +}; +MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature); + module_init(crc32_pmull_mod_init); module_exit(crc32_pmull_mod_exit); diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index 7546b3c02466..6bac8bea9f1e 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -15,6 +15,7 @@ #include <crypto/cryptd.h> #include <crypto/internal/hash.h> #include <crypto/gf128mul.h> +#include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> @@ -311,9 +312,6 @@ static int __init ghash_ce_mod_init(void) { int err; - if (!(elf_hwcap2 & HWCAP2_PMULL)) - return -ENODEV; - err = crypto_register_shash(&ghash_alg); if (err) return err; @@ -334,5 +332,5 @@ static void __exit ghash_ce_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -module_init(ghash_ce_mod_init); +module_cpu_feature_match(PMULL, ghash_ce_mod_init); module_exit(ghash_ce_mod_exit); diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c index 80bc2fcd241a..555f72b5e659 100644 --- a/arch/arm/crypto/sha1-ce-glue.c +++ b/arch/arm/crypto/sha1-ce-glue.c @@ -11,6 +11,7 @@ #include <crypto/internal/hash.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> +#include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> @@ -82,8 +83,6 @@ static struct shash_alg alg = { static int __init sha1_ce_mod_init(void) { - if (!(elf_hwcap2 & HWCAP2_SHA1)) - return -ENODEV; return crypto_register_shash(&alg); } @@ -92,5 +91,5 @@ static void __exit sha1_ce_mod_fini(void) crypto_unregister_shash(&alg); } -module_init(sha1_ce_mod_init); +module_cpu_feature_match(SHA1, sha1_ce_mod_init); module_exit(sha1_ce_mod_fini); diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c index 0755b2d657f3..df4dcef054ae 100644 --- a/arch/arm/crypto/sha2-ce-glue.c +++ b/arch/arm/crypto/sha2-ce-glue.c @@ -11,6 +11,7 @@ #include <crypto/internal/hash.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> +#include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> @@ -100,8 +101,6 @@ static struct shash_alg algs[] = { { static int __init sha2_ce_mod_init(void) { - if (!(elf_hwcap2 & HWCAP2_SHA2)) - return -ENODEV; return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } @@ -110,5 +109,5 @@ static void __exit sha2_ce_mod_fini(void) crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } -module_init(sha2_ce_mod_init); +module_cpu_feature_match(SHA2, sha2_ce_mod_init); module_exit(sha2_ce_mod_fini); diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 68b06f9c65de..ad301f107dd2 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -87,6 +87,8 @@ #define CALGN(code...) #endif +#define IMM12_MASK 0xfff + /* * Enable and disable interrupts */ diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index a3f0b3d50089..ebf020b02bc8 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -187,6 +187,16 @@ #define FSC_FAULT (0x04) #define FSC_ACCESS (0x08) #define FSC_PERM (0x0c) +#define FSC_SEA (0x10) +#define FSC_SEA_TTW0 (0x14) +#define FSC_SEA_TTW1 (0x15) +#define FSC_SEA_TTW2 (0x16) +#define FSC_SEA_TTW3 (0x17) +#define FSC_SECC (0x18) +#define FSC_SECC_TTW0 (0x1c) +#define FSC_SECC_TTW1 (0x1d) +#define FSC_SECC_TTW2 (0x1e) +#define FSC_SECC_TTW3 (0x1f) /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~0xf) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index f0e66577ce05..127e2dd2e21c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -44,7 +44,9 @@ #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS #endif -#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SLEEP \ + KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); @@ -233,8 +235,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu); -void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu); int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu); @@ -291,20 +291,12 @@ static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} -static inline int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} -static inline int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} -static inline int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - return -ENXIO; -} + +int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr); +int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr); +int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h index a3d61ad984af..8c4a89f5ce7d 100644 --- a/arch/arm/include/asm/system_misc.h +++ b/arch/arm/include/asm/system_misc.h @@ -22,6 +22,11 @@ extern void (*arm_pm_idle)(void); extern unsigned int user_debug; +static inline int handle_guest_sea(phys_addr_t addr, unsigned int esr) +{ + return -1; +} + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_ARM_SYSTEM_MISC_H */ diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h index 71e473d05fcc..620dc75362e5 100644 --- a/arch/arm/include/asm/xen/events.h +++ b/arch/arm/include/asm/xen/events.h @@ -16,7 +16,7 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) return raw_irqs_disabled_flags(regs->ARM_cpsr); } -#define xchg_xen_ulong(ptr, val) atomic64_xchg(container_of((ptr), \ +#define xchg_xen_ulong(ptr, val) atomic64_xchg(container_of((long long*)(ptr),\ atomic64_t, \ counter), (val)) diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 5e3c673fa3f4..5db2d4c6a55f 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -203,6 +203,14 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff #define VGIC_LEVEL_INFO_LINE_LEVEL 0 +/* Device Control API on vcpu fd */ +#define KVM_ARM_VCPU_PMU_V3_CTRL 0 +#define KVM_ARM_VCPU_PMU_V3_IRQ 0 +#define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_TIMER_CTRL 1 +#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 +#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 + #define KVM_DEV_ARM_VGIC_CTRL_INIT 0 #define KVM_DEV_ARM_ITS_SAVE_TABLES 1 #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 9f157e7c51e7..c731f0d2b2af 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -797,7 +797,10 @@ ENTRY(__switch_to) #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP) ldr r7, [r2, #TI_TASK] ldr r8, =__stack_chk_guard - ldr r7, [r7, #TSK_STACK_CANARY] + .if (TSK_STACK_CANARY > IMM12_MASK) + add r7, r7, #TSK_STACK_CANARY & ~IMM12_MASK + .endif + ldr r7, [r7, #TSK_STACK_CANARY & IMM12_MASK] #endif #ifdef CONFIG_CPU_USE_DOMAINS mcr p15, 0, r6, c3, c0, 0 @ Set domain register diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c index 96b7a477a8db..8226d0b71fd3 100644 --- a/arch/arm/kernel/perf_event_v6.c +++ b/arch/arm/kernel/perf_event_v6.c @@ -552,7 +552,7 @@ static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu) return 0; } -static struct of_device_id armv6_pmu_of_device_ids[] = { +static const struct of_device_id armv6_pmu_of_device_ids[] = { {.compatible = "arm,arm11mpcore-pmu", .data = armv6mpcore_pmu_init}, {.compatible = "arm,arm1176-pmu", .data = armv6_1176_pmu_init}, {.compatible = "arm,arm1136-pmu", .data = armv6_1136_pmu_init}, diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index fa6182a40941..1e0784ebbfd6 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -301,3 +301,54 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, { return -EINVAL; } + +int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_set_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_get_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} + +int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->group) { + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_has_attr(vcpu, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; +} diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index f86a9aaef462..54442e375354 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -72,6 +72,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) trace_kvm_wfx(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; kvm_vcpu_block(vcpu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index 624a510d31df..ebd2dd46adf7 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -237,8 +237,10 @@ void __hyp_text __noreturn __hyp_panic(int cause) vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); + __timer_save_state(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); + __banked_restore_state(host_ctxt); __sysreg_restore_state(host_ctxt); } diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index 1da8b2d14550..5ed0c3ee33d6 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -37,16 +37,6 @@ static struct kvm_regs cortexa_regs_reset = { .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, }; -static const struct kvm_irq_level cortexa_ptimer_irq = { - { .irq = 30 }, - .level = 1, -}; - -static const struct kvm_irq_level cortexa_vtimer_irq = { - { .irq = 27 }, - .level = 1, -}; - /******************************************************************************* * Exported reset function @@ -62,16 +52,12 @@ static const struct kvm_irq_level cortexa_vtimer_irq = { int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { struct kvm_regs *reset_regs; - const struct kvm_irq_level *cpu_vtimer_irq; - const struct kvm_irq_level *cpu_ptimer_irq; switch (vcpu->arch.target) { case KVM_ARM_TARGET_CORTEX_A7: case KVM_ARM_TARGET_CORTEX_A15: reset_regs = &cortexa_regs_reset; vcpu->arch.midr = read_cpuid_id(); - cpu_vtimer_irq = &cortexa_vtimer_irq; - cpu_ptimer_irq = &cortexa_ptimer_irq; break; default: return -ENODEV; @@ -84,5 +70,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_reset_coprocs(vcpu); /* Reset arch_timer context */ - return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); + return kvm_timer_vcpu_reset(vcpu); } diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S index 0d40c285bd86..f944836da8a2 100644 --- a/arch/arm/mm/proc-macros.S +++ b/arch/arm/mm/proc-macros.S @@ -25,11 +25,6 @@ ldr \rd, [\rn, #VMA_VM_FLAGS] .endm - .macro tsk_mm, rd, rn - ldr \rd, [\rn, #TI_TASK] - ldr \rd, [\rd, #TSK_ACTIVE_MM] - .endm - /* * act_mm - get current->active_mm */ @@ -37,7 +32,10 @@ bic \rd, sp, #8128 bic \rd, \rd, #63 ldr \rd, [\rd, #TI_TASK] - ldr \rd, [\rd, #TSK_ACTIVE_MM] + .if (TSK_ACTIVE_MM > IMM12_MASK) + add \rd, \rd, #TSK_ACTIVE_MM & ~IMM12_MASK + .endif + ldr \rd, [\rd, #TSK_ACTIVE_MM & IMM12_MASK] .endm /* diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index 0ed01f2d5ee4..e71eefa2e427 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -144,17 +144,17 @@ bool __set_phys_to_machine_multi(unsigned long pfn, return true; } - p2m_entry = kzalloc(sizeof(struct xen_p2m_entry), GFP_NOWAIT); - if (!p2m_entry) { - pr_warn("cannot allocate xen_p2m_entry\n"); + p2m_entry = kzalloc(sizeof(*p2m_entry), GFP_NOWAIT); + if (!p2m_entry) return false; - } + p2m_entry->pfn = pfn; p2m_entry->nr_pages = nr_pages; p2m_entry->mfn = mfn; write_lock_irqsave(&p2m_lock, irqflags); - if ((rc = xen_add_phys_to_mach_entry(p2m_entry)) < 0) { + rc = xen_add_phys_to_mach_entry(p2m_entry); + if (rc < 0) { write_unlock_irqrestore(&p2m_lock, irqflags); return false; } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 300146dc8433..192208ea2842 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -3,6 +3,7 @@ config ARM64 select ACPI_CCA_REQUIRED if ACPI select ACPI_GENERIC_GSI if ACPI select ACPI_GTDT if ACPI + select ACPI_IORT if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ACPI_MCFG if ACPI select ACPI_SPCR_TABLE if ACPI @@ -19,7 +20,9 @@ config ARM64 select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAVE_NMI_SAFE_CMPXCHG if ACPI_APEI_SEA select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION @@ -93,6 +96,7 @@ config ARM64 select HAVE_IRQ_TIME_ACCOUNTING select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP if NUMA + select HAVE_NMI if ACPI_APEI_SEA select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -245,6 +249,9 @@ config PGTABLE_LEVELS config ARCH_SUPPORTS_UPROBES def_bool y +config ARCH_PROC_KCORE_TEXT + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -481,6 +488,17 @@ config CAVIUM_ERRATUM_27456 If unsure, say Y. +config CAVIUM_ERRATUM_30115 + bool "Cavium erratum 30115: Guest may disable interrupts in host" + default y + help + On ThunderX T88 pass 1.x through 2.2, T81 pass 1.0 through + 1.2, and T83 Pass 1.0, KVM guest execution may disable + interrupts in host. Trapping both GICv3 group-0 and group-1 + accesses sidesteps the issue. + + If unsure, say Y. + config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y @@ -983,7 +1001,7 @@ config RANDOMIZE_BASE config RANDOMIZE_MODULE_REGION_FULL bool "Randomize the module region independently from the core kernel" - depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE + depends on RANDOMIZE_BASE default y help Randomizes the location of the module region without considering the diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index f839ecd919f9..9b41f1e3b1a0 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -52,17 +52,19 @@ KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian +CHECKFLAGS += -D__AARCH64EB__ AS += -EB LD += -EB UTS_MACHINE := aarch64_be else KBUILD_CPPFLAGS += -mlittle-endian +CHECKFLAGS += -D__AARCH64EL__ AS += -EL LD += -EL UTS_MACHINE := aarch64 endif -CHECKFLAGS += -D__aarch64__ +CHECKFLAGS += -D__aarch64__ -m64 ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y) KBUILD_CFLAGS_MODULE += -mcmodel=large @@ -70,6 +72,9 @@ endif ifeq ($(CONFIG_ARM64_MODULE_PLTS),y) KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/arm64/kernel/module.lds +ifeq ($(CONFIG_DYNAMIC_FTRACE),y) +KBUILD_LDFLAGS_MODULE += $(objtree)/arch/arm64/kernel/ftrace-mod.o +endif endif # Default value diff --git a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts index afae4de6e53f..2b526304ed27 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts +++ b/arch/arm64/boot/dts/hisilicon/hi6220-hikey.dts @@ -141,6 +141,8 @@ bluetooth { compatible = "ti,wl1835-st"; enable-gpios = <&gpio1 7 GPIO_ACTIVE_HIGH>; + clocks = <&pmic>; + clock-names = "ext_clock"; }; }; diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index c98e7e849f06..8550408735a0 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -82,7 +82,8 @@ ENTRY(sha1_ce_transform) ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ - ldr w4, [x0, #:lo12:sha1_ce_offsetof_finalize] + ldr_l w4, sha1_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] /* load input */ 0: ld1 {v8.4s-v11.4s}, [x1], #64 @@ -132,7 +133,8 @@ CPU_LE( rev32 v11.16b, v11.16b ) * the padding is handled by the C code in that case. */ cbz x4, 3f - ldr x4, [x0, #:lo12:sha1_ce_offsetof_count] + ldr_l w4, sha1_ce_offsetof_count, x4 + ldr x4, [x0, x4] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index aefda9868627..ea319c055f5d 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -17,9 +17,6 @@ #include <linux/crypto.h> #include <linux/module.h> -#define ASM_EXPORT(sym, val) \ - asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); - MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); @@ -32,6 +29,9 @@ struct sha1_ce_state { asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, int blocks); +const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); +const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize); + static int sha1_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { @@ -52,11 +52,6 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, struct sha1_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); - ASM_EXPORT(sha1_ce_offsetof_count, - offsetof(struct sha1_ce_state, sst.count)); - ASM_EXPORT(sha1_ce_offsetof_finalize, - offsetof(struct sha1_ce_state, finalize)); - /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 01cfee066837..679c6c002f4f 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -88,7 +88,8 @@ ENTRY(sha2_ce_transform) ld1 {dgav.4s, dgbv.4s}, [x0] /* load sha256_ce_state::finalize */ - ldr w4, [x0, #:lo12:sha256_ce_offsetof_finalize] + ldr_l w4, sha256_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] /* load input */ 0: ld1 {v16.4s-v19.4s}, [x1], #64 @@ -136,7 +137,8 @@ CPU_LE( rev32 v19.16b, v19.16b ) * the padding is handled by the C code in that case. */ cbz x4, 3f - ldr x4, [x0, #:lo12:sha256_ce_offsetof_count] + ldr_l w4, sha256_ce_offsetof_count, x4 + ldr x4, [x0, x4] movi v17.2d, #0 mov x8, #0x80000000 movi v18.2d, #0 diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 7cd587564a41..0ed9486f75dd 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -17,9 +17,6 @@ #include <linux/crypto.h> #include <linux/module.h> -#define ASM_EXPORT(sym, val) \ - asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); - MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); @@ -32,6 +29,11 @@ struct sha256_ce_state { asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, int blocks); +const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, + sst.count); +const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, + finalize); + static int sha256_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { @@ -52,11 +54,6 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, struct sha256_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); - ASM_EXPORT(sha256_ce_offsetof_count, - offsetof(struct sha256_ce_state, sst.count)); - ASM_EXPORT(sha256_ce_offsetof_finalize, - offsetof(struct sha256_ce_state, finalize)); - /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 1a98bc8602a2..8cef47fa2218 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -89,7 +89,7 @@ static inline void gic_write_ctlr(u32 val) static inline void gic_write_grpen1(u32 val) { - write_sysreg_s(val, SYS_ICC_GRPEN1_EL1); + write_sysreg_s(val, SYS_ICC_IGRPEN1_EL1); isb(); } diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 09f65339d66d..0b6f5a7d4027 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -42,7 +42,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) } while (--ihl); sum += ((sum >> 32) | (sum << 32)); - return csum_fold(sum >> 32); + return csum_fold((__force u32)(sum >> 32)); } #define ip_fast_csum ip_fast_csum diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index b3aab8a17868..8d2272c6822c 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -38,7 +38,8 @@ #define ARM64_WORKAROUND_REPEAT_TLBI 17 #define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 #define ARM64_WORKAROUND_858921 19 +#define ARM64_WORKAROUND_CAVIUM_30115 20 -#define ARM64_NCAPS 20 +#define ARM64_NCAPS 21 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 0984d1b3a8f2..235e77d98261 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -86,6 +86,7 @@ #define CAVIUM_CPU_PART_THUNDERX 0x0A1 #define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 +#define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3 #define BRCM_CPU_PART_VULCAN 0x516 @@ -96,6 +97,7 @@ #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) +#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index 5392dbeffa45..f72779aad276 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -48,8 +48,6 @@ void arch_teardown_dma_ops(struct device *dev); /* do not use this function in a driver */ static inline bool is_device_dma_coherent(struct device *dev) { - if (!dev) - return false; return dev->archdata.dma_coherent; } diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 5d1700425efe..ac3fb7441510 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -142,6 +142,7 @@ typedef struct user_fpsimd_state elf_fpregset_t; ({ \ clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ clear_thread_flag(TIF_32BIT); \ + current->personality &= ~READ_IMPLIES_EXEC; \ }) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ @@ -187,6 +188,11 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; ((x)->e_flags & EF_ARM_EABI_MASK)) #define compat_start_thread compat_start_thread +/* + * Unlike the native SET_PERSONALITY macro, the compat version inherits + * READ_IMPLIES_EXEC across a fork() since this is the behaviour on + * arch/arm/. + */ #define COMPAT_SET_PERSONALITY(ex) \ ({ \ set_bit(TIF_32BIT, ¤t->mm->context.flags); \ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 85997c0e5443..8cabd57b6348 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -19,6 +19,7 @@ #define __ASM_ESR_H #include <asm/memory.h> +#include <asm/sysreg.h> #define ESR_ELx_EC_UNKNOWN (0x00) #define ESR_ELx_EC_WFx (0x01) @@ -83,6 +84,7 @@ #define ESR_ELx_WNR (UL(1) << 6) /* Shared ISS field definitions for Data/Instruction aborts */ +#define ESR_ELx_FnV (UL(1) << 10) #define ESR_ELx_EA (UL(1) << 9) #define ESR_ELx_S1PTW (UL(1) << 7) @@ -181,6 +183,29 @@ #define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \ ESR_ELx_SYS64_ISS_DIR_READ) +#define esr_sys64_to_sysreg(e) \ + sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >> \ + ESR_ELx_SYS64_ISS_OP0_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + +#define esr_cp15_to_sysreg(e) \ + sys_reg(3, \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + #ifndef __ASSEMBLY__ #include <asm/types.h> diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 85c4a8981d47..f32b42e8725d 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -48,16 +48,16 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; + int oparg = (int)(encoded_op << 8) >> 20; + int cmparg = (int)(encoded_op << 20) >> 20; int oldval = 0, ret, tmp; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; + oparg = 1U << (oparg & 0x1f); if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 6e99978e83bd..61d694c2eae5 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -204,6 +204,16 @@ #define FSC_FAULT ESR_ELx_FSC_FAULT #define FSC_ACCESS ESR_ELx_FSC_ACCESS #define FSC_PERM ESR_ELx_FSC_PERM +#define FSC_SEA ESR_ELx_FSC_EXTABT +#define FSC_SEA_TTW0 (0x14) +#define FSC_SEA_TTW1 (0x15) +#define FSC_SEA_TTW2 (0x16) +#define FSC_SEA_TTW3 (0x17) +#define FSC_SECC (0x18) +#define FSC_SECC_TTW0 (0x1c) +#define FSC_SECC_TTW1 (0x1d) +#define FSC_SECC_TTW2 (0x1e) +#define FSC_SECC_TTW3 (0x1f) /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 1f252a95bc02..d68630007b14 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -42,7 +42,9 @@ #define KVM_VCPU_MAX_FEATURES 4 -#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SLEEP \ + KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -334,8 +336,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu); -void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu); u64 __kvm_call_hyp(void *hypfn, ...); #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index b18e852d27e8..4572a9b560fa 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -127,6 +127,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); +int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); void __timer_save_state(struct kvm_vcpu *vcpu); void __timer_restore_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index d57693f5d4ec..19bd97671bb8 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -30,6 +30,9 @@ struct mod_plt_sec { struct mod_arch_specific { struct mod_plt_sec core; struct mod_plt_sec init; + + /* for CONFIG_DYNAMIC_FTRACE */ + void *ftrace_trampoline; }; #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c213fdbd056c..6eae342ced6b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -441,7 +441,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) -#define pud_present(pud) (pud_val(pud)) +#define pud_present(pud) pte_present(pud_pte(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) { diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 9428b93fefb2..64c9e78f9882 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -104,6 +104,9 @@ struct thread_struct { #define task_user_tls(t) (&(t)->thread.tp_value) #endif +/* Sync TPIDR_EL0 back to thread_struct for current */ +void tls_preserve_current_state(void); + #define INIT_THREAD { } static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 801a16dbbdf6..5b6eafccc5d8 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -30,5 +30,6 @@ struct stackframe { extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); +extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b4d13d9267ff..16e44fa9b3b6 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -180,14 +180,31 @@ #define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0) +#define SYS_ICC_IAR0_EL1 sys_reg(3, 0, 12, 8, 0) +#define SYS_ICC_EOIR0_EL1 sys_reg(3, 0, 12, 8, 1) +#define SYS_ICC_HPPIR0_EL1 sys_reg(3, 0, 12, 8, 2) +#define SYS_ICC_BPR0_EL1 sys_reg(3, 0, 12, 8, 3) +#define SYS_ICC_AP0Rn_EL1(n) sys_reg(3, 0, 12, 8, 4 | n) +#define SYS_ICC_AP0R0_EL1 SYS_ICC_AP0Rn_EL1(0) +#define SYS_ICC_AP0R1_EL1 SYS_ICC_AP0Rn_EL1(1) +#define SYS_ICC_AP0R2_EL1 SYS_ICC_AP0Rn_EL1(2) +#define SYS_ICC_AP0R3_EL1 SYS_ICC_AP0Rn_EL1(3) +#define SYS_ICC_AP1Rn_EL1(n) sys_reg(3, 0, 12, 9, n) +#define SYS_ICC_AP1R0_EL1 SYS_ICC_AP1Rn_EL1(0) +#define SYS_ICC_AP1R1_EL1 SYS_ICC_AP1Rn_EL1(1) +#define SYS_ICC_AP1R2_EL1 SYS_ICC_AP1Rn_EL1(2) +#define SYS_ICC_AP1R3_EL1 SYS_ICC_AP1Rn_EL1(3) #define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) +#define SYS_ICC_RPR_EL1 sys_reg(3, 0, 12, 11, 3) #define SYS_ICC_SGI1R_EL1 sys_reg(3, 0, 12, 11, 5) #define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) #define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) +#define SYS_ICC_HPPIR1_EL1 sys_reg(3, 0, 12, 12, 2) #define SYS_ICC_BPR1_EL1 sys_reg(3, 0, 12, 12, 3) #define SYS_ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) #define SYS_ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) -#define SYS_ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) +#define SYS_ICC_IGRPEN0_EL1 sys_reg(3, 0, 12, 12, 6) +#define SYS_ICC_IGRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) #define SYS_CONTEXTIDR_EL1 sys_reg(3, 0, 13, 0, 1) #define SYS_TPIDR_EL1 sys_reg(3, 0, 13, 0, 4) @@ -287,8 +304,8 @@ #define SCTLR_ELx_M 1 #define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ - (1 << 16) | (1 << 18) | (1 << 22) | (1 << 23) | \ - (1 << 28) | (1 << 29)) + (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \ + (1 << 29)) #define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ SCTLR_ELx_SA | SCTLR_ELx_I) diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index bc812435bc76..07aa8e3c5630 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -40,7 +40,7 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, int sig, int code, const char *name); struct mm_struct; -extern void show_pte(struct mm_struct *mm, unsigned long addr); +extern void show_pte(unsigned long addr); extern void __show_regs(struct pt_regs *); extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); @@ -56,6 +56,8 @@ extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); __show_ratelimited; \ }) +int handle_guest_sea(phys_addr_t addr, unsigned int esr); + #endif /* __ASSEMBLY__ */ #endif /* __ASM_SYSTEM_MISC_H */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 70eea2ecc663..9f3ca24bbcc6 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -232,6 +232,9 @@ struct kvm_arch_memory_slot { #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_TIMER_CTRL 1 +#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 +#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index ee469be1ae1d..f0a76b9fcd6e 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -34,6 +34,26 @@ struct sigcontext { }; /* + * Allocation of __reserved[]: + * (Note: records do not necessarily occur in the order shown here.) + * + * size description + * + * 0x210 fpsimd_context + * 0x10 esr_context + * 0x20 extra_context (optional) + * 0x10 terminator (null _aarch64_ctx) + * + * 0xdb0 (reserved for future allocation) + * + * New records that can exceed this space need to be opt-in for userspace, so + * that an expanded signal frame is not generated unexpectedly. The mechanism + * for opting in will depend on the extension that generates each new record. + * The above table documents the maximum set and sizes of records than can be + * generated when userspace does not opt in for any such extension. + */ + +/* * Header to be used at the beginning of structures extending the user * context. Such structures must be placed after the rt_sigframe on the stack * and be 16-byte aligned. The last structure must be a dummy one with the @@ -61,4 +81,39 @@ struct esr_context { __u64 esr; }; +/* + * extra_context: describes extra space in the signal frame for + * additional structures that don't fit in sigcontext.__reserved[]. + * + * Note: + * + * 1) fpsimd_context, esr_context and extra_context must be placed in + * sigcontext.__reserved[] if present. They cannot be placed in the + * extra space. Any other record can be placed either in the extra + * space or in sigcontext.__reserved[], unless otherwise specified in + * this file. + * + * 2) There must not be more than one extra_context. + * + * 3) If extra_context is present, it must be followed immediately in + * sigcontext.__reserved[] by the terminating null _aarch64_ctx. + * + * 4) The extra space to which datap points must start at the first + * 16-byte aligned address immediately after the terminating null + * _aarch64_ctx that follows the extra_context structure in + * __reserved[]. The extra space may overrun the end of __reserved[], + * as indicated by a sufficiently large value for the size field. + * + * 5) The extra space must itself be terminated with a null + * _aarch64_ctx. + */ +#define EXTRA_MAGIC 0x45585401 + +struct extra_context { + struct _aarch64_ctx head; + __u64 datap; /* 16-byte aligned pointer to extra space cast to __u64 */ + __u32 size; /* size in bytes of the extra space */ + __u32 __reserved[3]; +}; + #endif /* _UAPI__ASM_SIGCONTEXT_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 1dcb69d3d0e5..f2b4e816b6de 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -62,3 +62,6 @@ extra-y += $(head-y) vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" endif + +# will be included by each individual module but not by the core kernel itself +extra-$(CONFIG_DYNAMIC_FTRACE) += ftrace-mod.o diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c index 1f5655cd9cc9..98a20e58758b 100644 --- a/arch/arm64/kernel/acpi_parking_protocol.c +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -71,7 +71,7 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu) { struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; struct parking_protocol_mailbox __iomem *mailbox; - __le32 cpu_id; + u32 cpu_id; /* * Map mailbox memory with attribute device nGnRE (ie ioremap - @@ -123,9 +123,9 @@ static void acpi_parking_protocol_cpu_postboot(void) int cpu = smp_processor_id(); struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; struct parking_protocol_mailbox __iomem *mailbox = cpu_entry->mailbox; - __le64 entry_point; + u64 entry_point; - entry_point = readl_relaxed(&mailbox->entry_point); + entry_point = readq_relaxed(&mailbox->entry_point); /* * Check if firmware has cleared the entry_point as expected * by the protocol specification. diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 8840c109c5d6..6dd0a3a3e5c9 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -28,7 +28,7 @@ #include <asm/sections.h> #include <linux/stop_machine.h> -#define __ALT_PTR(a,f) (u32 *)((void *)&(a)->f + (a)->f) +#define __ALT_PTR(a,f) ((void *)&(a)->f + (a)->f) #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) @@ -60,7 +60,7 @@ static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc) #define align_down(x, a) ((unsigned long)(x) & ~(((unsigned long)(a)) - 1)) -static u32 get_alt_insn(struct alt_instr *alt, u32 *insnptr, u32 *altinsnptr) +static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnptr) { u32 insn; @@ -109,7 +109,7 @@ static void __apply_alternatives(void *alt_region, bool use_linear_alias) { struct alt_instr *alt; struct alt_region *region = alt_region; - u32 *origptr, *replptr, *updptr; + __le32 *origptr, *replptr, *updptr; for (alt = region->begin; alt < region->end; alt++) { u32 insn; @@ -124,7 +124,7 @@ static void __apply_alternatives(void *alt_region, bool use_linear_alias) origptr = ALT_ORIG_PTR(alt); replptr = ALT_REPL_PTR(alt); - updptr = use_linear_alias ? (u32 *)lm_alias(origptr) : origptr; + updptr = use_linear_alias ? lm_alias(origptr) : origptr; nr_inst = alt->alt_len / sizeof(insn); for (i = 0; i < nr_inst; i++) { diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 2ed2a7657711..0e27f86ee709 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -133,6 +133,27 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00), }, #endif +#ifdef CONFIG_CAVIUM_ERRATUM_30115 + { + /* Cavium ThunderX, T88 pass 1.x - 2.2 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX, 0x00, + (1 << MIDR_VARIANT_SHIFT) | 2), + }, + { + /* Cavium ThunderX, T81 pass 1.0 - 1.2 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x02), + }, + { + /* Cavium ThunderX, T83 pass 1.0 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX_83XX, 0x00, 0x00), + }, +#endif { .desc = "Mismatched cache line size", .capability = ARM64_MISMATCHED_CACHE_LINE_SIZE, diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 817ce3365e20..9f9e0064c8c1 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -51,6 +51,25 @@ unsigned int compat_elf_hwcap2 __read_mostly; DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcaps); +static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p) +{ + /* file-wide pr_fmt adds "CPU features: " prefix */ + pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps); + return 0; +} + +static struct notifier_block cpu_hwcaps_notifier = { + .notifier_call = dump_cpu_hwcaps +}; + +static int __init register_cpu_hwcaps_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &cpu_hwcaps_notifier); + return 0; +} +__initcall(register_cpu_hwcaps_dumper); + DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcap_keys); @@ -639,8 +658,10 @@ void update_cpu_features(int cpu, * Mismatched CPU features are a recipe for disaster. Don't even * pretend to support them. */ - WARN_TAINT_ONCE(taint, TAINT_CPU_OUT_OF_SPEC, - "Unsupported CPU feature variation.\n"); + if (taint) { + pr_warn_once("Unsupported CPU feature variation detected.\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } } u64 read_sanitised_ftr_reg(u32 id) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 68b1f364c515..f495ee5049fd 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -227,7 +227,7 @@ static struct attribute *cpuregs_id_attrs[] = { NULL }; -static struct attribute_group cpuregs_attr_group = { +static const struct attribute_group cpuregs_attr_group = { .attrs = cpuregs_id_attrs, .name = "identification" }; diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index d618e25c3de1..c7ef99904934 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -341,20 +341,22 @@ int aarch32_break_handler(struct pt_regs *regs) if (compat_thumb_mode(regs)) { /* get 16-bit Thumb instruction */ - get_user(thumb_instr, (u16 __user *)pc); - thumb_instr = le16_to_cpu(thumb_instr); + __le16 instr; + get_user(instr, (__le16 __user *)pc); + thumb_instr = le16_to_cpu(instr); if (thumb_instr == AARCH32_BREAK_THUMB2_LO) { /* get second half of 32-bit Thumb-2 instruction */ - get_user(thumb_instr, (u16 __user *)(pc + 2)); - thumb_instr = le16_to_cpu(thumb_instr); + get_user(instr, (__le16 __user *)(pc + 2)); + thumb_instr = le16_to_cpu(instr); bp = thumb_instr == AARCH32_BREAK_THUMB2_HI; } else { bp = thumb_instr == AARCH32_BREAK_THUMB; } } else { /* 32-bit ARM instruction */ - get_user(arm_instr, (u32 __user *)pc); - arm_instr = le32_to_cpu(arm_instr); + __le32 instr; + get_user(instr, (__le32 __user *)pc); + arm_instr = le32_to_cpu(instr); bp = (arm_instr & ~0xf0000000) == AARCH32_BREAK_ARM; } diff --git a/arch/arm64/kernel/ftrace-mod.S b/arch/arm64/kernel/ftrace-mod.S new file mode 100644 index 000000000000..00c4025be4ff --- /dev/null +++ b/arch/arm64/kernel/ftrace-mod.S @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .section ".text.ftrace_trampoline", "ax" + .align 3 +0: .quad 0 +__ftrace_trampoline: + ldr x16, 0b + br x16 +ENDPROC(__ftrace_trampoline) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 40ad08ac569a..c13b1fca0e5b 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -10,10 +10,12 @@ */ #include <linux/ftrace.h> +#include <linux/module.h> #include <linux/swab.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> +#include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> @@ -70,6 +72,58 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long pc = rec->ip; u32 old, new; + long offset = (long)pc - (long)addr; + + if (offset < -SZ_128M || offset >= SZ_128M) { +#ifdef CONFIG_ARM64_MODULE_PLTS + unsigned long *trampoline; + struct module *mod; + + /* + * On kernels that support module PLTs, the offset between the + * branch instruction and its target may legally exceed the + * range of an ordinary relative 'bl' opcode. In this case, we + * need to branch via a trampoline in the module. + * + * NOTE: __module_text_address() must be called with preemption + * disabled, but we can rely on ftrace_lock to ensure that 'mod' + * retains its validity throughout the remainder of this code. + */ + preempt_disable(); + mod = __module_text_address(pc); + preempt_enable(); + + if (WARN_ON(!mod)) + return -EINVAL; + + /* + * There is only one ftrace trampoline per module. For now, + * this is not a problem since on arm64, all dynamic ftrace + * invocations are routed via ftrace_caller(). This will need + * to be revisited if support for multiple ftrace entry points + * is added in the future, but for now, the pr_err() below + * deals with a theoretical issue only. + */ + trampoline = (unsigned long *)mod->arch.ftrace_trampoline; + if (trampoline[0] != addr) { + if (trampoline[0] != 0) { + pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n"); + return -EINVAL; + } + + /* point the trampoline to our ftrace entry point */ + module_disable_ro(mod); + trampoline[0] = addr; + module_enable_ro(mod, true); + + /* update trampoline before patching in the branch */ + smp_wmb(); + } + addr = (unsigned long)&trampoline[1]; +#else /* CONFIG_ARM64_MODULE_PLTS */ + return -EINVAL; +#endif /* CONFIG_ARM64_MODULE_PLTS */ + } old = aarch64_insn_gen_nop(); new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); @@ -84,12 +138,55 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned long pc = rec->ip; - u32 old, new; + bool validate = true; + u32 old = 0, new; + long offset = (long)pc - (long)addr; + + if (offset < -SZ_128M || offset >= SZ_128M) { +#ifdef CONFIG_ARM64_MODULE_PLTS + u32 replaced; + + /* + * 'mod' is only set at module load time, but if we end up + * dealing with an out-of-range condition, we can assume it + * is due to a module being loaded far away from the kernel. + */ + if (!mod) { + preempt_disable(); + mod = __module_text_address(pc); + preempt_enable(); + + if (WARN_ON(!mod)) + return -EINVAL; + } + + /* + * The instruction we are about to patch may be a branch and + * link instruction that was redirected via a PLT entry. In + * this case, the normal validation will fail, but we can at + * least check that we are dealing with a branch and link + * instruction that points into the right module. + */ + if (aarch64_insn_read((void *)pc, &replaced)) + return -EFAULT; + + if (!aarch64_insn_is_bl(replaced) || + !within_module(pc + aarch64_get_branch_offset(replaced), + mod)) + return -EINVAL; + + validate = false; +#else /* CONFIG_ARM64_MODULE_PLTS */ + return -EINVAL; +#endif /* CONFIG_ARM64_MODULE_PLTS */ + } else { + old = aarch64_insn_gen_branch_imm(pc, addr, + AARCH64_INSN_BRANCH_LINK); + } - old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK); new = aarch64_insn_gen_nop(); - return ftrace_modify_code(pc, old, new, true); + return ftrace_modify_code(pc, old, new, validate); } void arch_ftrace_update_code(int command) diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index cd872133e88e..2718a77da165 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -117,7 +117,7 @@ static void __kprobes patch_unmap(int fixmap) int __kprobes aarch64_insn_read(void *addr, u32 *insnp) { int ret; - u32 val; + __le32 val; ret = probe_kernel_read(&val, addr, AARCH64_INSN_SIZE); if (!ret) @@ -126,7 +126,7 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp) return ret; } -static int __kprobes __aarch64_insn_write(void *addr, u32 insn) +static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) { void *waddr = addr; unsigned long flags = 0; @@ -145,8 +145,7 @@ static int __kprobes __aarch64_insn_write(void *addr, u32 insn) int __kprobes aarch64_insn_write(void *addr, u32 insn) { - insn = cpu_to_le32(insn); - return __aarch64_insn_write(addr, insn); + return __aarch64_insn_write(addr, cpu_to_le32(insn)); } static bool __kprobes __aarch64_insn_hotpatch_safe(u32 insn) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index d7e90d97f5c4..a9710efb8c01 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -27,7 +27,7 @@ u16 __initdata memstart_offset_seed; static __init u64 get_kaslr_seed(void *fdt) { int node, len; - u64 *prop; + fdt64_t *prop; u64 ret; node = fdt_path_offset(fdt, "/chosen"); diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index f035ff6fb223..f469e0435903 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -74,7 +74,7 @@ enum aarch64_reloc_op { RELOC_OP_PAGE, }; -static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val) +static u64 do_reloc(enum aarch64_reloc_op reloc_op, __le32 *place, u64 val) { switch (reloc_op) { case RELOC_OP_ABS: @@ -121,12 +121,12 @@ enum aarch64_insn_movw_imm_type { AARCH64_INSN_IMM_MOVKZ, }; -static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, +static int reloc_insn_movw(enum aarch64_reloc_op op, __le32 *place, u64 val, int lsb, enum aarch64_insn_movw_imm_type imm_type) { u64 imm; s64 sval; - u32 insn = le32_to_cpu(*(u32 *)place); + u32 insn = le32_to_cpu(*place); sval = do_reloc(op, place, val); imm = sval >> lsb; @@ -154,7 +154,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, /* Update the instruction with the new encoding. */ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); - *(u32 *)place = cpu_to_le32(insn); + *place = cpu_to_le32(insn); if (imm > U16_MAX) return -ERANGE; @@ -162,12 +162,12 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, return 0; } -static int reloc_insn_imm(enum aarch64_reloc_op op, void *place, u64 val, +static int reloc_insn_imm(enum aarch64_reloc_op op, __le32 *place, u64 val, int lsb, int len, enum aarch64_insn_imm_type imm_type) { u64 imm, imm_mask; s64 sval; - u32 insn = le32_to_cpu(*(u32 *)place); + u32 insn = le32_to_cpu(*place); /* Calculate the relocation value. */ sval = do_reloc(op, place, val); @@ -179,7 +179,7 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, void *place, u64 val, /* Update the instruction's immediate field. */ insn = aarch64_insn_encode_immediate(imm_type, insn, imm); - *(u32 *)place = cpu_to_le32(insn); + *place = cpu_to_le32(insn); /* * Extract the upper value bits (including the sign bit) and @@ -420,8 +420,12 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { if (strcmp(".altinstructions", secstrs + s->sh_name) == 0) { apply_alternatives((void *)s->sh_addr, s->sh_size); - return 0; } +#ifdef CONFIG_ARM64_MODULE_PLTS + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) && + !strcmp(".text.ftrace_trampoline", secstrs + s->sh_name)) + me->arch.ftrace_trampoline = (void *)s->sh_addr; +#endif } return 0; diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index c7e3e6387a49..a7f6c01c13b9 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -108,7 +108,10 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) if (!acpi_disabled) { struct pci_config_window *cfg = bridge->bus->sysdata; struct acpi_device *adev = to_acpi_device(cfg->parent); + struct device *bus_dev = &bridge->bus->dev; + ACPI_COMPANION_SET(&bridge->dev, adev); + set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); } return 0; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 83a1b1ad189f..b5798ba21189 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -529,7 +529,7 @@ static struct attribute_group armv8_pmuv3_events_attr_group = { .is_visible = armv8pmu_event_attr_is_visible, }; -PMU_FORMAT_ATTR(event, "config:0-9"); +PMU_FORMAT_ATTR(event, "config:0-15"); static struct attribute *armv8_pmuv3_format_attrs[] = { &format_attr_event.attr, diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index c5c45942fb6e..d849d9804011 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -522,9 +522,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) pr_err("current sp %lx does not match saved sp %lx\n", orig_sp, stack_addr); pr_err("Saved registers for jprobe %p\n", jp); - show_regs(saved_regs); + __show_regs(saved_regs); pr_err("Current registers\n"); - show_regs(regs); + __show_regs(regs); BUG(); } unpause_graph_tracing(); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index ae2a835898d7..659ae8094ed5 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -210,6 +210,7 @@ void __show_regs(struct pt_regs *regs) void show_regs(struct pt_regs * regs) { __show_regs(regs); + dump_backtrace(regs, NULL); } static void tls_thread_flush(void) @@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, return 0; } +void tls_preserve_current_state(void) +{ + *task_user_tls(current) = read_sysreg(tpidr_el0); +} + static void tls_thread_switch(struct task_struct *next) { unsigned long tpidr, tpidrro; - tpidr = read_sysreg(tpidr_el0); - *task_user_tls(current) = tpidr; + tls_preserve_current_state(); tpidr = *task_user_tls(next); tpidrro = is_compat_thread(task_thread_info(next)) ? diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index c142459a88f3..1b38c0150aec 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -623,6 +623,10 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset, { struct user_fpsimd_state *uregs; uregs = &target->thread.fpsimd_state.user_fpsimd; + + if (target == current) + fpsimd_preserve_current_state(); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, -1); } @@ -648,6 +652,10 @@ static int tls_get(struct task_struct *target, const struct user_regset *regset, void *kbuf, void __user *ubuf) { unsigned long *tls = &target->thread.tp_value; + + if (target == current) + tls_preserve_current_state(); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, tls, 0, -1); } @@ -894,21 +902,27 @@ static int compat_vfp_get(struct task_struct *target, { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; - int ret; + int ret, vregs_end_pos; uregs = &target->thread.fpsimd_state.user_fpsimd; + if (target == current) + fpsimd_preserve_current_state(); + /* * The VFP registers are packed into the fpsimd_state, so they all sit * nicely together for us. We just need to create the fpscr separately. */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, 0, - VFP_STATE_SIZE - sizeof(compat_ulong_t)); + vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, uregs, + 0, vregs_end_pos); if (count && !ret) { fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | (uregs->fpcr & VFP_FPSCR_CTRL_MASK); - ret = put_user(fpscr, (compat_ulong_t *)ubuf); + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpscr, + vregs_end_pos, VFP_STATE_SIZE); } return ret; @@ -921,20 +935,21 @@ static int compat_vfp_set(struct task_struct *target, { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; - int ret; - - if (pos + count > VFP_STATE_SIZE) - return -EIO; + int ret, vregs_end_pos; uregs = &target->thread.fpsimd_state.user_fpsimd; + vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, uregs, 0, - VFP_STATE_SIZE - sizeof(compat_ulong_t)); + vregs_end_pos); if (count && !ret) { - ret = get_user(fpscr, (compat_ulong_t *)ubuf); - uregs->fpsr = fpscr & VFP_FPSCR_STAT_MASK; - uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpscr, + vregs_end_pos, VFP_STATE_SIZE); + if (!ret) { + uregs->fpsr = fpscr & VFP_FPSCR_STAT_MASK; + uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + } } fpsimd_flush_task_state(target); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 2c822ef94f34..d4b740538ad5 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -194,6 +194,9 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) } name = of_flat_dt_get_machine_name(); + if (!name) + return; + pr_info("Machine model: %s\n", name); dump_stack_set_arch_desc("%s (DT)", name); } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index c7b6de62f9d3..089c3747995d 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -19,10 +19,14 @@ #include <linux/compat.h> #include <linux/errno.h> +#include <linux/kernel.h> #include <linux/signal.h> #include <linux/personality.h> #include <linux/freezer.h> +#include <linux/stddef.h> #include <linux/uaccess.h> +#include <linux/sizes.h> +#include <linux/string.h> #include <linux/tracehook.h> #include <linux/ratelimit.h> @@ -41,10 +45,133 @@ struct rt_sigframe { struct siginfo info; struct ucontext uc; +}; + +struct frame_record { u64 fp; u64 lr; }; +struct rt_sigframe_user_layout { + struct rt_sigframe __user *sigframe; + struct frame_record __user *next_frame; + + unsigned long size; /* size of allocated sigframe data */ + unsigned long limit; /* largest allowed size */ + + unsigned long fpsimd_offset; + unsigned long esr_offset; + unsigned long extra_offset; + unsigned long end_offset; +}; + +#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16) +#define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) +#define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) + +static void init_user_layout(struct rt_sigframe_user_layout *user) +{ + const size_t reserved_size = + sizeof(user->sigframe->uc.uc_mcontext.__reserved); + + memset(user, 0, sizeof(*user)); + user->size = offsetof(struct rt_sigframe, uc.uc_mcontext.__reserved); + + user->limit = user->size + reserved_size; + + user->limit -= TERMINATOR_SIZE; + user->limit -= EXTRA_CONTEXT_SIZE; + /* Reserve space for extension and terminator ^ */ +} + +static size_t sigframe_size(struct rt_sigframe_user_layout const *user) +{ + return round_up(max(user->size, sizeof(struct rt_sigframe)), 16); +} + +/* + * Sanity limit on the approximate maximum size of signal frame we'll + * try to generate. Stack alignment padding and the frame record are + * not taken into account. This limit is not a guarantee and is + * NOT ABI. + */ +#define SIGFRAME_MAXSZ SZ_64K + +static int __sigframe_alloc(struct rt_sigframe_user_layout *user, + unsigned long *offset, size_t size, bool extend) +{ + size_t padded_size = round_up(size, 16); + + if (padded_size > user->limit - user->size && + !user->extra_offset && + extend) { + int ret; + + user->limit += EXTRA_CONTEXT_SIZE; + ret = __sigframe_alloc(user, &user->extra_offset, + sizeof(struct extra_context), false); + if (ret) { + user->limit -= EXTRA_CONTEXT_SIZE; + return ret; + } + + /* Reserve space for the __reserved[] terminator */ + user->size += TERMINATOR_SIZE; + + /* + * Allow expansion up to SIGFRAME_MAXSZ, ensuring space for + * the terminator: + */ + user->limit = SIGFRAME_MAXSZ - TERMINATOR_SIZE; + } + + /* Still not enough space? Bad luck! */ + if (padded_size > user->limit - user->size) + return -ENOMEM; + + *offset = user->size; + user->size += padded_size; + + return 0; +} + +/* + * Allocate space for an optional record of <size> bytes in the user + * signal frame. The offset from the signal frame base address to the + * allocated block is assigned to *offset. + */ +static int sigframe_alloc(struct rt_sigframe_user_layout *user, + unsigned long *offset, size_t size) +{ + return __sigframe_alloc(user, offset, size, true); +} + +/* Allocate the null terminator record and prevent further allocations */ +static int sigframe_alloc_end(struct rt_sigframe_user_layout *user) +{ + int ret; + + /* Un-reserve the space reserved for the terminator: */ + user->limit += TERMINATOR_SIZE; + + ret = sigframe_alloc(user, &user->end_offset, + sizeof(struct _aarch64_ctx)); + if (ret) + return ret; + + /* Prevent further allocation: */ + user->limit = user->size; + return 0; +} + +static void __user *apply_user_offset( + struct rt_sigframe_user_layout const *user, unsigned long offset) +{ + char __user *base = (char __user *)user->sigframe; + + return base + offset; +} + static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; @@ -92,12 +219,159 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } +struct user_ctxs { + struct fpsimd_context __user *fpsimd; +}; + +static int parse_user_sigframe(struct user_ctxs *user, + struct rt_sigframe __user *sf) +{ + struct sigcontext __user *const sc = &sf->uc.uc_mcontext; + struct _aarch64_ctx __user *head; + char __user *base = (char __user *)&sc->__reserved; + size_t offset = 0; + size_t limit = sizeof(sc->__reserved); + bool have_extra_context = false; + char const __user *const sfp = (char const __user *)sf; + + user->fpsimd = NULL; + + if (!IS_ALIGNED((unsigned long)base, 16)) + goto invalid; + + while (1) { + int err = 0; + u32 magic, size; + char const __user *userp; + struct extra_context const __user *extra; + u64 extra_datap; + u32 extra_size; + struct _aarch64_ctx const __user *end; + u32 end_magic, end_size; + + if (limit - offset < sizeof(*head)) + goto invalid; + + if (!IS_ALIGNED(offset, 16)) + goto invalid; + + head = (struct _aarch64_ctx __user *)(base + offset); + __get_user_error(magic, &head->magic, err); + __get_user_error(size, &head->size, err); + if (err) + return err; + + if (limit - offset < size) + goto invalid; + + switch (magic) { + case 0: + if (size) + goto invalid; + + goto done; + + case FPSIMD_MAGIC: + if (user->fpsimd) + goto invalid; + + if (size < sizeof(*user->fpsimd)) + goto invalid; + + user->fpsimd = (struct fpsimd_context __user *)head; + break; + + case ESR_MAGIC: + /* ignore */ + break; + + case EXTRA_MAGIC: + if (have_extra_context) + goto invalid; + + if (size < sizeof(*extra)) + goto invalid; + + userp = (char const __user *)head; + + extra = (struct extra_context const __user *)userp; + userp += size; + + __get_user_error(extra_datap, &extra->datap, err); + __get_user_error(extra_size, &extra->size, err); + if (err) + return err; + + /* Check for the dummy terminator in __reserved[]: */ + + if (limit - offset - size < TERMINATOR_SIZE) + goto invalid; + + end = (struct _aarch64_ctx const __user *)userp; + userp += TERMINATOR_SIZE; + + __get_user_error(end_magic, &end->magic, err); + __get_user_error(end_size, &end->size, err); + if (err) + return err; + + if (end_magic || end_size) + goto invalid; + + /* Prevent looping/repeated parsing of extra_context */ + have_extra_context = true; + + base = (__force void __user *)extra_datap; + if (!IS_ALIGNED((unsigned long)base, 16)) + goto invalid; + + if (!IS_ALIGNED(extra_size, 16)) + goto invalid; + + if (base != userp) + goto invalid; + + /* Reject "unreasonably large" frames: */ + if (extra_size > sfp + SIGFRAME_MAXSZ - userp) + goto invalid; + + /* + * Ignore trailing terminator in __reserved[] + * and start parsing extra data: + */ + offset = 0; + limit = extra_size; + continue; + + default: + goto invalid; + } + + if (size < sizeof(*head)) + goto invalid; + + if (limit - offset < size) + goto invalid; + + offset += size; + } + +done: + if (!user->fpsimd) + goto invalid; + + return 0; + +invalid: + return -EINVAL; +} + static int restore_sigframe(struct pt_regs *regs, struct rt_sigframe __user *sf) { sigset_t set; int i, err; - void *aux = sf->uc.uc_mcontext.__reserved; + struct user_ctxs user; err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); if (err == 0) @@ -116,12 +390,11 @@ static int restore_sigframe(struct pt_regs *regs, regs->syscallno = ~0UL; err |= !valid_user_regs(®s->user_regs, current); + if (err == 0) + err = parse_user_sigframe(&user, sf); - if (err == 0) { - struct fpsimd_context *fpsimd_ctx = - container_of(aux, struct fpsimd_context, head); - err |= restore_fpsimd_context(fpsimd_ctx); - } + if (err == 0) + err = restore_fpsimd_context(user.fpsimd); return err; } @@ -162,16 +435,37 @@ badframe: return 0; } -static int setup_sigframe(struct rt_sigframe __user *sf, +/* Determine the layout of optional records in the signal frame */ +static int setup_sigframe_layout(struct rt_sigframe_user_layout *user) +{ + int err; + + err = sigframe_alloc(user, &user->fpsimd_offset, + sizeof(struct fpsimd_context)); + if (err) + return err; + + /* fault information, if valid */ + if (current->thread.fault_code) { + err = sigframe_alloc(user, &user->esr_offset, + sizeof(struct esr_context)); + if (err) + return err; + } + + return sigframe_alloc_end(user); +} + + +static int setup_sigframe(struct rt_sigframe_user_layout *user, struct pt_regs *regs, sigset_t *set) { int i, err = 0; - void *aux = sf->uc.uc_mcontext.__reserved; - struct _aarch64_ctx *end; + struct rt_sigframe __user *sf = user->sigframe; /* set up the stack frame for unwinding */ - __put_user_error(regs->regs[29], &sf->fp, err); - __put_user_error(regs->regs[30], &sf->lr, err); + __put_user_error(regs->regs[29], &user->next_frame->fp, err); + __put_user_error(regs->regs[30], &user->next_frame->lr, err); for (i = 0; i < 31; i++) __put_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i], @@ -185,58 +479,103 @@ static int setup_sigframe(struct rt_sigframe __user *sf, err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); if (err == 0) { - struct fpsimd_context *fpsimd_ctx = - container_of(aux, struct fpsimd_context, head); + struct fpsimd_context __user *fpsimd_ctx = + apply_user_offset(user, user->fpsimd_offset); err |= preserve_fpsimd_context(fpsimd_ctx); - aux += sizeof(*fpsimd_ctx); } /* fault information, if valid */ - if (current->thread.fault_code) { - struct esr_context *esr_ctx = - container_of(aux, struct esr_context, head); + if (err == 0 && user->esr_offset) { + struct esr_context __user *esr_ctx = + apply_user_offset(user, user->esr_offset); + __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err); __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err); __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); - aux += sizeof(*esr_ctx); + } + + if (err == 0 && user->extra_offset) { + char __user *sfp = (char __user *)user->sigframe; + char __user *userp = + apply_user_offset(user, user->extra_offset); + + struct extra_context __user *extra; + struct _aarch64_ctx __user *end; + u64 extra_datap; + u32 extra_size; + + extra = (struct extra_context __user *)userp; + userp += EXTRA_CONTEXT_SIZE; + + end = (struct _aarch64_ctx __user *)userp; + userp += TERMINATOR_SIZE; + + /* + * extra_datap is just written to the signal frame. + * The value gets cast back to a void __user * + * during sigreturn. + */ + extra_datap = (__force u64)userp; + extra_size = sfp + round_up(user->size, 16) - userp; + + __put_user_error(EXTRA_MAGIC, &extra->head.magic, err); + __put_user_error(EXTRA_CONTEXT_SIZE, &extra->head.size, err); + __put_user_error(extra_datap, &extra->datap, err); + __put_user_error(extra_size, &extra->size, err); + + /* Add the terminator */ + __put_user_error(0, &end->magic, err); + __put_user_error(0, &end->size, err); } /* set the "end" magic */ - end = aux; - __put_user_error(0, &end->magic, err); - __put_user_error(0, &end->size, err); + if (err == 0) { + struct _aarch64_ctx __user *end = + apply_user_offset(user, user->end_offset); + + __put_user_error(0, &end->magic, err); + __put_user_error(0, &end->size, err); + } return err; } -static struct rt_sigframe __user *get_sigframe(struct ksignal *ksig, - struct pt_regs *regs) +static int get_sigframe(struct rt_sigframe_user_layout *user, + struct ksignal *ksig, struct pt_regs *regs) { unsigned long sp, sp_top; - struct rt_sigframe __user *frame; + int err; + + init_user_layout(user); + err = setup_sigframe_layout(user); + if (err) + return err; sp = sp_top = sigsp(regs->sp, ksig); - sp = (sp - sizeof(struct rt_sigframe)) & ~15; - frame = (struct rt_sigframe __user *)sp; + sp = round_down(sp - sizeof(struct frame_record), 16); + user->next_frame = (struct frame_record __user *)sp; + + sp = round_down(sp, 16) - sigframe_size(user); + user->sigframe = (struct rt_sigframe __user *)sp; /* * Check that we can actually write to the signal frame. */ - if (!access_ok(VERIFY_WRITE, frame, sp_top - sp)) - frame = NULL; + if (!access_ok(VERIFY_WRITE, user->sigframe, sp_top - sp)) + return -EFAULT; - return frame; + return 0; } static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, - void __user *frame, int usig) + struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; regs->regs[0] = usig; - regs->sp = (unsigned long)frame; - regs->regs[29] = regs->sp + offsetof(struct rt_sigframe, fp); + regs->sp = (unsigned long)user->sigframe; + regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->pc = (unsigned long)ka->sa.sa_handler; if (ka->sa.sa_flags & SA_RESTORER) @@ -250,20 +589,22 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { + struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; int err = 0; - frame = get_sigframe(ksig, regs); - if (!frame) + if (get_sigframe(&user, ksig, regs)) return 1; + frame = user.sigframe; + __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); - err |= setup_sigframe(frame, regs, set); + err |= setup_sigframe(&user, regs, set); if (err == 0) { - setup_return(regs, &ksig->ka, frame, usig); + setup_return(regs, &ksig->ka, &user, usig); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, &ksig->info); regs->regs[1] = (unsigned long)&frame->info; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index feac80c22f61..09d37d66b630 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -210,6 +210,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) put_task_stack(tsk); } +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace(struct stack_trace *trace) { diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 0805b44f986a..c7c7088097be 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -140,7 +140,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) } } -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; unsigned long irq_stack_ptr; @@ -344,22 +344,24 @@ static int call_undef_hook(struct pt_regs *regs) if (compat_thumb_mode(regs)) { /* 16-bit Thumb instruction */ - if (get_user(instr, (u16 __user *)pc)) + __le16 instr_le; + if (get_user(instr_le, (__le16 __user *)pc)) goto exit; - instr = le16_to_cpu(instr); + instr = le16_to_cpu(instr_le); if (aarch32_insn_is_wide(instr)) { u32 instr2; - if (get_user(instr2, (u16 __user *)(pc + 2))) + if (get_user(instr_le, (__le16 __user *)(pc + 2))) goto exit; - instr2 = le16_to_cpu(instr2); + instr2 = le16_to_cpu(instr_le); instr = (instr << 16) | instr2; } } else { /* 32-bit ARM instruction */ - if (get_user(instr, (u32 __user *)pc)) + __le32 instr_le; + if (get_user(instr_le, (__le32 __user *)pc)) goto exit; - instr = le32_to_cpu(instr); + instr = le32_to_cpu(instr_le); } raw_spin_lock_irqsave(&undef_lock, flags); @@ -728,8 +730,6 @@ static int bug_handler(struct pt_regs *regs, unsigned int esr) break; case BUG_TRAP_TYPE_WARN: - /* Ideally, report_bug() should backtrace for us... but no. */ - dump_backtrace(regs, NULL); break; default: diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 7492d9009610..e8f759f764f2 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -37,7 +37,7 @@ #include <asm/vdso.h> #include <asm/vdso_datapage.h> -extern char vdso_start, vdso_end; +extern char vdso_start[], vdso_end[]; static unsigned long vdso_pages __ro_after_init; /* @@ -125,14 +125,14 @@ static int __init vdso_init(void) struct page **vdso_pagelist; unsigned long pfn; - if (memcmp(&vdso_start, "\177ELF", 4)) { + if (memcmp(vdso_start, "\177ELF", 4)) { pr_err("vDSO is not a valid ELF object!\n"); return -EINVAL; } - vdso_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT; + vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; pr_info("vdso: %ld pages (%ld code @ %p, %ld data @ %p)\n", - vdso_pages + 1, vdso_pages, &vdso_start, 1L, vdso_data); + vdso_pages + 1, vdso_pages, vdso_start, 1L, vdso_data); /* Allocate the vDSO pagelist, plus a page for the data. */ vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), @@ -145,7 +145,7 @@ static int __init vdso_init(void) /* Grab the vDSO code pages. */ - pfn = sym_to_pfn(&vdso_start); + pfn = sym_to_pfn(vdso_start); for (i = 0; i < vdso_pages; i++) vdso_pagelist[i + 1] = pfn_to_page(pfn + i); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index b37446a8ffdb..5c7f657dd207 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -390,6 +390,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_set_attr(vcpu, attr); + break; default: ret = -ENXIO; break; @@ -407,6 +410,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_get_attr(vcpu, attr); break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_get_attr(vcpu, attr); + break; default: ret = -ENXIO; break; @@ -424,6 +430,9 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, case KVM_ARM_VCPU_PMU_V3_CTRL: ret = kvm_arm_pmu_v3_has_attr(vcpu, attr); break; + case KVM_ARM_VCPU_TIMER_CTRL: + ret = kvm_arm_timer_has_attr(vcpu, attr); + break; default: ret = -ENXIO; break; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index fa1b18e364fc..17d8a1677a0b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -89,6 +89,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; kvm_vcpu_block(vcpu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index aede1658aeda..945e79c641c4 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -350,6 +350,20 @@ again: } } + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + exit_code == ARM_EXCEPTION_TRAP && + (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || + kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { + int ret = __vgic_v3_perform_cpuif_access(vcpu); + + if (ret == 1) { + __skip_instr(vcpu); + goto again; + } + + /* 0 falls through to be handled out of EL2 */ + } + fp_enabled = __fpsimd_enabled(); __sysreg_save_guest_state(guest_ctxt); @@ -422,6 +436,7 @@ void __hyp_text __noreturn __hyp_panic(void) vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); + __timer_save_state(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); __sysreg_restore_host_state(host_ctxt); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 561badf93de8..3256b9228e75 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -46,16 +46,6 @@ static const struct kvm_regs default_regs_reset32 = { COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT), }; -static const struct kvm_irq_level default_ptimer_irq = { - .irq = 30, - .level = 1, -}; - -static const struct kvm_irq_level default_vtimer_irq = { - .irq = 27, - .level = 1, -}; - static bool cpu_has_32bit_el1(void) { u64 pfr0; @@ -108,8 +98,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { - const struct kvm_irq_level *cpu_vtimer_irq; - const struct kvm_irq_level *cpu_ptimer_irq; const struct kvm_regs *cpu_reset; switch (vcpu->arch.target) { @@ -122,8 +110,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) cpu_reset = &default_regs_reset; } - cpu_vtimer_irq = &default_vtimer_irq; - cpu_ptimer_irq = &default_ptimer_irq; break; } @@ -137,5 +123,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_reset(vcpu); /* Reset timer */ - return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); + return kvm_timer_vcpu_reset(vcpu); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0fe27024a2e1..77862881ae86 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -56,7 +56,8 @@ */ static bool read_from_write_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) + struct sys_reg_params *params, + const struct sys_reg_desc *r) { WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n"); print_sys_reg_instr(params); @@ -64,6 +65,16 @@ static bool read_from_write_only(struct kvm_vcpu *vcpu, return false; } +static bool write_to_read_only(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n"); + print_sys_reg_instr(params); + kvm_inject_undefined(vcpu); + return false; +} + /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ static u32 cache_levels; @@ -93,7 +104,7 @@ static bool access_dcsw(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); kvm_set_way_flush(vcpu); return true; @@ -135,7 +146,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); vgic_v3_dispatch_sgi(vcpu, p->regval); @@ -773,7 +784,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return trap_raz_wi(vcpu, p, r); if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); if (pmu_write_swinc_el0_disabled(vcpu)) return false; @@ -953,7 +964,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, + { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, + { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 6260b69e5622..116786d2e8e8 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -268,36 +268,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { - /* ICC_PMR_EL1 */ - { Op0(3), Op1(0), CRn(4), CRm(6), Op2(0), access_gic_pmr }, - /* ICC_BPR0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(3), access_gic_bpr0 }, - /* ICC_AP0R0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(4), access_gic_ap0r }, - /* ICC_AP0R1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(5), access_gic_ap0r }, - /* ICC_AP0R2_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(6), access_gic_ap0r }, - /* ICC_AP0R3_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(8), Op2(7), access_gic_ap0r }, - /* ICC_AP1R0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(0), access_gic_ap1r }, - /* ICC_AP1R1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(1), access_gic_ap1r }, - /* ICC_AP1R2_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(2), access_gic_ap1r }, - /* ICC_AP1R3_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(9), Op2(3), access_gic_ap1r }, - /* ICC_BPR1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(3), access_gic_bpr1 }, - /* ICC_CTLR_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(4), access_gic_ctlr }, - /* ICC_SRE_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(5), access_gic_sre }, - /* ICC_IGRPEN0_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(6), access_gic_grpen0 }, - /* ICC_GRPEN1_EL1 */ - { Op0(3), Op1(0), CRn(12), CRm(12), Op2(7), access_gic_grpen1 }, + { SYS_DESC(SYS_ICC_PMR_EL1), access_gic_pmr }, + { SYS_DESC(SYS_ICC_BPR0_EL1), access_gic_bpr0 }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), access_gic_ap0r }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), access_gic_ap1r }, + { SYS_DESC(SYS_ICC_BPR1_EL1), access_gic_bpr1 }, + { SYS_DESC(SYS_ICC_CTLR_EL1), access_gic_ctlr }, + { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), access_gic_grpen0 }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 }, }; int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 3216e098c058..3e340b625436 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -95,11 +95,6 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) { - if (dev == NULL) { - WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); - return NULL; - } - if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; @@ -128,10 +123,6 @@ static void __dma_free_coherent(struct device *dev, size_t size, bool freed; phys_addr_t paddr = dma_to_phys(dev, dma_handle); - if (dev == NULL) { - WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); - return; - } freed = dma_release_from_contiguous(dev, phys_to_page(paddr), diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 37b95dff0b07..c7861c9864e6 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -31,6 +31,7 @@ #include <linux/highmem.h> #include <linux/perf_event.h> #include <linux/preempt.h> +#include <linux/hugetlb.h> #include <asm/bug.h> #include <asm/cpufeature.h> @@ -42,6 +43,8 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> +#include <acpi/ghes.h> + struct fault_info { int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); @@ -80,18 +83,35 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) #endif /* - * Dump out the page tables associated with 'addr' in mm 'mm'. + * Dump out the page tables associated with 'addr' in the currently active mm. */ -void show_pte(struct mm_struct *mm, unsigned long addr) +void show_pte(unsigned long addr) { + struct mm_struct *mm; pgd_t *pgd; - if (!mm) + if (addr < TASK_SIZE) { + /* TTBR0 */ + mm = current->active_mm; + if (mm == &init_mm) { + pr_alert("[%016lx] user address but active_mm is swapper\n", + addr); + return; + } + } else if (addr >= VA_START) { + /* TTBR1 */ mm = &init_mm; + } else { + pr_alert("[%016lx] address between user and kernel address ranges\n", + addr); + return; + } - pr_alert("pgd = %p\n", mm->pgd); + pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgd = %p\n", + mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K, + VA_BITS, mm->pgd); pgd = pgd_offset(mm, addr); - pr_alert("[%08lx] *pgd=%016llx", addr, pgd_val(*pgd)); + pr_alert("[%016lx] *pgd=%016llx", addr, pgd_val(*pgd)); do { pud_t *pud; @@ -196,8 +216,8 @@ static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs, /* * The kernel tried to access some page that wasn't present. */ -static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, - unsigned int esr, struct pt_regs *regs) +static void __do_kernel_fault(unsigned long addr, unsigned int esr, + struct pt_regs *regs) { const char *msg; @@ -227,7 +247,7 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg, addr); - show_pte(mm, addr); + show_pte(addr); die("Oops", regs, esr); bust_spinlocks(0); do_exit(SIGKILL); @@ -239,18 +259,20 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, */ static void __do_user_fault(struct task_struct *tsk, unsigned long addr, unsigned int esr, unsigned int sig, int code, - struct pt_regs *regs) + struct pt_regs *regs, int fault) { struct siginfo si; const struct fault_info *inf; + unsigned int lsb = 0; if (unhandled_signal(tsk, sig) && show_unhandled_signals_ratelimited()) { inf = esr_to_fault_info(esr); - pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x\n", + pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x", tsk->comm, task_pid_nr(tsk), inf->name, sig, addr, esr); - show_pte(tsk->mm, addr); - show_regs(regs); + print_vma_addr(KERN_CONT ", in ", regs->pc); + pr_cont("\n"); + __show_regs(regs); } tsk->thread.fault_address = addr; @@ -259,13 +281,23 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr, si.si_errno = 0; si.si_code = code; si.si_addr = (void __user *)addr; + /* + * Either small page or large page may be poisoned. + * In other words, VM_FAULT_HWPOISON_LARGE and + * VM_FAULT_HWPOISON are mutually exclusive. + */ + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + else if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + si.si_addr_lsb = lsb; + force_sig_info(sig, &si, tsk); } static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->active_mm; const struct fault_info *inf; /* @@ -274,9 +306,9 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re */ if (user_mode(regs)) { inf = esr_to_fault_info(esr); - __do_user_fault(tsk, addr, esr, inf->sig, inf->code, regs); + __do_user_fault(tsk, addr, esr, inf->sig, inf->code, regs, 0); } else - __do_kernel_fault(mm, addr, esr, regs); + __do_kernel_fault(addr, esr, regs); } #define VM_FAULT_BADMAP 0x010000 @@ -329,7 +361,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, { struct task_struct *tsk; struct mm_struct *mm; - int fault, sig, code; + int fault, sig, code, major = 0; unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@ -368,6 +400,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, die("Accessing user space memory outside uaccess.h routines", regs, esr); } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -391,24 +425,42 @@ retry: } fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk); + major |= fault & VM_FAULT_MAJOR; - /* - * If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_sem because it - * would already be released in __lock_page_or_retry in mm/filemap.c. - */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) - return 0; + if (fault & VM_FAULT_RETRY) { + /* + * If we need to retry but a fatal signal is pending, + * handle the signal first. We do not need to release + * the mmap_sem because it would already be released + * in __lock_page_or_retry in mm/filemap.c. + */ + if (fatal_signal_pending(current)) + return 0; + + /* + * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of + * starvation. + */ + if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { + mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; + mm_flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + up_read(&mm->mmap_sem); /* - * Major/minor page fault accounting is only done on the initial - * attempt. If we go through a retry, it is extremely likely that the - * page will be found in page cache at that point. + * Handle the "normal" (no error) case first. */ - - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { + if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | + VM_FAULT_BADACCESS)))) { + /* + * Major/minor page fault accounting is only done + * once. If we go through a retry, it is extremely + * likely that the page will be found in page cache at + * that point. + */ + if (major) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); @@ -417,25 +469,9 @@ retry: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); } - if (fault & VM_FAULT_RETRY) { - /* - * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of - * starvation. - */ - mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; - mm_flags |= FAULT_FLAG_TRIED; - goto retry; - } - } - - up_read(&mm->mmap_sem); - /* - * Handle the "normal" case first - VM_FAULT_MAJOR - */ - if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | - VM_FAULT_BADACCESS)))) return 0; + } /* * If we are in kernel mode at this point, we have no context to @@ -461,6 +497,9 @@ retry: */ sig = SIGBUS; code = BUS_ADRERR; + } else if (fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { + sig = SIGBUS; + code = BUS_MCEERR_AR; } else { /* * Something tried to access memory that isn't in our memory @@ -471,11 +510,11 @@ retry: SEGV_ACCERR : SEGV_MAPERR; } - __do_user_fault(tsk, addr, esr, sig, code, regs); + __do_user_fault(tsk, addr, esr, sig, code, regs, fault); return 0; no_context: - __do_kernel_fault(mm, addr, esr, regs); + __do_kernel_fault(addr, esr, regs); return 0; } @@ -522,6 +561,47 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; } +/* + * This abort handler deals with Synchronous External Abort. + * It calls notifiers, and then returns "fault". + */ +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) +{ + struct siginfo info; + const struct fault_info *inf; + int ret = 0; + + inf = esr_to_fault_info(esr); + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", + inf->name, esr, addr); + + /* + * Synchronous aborts may interrupt code which had interrupts masked. + * Before calling out into the wider kernel tell the interested + * subsystems. + */ + if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) { + if (interrupts_enabled(regs)) + nmi_enter(); + + ret = ghes_notify_sea(); + + if (interrupts_enabled(regs)) + nmi_exit(); + } + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = 0; + if (esr & ESR_ELx_FnV) + info.si_addr = NULL; + else + info.si_addr = (void __user *)addr; + arm64_notify_die("", regs, &info, esr); + + return ret; +} + static const struct fault_info fault_info[] = { { do_bad, SIGBUS, 0, "ttbr address size fault" }, { do_bad, SIGBUS, 0, "level 1 address size fault" }, @@ -539,22 +619,22 @@ static const struct fault_info fault_info[] = { { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, - { do_bad, SIGBUS, 0, "synchronous external abort" }, + { do_sea, SIGBUS, 0, "synchronous external abort" }, { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error" }, + { do_sea, SIGBUS, 0, "level 0 (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 1 (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 2 (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 3 (translation table walk)" }, + { do_sea, SIGBUS, 0, "synchronous parity or ECC error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, { do_bad, SIGBUS, 0, "unknown 27" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 0 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 1 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 2 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 3 synchronous parity error (translation table walk)" }, { do_bad, SIGBUS, 0, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, { do_bad, SIGBUS, 0, "unknown 34" }, @@ -590,6 +670,23 @@ static const struct fault_info fault_info[] = { }; /* + * Handle Synchronous External Aborts that occur in a guest kernel. + * + * The return value will be zero if the SEA was successfully handled + * and non-zero if there was an error processing the error or there was + * no error to process. + */ +int handle_guest_sea(phys_addr_t addr, unsigned int esr) +{ + int ret = -ENOENT; + + if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) + ret = ghes_notify_sea(); + + return ret; +} + +/* * Dispatch a data abort to the relevant handler. */ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 7514a000e361..69b8200b1cfd 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -136,36 +136,27 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd = NULL; - pte_t *pte = NULL; + pmd_t *pmd; pgd = pgd_offset(mm, addr); pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd); if (!pgd_present(*pgd)) return NULL; + pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) + if (pud_none(*pud)) return NULL; - - if (pud_huge(*pud)) + /* swap or huge page */ + if (!pud_present(*pud) || pud_huge(*pud)) return (pte_t *)pud; + /* table; check the next level */ + pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) + if (pmd_none(*pmd)) return NULL; - - if (pte_cont(pmd_pte(*pmd))) { - pmd = pmd_offset( - pud, (addr & CONT_PMD_MASK)); - return (pte_t *)pmd; - } - if (pmd_huge(*pmd)) + if (!pmd_present(*pmd) || pmd_huge(*pmd)) return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, addr); - if (pte_present(*pte) && pte_cont(*pte)) { - pte = pte_offset_kernel( - pmd, (addr & CONT_PTE_MASK)); - return pte; - } + return NULL; } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 7b0d55756eb1..adc208c2ae9c 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -18,6 +18,7 @@ #include <linux/elf.h> #include <linux/fs.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/export.h> @@ -103,12 +104,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm) */ int valid_phys_addr_range(phys_addr_t addr, size_t size) { - if (addr < PHYS_OFFSET) - return 0; - if (addr + size > __pa(high_memory - 1) + 1) - return 0; - - return 1; + /* + * Check whether addr is covered by a memory region without the + * MEMBLOCK_NOMAP attribute, and whether that region covers the + * entire range. In theory, this could lead to false negatives + * if the range is covered by distinct but adjacent memory regions + * that only differ in other attributes. However, few of such + * attributes have been defined, and it is debatable whether it + * follows that /dev/mem read() calls should be able traverse + * such boundaries. + */ + return memblock_is_region_memory(addr, size) && + memblock_is_map_memory(addr); } /* diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0c429ec6fde8..23c2d89a362e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -31,6 +31,7 @@ #include <linux/fs.h> #include <linux/io.h> #include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/barrier.h> #include <asm/cputype.h> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c870d6f01ac2..f32144b2e07f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -70,7 +70,8 @@ struct jit_ctx { int idx; int epilogue_offset; int *offset; - u32 *image; + __le32 *image; + u32 stack_size; }; static inline void emit(const u32 insn, struct jit_ctx *ctx) @@ -130,7 +131,7 @@ static inline int bpf2a64_offset(int bpf_to, int bpf_from, static void jit_fill_hole(void *area, unsigned int size) { - u32 *ptr; + __le32 *ptr; /* We are guaranteed to have aligned memory. */ for (ptr = area; size >= sizeof(u32); size -= sizeof(u32)) *ptr++ = cpu_to_le32(AARCH64_BREAK_FAULT); @@ -147,16 +148,11 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) /* Stack must be multiples of 16B */ #define STACK_ALIGN(sz) (((sz) + 15) & ~15) -#define _STACK_SIZE \ - (MAX_BPF_STACK \ - + 4 /* extra for skb_copy_bits buffer */) - -#define STACK_SIZE STACK_ALIGN(_STACK_SIZE) - #define PROLOGUE_OFFSET 8 static int build_prologue(struct jit_ctx *ctx) { + const struct bpf_prog *prog = ctx->prog; const u8 r6 = bpf2a64[BPF_REG_6]; const u8 r7 = bpf2a64[BPF_REG_7]; const u8 r8 = bpf2a64[BPF_REG_8]; @@ -178,9 +174,9 @@ static int build_prologue(struct jit_ctx *ctx) * | | * | ... | BPF prog stack * | | - * +-----+ <= (BPF_FP - MAX_BPF_STACK) + * +-----+ <= (BPF_FP - prog->aux->stack_depth) * |RSVD | JIT scratchpad - * current A64_SP => +-----+ <= (BPF_FP - STACK_SIZE) + * current A64_SP => +-----+ <= (BPF_FP - ctx->stack_size) * | | * | ... | Function call stack * | | @@ -204,8 +200,12 @@ static int build_prologue(struct jit_ctx *ctx) /* Initialize tail_call_cnt */ emit(A64_MOVZ(1, tcc, 0, 0), ctx); + /* 4 byte extra for skb_copy_bits buffer */ + ctx->stack_size = prog->aux->stack_depth + 4; + ctx->stack_size = STACK_ALIGN(ctx->stack_size); + /* Set up function call stack */ - emit(A64_SUB_I(1, A64_SP, A64_SP, STACK_SIZE), ctx); + emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); cur_offset = ctx->idx - idx0; if (cur_offset != PROLOGUE_OFFSET) { @@ -290,7 +290,7 @@ static void build_epilogue(struct jit_ctx *ctx) const u8 fp = bpf2a64[BPF_REG_FP]; /* We're done with BPF stack */ - emit(A64_ADD_I(1, A64_SP, A64_SP, STACK_SIZE), ctx); + emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); /* Restore fs (x25) and x26 */ emit(A64_POP(fp, A64_R(26), A64_SP), ctx); @@ -589,7 +589,7 @@ emit_cond_jmp: break; } /* tail call */ - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: if (emit_bpf_tail_call(ctx)) return -EFAULT; break; @@ -735,7 +735,7 @@ emit_cond_jmp: return -EINVAL; } emit_a64_mov_i64(r3, size, ctx); - emit(A64_SUB_I(1, r4, fp, STACK_SIZE), ctx); + emit(A64_SUB_I(1, r4, fp, ctx->stack_size), ctx); emit_a64_mov_i64(r5, (unsigned long)bpf_load_pointer, ctx); emit(A64_BLR(r5), ctx); emit(A64_MOV(1, r0, A64_R(0)), ctx); @@ -874,7 +874,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* 2. Now, the actual pass. */ - ctx.image = (u32 *)image_ptr; + ctx.image = (__le32 *)image_ptr; ctx.idx = 0; build_prologue(&ctx); @@ -903,6 +903,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_binary_lock_ro(header); prog->bpf_func = (void *)ctx.image; prog->jited = 1; + prog->jited_len = image_size; out_off: kfree(ctx.offset); diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig index ba4267f658af..3ce77f07208a 100644 --- a/arch/blackfin/configs/BF609-EZKIT_defconfig +++ b/arch/blackfin/configs/BF609-EZKIT_defconfig @@ -105,7 +105,7 @@ CONFIG_SPI=y CONFIG_SPI_ADI_V3=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_MCP23S08=y +CONFIG_PINCTRL_MCP23S08=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_BFIN_WDT=y diff --git a/arch/blackfin/mach-bf527/boards/tll6527m.c b/arch/blackfin/mach-bf527/boards/tll6527m.c index c1acce4c2e45..ce5488e8226b 100644 --- a/arch/blackfin/mach-bf527/boards/tll6527m.c +++ b/arch/blackfin/mach-bf527/boards/tll6527m.c @@ -348,14 +348,14 @@ static struct platform_device bfin_i2s = { }; #endif -#if IS_ENABLED(CONFIG_GPIO_MCP23S08) +#if IS_ENABLED(CONFIG_PINCTRL_MCP23S08) #include <linux/spi/mcp23s08.h> static const struct mcp23s08_platform_data bfin_mcp23s08_sys_gpio_info = { - .chip[0].is_present = true, + .spi_present_mask = BIT(0), .base = 0x30, }; static const struct mcp23s08_platform_data bfin_mcp23s08_usr_gpio_info = { - .chip[2].is_present = true, + .spi_present_mask = BIT(2), .base = 0x38, }; #endif @@ -423,7 +423,7 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { .mode = SPI_CPHA | SPI_CPOL, }, #endif -#if IS_ENABLED(CONFIG_GPIO_MCP23S08) +#if IS_ENABLED(CONFIG_PINCTRL_MCP23S08) { .modalias = "mcp23s08", .platform_data = &bfin_mcp23s08_sys_gpio_info, diff --git a/arch/blackfin/mach-bf609/boards/ezkit.c b/arch/blackfin/mach-bf609/boards/ezkit.c index 9231e5a72b93..51157a255824 100644 --- a/arch/blackfin/mach-bf609/boards/ezkit.c +++ b/arch/blackfin/mach-bf609/boards/ezkit.c @@ -1887,7 +1887,7 @@ static struct platform_device i2c_bfin_twi1_device = { }; #endif -#if IS_ENABLED(CONFIG_GPIO_MCP23S08) +#if IS_ENABLED(CONFIG_PINCTRL_MCP23S08) #include <linux/spi/mcp23s08.h> static const struct mcp23s08_platform_data bfin_mcp23s08_soft_switch0 = { .base = 120, @@ -1929,7 +1929,7 @@ static struct i2c_board_info __initdata bfin_i2c_board_info0[] = { I2C_BOARD_INFO("ssm2602", 0x1b), }, #endif -#if IS_ENABLED(CONFIG_GPIO_MCP23S08) +#if IS_ENABLED(CONFIG_PINCTRL_MCP23S08) { I2C_BOARD_INFO("mcp23017", 0x21), .platform_data = (void *)&bfin_mcp23s08_soft_switch0 diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index 1ccf45657472..f1e3b20dce9f 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -98,5 +98,9 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 2c3f4b48042a..5dd5c5d0d642 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -107,4 +107,8 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index ae6548d29a18..f8f7b47e247f 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -98,4 +98,8 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 2828ecde133d..45bcd1cfcec0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -364,6 +364,7 @@ config MACH_INGENIC select SYS_SUPPORTS_ZBOOT_UART16550 select DMA_NONCOHERENT select IRQ_MIPS_CPU + select PINCTRL select GPIOLIB select COMMON_CLK select GENERIC_IRQ_CHIP diff --git a/arch/mips/boot/dts/ingenic/ci20.dts b/arch/mips/boot/dts/ingenic/ci20.dts index 1652d8d60b1e..fd138d9978c1 100644 --- a/arch/mips/boot/dts/ingenic/ci20.dts +++ b/arch/mips/boot/dts/ingenic/ci20.dts @@ -29,18 +29,30 @@ &uart0 { status = "okay"; + + pinctrl-names = "default"; + pinctrl-0 = <&pins_uart0>; }; &uart1 { status = "okay"; + + pinctrl-names = "default"; + pinctrl-0 = <&pins_uart1>; }; &uart3 { status = "okay"; + + pinctrl-names = "default"; + pinctrl-0 = <&pins_uart2>; }; &uart4 { status = "okay"; + + pinctrl-names = "default"; + pinctrl-0 = <&pins_uart4>; }; &nemc { @@ -61,6 +73,13 @@ ingenic,nemc-tAW = <15>; ingenic,nemc-tSTRV = <100>; + /* + * Only CLE/ALE are needed for the devices that are connected, rather + * than the full address line set. + */ + pinctrl-names = "default"; + pinctrl-0 = <&pins_nemc>; + nand@1 { reg = <1>; @@ -69,6 +88,9 @@ nand-ecc-mode = "hw"; nand-on-flash-bbt; + pinctrl-names = "default"; + pinctrl-0 = <&pins_nemc_cs1>; + partitions { compatible = "fixed-partitions"; #address-cells = <2>; @@ -106,3 +128,41 @@ &bch { status = "okay"; }; + +&pinctrl { + pins_uart0: uart0 { + function = "uart0"; + groups = "uart0-data"; + bias-disable; + }; + + pins_uart1: uart1 { + function = "uart1"; + groups = "uart1-data"; + bias-disable; + }; + + pins_uart2: uart2 { + function = "uart2"; + groups = "uart2-data", "uart2-hwflow"; + bias-disable; + }; + + pins_uart4: uart4 { + function = "uart4"; + groups = "uart4-data"; + bias-disable; + }; + + pins_nemc: nemc { + function = "nemc"; + groups = "nemc-data", "nemc-cle-ale", "nemc-rd-we", "nemc-frd-fwe"; + bias-disable; + }; + + pins_nemc_cs1: nemc-cs1 { + function = "nemc-cs1"; + groups = "nemc-cs1"; + bias-disable; + }; +}; diff --git a/arch/mips/boot/dts/ingenic/jz4740.dtsi b/arch/mips/boot/dts/ingenic/jz4740.dtsi index 3e1587f1f77a..2ca7ce7481f1 100644 --- a/arch/mips/boot/dts/ingenic/jz4740.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4740.dtsi @@ -55,6 +55,74 @@ clock-names = "rtc"; }; + pinctrl: pin-controller@10010000 { + compatible = "ingenic,jz4740-pinctrl"; + reg = <0x10010000 0x400>; + + #address-cells = <1>; + #size-cells = <0>; + + gpa: gpio@0 { + compatible = "ingenic,jz4740-gpio"; + reg = <0>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 0 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <28>; + }; + + gpb: gpio@1 { + compatible = "ingenic,jz4740-gpio"; + reg = <1>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 32 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <27>; + }; + + gpc: gpio@2 { + compatible = "ingenic,jz4740-gpio"; + reg = <2>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 64 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <26>; + }; + + gpd: gpio@3 { + compatible = "ingenic,jz4740-gpio"; + reg = <3>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 96 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <25>; + }; + }; + uart0: serial@10030000 { compatible = "ingenic,jz4740-uart"; reg = <0x10030000 0x100>; diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi index b868b429add2..4853ef67b3ab 100644 --- a/arch/mips/boot/dts/ingenic/jz4780.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi @@ -44,6 +44,104 @@ #clock-cells = <1>; }; + pinctrl: pin-controller@10010000 { + compatible = "ingenic,jz4780-pinctrl"; + reg = <0x10010000 0x600>; + + #address-cells = <1>; + #size-cells = <0>; + + gpa: gpio@0 { + compatible = "ingenic,jz4780-gpio"; + reg = <0>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 0 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <17>; + }; + + gpb: gpio@1 { + compatible = "ingenic,jz4780-gpio"; + reg = <1>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 32 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <16>; + }; + + gpc: gpio@2 { + compatible = "ingenic,jz4780-gpio"; + reg = <2>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 64 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <15>; + }; + + gpd: gpio@3 { + compatible = "ingenic,jz4780-gpio"; + reg = <3>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 96 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <14>; + }; + + gpe: gpio@4 { + compatible = "ingenic,jz4780-gpio"; + reg = <4>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 128 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <13>; + }; + + gpf: gpio@5 { + compatible = "ingenic,jz4780-gpio"; + reg = <5>; + + gpio-controller; + gpio-ranges = <&pinctrl 0 160 32>; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + interrupt-parent = <&intc>; + interrupts = <12>; + }; + }; + uart0: serial@10030000 { compatible = "ingenic,jz4780-uart"; reg = <0x10030000 0x100>; diff --git a/arch/mips/boot/dts/ingenic/qi_lb60.dts b/arch/mips/boot/dts/ingenic/qi_lb60.dts index be1a7d3a3e1b..b715ee2ac2ee 100644 --- a/arch/mips/boot/dts/ingenic/qi_lb60.dts +++ b/arch/mips/boot/dts/ingenic/qi_lb60.dts @@ -17,3 +17,16 @@ &rtc_dev { system-power-controller; }; + +&uart0 { + pinctrl-names = "default"; + pinctrl-0 = <&pins_uart0>; +}; + +&pinctrl { + pins_uart0: uart0 { + function = "uart0"; + groups = "uart0-data"; + bias-disable; + }; +}; diff --git a/arch/mips/include/asm/mach-jz4740/gpio.h b/arch/mips/include/asm/mach-jz4740/gpio.h index 7c7708a23baa..fd847c984701 100644 --- a/arch/mips/include/asm/mach-jz4740/gpio.h +++ b/arch/mips/include/asm/mach-jz4740/gpio.h @@ -16,380 +16,9 @@ #ifndef _JZ_GPIO_H #define _JZ_GPIO_H -#include <linux/types.h> - -enum jz_gpio_function { - JZ_GPIO_FUNC_NONE, - JZ_GPIO_FUNC1, - JZ_GPIO_FUNC2, - JZ_GPIO_FUNC3, -}; - -/* - Usually a driver for a SoC component has to request several gpio pins and - configure them as function pins. - jz_gpio_bulk_request can be used to ease this process. - Usually one would do something like: - - static const struct jz_gpio_bulk_request i2c_pins[] = { - JZ_GPIO_BULK_PIN(I2C_SDA), - JZ_GPIO_BULK_PIN(I2C_SCK), - }; - - inside the probe function: - - ret = jz_gpio_bulk_request(i2c_pins, ARRAY_SIZE(i2c_pins)); - if (ret) { - ... - - inside the remove function: - - jz_gpio_bulk_free(i2c_pins, ARRAY_SIZE(i2c_pins)); - -*/ - -struct jz_gpio_bulk_request { - int gpio; - const char *name; - enum jz_gpio_function function; -}; - -#define JZ_GPIO_BULK_PIN(pin) { \ - .gpio = JZ_GPIO_ ## pin, \ - .name = #pin, \ - .function = JZ_GPIO_FUNC_ ## pin \ -} - -int jz_gpio_bulk_request(const struct jz_gpio_bulk_request *request, size_t num); -void jz_gpio_bulk_free(const struct jz_gpio_bulk_request *request, size_t num); -void jz_gpio_bulk_suspend(const struct jz_gpio_bulk_request *request, size_t num); -void jz_gpio_bulk_resume(const struct jz_gpio_bulk_request *request, size_t num); -void jz_gpio_enable_pullup(unsigned gpio); -void jz_gpio_disable_pullup(unsigned gpio); -int jz_gpio_set_function(int gpio, enum jz_gpio_function function); - -int jz_gpio_port_direction_input(int port, uint32_t mask); -int jz_gpio_port_direction_output(int port, uint32_t mask); -void jz_gpio_port_set_value(int port, uint32_t value, uint32_t mask); -uint32_t jz_gpio_port_get_value(int port, uint32_t mask); - #define JZ_GPIO_PORTA(x) ((x) + 32 * 0) #define JZ_GPIO_PORTB(x) ((x) + 32 * 1) #define JZ_GPIO_PORTC(x) ((x) + 32 * 2) #define JZ_GPIO_PORTD(x) ((x) + 32 * 3) -/* Port A function pins */ -#define JZ_GPIO_MEM_DATA0 JZ_GPIO_PORTA(0) -#define JZ_GPIO_MEM_DATA1 JZ_GPIO_PORTA(1) -#define JZ_GPIO_MEM_DATA2 JZ_GPIO_PORTA(2) -#define JZ_GPIO_MEM_DATA3 JZ_GPIO_PORTA(3) -#define JZ_GPIO_MEM_DATA4 JZ_GPIO_PORTA(4) -#define JZ_GPIO_MEM_DATA5 JZ_GPIO_PORTA(5) -#define JZ_GPIO_MEM_DATA6 JZ_GPIO_PORTA(6) -#define JZ_GPIO_MEM_DATA7 JZ_GPIO_PORTA(7) -#define JZ_GPIO_MEM_DATA8 JZ_GPIO_PORTA(8) -#define JZ_GPIO_MEM_DATA9 JZ_GPIO_PORTA(9) -#define JZ_GPIO_MEM_DATA10 JZ_GPIO_PORTA(10) -#define JZ_GPIO_MEM_DATA11 JZ_GPIO_PORTA(11) -#define JZ_GPIO_MEM_DATA12 JZ_GPIO_PORTA(12) -#define JZ_GPIO_MEM_DATA13 JZ_GPIO_PORTA(13) -#define JZ_GPIO_MEM_DATA14 JZ_GPIO_PORTA(14) -#define JZ_GPIO_MEM_DATA15 JZ_GPIO_PORTA(15) -#define JZ_GPIO_MEM_DATA16 JZ_GPIO_PORTA(16) -#define JZ_GPIO_MEM_DATA17 JZ_GPIO_PORTA(17) -#define JZ_GPIO_MEM_DATA18 JZ_GPIO_PORTA(18) -#define JZ_GPIO_MEM_DATA19 JZ_GPIO_PORTA(19) -#define JZ_GPIO_MEM_DATA20 JZ_GPIO_PORTA(20) -#define JZ_GPIO_MEM_DATA21 JZ_GPIO_PORTA(21) -#define JZ_GPIO_MEM_DATA22 JZ_GPIO_PORTA(22) -#define JZ_GPIO_MEM_DATA23 JZ_GPIO_PORTA(23) -#define JZ_GPIO_MEM_DATA24 JZ_GPIO_PORTA(24) -#define JZ_GPIO_MEM_DATA25 JZ_GPIO_PORTA(25) -#define JZ_GPIO_MEM_DATA26 JZ_GPIO_PORTA(26) -#define JZ_GPIO_MEM_DATA27 JZ_GPIO_PORTA(27) -#define JZ_GPIO_MEM_DATA28 JZ_GPIO_PORTA(28) -#define JZ_GPIO_MEM_DATA29 JZ_GPIO_PORTA(29) -#define JZ_GPIO_MEM_DATA30 JZ_GPIO_PORTA(30) -#define JZ_GPIO_MEM_DATA31 JZ_GPIO_PORTA(31) - -#define JZ_GPIO_FUNC_MEM_DATA0 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA1 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA2 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA3 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA4 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA5 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA6 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA7 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA8 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA9 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA10 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA11 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA12 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA13 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA14 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA15 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA16 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA17 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA18 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA19 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA20 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA21 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA22 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA23 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA24 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA25 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA26 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA27 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA28 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA29 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA30 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DATA31 JZ_GPIO_FUNC1 - -/* Port B function pins */ -#define JZ_GPIO_MEM_ADDR0 JZ_GPIO_PORTB(0) -#define JZ_GPIO_MEM_ADDR1 JZ_GPIO_PORTB(1) -#define JZ_GPIO_MEM_ADDR2 JZ_GPIO_PORTB(2) -#define JZ_GPIO_MEM_ADDR3 JZ_GPIO_PORTB(3) -#define JZ_GPIO_MEM_ADDR4 JZ_GPIO_PORTB(4) -#define JZ_GPIO_MEM_ADDR5 JZ_GPIO_PORTB(5) -#define JZ_GPIO_MEM_ADDR6 JZ_GPIO_PORTB(6) -#define JZ_GPIO_MEM_ADDR7 JZ_GPIO_PORTB(7) -#define JZ_GPIO_MEM_ADDR8 JZ_GPIO_PORTB(8) -#define JZ_GPIO_MEM_ADDR9 JZ_GPIO_PORTB(9) -#define JZ_GPIO_MEM_ADDR10 JZ_GPIO_PORTB(10) -#define JZ_GPIO_MEM_ADDR11 JZ_GPIO_PORTB(11) -#define JZ_GPIO_MEM_ADDR12 JZ_GPIO_PORTB(12) -#define JZ_GPIO_MEM_ADDR13 JZ_GPIO_PORTB(13) -#define JZ_GPIO_MEM_ADDR14 JZ_GPIO_PORTB(14) -#define JZ_GPIO_MEM_ADDR15 JZ_GPIO_PORTB(15) -#define JZ_GPIO_MEM_ADDR16 JZ_GPIO_PORTB(16) -#define JZ_GPIO_LCD_CLS JZ_GPIO_PORTB(17) -#define JZ_GPIO_LCD_SPL JZ_GPIO_PORTB(18) -#define JZ_GPIO_MEM_DCS JZ_GPIO_PORTB(19) -#define JZ_GPIO_MEM_RAS JZ_GPIO_PORTB(20) -#define JZ_GPIO_MEM_CAS JZ_GPIO_PORTB(21) -#define JZ_GPIO_MEM_SDWE JZ_GPIO_PORTB(22) -#define JZ_GPIO_MEM_CKE JZ_GPIO_PORTB(23) -#define JZ_GPIO_MEM_CKO JZ_GPIO_PORTB(24) -#define JZ_GPIO_MEM_CS0 JZ_GPIO_PORTB(25) -#define JZ_GPIO_MEM_CS1 JZ_GPIO_PORTB(26) -#define JZ_GPIO_MEM_CS2 JZ_GPIO_PORTB(27) -#define JZ_GPIO_MEM_CS3 JZ_GPIO_PORTB(28) -#define JZ_GPIO_MEM_RD JZ_GPIO_PORTB(29) -#define JZ_GPIO_MEM_WR JZ_GPIO_PORTB(30) -#define JZ_GPIO_MEM_WE0 JZ_GPIO_PORTB(31) - -#define JZ_GPIO_FUNC_MEM_ADDR0 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR1 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR2 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR3 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR4 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR5 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR6 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR7 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR8 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR9 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR10 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR11 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR12 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR13 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR14 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR15 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_ADDR16 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_CLS JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_SPL JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_DCS JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_RAS JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_CAS JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_SDWE JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_CKE JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_CKO JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_CS0 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_CS1 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_CS2 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_CS3 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_RD JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_WR JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_WE0 JZ_GPIO_FUNC1 - - -#define JZ_GPIO_MEM_ADDR21 JZ_GPIO_PORTB(17) -#define JZ_GPIO_MEM_ADDR22 JZ_GPIO_PORTB(18) - -#define JZ_GPIO_FUNC_MEM_ADDR21 JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_ADDR22 JZ_GPIO_FUNC2 - -/* Port C function pins */ -#define JZ_GPIO_LCD_DATA0 JZ_GPIO_PORTC(0) -#define JZ_GPIO_LCD_DATA1 JZ_GPIO_PORTC(1) -#define JZ_GPIO_LCD_DATA2 JZ_GPIO_PORTC(2) -#define JZ_GPIO_LCD_DATA3 JZ_GPIO_PORTC(3) -#define JZ_GPIO_LCD_DATA4 JZ_GPIO_PORTC(4) -#define JZ_GPIO_LCD_DATA5 JZ_GPIO_PORTC(5) -#define JZ_GPIO_LCD_DATA6 JZ_GPIO_PORTC(6) -#define JZ_GPIO_LCD_DATA7 JZ_GPIO_PORTC(7) -#define JZ_GPIO_LCD_DATA8 JZ_GPIO_PORTC(8) -#define JZ_GPIO_LCD_DATA9 JZ_GPIO_PORTC(9) -#define JZ_GPIO_LCD_DATA10 JZ_GPIO_PORTC(10) -#define JZ_GPIO_LCD_DATA11 JZ_GPIO_PORTC(11) -#define JZ_GPIO_LCD_DATA12 JZ_GPIO_PORTC(12) -#define JZ_GPIO_LCD_DATA13 JZ_GPIO_PORTC(13) -#define JZ_GPIO_LCD_DATA14 JZ_GPIO_PORTC(14) -#define JZ_GPIO_LCD_DATA15 JZ_GPIO_PORTC(15) -#define JZ_GPIO_LCD_DATA16 JZ_GPIO_PORTC(16) -#define JZ_GPIO_LCD_DATA17 JZ_GPIO_PORTC(17) -#define JZ_GPIO_LCD_PCLK JZ_GPIO_PORTC(18) -#define JZ_GPIO_LCD_HSYNC JZ_GPIO_PORTC(19) -#define JZ_GPIO_LCD_VSYNC JZ_GPIO_PORTC(20) -#define JZ_GPIO_LCD_DE JZ_GPIO_PORTC(21) -#define JZ_GPIO_LCD_PS JZ_GPIO_PORTC(22) -#define JZ_GPIO_LCD_REV JZ_GPIO_PORTC(23) -#define JZ_GPIO_MEM_WE1 JZ_GPIO_PORTC(24) -#define JZ_GPIO_MEM_WE2 JZ_GPIO_PORTC(25) -#define JZ_GPIO_MEM_WE3 JZ_GPIO_PORTC(26) -#define JZ_GPIO_MEM_WAIT JZ_GPIO_PORTC(27) -#define JZ_GPIO_MEM_FRE JZ_GPIO_PORTC(28) -#define JZ_GPIO_MEM_FWE JZ_GPIO_PORTC(29) - -#define JZ_GPIO_FUNC_LCD_DATA0 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA1 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA2 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA3 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA4 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA5 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA6 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA7 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA8 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA9 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA10 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA11 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA12 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA13 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA14 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA15 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA16 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DATA17 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_PCLK JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_VSYNC JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_HSYNC JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_DE JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_PS JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_LCD_REV JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_WE1 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_WE2 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_WE3 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_WAIT JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_FRE JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MEM_FWE JZ_GPIO_FUNC1 - - -#define JZ_GPIO_MEM_ADDR19 JZ_GPIO_PORTB(22) -#define JZ_GPIO_MEM_ADDR20 JZ_GPIO_PORTB(23) - -#define JZ_GPIO_FUNC_MEM_ADDR19 JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_ADDR20 JZ_GPIO_FUNC2 - -/* Port D function pins */ -#define JZ_GPIO_CIM_DATA0 JZ_GPIO_PORTD(0) -#define JZ_GPIO_CIM_DATA1 JZ_GPIO_PORTD(1) -#define JZ_GPIO_CIM_DATA2 JZ_GPIO_PORTD(2) -#define JZ_GPIO_CIM_DATA3 JZ_GPIO_PORTD(3) -#define JZ_GPIO_CIM_DATA4 JZ_GPIO_PORTD(4) -#define JZ_GPIO_CIM_DATA5 JZ_GPIO_PORTD(5) -#define JZ_GPIO_CIM_DATA6 JZ_GPIO_PORTD(6) -#define JZ_GPIO_CIM_DATA7 JZ_GPIO_PORTD(7) -#define JZ_GPIO_MSC_CMD JZ_GPIO_PORTD(8) -#define JZ_GPIO_MSC_CLK JZ_GPIO_PORTD(9) -#define JZ_GPIO_MSC_DATA0 JZ_GPIO_PORTD(10) -#define JZ_GPIO_MSC_DATA1 JZ_GPIO_PORTD(11) -#define JZ_GPIO_MSC_DATA2 JZ_GPIO_PORTD(12) -#define JZ_GPIO_MSC_DATA3 JZ_GPIO_PORTD(13) -#define JZ_GPIO_CIM_MCLK JZ_GPIO_PORTD(14) -#define JZ_GPIO_CIM_PCLK JZ_GPIO_PORTD(15) -#define JZ_GPIO_CIM_VSYNC JZ_GPIO_PORTD(16) -#define JZ_GPIO_CIM_HSYNC JZ_GPIO_PORTD(17) -#define JZ_GPIO_SPI_CLK JZ_GPIO_PORTD(18) -#define JZ_GPIO_SPI_CE0 JZ_GPIO_PORTD(19) -#define JZ_GPIO_SPI_DT JZ_GPIO_PORTD(20) -#define JZ_GPIO_SPI_DR JZ_GPIO_PORTD(21) -#define JZ_GPIO_SPI_CE1 JZ_GPIO_PORTD(22) -#define JZ_GPIO_PWM0 JZ_GPIO_PORTD(23) -#define JZ_GPIO_PWM1 JZ_GPIO_PORTD(24) -#define JZ_GPIO_PWM2 JZ_GPIO_PORTD(25) -#define JZ_GPIO_PWM3 JZ_GPIO_PORTD(26) -#define JZ_GPIO_PWM4 JZ_GPIO_PORTD(27) -#define JZ_GPIO_PWM5 JZ_GPIO_PORTD(28) -#define JZ_GPIO_PWM6 JZ_GPIO_PORTD(30) -#define JZ_GPIO_PWM7 JZ_GPIO_PORTD(31) - -#define JZ_GPIO_FUNC_CIM_DATA JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_CIM_DATA0 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_CIM_DATA1 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_CIM_DATA2 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_CIM_DATA3 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_CIM_DATA4 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_CIM_DATA5 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_CIM_DATA6 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_CIM_DATA7 JZ_GPIO_FUNC_CIM_DATA -#define JZ_GPIO_FUNC_MSC_CMD JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MSC_CLK JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MSC_DATA JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_MSC_DATA0 JZ_GPIO_FUNC_MSC_DATA -#define JZ_GPIO_FUNC_MSC_DATA1 JZ_GPIO_FUNC_MSC_DATA -#define JZ_GPIO_FUNC_MSC_DATA2 JZ_GPIO_FUNC_MSC_DATA -#define JZ_GPIO_FUNC_MSC_DATA3 JZ_GPIO_FUNC_MSC_DATA -#define JZ_GPIO_FUNC_CIM_MCLK JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_CIM_PCLK JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_CIM_VSYNC JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_CIM_HSYNC JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_SPI_CLK JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_SPI_CE0 JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_SPI_DT JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_SPI_DR JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_SPI_CE1 JZ_GPIO_FUNC1 - -#define JZ_GPIO_FUNC_PWM JZ_GPIO_FUNC1 -#define JZ_GPIO_FUNC_PWM0 JZ_GPIO_FUNC_PWM -#define JZ_GPIO_FUNC_PWM1 JZ_GPIO_FUNC_PWM -#define JZ_GPIO_FUNC_PWM2 JZ_GPIO_FUNC_PWM -#define JZ_GPIO_FUNC_PWM3 JZ_GPIO_FUNC_PWM -#define JZ_GPIO_FUNC_PWM4 JZ_GPIO_FUNC_PWM -#define JZ_GPIO_FUNC_PWM5 JZ_GPIO_FUNC_PWM -#define JZ_GPIO_FUNC_PWM6 JZ_GPIO_FUNC_PWM -#define JZ_GPIO_FUNC_PWM7 JZ_GPIO_FUNC_PWM - -#define JZ_GPIO_MEM_SCLK_RSTN JZ_GPIO_PORTD(18) -#define JZ_GPIO_MEM_BCLK JZ_GPIO_PORTD(19) -#define JZ_GPIO_MEM_SDATO JZ_GPIO_PORTD(20) -#define JZ_GPIO_MEM_SDATI JZ_GPIO_PORTD(21) -#define JZ_GPIO_MEM_SYNC JZ_GPIO_PORTD(22) -#define JZ_GPIO_I2C_SDA JZ_GPIO_PORTD(23) -#define JZ_GPIO_I2C_SCK JZ_GPIO_PORTD(24) -#define JZ_GPIO_UART0_TXD JZ_GPIO_PORTD(25) -#define JZ_GPIO_UART0_RXD JZ_GPIO_PORTD(26) -#define JZ_GPIO_MEM_ADDR17 JZ_GPIO_PORTD(27) -#define JZ_GPIO_MEM_ADDR18 JZ_GPIO_PORTD(28) -#define JZ_GPIO_UART0_CTS JZ_GPIO_PORTD(30) -#define JZ_GPIO_UART0_RTS JZ_GPIO_PORTD(31) - -#define JZ_GPIO_FUNC_MEM_SCLK_RSTN JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_BCLK JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_SDATO JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_SDATI JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_SYNC JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_I2C_SDA JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_I2C_SCK JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_UART0_TXD JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_UART0_RXD JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_ADDR17 JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_MEM_ADDR18 JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_UART0_CTS JZ_GPIO_FUNC2 -#define JZ_GPIO_FUNC_UART0_RTS JZ_GPIO_FUNC2 - -#define JZ_GPIO_UART1_RXD JZ_GPIO_PORTD(30) -#define JZ_GPIO_UART1_TXD JZ_GPIO_PORTD(31) - -#define JZ_GPIO_FUNC_UART1_RXD JZ_GPIO_FUNC3 -#define JZ_GPIO_FUNC_UART1_TXD JZ_GPIO_FUNC3 - #endif diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 3418ec9c1c50..882823bec153 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -116,4 +116,8 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mips/jz4740/Makefile b/arch/mips/jz4740/Makefile index 39d70bde8cfe..6b9c1f7c31c9 100644 --- a/arch/mips/jz4740/Makefile +++ b/arch/mips/jz4740/Makefile @@ -7,8 +7,6 @@ obj-y += prom.o time.o reset.o setup.o \ platform.o timer.o -obj-$(CONFIG_MACH_JZ4740) += gpio.o - CFLAGS_setup.o = -I$(src)/../../../scripts/dtc/libfdt # board specific support diff --git a/arch/mips/jz4740/board-qi_lb60.c b/arch/mips/jz4740/board-qi_lb60.c index a5bd94b95263..6d7f97552200 100644 --- a/arch/mips/jz4740/board-qi_lb60.c +++ b/arch/mips/jz4740/board-qi_lb60.c @@ -22,6 +22,8 @@ #include <linux/input/matrix_keypad.h> #include <linux/spi/spi.h> #include <linux/spi/spi_gpio.h> +#include <linux/pinctrl/machine.h> +#include <linux/pinctrl/pinconf-generic.h> #include <linux/power_supply.h> #include <linux/power/jz4740-battery.h> #include <linux/power/gpio-charger.h> @@ -159,7 +161,7 @@ static struct jz_nand_platform_data qi_lb60_nand_pdata = { static struct gpiod_lookup_table qi_lb60_nand_gpio_table = { .dev_id = "jz4740-nand.0", .table = { - GPIO_LOOKUP("Bank C", 30, "busy", 0), + GPIO_LOOKUP("GPIOC", 30, "busy", 0), { }, }, }; @@ -421,8 +423,8 @@ static struct platform_device qi_lb60_audio_device = { static struct gpiod_lookup_table qi_lb60_audio_gpio_table = { .dev_id = "qi-lb60-audio", .table = { - GPIO_LOOKUP("Bank B", 29, "snd", 0), - GPIO_LOOKUP("Bank D", 4, "amp", 0), + GPIO_LOOKUP("GPIOB", 29, "snd", 0), + GPIO_LOOKUP("GPIOD", 4, "amp", 0), { }, }, }; @@ -447,13 +449,36 @@ static struct platform_device *jz_platform_devices[] __initdata = { &qi_lb60_audio_device, }; -static void __init board_gpio_setup(void) -{ - /* We only need to enable/disable pullup here for pins used in generic - * drivers. Everything else is done by the drivers themselves. */ - jz_gpio_disable_pullup(QI_LB60_GPIO_SD_VCC_EN_N); - jz_gpio_disable_pullup(QI_LB60_GPIO_SD_CD); -} +static unsigned long pin_cfg_bias_disable[] = { + PIN_CONFIG_BIAS_DISABLE, +}; + +static struct pinctrl_map pin_map[] __initdata = { + /* NAND pin configuration */ + PIN_MAP_MUX_GROUP_DEFAULT("jz4740-nand", + "10010000.jz4740-pinctrl", "nand", "nand-cs1"), + + /* fbdev pin configuration */ + PIN_MAP_MUX_GROUP("jz4740-fb", PINCTRL_STATE_DEFAULT, + "10010000.jz4740-pinctrl", "lcd", "lcd-8bit"), + PIN_MAP_MUX_GROUP("jz4740-fb", PINCTRL_STATE_SLEEP, + "10010000.jz4740-pinctrl", "lcd", "lcd-no-pins"), + + /* MMC pin configuration */ + PIN_MAP_MUX_GROUP_DEFAULT("jz4740-mmc.0", + "10010000.jz4740-pinctrl", "mmc", "mmc-1bit"), + PIN_MAP_MUX_GROUP_DEFAULT("jz4740-mmc.0", + "10010000.jz4740-pinctrl", "mmc", "mmc-4bit"), + PIN_MAP_CONFIGS_PIN_DEFAULT("jz4740-mmc.0", + "10010000.jz4740-pinctrl", "PD0", pin_cfg_bias_disable), + PIN_MAP_CONFIGS_PIN_DEFAULT("jz4740-mmc.0", + "10010000.jz4740-pinctrl", "PD2", pin_cfg_bias_disable), + + /* PWM pin configuration */ + PIN_MAP_MUX_GROUP_DEFAULT("jz4740-pwm", + "10010000.jz4740-pinctrl", "pwm4", "pwm4"), +}; + static int __init qi_lb60_init_platform_devices(void) { @@ -469,6 +494,7 @@ static int __init qi_lb60_init_platform_devices(void) ARRAY_SIZE(qi_lb60_spi_board_info)); pwm_add_table(qi_lb60_pwm_lookup, ARRAY_SIZE(qi_lb60_pwm_lookup)); + pinctrl_register_mappings(pin_map, ARRAY_SIZE(pin_map)); return platform_add_devices(jz_platform_devices, ARRAY_SIZE(jz_platform_devices)); @@ -479,8 +505,6 @@ static int __init qi_lb60_board_setup(void) { printk(KERN_INFO "Qi Hardware JZ4740 QI LB60 setup\n"); - board_gpio_setup(); - if (qi_lb60_init_platform_devices()) panic("Failed to initialize platform devices"); diff --git a/arch/mips/jz4740/gpio.c b/arch/mips/jz4740/gpio.c deleted file mode 100644 index cac1ccde2214..000000000000 --- a/arch/mips/jz4740/gpio.c +++ /dev/null @@ -1,519 +0,0 @@ -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de> - * JZ4740 platform GPIO support - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/init.h> - -#include <linux/io.h> -#include <linux/gpio/driver.h> -/* FIXME: needed for gpio_request(), try to remove consumer API from driver */ -#include <linux/gpio.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/irqchip/ingenic.h> -#include <linux/bitops.h> - -#include <linux/debugfs.h> -#include <linux/seq_file.h> - -#include <asm/mach-jz4740/base.h> -#include <asm/mach-jz4740/gpio.h> - -#define JZ4740_GPIO_BASE_A (32*0) -#define JZ4740_GPIO_BASE_B (32*1) -#define JZ4740_GPIO_BASE_C (32*2) -#define JZ4740_GPIO_BASE_D (32*3) - -#define JZ4740_GPIO_NUM_A 32 -#define JZ4740_GPIO_NUM_B 32 -#define JZ4740_GPIO_NUM_C 31 -#define JZ4740_GPIO_NUM_D 32 - -#define JZ4740_IRQ_GPIO_BASE_A (JZ4740_IRQ_GPIO(0) + JZ4740_GPIO_BASE_A) -#define JZ4740_IRQ_GPIO_BASE_B (JZ4740_IRQ_GPIO(0) + JZ4740_GPIO_BASE_B) -#define JZ4740_IRQ_GPIO_BASE_C (JZ4740_IRQ_GPIO(0) + JZ4740_GPIO_BASE_C) -#define JZ4740_IRQ_GPIO_BASE_D (JZ4740_IRQ_GPIO(0) + JZ4740_GPIO_BASE_D) - -#define JZ_REG_GPIO_PIN 0x00 -#define JZ_REG_GPIO_DATA 0x10 -#define JZ_REG_GPIO_DATA_SET 0x14 -#define JZ_REG_GPIO_DATA_CLEAR 0x18 -#define JZ_REG_GPIO_MASK 0x20 -#define JZ_REG_GPIO_MASK_SET 0x24 -#define JZ_REG_GPIO_MASK_CLEAR 0x28 -#define JZ_REG_GPIO_PULL 0x30 -#define JZ_REG_GPIO_PULL_SET 0x34 -#define JZ_REG_GPIO_PULL_CLEAR 0x38 -#define JZ_REG_GPIO_FUNC 0x40 -#define JZ_REG_GPIO_FUNC_SET 0x44 -#define JZ_REG_GPIO_FUNC_CLEAR 0x48 -#define JZ_REG_GPIO_SELECT 0x50 -#define JZ_REG_GPIO_SELECT_SET 0x54 -#define JZ_REG_GPIO_SELECT_CLEAR 0x58 -#define JZ_REG_GPIO_DIRECTION 0x60 -#define JZ_REG_GPIO_DIRECTION_SET 0x64 -#define JZ_REG_GPIO_DIRECTION_CLEAR 0x68 -#define JZ_REG_GPIO_TRIGGER 0x70 -#define JZ_REG_GPIO_TRIGGER_SET 0x74 -#define JZ_REG_GPIO_TRIGGER_CLEAR 0x78 -#define JZ_REG_GPIO_FLAG 0x80 -#define JZ_REG_GPIO_FLAG_CLEAR 0x14 - -#define GPIO_TO_BIT(gpio) BIT(gpio & 0x1f) -#define GPIO_TO_REG(gpio, reg) (gpio_to_jz_gpio_chip(gpio)->base + (reg)) -#define CHIP_TO_REG(chip, reg) (gpio_chip_to_jz_gpio_chip(chip)->base + (reg)) - -struct jz_gpio_chip { - unsigned int irq; - unsigned int irq_base; - uint32_t edge_trigger_both; - - void __iomem *base; - - struct gpio_chip gpio_chip; -}; - -static struct jz_gpio_chip jz4740_gpio_chips[]; - -static inline struct jz_gpio_chip *gpio_to_jz_gpio_chip(unsigned int gpio) -{ - return &jz4740_gpio_chips[gpio >> 5]; -} - -static inline struct jz_gpio_chip *gpio_chip_to_jz_gpio_chip(struct gpio_chip *gc) -{ - return gpiochip_get_data(gc); -} - -static inline struct jz_gpio_chip *irq_to_jz_gpio_chip(struct irq_data *data) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data); - return gc->private; -} - -static inline void jz_gpio_write_bit(unsigned int gpio, unsigned int reg) -{ - writel(GPIO_TO_BIT(gpio), GPIO_TO_REG(gpio, reg)); -} - -int jz_gpio_set_function(int gpio, enum jz_gpio_function function) -{ - if (function == JZ_GPIO_FUNC_NONE) { - jz_gpio_write_bit(gpio, JZ_REG_GPIO_FUNC_CLEAR); - jz_gpio_write_bit(gpio, JZ_REG_GPIO_SELECT_CLEAR); - jz_gpio_write_bit(gpio, JZ_REG_GPIO_TRIGGER_CLEAR); - } else { - jz_gpio_write_bit(gpio, JZ_REG_GPIO_FUNC_SET); - jz_gpio_write_bit(gpio, JZ_REG_GPIO_TRIGGER_CLEAR); - switch (function) { - case JZ_GPIO_FUNC1: - jz_gpio_write_bit(gpio, JZ_REG_GPIO_SELECT_CLEAR); - break; - case JZ_GPIO_FUNC3: - jz_gpio_write_bit(gpio, JZ_REG_GPIO_TRIGGER_SET); - case JZ_GPIO_FUNC2: /* Falltrough */ - jz_gpio_write_bit(gpio, JZ_REG_GPIO_SELECT_SET); - break; - default: - BUG(); - break; - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(jz_gpio_set_function); - -int jz_gpio_bulk_request(const struct jz_gpio_bulk_request *request, size_t num) -{ - size_t i; - int ret; - - for (i = 0; i < num; ++i, ++request) { - ret = gpio_request(request->gpio, request->name); - if (ret) - goto err; - jz_gpio_set_function(request->gpio, request->function); - } - - return 0; - -err: - for (--request; i > 0; --i, --request) { - gpio_free(request->gpio); - jz_gpio_set_function(request->gpio, JZ_GPIO_FUNC_NONE); - } - - return ret; -} -EXPORT_SYMBOL_GPL(jz_gpio_bulk_request); - -void jz_gpio_bulk_free(const struct jz_gpio_bulk_request *request, size_t num) -{ - size_t i; - - for (i = 0; i < num; ++i, ++request) { - gpio_free(request->gpio); - jz_gpio_set_function(request->gpio, JZ_GPIO_FUNC_NONE); - } - -} -EXPORT_SYMBOL_GPL(jz_gpio_bulk_free); - -void jz_gpio_bulk_suspend(const struct jz_gpio_bulk_request *request, size_t num) -{ - size_t i; - - for (i = 0; i < num; ++i, ++request) { - jz_gpio_set_function(request->gpio, JZ_GPIO_FUNC_NONE); - jz_gpio_write_bit(request->gpio, JZ_REG_GPIO_DIRECTION_CLEAR); - jz_gpio_write_bit(request->gpio, JZ_REG_GPIO_PULL_SET); - } -} -EXPORT_SYMBOL_GPL(jz_gpio_bulk_suspend); - -void jz_gpio_bulk_resume(const struct jz_gpio_bulk_request *request, size_t num) -{ - size_t i; - - for (i = 0; i < num; ++i, ++request) - jz_gpio_set_function(request->gpio, request->function); -} -EXPORT_SYMBOL_GPL(jz_gpio_bulk_resume); - -void jz_gpio_enable_pullup(unsigned gpio) -{ - jz_gpio_write_bit(gpio, JZ_REG_GPIO_PULL_CLEAR); -} -EXPORT_SYMBOL_GPL(jz_gpio_enable_pullup); - -void jz_gpio_disable_pullup(unsigned gpio) -{ - jz_gpio_write_bit(gpio, JZ_REG_GPIO_PULL_SET); -} -EXPORT_SYMBOL_GPL(jz_gpio_disable_pullup); - -static int jz_gpio_get_value(struct gpio_chip *chip, unsigned gpio) -{ - return !!(readl(CHIP_TO_REG(chip, JZ_REG_GPIO_PIN)) & BIT(gpio)); -} - -static void jz_gpio_set_value(struct gpio_chip *chip, unsigned gpio, int value) -{ - uint32_t __iomem *reg = CHIP_TO_REG(chip, JZ_REG_GPIO_DATA_SET); - reg += !value; - writel(BIT(gpio), reg); -} - -static int jz_gpio_direction_output(struct gpio_chip *chip, unsigned gpio, - int value) -{ - writel(BIT(gpio), CHIP_TO_REG(chip, JZ_REG_GPIO_DIRECTION_SET)); - jz_gpio_set_value(chip, gpio, value); - - return 0; -} - -static int jz_gpio_direction_input(struct gpio_chip *chip, unsigned gpio) -{ - writel(BIT(gpio), CHIP_TO_REG(chip, JZ_REG_GPIO_DIRECTION_CLEAR)); - - return 0; -} - -static int jz_gpio_to_irq(struct gpio_chip *chip, unsigned gpio) -{ - struct jz_gpio_chip *jz_gpio = gpiochip_get_data(chip); - - return jz_gpio->irq_base + gpio; -} - -int jz_gpio_port_direction_input(int port, uint32_t mask) -{ - writel(mask, GPIO_TO_REG(port, JZ_REG_GPIO_DIRECTION_CLEAR)); - - return 0; -} -EXPORT_SYMBOL(jz_gpio_port_direction_input); - -int jz_gpio_port_direction_output(int port, uint32_t mask) -{ - writel(mask, GPIO_TO_REG(port, JZ_REG_GPIO_DIRECTION_SET)); - - return 0; -} -EXPORT_SYMBOL(jz_gpio_port_direction_output); - -void jz_gpio_port_set_value(int port, uint32_t value, uint32_t mask) -{ - writel(~value & mask, GPIO_TO_REG(port, JZ_REG_GPIO_DATA_CLEAR)); - writel(value & mask, GPIO_TO_REG(port, JZ_REG_GPIO_DATA_SET)); -} -EXPORT_SYMBOL(jz_gpio_port_set_value); - -uint32_t jz_gpio_port_get_value(int port, uint32_t mask) -{ - uint32_t value = readl(GPIO_TO_REG(port, JZ_REG_GPIO_PIN)); - - return value & mask; -} -EXPORT_SYMBOL(jz_gpio_port_get_value); - -#define IRQ_TO_BIT(irq) BIT((irq - JZ4740_IRQ_GPIO(0)) & 0x1f) - -static void jz_gpio_check_trigger_both(struct jz_gpio_chip *chip, unsigned int irq) -{ - uint32_t value; - void __iomem *reg; - uint32_t mask = IRQ_TO_BIT(irq); - - if (!(chip->edge_trigger_both & mask)) - return; - - reg = chip->base; - - value = readl(chip->base + JZ_REG_GPIO_PIN); - if (value & mask) - reg += JZ_REG_GPIO_DIRECTION_CLEAR; - else - reg += JZ_REG_GPIO_DIRECTION_SET; - - writel(mask, reg); -} - -static void jz_gpio_irq_demux_handler(struct irq_desc *desc) -{ - uint32_t flag; - unsigned int gpio_irq; - struct jz_gpio_chip *chip = irq_desc_get_handler_data(desc); - - flag = readl(chip->base + JZ_REG_GPIO_FLAG); - if (!flag) - return; - - gpio_irq = chip->irq_base + __fls(flag); - - jz_gpio_check_trigger_both(chip, gpio_irq); - - generic_handle_irq(gpio_irq); -}; - -static inline void jz_gpio_set_irq_bit(struct irq_data *data, unsigned int reg) -{ - struct jz_gpio_chip *chip = irq_to_jz_gpio_chip(data); - writel(IRQ_TO_BIT(data->irq), chip->base + reg); -} - -static void jz_gpio_irq_unmask(struct irq_data *data) -{ - struct jz_gpio_chip *chip = irq_to_jz_gpio_chip(data); - - jz_gpio_check_trigger_both(chip, data->irq); - irq_gc_unmask_enable_reg(data); -}; - -/* TODO: Check if function is gpio */ -static unsigned int jz_gpio_irq_startup(struct irq_data *data) -{ - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_SELECT_SET); - jz_gpio_irq_unmask(data); - return 0; -} - -static void jz_gpio_irq_shutdown(struct irq_data *data) -{ - irq_gc_mask_disable_reg(data); - - /* Set direction to input */ - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_DIRECTION_CLEAR); - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_SELECT_CLEAR); -} - -static int jz_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) -{ - struct jz_gpio_chip *chip = irq_to_jz_gpio_chip(data); - unsigned int irq = data->irq; - - if (flow_type == IRQ_TYPE_EDGE_BOTH) { - uint32_t value = readl(chip->base + JZ_REG_GPIO_PIN); - if (value & IRQ_TO_BIT(irq)) - flow_type = IRQ_TYPE_EDGE_FALLING; - else - flow_type = IRQ_TYPE_EDGE_RISING; - chip->edge_trigger_both |= IRQ_TO_BIT(irq); - } else { - chip->edge_trigger_both &= ~IRQ_TO_BIT(irq); - } - - switch (flow_type) { - case IRQ_TYPE_EDGE_RISING: - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_DIRECTION_SET); - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_TRIGGER_SET); - break; - case IRQ_TYPE_EDGE_FALLING: - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_DIRECTION_CLEAR); - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_TRIGGER_SET); - break; - case IRQ_TYPE_LEVEL_HIGH: - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_DIRECTION_SET); - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_TRIGGER_CLEAR); - break; - case IRQ_TYPE_LEVEL_LOW: - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_DIRECTION_CLEAR); - jz_gpio_set_irq_bit(data, JZ_REG_GPIO_TRIGGER_CLEAR); - break; - default: - return -EINVAL; - } - - return 0; -} - -static int jz_gpio_irq_set_wake(struct irq_data *data, unsigned int on) -{ - struct jz_gpio_chip *chip = irq_to_jz_gpio_chip(data); - - irq_gc_set_wake(data, on); - irq_set_irq_wake(chip->irq, on); - - return 0; -} - -#define JZ4740_GPIO_CHIP(_bank) { \ - .irq_base = JZ4740_IRQ_GPIO_BASE_ ## _bank, \ - .gpio_chip = { \ - .label = "Bank " # _bank, \ - .owner = THIS_MODULE, \ - .set = jz_gpio_set_value, \ - .get = jz_gpio_get_value, \ - .direction_output = jz_gpio_direction_output, \ - .direction_input = jz_gpio_direction_input, \ - .to_irq = jz_gpio_to_irq, \ - .base = JZ4740_GPIO_BASE_ ## _bank, \ - .ngpio = JZ4740_GPIO_NUM_ ## _bank, \ - }, \ -} - -static struct jz_gpio_chip jz4740_gpio_chips[] = { - JZ4740_GPIO_CHIP(A), - JZ4740_GPIO_CHIP(B), - JZ4740_GPIO_CHIP(C), - JZ4740_GPIO_CHIP(D), -}; - -static void jz4740_gpio_chip_init(struct jz_gpio_chip *chip, unsigned int id) -{ - struct irq_chip_generic *gc; - struct irq_chip_type *ct; - - chip->base = ioremap(JZ4740_GPIO_BASE_ADDR + (id * 0x100), 0x100); - - chip->irq = JZ4740_IRQ_INTC_GPIO(id); - irq_set_chained_handler_and_data(chip->irq, - jz_gpio_irq_demux_handler, chip); - - gc = irq_alloc_generic_chip(chip->gpio_chip.label, 1, chip->irq_base, - chip->base, handle_level_irq); - - gc->wake_enabled = IRQ_MSK(chip->gpio_chip.ngpio); - gc->private = chip; - - ct = gc->chip_types; - ct->regs.enable = JZ_REG_GPIO_MASK_CLEAR; - ct->regs.disable = JZ_REG_GPIO_MASK_SET; - ct->regs.ack = JZ_REG_GPIO_FLAG_CLEAR; - - ct->chip.name = "GPIO"; - ct->chip.irq_mask = irq_gc_mask_disable_reg; - ct->chip.irq_unmask = jz_gpio_irq_unmask; - ct->chip.irq_ack = irq_gc_ack_set_bit; - ct->chip.irq_suspend = ingenic_intc_irq_suspend; - ct->chip.irq_resume = ingenic_intc_irq_resume; - ct->chip.irq_startup = jz_gpio_irq_startup; - ct->chip.irq_shutdown = jz_gpio_irq_shutdown; - ct->chip.irq_set_type = jz_gpio_irq_set_type; - ct->chip.irq_set_wake = jz_gpio_irq_set_wake; - ct->chip.flags = IRQCHIP_SET_TYPE_MASKED; - - irq_setup_generic_chip(gc, IRQ_MSK(chip->gpio_chip.ngpio), - IRQ_GC_INIT_NESTED_LOCK, 0, IRQ_NOPROBE | IRQ_LEVEL); - - gpiochip_add_data(&chip->gpio_chip, chip); -} - -static int __init jz4740_gpio_init(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(jz4740_gpio_chips); ++i) - jz4740_gpio_chip_init(&jz4740_gpio_chips[i], i); - - printk(KERN_INFO "JZ4740 GPIO initialized\n"); - - return 0; -} -arch_initcall(jz4740_gpio_init); - -#ifdef CONFIG_DEBUG_FS - -static inline void gpio_seq_reg(struct seq_file *s, struct jz_gpio_chip *chip, - const char *name, unsigned int reg) -{ - seq_printf(s, "\t%s: %08x\n", name, readl(chip->base + reg)); -} - -static int gpio_regs_show(struct seq_file *s, void *unused) -{ - struct jz_gpio_chip *chip = jz4740_gpio_chips; - int i; - - for (i = 0; i < ARRAY_SIZE(jz4740_gpio_chips); ++i, ++chip) { - seq_printf(s, "==GPIO %d==\n", i); - gpio_seq_reg(s, chip, "Pin", JZ_REG_GPIO_PIN); - gpio_seq_reg(s, chip, "Data", JZ_REG_GPIO_DATA); - gpio_seq_reg(s, chip, "Mask", JZ_REG_GPIO_MASK); - gpio_seq_reg(s, chip, "Pull", JZ_REG_GPIO_PULL); - gpio_seq_reg(s, chip, "Func", JZ_REG_GPIO_FUNC); - gpio_seq_reg(s, chip, "Select", JZ_REG_GPIO_SELECT); - gpio_seq_reg(s, chip, "Direction", JZ_REG_GPIO_DIRECTION); - gpio_seq_reg(s, chip, "Trigger", JZ_REG_GPIO_TRIGGER); - gpio_seq_reg(s, chip, "Flag", JZ_REG_GPIO_FLAG); - } - - return 0; -} - -static int gpio_regs_open(struct inode *inode, struct file *file) -{ - return single_open(file, gpio_regs_show, NULL); -} - -static const struct file_operations gpio_regs_operations = { - .open = gpio_regs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init gpio_debugfs_init(void) -{ - (void) debugfs_create_file("jz_regs_gpio", S_IFREG | S_IRUGO, - NULL, NULL, &gpio_regs_operations); - return 0; -} -subsys_initcall(gpio_debugfs_init); - -#endif diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index a563759fd142..6a0d7040d882 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -1094,7 +1094,7 @@ static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu, struct mm_struct *mm; int i; - if (likely(!vcpu->requests)) + if (likely(!kvm_request_pending(vcpu))) return; if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 71d8856ade64..74805035edc8 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -2337,7 +2337,7 @@ static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu) int ret = 0; int i; - if (!vcpu->requests) + if (!kvm_request_pending(vcpu)) return 0; if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 4526e92301a6..c710db354ff2 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -98,4 +98,8 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h index 9a2a8956a695..2b16282add69 100644 --- a/arch/parisc/include/asm/dma-mapping.h +++ b/arch/parisc/include/asm/dma-mapping.h @@ -20,8 +20,6 @@ ** flush/purge and allocate "regular" cacheable pages for everything. */ -#define DMA_ERROR_CODE (~(dma_addr_t)0) - #ifdef CONFIG_PA11 extern const struct dma_map_ops pcxl_dma_ops; extern const struct dma_map_ops pcx_dma_ops; diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 514701840bd9..a0d4dc9f4eb2 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -97,4 +97,8 @@ #define SO_COOKIE 0x4032 +#define SCM_TIMESTAMPING_PKTINFO 0x4033 + +#define SO_PEERGROUPS 0x4034 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/boot/dts/fsl/kmcent2.dts b/arch/powerpc/boot/dts/fsl/kmcent2.dts index 47afa438602e..5922c1ea0e96 100644 --- a/arch/powerpc/boot/dts/fsl/kmcent2.dts +++ b/arch/powerpc/boot/dts/fsl/kmcent2.dts @@ -293,9 +293,7 @@ compatible = "fsl,ucc-hdlc"; rx-clock-name = "clk9"; tx-clock-name = "clk9"; - fsl,tx-timeslot-mask = <0xfffffffe>; - fsl,rx-timeslot-mask = <0xfffffffe>; - fsl,siram-entry-id = <0>; + fsl,hdlc-bus; }; }; }; diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index d73755fafbb0..57d38b504ff7 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -295,6 +295,8 @@ #define H_DISABLE_ALL_VIO_INTS 0x0A #define H_DISABLE_VIO_INTERRUPT 0x0B #define H_ENABLE_VIO_INTERRUPT 0x0C +#define H_GET_SESSION_TOKEN 0x19 +#define H_SESSION_ERR_DETECTED 0x1A /* Platform specific hcalls, used by KVM */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 2bf35017ffc0..b8d5b8e35244 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -86,7 +86,6 @@ struct kvmppc_vcore { u16 last_cpu; u8 vcore_state; u8 in_guest; - struct kvmppc_vcore *master_vcore; struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; struct list_head preempt_list; spinlock_t lock; diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index b148496ffe36..7cea76f11c26 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -81,7 +81,7 @@ struct kvm_split_mode { u8 subcore_size; u8 do_nap; u8 napped[MAX_SMT_THREADS]; - struct kvmppc_vcore *master_vcs[MAX_SUBCORES]; + struct kvmppc_vcore *vc[MAX_SUBCORES]; }; /* diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9c51ac4b8f36..8b3f1238d07f 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/hvcall.h> +#include <asm/mce.h> #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS @@ -52,8 +53,8 @@ #define KVM_IRQCHIP_NUM_PINS 256 /* PPC-specific vcpu->requests bit members */ -#define KVM_REQ_WATCHDOG 8 -#define KVM_REQ_EPR_EXIT 9 +#define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0) +#define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1) #include <linux/mmu_notifier.h> @@ -267,6 +268,8 @@ struct kvm_resize_hpt; struct kvm_arch { unsigned int lpid; + unsigned int smt_mode; /* # vcpus per virtual core */ + unsigned int emul_smt_mode; /* emualted SMT mode, on P9 */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned int tlb_sets; struct kvm_hpt_info hpt; @@ -285,6 +288,7 @@ struct kvm_arch { cpumask_t need_tlb_flush; cpumask_t cpu_in_guest; u8 radix; + u8 fwnmi_enabled; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; @@ -566,6 +570,7 @@ struct kvm_vcpu_arch { ulong wort; ulong tid; ulong psscr; + ulong hfscr; ulong shadow_srr1; #endif u32 vrsave; /* also USPRG0 */ @@ -579,7 +584,7 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - u32 dec; + ulong dec; #ifdef CONFIG_BOOKE u32 decar; #endif @@ -710,6 +715,7 @@ struct kvm_vcpu_arch { unsigned long pending_exceptions; u8 ceded; u8 prodded; + u8 doorbell_request; u32 last_inst; struct swait_queue_head *wqp; @@ -722,6 +728,7 @@ struct kvm_vcpu_arch { int prev_cpu; bool timer_running; wait_queue_head_t cpu_run; + struct machine_check_event mce_evt; /* Valid if trap == 0x200 */ struct kvm_vcpu_arch_shared *shared; #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e0d88c38602b..ba5fadd6f3c9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -315,6 +315,8 @@ struct kvmppc_ops { struct irq_bypass_producer *); int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg); int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); + int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, + unsigned long flags); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 3a8d278e7421..1a9b45198c06 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -103,6 +103,8 @@ #define OP_31_XOP_STBUX 247 #define OP_31_XOP_LHZX 279 #define OP_31_XOP_LHZUX 311 +#define OP_31_XOP_MSGSNDP 142 +#define OP_31_XOP_MSGCLRP 174 #define OP_31_XOP_MFSPR 339 #define OP_31_XOP_LWAX 341 #define OP_31_XOP_LHAX 343 diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 07fbeb927834..8cf8f0c96906 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -60,6 +60,12 @@ struct kvm_regs { #define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */ +/* flags for kvm_run.flags */ +#define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0) +#define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0) +#define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0) +#define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0) + /* * Feature bits indicate which sections of the sregs struct are valid, * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 58e2ec0310fc..3c590c7c42c0 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -8,28 +8,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <asm/sockios.h> - -/* For setsockopt(2) */ -#define SOL_SOCKET 1 - -#define SO_DEBUG 1 -#define SO_REUSEADDR 2 -#define SO_TYPE 3 -#define SO_ERROR 4 -#define SO_DONTROUTE 5 -#define SO_BROADCAST 6 -#define SO_SNDBUF 7 -#define SO_RCVBUF 8 -#define SO_SNDBUFFORCE 32 -#define SO_RCVBUFFORCE 33 -#define SO_KEEPALIVE 9 -#define SO_OOBINLINE 10 -#define SO_NO_CHECK 11 -#define SO_PRIORITY 12 -#define SO_LINGER 13 -#define SO_BSDCOMPAT 14 -#define SO_REUSEPORT 15 #define SO_RCVLOWAT 16 #define SO_SNDLOWAT 17 #define SO_RCVTIMEO 18 @@ -37,72 +15,6 @@ #define SO_PASSCRED 20 #define SO_PEERCRED 21 -/* Security levels - as per NRL IPv6 - don't actually do anything */ -#define SO_SECURITY_AUTHENTICATION 22 -#define SO_SECURITY_ENCRYPTION_TRANSPORT 23 -#define SO_SECURITY_ENCRYPTION_NETWORK 24 - -#define SO_BINDTODEVICE 25 - -/* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 -#define SO_GET_FILTER SO_ATTACH_FILTER - -#define SO_PEERNAME 28 -#define SO_TIMESTAMP 29 -#define SCM_TIMESTAMP SO_TIMESTAMP - -#define SO_ACCEPTCONN 30 - -#define SO_PEERSEC 31 -#define SO_PASSSEC 34 -#define SO_TIMESTAMPNS 35 -#define SCM_TIMESTAMPNS SO_TIMESTAMPNS - -#define SO_MARK 36 - -#define SO_TIMESTAMPING 37 -#define SCM_TIMESTAMPING SO_TIMESTAMPING - -#define SO_PROTOCOL 38 -#define SO_DOMAIN 39 - -#define SO_RXQ_OVFL 40 - -#define SO_WIFI_STATUS 41 -#define SCM_WIFI_STATUS SO_WIFI_STATUS -#define SO_PEEK_OFF 42 - -/* Instruct lower device to use last 4-bytes of skb data as FCS */ -#define SO_NOFCS 43 - -#define SO_LOCK_FILTER 44 - -#define SO_SELECT_ERR_QUEUE 45 - -#define SO_BUSY_POLL 46 - -#define SO_MAX_PACING_RATE 47 - -#define SO_BPF_EXTENSIONS 48 - -#define SO_INCOMING_CPU 49 - -#define SO_ATTACH_BPF 50 -#define SO_DETACH_BPF SO_DETACH_FILTER - -#define SO_ATTACH_REUSEPORT_CBPF 51 -#define SO_ATTACH_REUSEPORT_EBPF 52 - -#define SO_CNX_ADVICE 53 - -#define SCM_TIMESTAMPING_OPT_STATS 54 - -#define SO_MEMINFO 55 - -#define SO_INCOMING_NAPI_ID 56 - -#define SO_COOKIE 57 +#include <asm-generic/socket.h> #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 709e23425317..ae8e89e0d083 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -485,6 +485,7 @@ int main(void) OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); OFFSET(KVM_RADIX, kvm, arch.radix); + OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); @@ -513,6 +514,7 @@ int main(void) OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); + OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc); @@ -542,6 +544,7 @@ int main(void) OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); OFFSET(VCPU_TID, kvm_vcpu, arch.tid); OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); + OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr); OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 5f9eada3519b..a9bfa49f3698 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -405,6 +405,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } } +EXPORT_SYMBOL_GPL(machine_check_print_event_info); uint64_t get_mce_fault_addr(struct machine_check_event *evt) { diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index eae61b044e9e..496d6393bd41 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -792,21 +792,17 @@ static ssize_t dev_nvram_write(struct file *file, const char __user *buf, count = min_t(size_t, count, size - *ppos); count = min(count, PAGE_SIZE); - ret = -ENOMEM; - tmp = kmalloc(count, GFP_KERNEL); - if (!tmp) - goto out; - - ret = -EFAULT; - if (copy_from_user(tmp, buf, count)) + tmp = memdup_user(buf, count); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); goto out; + } ret = ppc_md.nvram_write(tmp, count, ppos); -out: kfree(tmp); +out: return ret; - } static long dev_nvram_ioctl(struct file *file, unsigned int cmd, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 773b35d16a0b..0b436df746fc 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -46,6 +46,8 @@ #include <linux/of.h> #include <asm/reg.h> +#include <asm/ppc-opcode.h> +#include <asm/disassemble.h> #include <asm/cputable.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -645,6 +647,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, unsigned long stolen; unsigned long core_stolen; u64 now; + unsigned long flags; dt = vcpu->arch.dtl_ptr; vpa = vcpu->arch.vpa.pinned_addr; @@ -652,10 +655,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, core_stolen = vcore_stolen_time(vc, now); stolen = core_stolen - vcpu->arch.stolen_logged; vcpu->arch.stolen_logged = core_stolen; - spin_lock_irq(&vcpu->arch.tbacct_lock); + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); stolen += vcpu->arch.busy_stolen; vcpu->arch.busy_stolen = 0; - spin_unlock_irq(&vcpu->arch.tbacct_lock); + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); if (!dt || !vpa) return; memset(dt, 0, sizeof(struct dtl_entry)); @@ -675,6 +678,26 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, vcpu->arch.dtl.dirty = true; } +/* See if there is a doorbell interrupt pending for a vcpu */ +static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) +{ + int thr; + struct kvmppc_vcore *vc; + + if (vcpu->arch.doorbell_request) + return true; + /* + * Ensure that the read of vcore->dpdes comes after the read + * of vcpu->doorbell_request. This barrier matches the + * lwsync in book3s_hv_rmhandlers.S just before the + * fast_guest_return label. + */ + smp_rmb(); + vc = vcpu->arch.vcore; + thr = vcpu->vcpu_id - vc->first_vcpuid; + return !!(vc->dpdes & (1 << thr)); +} + static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu) { if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207) @@ -926,6 +949,101 @@ static int kvmppc_emulate_debug_inst(struct kvm_run *run, } } +static void do_nothing(void *x) +{ +} + +static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu) +{ + int thr, cpu, pcpu, nthreads; + struct kvm_vcpu *v; + unsigned long dpdes; + + nthreads = vcpu->kvm->arch.emul_smt_mode; + dpdes = 0; + cpu = vcpu->vcpu_id & ~(nthreads - 1); + for (thr = 0; thr < nthreads; ++thr, ++cpu) { + v = kvmppc_find_vcpu(vcpu->kvm, cpu); + if (!v) + continue; + /* + * If the vcpu is currently running on a physical cpu thread, + * interrupt it in order to pull it out of the guest briefly, + * which will update its vcore->dpdes value. + */ + pcpu = READ_ONCE(v->cpu); + if (pcpu >= 0) + smp_call_function_single(pcpu, do_nothing, NULL, 1); + if (kvmppc_doorbell_pending(v)) + dpdes |= 1 << thr; + } + return dpdes; +} + +/* + * On POWER9, emulate doorbell-related instructions in order to + * give the guest the illusion of running on a multi-threaded core. + * The instructions emulated are msgsndp, msgclrp, mfspr TIR, + * and mfspr DPDES. + */ +static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) +{ + u32 inst, rb, thr; + unsigned long arg; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tvcpu; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return EMULATE_FAIL; + if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE) + return RESUME_GUEST; + if (get_op(inst) != 31) + return EMULATE_FAIL; + rb = get_rb(inst); + thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1); + switch (get_xop(inst)) { + case OP_31_XOP_MSGSNDP: + arg = kvmppc_get_gpr(vcpu, rb); + if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) + break; + arg &= 0x3f; + if (arg >= kvm->arch.emul_smt_mode) + break; + tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg); + if (!tvcpu) + break; + if (!tvcpu->arch.doorbell_request) { + tvcpu->arch.doorbell_request = 1; + kvmppc_fast_vcpu_kick_hv(tvcpu); + } + break; + case OP_31_XOP_MSGCLRP: + arg = kvmppc_get_gpr(vcpu, rb); + if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) + break; + vcpu->arch.vcore->dpdes = 0; + vcpu->arch.doorbell_request = 0; + break; + case OP_31_XOP_MFSPR: + switch (get_sprn(inst)) { + case SPRN_TIR: + arg = thr; + break; + case SPRN_DPDES: + arg = kvmppc_read_dpdes(vcpu); + break; + default: + return EMULATE_FAIL; + } + kvmppc_set_gpr(vcpu, get_rt(inst), arg); + break; + default: + return EMULATE_FAIL; + } + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + return RESUME_GUEST; +} + static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -971,15 +1089,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: - /* - * Deliver a machine check interrupt to the guest. - * We have to do this, even if the host has handled the - * machine check, because machine checks use SRR0/1 and - * the interrupt might have trashed guest state in them. - */ - kvmppc_book3s_queue_irqprio(vcpu, - BOOK3S_INTERRUPT_MACHINE_CHECK); - r = RESUME_GUEST; + /* Exit to guest with KVM_EXIT_NMI as exit reason */ + run->exit_reason = KVM_EXIT_NMI; + run->hw.hardware_exit_reason = vcpu->arch.trap; + /* Clear out the old NMI status from run->flags */ + run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK; + /* Now set the NMI status */ + if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED) + run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; + else + run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; + + r = RESUME_HOST; + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); break; case BOOK3S_INTERRUPT_PROGRAM: { @@ -1048,12 +1171,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* * This occurs if the guest (kernel or userspace), does something that - * is prohibited by HFSCR. We just generate a program interrupt to - * the guest. + * is prohibited by HFSCR. + * On POWER9, this could be a doorbell instruction that we need + * to emulate. + * Otherwise, we just generate a program interrupt to the guest. */ case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - r = RESUME_GUEST; + r = EMULATE_FAIL; + if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) + r = kvmppc_emulate_doorbell_instr(vcpu); + if (r == EMULATE_FAIL) { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + r = RESUME_GUEST; + } break; case BOOK3S_INTERRUPT_HV_RM_HARD: r = RESUME_PASSTHROUGH; @@ -1143,6 +1273,12 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; if (cpu_has_feature(CPU_FTR_ARCH_207S)) mask |= LPCR_AIL; + /* + * On POWER9, allow userspace to enable large decrementer for the + * guest, whether or not the host has it enabled. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + mask |= LPCR_LD; /* Broken 32-bit version of LPCR must not clear top bits */ if (preserve_top32) @@ -1611,7 +1747,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) init_swait_queue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; - vcore->first_vcpuid = core * threads_per_vcore(); + vcore->first_vcpuid = core * kvm->arch.smt_mode; vcore->kvm = kvm; INIT_LIST_HEAD(&vcore->preempt_list); @@ -1770,14 +1906,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; - int err = -EINVAL; + int err; int core; struct kvmppc_vcore *vcore; - core = id / threads_per_vcore(); - if (core >= KVM_MAX_VCORES) - goto out; - err = -ENOMEM; vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) @@ -1808,6 +1940,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, vcpu->arch.busy_preempt = TB_NIL; vcpu->arch.intr_msr = MSR_SF | MSR_ME; + /* + * Set the default HFSCR for the guest from the host value. + * This value is only used on POWER9. + * On POWER9 DD1, TM doesn't work, so we make sure to + * prevent the guest from using it. + * On POWER9, we want to virtualize the doorbell facility, so we + * turn off the HFSCR bit, which causes those instructions to trap. + */ + vcpu->arch.hfscr = mfspr(SPRN_HFSCR); + if (!cpu_has_feature(CPU_FTR_TM)) + vcpu->arch.hfscr &= ~HFSCR_TM; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + vcpu->arch.hfscr &= ~HFSCR_MSGP; + kvmppc_mmu_book3s_hv_init(vcpu); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; @@ -1815,11 +1961,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, init_waitqueue_head(&vcpu->arch.cpu_run); mutex_lock(&kvm->lock); - vcore = kvm->arch.vcores[core]; - if (!vcore) { - vcore = kvmppc_vcore_create(kvm, core); - kvm->arch.vcores[core] = vcore; - kvm->arch.online_vcores++; + vcore = NULL; + err = -EINVAL; + core = id / kvm->arch.smt_mode; + if (core < KVM_MAX_VCORES) { + vcore = kvm->arch.vcores[core]; + if (!vcore) { + err = -ENOMEM; + vcore = kvmppc_vcore_create(kvm, core); + kvm->arch.vcores[core] = vcore; + kvm->arch.online_vcores++; + } } mutex_unlock(&kvm->lock); @@ -1847,6 +1999,43 @@ out: return ERR_PTR(err); } +static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, + unsigned long flags) +{ + int err; + int esmt = 0; + + if (flags) + return -EINVAL; + if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode)) + return -EINVAL; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * On POWER8 (or POWER7), the threading mode is "strict", + * so we pack smt_mode vcpus per vcore. + */ + if (smt_mode > threads_per_subcore) + return -EINVAL; + } else { + /* + * On POWER9, the threading mode is "loose", + * so each vcpu gets its own vcore. + */ + esmt = smt_mode; + smt_mode = 1; + } + mutex_lock(&kvm->lock); + err = -EBUSY; + if (!kvm->arch.online_vcores) { + kvm->arch.smt_mode = smt_mode; + kvm->arch.emul_smt_mode = esmt; + err = 0; + } + mutex_unlock(&kvm->lock); + + return err; +} + static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) { if (vpa->pinned_addr) @@ -1897,7 +2086,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu) } } -extern void __kvmppc_vcore_entry(void); +extern int __kvmppc_vcore_entry(void); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) @@ -1962,10 +2151,6 @@ static void kvmppc_release_hwthread(int cpu) tpaca->kvm_hstate.kvm_split_mode = NULL; } -static void do_nothing(void *x) -{ -} - static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { int i; @@ -1983,11 +2168,35 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) smp_call_function_single(cpu + i, do_nothing, NULL, 1); } +static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) +{ + struct kvm *kvm = vcpu->kvm; + + /* + * With radix, the guest can do TLB invalidations itself, + * and it could choose to use the local form (tlbiel) if + * it is invalidating a translation that has only ever been + * used on one vcpu. However, that doesn't mean it has + * only ever been used on one physical cpu, since vcpus + * can move around between pcpus. To cope with this, when + * a vcpu moves from one pcpu to another, we need to tell + * any vcpus running on the same core as this vcpu previously + * ran to flush the TLB. The TLB is shared between threads, + * so we use a single bit in .need_tlb_flush for all 4 threads. + */ + if (vcpu->arch.prev_cpu != pcpu) { + if (vcpu->arch.prev_cpu >= 0 && + cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + cpu_first_thread_sibling(pcpu)) + radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); + vcpu->arch.prev_cpu = pcpu; + } +} + static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; struct paca_struct *tpaca; - struct kvmppc_vcore *mvc = vc->master_vcore; struct kvm *kvm = vc->kvm; cpu = vc->pcpu; @@ -1997,36 +2206,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) vcpu->arch.timer_running = 0; } cpu += vcpu->arch.ptid; - vcpu->cpu = mvc->pcpu; + vcpu->cpu = vc->pcpu; vcpu->arch.thread_cpu = cpu; - - /* - * With radix, the guest can do TLB invalidations itself, - * and it could choose to use the local form (tlbiel) if - * it is invalidating a translation that has only ever been - * used on one vcpu. However, that doesn't mean it has - * only ever been used on one physical cpu, since vcpus - * can move around between pcpus. To cope with this, when - * a vcpu moves from one pcpu to another, we need to tell - * any vcpus running on the same core as this vcpu previously - * ran to flush the TLB. The TLB is shared between threads, - * so we use a single bit in .need_tlb_flush for all 4 threads. - */ - if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) { - if (vcpu->arch.prev_cpu >= 0 && - cpu_first_thread_sibling(vcpu->arch.prev_cpu) != - cpu_first_thread_sibling(cpu)) - radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); - vcpu->arch.prev_cpu = cpu; - } cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); } tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; - tpaca->kvm_hstate.ptid = cpu - mvc->pcpu; + tpaca->kvm_hstate.ptid = cpu - vc->pcpu; /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */ smp_wmb(); - tpaca->kvm_hstate.kvm_vcore = mvc; + tpaca->kvm_hstate.kvm_vcore = vc; if (cpu != smp_processor_id()) kvmppc_ipi_thread(cpu); } @@ -2155,8 +2344,7 @@ struct core_info { int max_subcore_threads; int total_threads; int subcore_threads[MAX_SUBCORES]; - struct kvm *subcore_vm[MAX_SUBCORES]; - struct list_head vcs[MAX_SUBCORES]; + struct kvmppc_vcore *vc[MAX_SUBCORES]; }; /* @@ -2167,17 +2355,12 @@ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) { - int sub; - memset(cip, 0, sizeof(*cip)); cip->n_subcores = 1; cip->max_subcore_threads = vc->num_threads; cip->total_threads = vc->num_threads; cip->subcore_threads[0] = vc->num_threads; - cip->subcore_vm[0] = vc->kvm; - for (sub = 0; sub < MAX_SUBCORES; ++sub) - INIT_LIST_HEAD(&cip->vcs[sub]); - list_add_tail(&vc->preempt_list, &cip->vcs[0]); + cip->vc[0] = vc; } static bool subcore_config_ok(int n_subcores, int n_threads) @@ -2197,9 +2380,8 @@ static bool subcore_config_ok(int n_subcores, int n_threads) return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS; } -static void init_master_vcore(struct kvmppc_vcore *vc) +static void init_vcore_to_run(struct kvmppc_vcore *vc) { - vc->master_vcore = vc; vc->entry_exit_map = 0; vc->in_guest = 0; vc->napping_threads = 0; @@ -2224,9 +2406,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) ++cip->n_subcores; cip->total_threads += vc->num_threads; cip->subcore_threads[sub] = vc->num_threads; - cip->subcore_vm[sub] = vc->kvm; - init_master_vcore(vc); - list_move_tail(&vc->preempt_list, &cip->vcs[sub]); + cip->vc[sub] = vc; + init_vcore_to_run(vc); + list_del_init(&vc->preempt_list); return true; } @@ -2294,6 +2476,18 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&lp->lock); } +static bool recheck_signals(struct core_info *cip) +{ + int sub, i; + struct kvm_vcpu *vcpu; + + for (sub = 0; sub < cip->n_subcores; ++sub) + for_each_runnable_thread(i, vcpu, cip->vc[sub]) + if (signal_pending(vcpu->arch.run_task)) + return true; + return false; +} + static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) { int still_running = 0, i; @@ -2331,7 +2525,6 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) wake_up(&vcpu->arch.cpu_run); } } - list_del_init(&vc->preempt_list); if (!is_master) { if (still_running > 0) { kvmppc_vcore_preempt(vc); @@ -2393,6 +2586,21 @@ static inline int kvmppc_set_host_core(unsigned int cpu) return 0; } +static void set_irq_happened(int trap) +{ + switch (trap) { + case BOOK3S_INTERRUPT_EXTERNAL: + local_paca->irq_happened |= PACA_IRQ_EE; + break; + case BOOK3S_INTERRUPT_H_DOORBELL: + local_paca->irq_happened |= PACA_IRQ_DBELL; + break; + case BOOK3S_INTERRUPT_HMI: + local_paca->irq_happened |= PACA_IRQ_HMI; + break; + } +} + /* * Run a set of guest threads on a physical core. * Called with vc->lock held. @@ -2403,7 +2611,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int i; int srcu_idx; struct core_info core_info; - struct kvmppc_vcore *pvc, *vcnext; + struct kvmppc_vcore *pvc; struct kvm_split_mode split_info, *sip; int split, subcore_size, active; int sub; @@ -2412,6 +2620,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int pcpu, thr; int target_threads; int controlled_threads; + int trap; /* * Remove from the list any threads that have a signal pending @@ -2426,7 +2635,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* * Initialize *vc. */ - init_master_vcore(vc); + init_vcore_to_run(vc); vc->preempt_tb = TB_NIL; /* @@ -2463,6 +2672,43 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) if (vc->num_threads < target_threads) collect_piggybacks(&core_info, target_threads); + /* + * On radix, arrange for TLB flushing if necessary. + * This has to be done before disabling interrupts since + * it uses smp_call_function(). + */ + pcpu = smp_processor_id(); + if (kvm_is_radix(vc->kvm)) { + for (sub = 0; sub < core_info.n_subcores; ++sub) + for_each_runnable_thread(i, vcpu, core_info.vc[sub]) + kvmppc_prepare_radix_vcpu(vcpu, pcpu); + } + + /* + * Hard-disable interrupts, and check resched flag and signals. + * If we need to reschedule or deliver a signal, clean up + * and return without going into the guest(s). + */ + local_irq_disable(); + hard_irq_disable(); + if (lazy_irq_pending() || need_resched() || + recheck_signals(&core_info)) { + local_irq_enable(); + vc->vcore_state = VCORE_INACTIVE; + /* Unlock all except the primary vcore */ + for (sub = 1; sub < core_info.n_subcores; ++sub) { + pvc = core_info.vc[sub]; + /* Put back on to the preempted vcores list */ + kvmppc_vcore_preempt(pvc); + spin_unlock(&pvc->lock); + } + for (i = 0; i < controlled_threads; ++i) + kvmppc_release_hwthread(pcpu + i); + return; + } + + kvmppc_clear_host_core(pcpu); + /* Decide on micro-threading (split-core) mode */ subcore_size = threads_per_subcore; cmd_bit = stat_bit = 0; @@ -2486,13 +2732,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) split_info.ldbar = mfspr(SPRN_LDBAR); split_info.subcore_size = subcore_size; for (sub = 0; sub < core_info.n_subcores; ++sub) - split_info.master_vcs[sub] = - list_first_entry(&core_info.vcs[sub], - struct kvmppc_vcore, preempt_list); + split_info.vc[sub] = core_info.vc[sub]; /* order writes to split_info before kvm_split_mode pointer */ smp_wmb(); } - pcpu = smp_processor_id(); for (thr = 0; thr < controlled_threads; ++thr) paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; @@ -2512,32 +2755,29 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } } - kvmppc_clear_host_core(pcpu); - /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { thr = subcore_thread_map[sub]; thr0_done = false; active |= 1 << thr; - list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { - pvc->pcpu = pcpu + thr; - for_each_runnable_thread(i, vcpu, pvc) { - kvmppc_start_thread(vcpu, pvc); - kvmppc_create_dtl_entry(vcpu, pvc); - trace_kvm_guest_enter(vcpu); - if (!vcpu->arch.ptid) - thr0_done = true; - active |= 1 << (thr + vcpu->arch.ptid); - } - /* - * We need to start the first thread of each subcore - * even if it doesn't have a vcpu. - */ - if (pvc->master_vcore == pvc && !thr0_done) - kvmppc_start_thread(NULL, pvc); - thr += pvc->num_threads; + pvc = core_info.vc[sub]; + pvc->pcpu = pcpu + thr; + for_each_runnable_thread(i, vcpu, pvc) { + kvmppc_start_thread(vcpu, pvc); + kvmppc_create_dtl_entry(vcpu, pvc); + trace_kvm_guest_enter(vcpu); + if (!vcpu->arch.ptid) + thr0_done = true; + active |= 1 << (thr + vcpu->arch.ptid); } + /* + * We need to start the first thread of each subcore + * even if it doesn't have a vcpu. + */ + if (!thr0_done) + kvmppc_start_thread(NULL, pvc); + thr += pvc->num_threads; } /* @@ -2564,17 +2804,27 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) trace_kvmppc_run_core(vc, 0); for (sub = 0; sub < core_info.n_subcores; ++sub) - list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) - spin_unlock(&pvc->lock); + spin_unlock(&core_info.vc[sub]->lock); + + /* + * Interrupts will be enabled once we get into the guest, + * so tell lockdep that we're about to enable interrupts. + */ + trace_hardirqs_on(); guest_enter(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); - __kvmppc_vcore_entry(); + trap = __kvmppc_vcore_entry(); srcu_read_unlock(&vc->kvm->srcu, srcu_idx); + guest_exit(); + + trace_hardirqs_off(); + set_irq_happened(trap); + spin_lock(&vc->lock); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ vc->vcore_state = VCORE_EXITING; @@ -2602,6 +2852,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) split_info.do_nap = 0; } + kvmppc_set_host_core(pcpu); + + local_irq_enable(); + /* Let secondaries go back to the offline loop */ for (i = 0; i < controlled_threads; ++i) { kvmppc_release_hwthread(pcpu + i); @@ -2610,18 +2864,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); } - kvmppc_set_host_core(pcpu); - spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); - guest_exit(); - for (sub = 0; sub < core_info.n_subcores; ++sub) - list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], - preempt_list) - post_guest_process(pvc, pvc == vc); + for (sub = 0; sub < core_info.n_subcores; ++sub) { + pvc = core_info.vc[sub]; + post_guest_process(pvc, pvc == vc); + } spin_lock(&vc->lock); preempt_enable(); @@ -2666,6 +2917,30 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) vc->halt_poll_ns /= halt_poll_ns_shrink; } +#ifdef CONFIG_KVM_XICS +static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) +{ + if (!xive_enabled()) + return false; + return vcpu->arch.xive_saved_state.pipr < + vcpu->arch.xive_saved_state.cppr; +} +#else +static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) +{ + return false; +} +#endif /* CONFIG_KVM_XICS */ + +static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.pending_exceptions || vcpu->arch.prodded || + kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu)) + return true; + + return false; +} + /* * Check to see if any of the runnable vcpus on the vcore have pending * exceptions or are no longer ceded @@ -2676,8 +2951,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) int i; for_each_runnable_thread(i, vcpu, vc) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || - vcpu->arch.prodded) + if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu)) return 1; } @@ -2819,15 +3093,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) */ if (!signal_pending(current)) { if (vc->vcore_state == VCORE_PIGGYBACK) { - struct kvmppc_vcore *mvc = vc->master_vcore; - if (spin_trylock(&mvc->lock)) { - if (mvc->vcore_state == VCORE_RUNNING && - !VCORE_IS_EXITING(mvc)) { + if (spin_trylock(&vc->lock)) { + if (vc->vcore_state == VCORE_RUNNING && + !VCORE_IS_EXITING(vc)) { kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } - spin_unlock(&mvc->lock); + spin_unlock(&vc->lock); } } else if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { @@ -2863,7 +3136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; n_ceded = 0; for_each_runnable_thread(i, v, vc) { - if (!v->arch.pending_exceptions && !v->arch.prodded) + if (!kvmppc_vcpu_woken(v)) n_ceded += v->arch.ceded; else v->arch.ceded = 0; @@ -3519,6 +3792,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm_hv_vm_activated(); /* + * Initialize smt_mode depending on processor. + * POWER8 and earlier have to use "strict" threading, where + * all vCPUs in a vcore have to run on the same (sub)core, + * whereas on POWER9 the threads can each run a different + * guest. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.smt_mode = threads_per_subcore; + else + kvm->arch.smt_mode = 1; + kvm->arch.emul_smt_mode = 1; + + /* * Create a debugfs directory for the VM */ snprintf(buf, sizeof(buf), "vm%d", current->pid); @@ -3947,6 +4233,7 @@ static struct kvmppc_ops kvm_ops_hv = { #endif .configure_mmu = kvmhv_configure_mmu, .get_rmmu_info = kvmhv_get_rmmu_info, + .set_smt_mode = kvmhv_set_smt_mode, }; static int kvm_init_subcore_bitmap(void) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index ee4c2558c305..90644db9d38e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -307,7 +307,7 @@ void kvmhv_commence_exit(int trap) return; for (i = 0; i < MAX_SUBCORES; ++i) { - vc = sip->master_vcs[i]; + vc = sip->vc[i]; if (!vc) break; do { diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 404deb512844..dc54373c8780 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -61,13 +61,6 @@ BEGIN_FTR_SECTION std r3, HSTATE_DABR(r13) END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - /* Hard-disable interrupts */ - mfmsr r10 - std r10, HSTATE_HOST_MSR(r13) - rldicl r10,r10,48,1 - rotldi r10,r10,16 - mtmsrd r10,1 - /* Save host PMU registers */ BEGIN_FTR_SECTION /* Work around P8 PMAE bug */ @@ -153,6 +146,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) * * R1 = host R1 * R2 = host R2 + * R3 = trap number on this thread * R12 = exit handler id * R13 = PACA */ diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 7ef0993214f3..c356f9a40b24 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -130,12 +130,28 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) out: /* + * For guest that supports FWNMI capability, hook the MCE event into + * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI + * exit reason. On our way to exit we will pull this event from vcpu + * structure and print it from thread 0 of the core/subcore. + * + * For guest that does not support FWNMI capability (old QEMU): * We are now going enter guest either through machine check * interrupt (for unhandled errors) or will continue from * current HSRR0 (for handled errors) in guest. Hence * queue up the event so that we can log it from host console later. */ - machine_check_queue_event(); + if (vcpu->kvm->arch.fwnmi_enabled) { + /* + * Hook up the mce event on to vcpu structure. + * First clear the old event. + */ + memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt)); + if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { + vcpu->arch.mce_evt = mce_evt; + } + } else + machine_check_queue_event(); return handled; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 4888dd494604..6ea4b53f4b16 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -45,7 +45,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define NAPPING_NOVCPU 2 /* Stack frame offsets for kvmppc_hv_entry */ -#define SFS 144 +#define SFS 160 #define STACK_SLOT_TRAP (SFS-4) #define STACK_SLOT_TID (SFS-16) #define STACK_SLOT_PSSCR (SFS-24) @@ -54,6 +54,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_CIABR (SFS-48) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) +#define STACK_SLOT_HFSCR (SFS-72) /* * Call kvmppc_hv_entry in real mode. @@ -68,6 +69,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) std r0, PPC_LR_STKOFF(r1) stdu r1, -112(r1) mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) li r0,MSR_RI andc r0,r10,r0 @@ -152,20 +154,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) stb r0, HSTATE_HWTHREAD_REQ(r13) /* - * For external and machine check interrupts, we need - * to call the Linux handler to process the interrupt. - * We do that by jumping to absolute address 0x500 for - * external interrupts, or the machine_check_fwnmi label - * for machine checks (since firmware might have patched - * the vector area at 0x200). The [h]rfid at the end of the - * handler will return to the book3s_hv_interrupts.S code. - * For other interrupts we do the rfid to get back - * to the book3s_hv_interrupts.S code here. + * For external interrupts we need to call the Linux + * handler to process the interrupt. We do that by jumping + * to absolute address 0x500 for external interrupts. + * The [h]rfid at the end of the handler will return to + * the book3s_hv_interrupts.S code. For other interrupts + * we do the rfid to get back to the book3s_hv_interrupts.S + * code here. */ ld r8, 112+PPC_LR_STKOFF(r1) addi r1, r1, 112 ld r7, HSTATE_HOST_MSR(r13) + /* Return the trap number on this thread as the return value */ + mr r3, r12 + /* * If we came back from the guest via a relocation-on interrupt, * we will be in virtual mode at this point, which makes it a @@ -175,59 +178,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) andi. r0, r0, MSR_IR /* in real mode? */ bne .Lvirt_return - cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq 11f - cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL - beq 15f /* Invoke the H_DOORBELL handler */ - cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI - beq cr2, 14f /* HMI check */ - - /* RFI into the highmem handler, or branch to interrupt handler */ + /* RFI into the highmem handler */ mfmsr r6 li r0, MSR_RI andc r6, r6, r0 mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - beq cr1, 13f /* machine check */ RFI - /* On POWER7, we have external interrupts set to use HSRR0/1 */ -11: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0x500 - -13: b machine_check_fwnmi - -14: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - b hmi_exception_after_realmode - -15: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0xe80 - - /* Virtual-mode return - can't get here for HMI or machine check */ + /* Virtual-mode return */ .Lvirt_return: - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq 16f - cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL - beq 17f - andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */ - beq 18f - mtmsrd r7, 1 /* if so then re-enable them */ -18: mtlr r8 + mtlr r8 blr -16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */ - mtspr SPRN_HSRR1, r7 - b exc_virt_0x4500_hardware_interrupt - -17: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - b exc_virt_0x4e80_h_doorbell - kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ @@ -769,6 +733,8 @@ BEGIN_FTR_SECTION std r6, STACK_SLOT_PSSCR(r1) std r7, STACK_SLOT_PID(r1) std r8, STACK_SLOT_IAMR(r1) + mfspr r5, SPRN_HFSCR + std r5, STACK_SLOT_HFSCR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION mfspr r5, SPRN_CIABR @@ -920,8 +886,10 @@ FTR_SECTION_ELSE ld r5, VCPU_TID(r4) ld r6, VCPU_PSSCR(r4) oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */ + ld r7, VCPU_HFSCR(r4) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 + mtspr SPRN_HFSCR, r7 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 8: @@ -936,7 +904,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mftb r7 subf r3,r7,r8 mtspr SPRN_DEC,r3 - stw r3,VCPU_DEC(r4) + std r3,VCPU_DEC(r4) ld r5, VCPU_SPRG0(r4) ld r6, VCPU_SPRG1(r4) @@ -1048,7 +1016,13 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ li r0, BOOK3S_INTERRUPT_EXTERNAL bne cr1, 12f mfspr r0, SPRN_DEC - cmpwi r0, 0 +BEGIN_FTR_SECTION + /* On POWER9 check whether the guest has large decrementer enabled */ + andis. r8, r8, LPCR_LD@h + bne 15f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + extsw r0, r0 +15: cmpdi r0, 0 li r0, BOOK3S_INTERRUPT_DECREMENTER bge 5f @@ -1058,6 +1032,23 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ mr r9, r4 bl kvmppc_msr_interrupt 5: +BEGIN_FTR_SECTION + b fast_guest_return +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + /* On POWER9, check for pending doorbell requests */ + lbz r0, VCPU_DBELL_REQ(r4) + cmpwi r0, 0 + beq fast_guest_return + ld r5, HSTATE_KVM_VCORE(r13) + /* Set DPDES register so the CPU will take a doorbell interrupt */ + li r0, 1 + mtspr SPRN_DPDES, r0 + std r0, VCORE_DPDES(r5) + /* Make sure other cpus see vcore->dpdes set before dbell req clear */ + lwsync + /* Clear the pending doorbell request */ + li r0, 0 + stb r0, VCPU_DBELL_REQ(r4) /* * Required state: @@ -1232,6 +1223,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) stw r12,VCPU_TRAP(r9) + /* + * Now that we have saved away SRR0/1 and HSRR0/1, + * interrupts are recoverable in principle, so set MSR_RI. + * This becomes important for relocation-on interrupts from + * the guest, which we can get in radix mode on POWER9. + */ + li r0, MSR_RI + mtmsrd r0, 1 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r9, VCPU_TB_RMINTR mr r4, r9 @@ -1288,6 +1288,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) beq 4f b guest_exit_cont 3: + /* If it's a hypervisor facility unavailable interrupt, save HFSCR */ + cmpwi r12, BOOK3S_INTERRUPT_H_FAC_UNAVAIL + bne 14f + mfspr r3, SPRN_HFSCR + std r3, VCPU_HFSCR(r9) + b guest_exit_cont +14: /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL bne+ guest_exit_cont @@ -1475,12 +1482,18 @@ mc_cont: mtspr SPRN_SPURR,r4 /* Save DEC */ + ld r3, HSTATE_KVM_VCORE(r13) mfspr r5,SPRN_DEC mftb r6 + /* On P9, if the guest has large decr enabled, don't sign extend */ +BEGIN_FTR_SECTION + ld r4, VCORE_LPCR(r3) + andis. r4, r4, LPCR_LD@h + bne 16f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) extsw r5,r5 - add r5,r5,r6 +16: add r5,r5,r6 /* r5 is a guest timebase value here, convert to host TB */ - ld r3,HSTATE_KVM_VCORE(r13) ld r4,VCORE_TB_OFFSET(r3) subf r5,r4,r5 std r5,VCPU_DEC_EXPIRES(r9) @@ -1525,6 +1538,9 @@ FTR_SECTION_ELSE rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */ rotldi r6, r6, 60 std r6, VCPU_PSSCR(r9) + /* Restore host HFSCR value */ + ld r7, STACK_SLOT_HFSCR(r1) + mtspr SPRN_HFSCR, r7 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) /* * Restore various registers to 0, where non-zero values @@ -2402,8 +2418,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) mfspr r3, SPRN_DEC mfspr r4, SPRN_HDEC mftb r5 +BEGIN_FTR_SECTION + /* On P9 check whether the guest has large decrementer mode enabled */ + ld r6, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_LPCR(r6) + andis. r6, r6, LPCR_LD@h + bne 68f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) extsw r3, r3 - EXTEND_HDEC(r4) +68: EXTEND_HDEC(r4) cmpd r3, r4 ble 67f mtspr SPRN_DEC, r4 @@ -2589,22 +2612,32 @@ machine_check_realmode: ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK /* - * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through - * machine check interrupt (set HSRR0 to 0x200). And for handled - * errors (no-fatal), just go back to guest execution with current - * HSRR0 instead of exiting guest. This new approach will inject - * machine check to guest for fatal error causing guest to crash. - * - * The old code used to return to host for unhandled errors which - * was causing guest to hang with soft lockups inside guest and - * makes it difficult to recover guest instance. + * For the guest that is FWNMI capable, deliver all the MCE errors + * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit + * reason. This new approach injects machine check errors in guest + * address space to guest with additional information in the form + * of RTAS event, thus enabling guest kernel to suitably handle + * such errors. * + * For the guest that is not FWNMI capable (old QEMU) fallback + * to old behaviour for backward compatibility: + * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either + * through machine check interrupt (set HSRR0 to 0x200). + * For handled errors (no-fatal), just go back to guest execution + * with current HSRR0. * if we receive machine check with MSR(RI=0) then deliver it to * guest as machine check causing guest to crash. */ ld r11, VCPU_MSR(r9) rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ bne mc_cont /* if so, exit to host */ + /* Check if guest is capable of handling NMI exit */ + ld r10, VCPU_KVM(r9) + lbz r10, KVM_FWNMI(r10) + cmpdi r10, 1 /* FWNMI capable? */ + beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ + + /* if not, fall through for backward compatibility. */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ beq 1f /* Deliver a machine check to guest */ ld r10, VCPU_PC(r9) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index ffe1da95033a..08b200a0bbce 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1257,8 +1257,8 @@ static void xive_pre_save_scan(struct kvmppc_xive *xive) if (!xc) continue; for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) { - if (xc->queues[i].qpage) - xive_pre_save_queue(xive, &xc->queues[i]); + if (xc->queues[j].qpage) + xive_pre_save_queue(xive, &xc->queues[j]); } } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3eaac3809977..071b87ee682f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -687,7 +687,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) kvmppc_core_check_exceptions(vcpu); - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { /* Exception delivery raised request; start over */ return 1; } diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index c873ffe55362..4d8b4d6cebff 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -39,7 +39,7 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) unsigned long dec_nsec; unsigned long long dec_time; - pr_debug("mtDEC: %x\n", vcpu->arch.dec); + pr_debug("mtDEC: %lx\n", vcpu->arch.dec); hrtimer_try_to_cancel(&vcpu->arch.dec_timer); #ifdef CONFIG_PPC_BOOK3S @@ -109,7 +109,7 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_TBWU: break; case SPRN_DEC: - vcpu->arch.dec = spr_val; + vcpu->arch.dec = (u32) spr_val; kvmppc_emulate_dec(vcpu); break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 7f71ab5fcad1..1a75c0b5f4ca 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -55,8 +55,7 @@ EXPORT_SYMBOL_GPL(kvmppc_pr_ops); int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !!(v->arch.pending_exceptions) || - v->requests; + return !!(v->arch.pending_exceptions) || kvm_request_pending(v); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) @@ -108,7 +107,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) */ smp_mb(); - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { /* Make sure we process requests preemptable */ local_irq_enable(); trace_kvm_check_requests(vcpu); @@ -554,13 +553,28 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: r = 0; - if (hv_enabled) { + if (kvm) { + if (kvm->arch.emul_smt_mode > 1) + r = kvm->arch.emul_smt_mode; + else + r = kvm->arch.smt_mode; + } else if (hv_enabled) { if (cpu_has_feature(CPU_FTR_ARCH_300)) r = 1; else r = threads_per_subcore; } break; + case KVM_CAP_PPC_SMT_POSSIBLE: + r = 1; + if (hv_enabled) { + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + r = ((threads_per_subcore << 1) - 1); + else + /* P9 can emulate dbells, so allow any mode */ + r = 8 | 4 | 2 | 1; + } + break; case KVM_CAP_PPC_RMA: r = 0; break; @@ -619,6 +633,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300); break; #endif +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_CAP_PPC_FWNMI: + r = hv_enabled; + break; +#endif case KVM_CAP_PPC_HTM: r = cpu_has_feature(CPU_FTR_TM_COMP) && is_kvmppc_hv_enabled(kvm); @@ -1538,6 +1557,15 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } #endif /* CONFIG_KVM_XICS */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_CAP_PPC_FWNMI: + r = -EINVAL; + if (!is_kvmppc_hv_enabled(vcpu->kvm)) + break; + r = 0; + vcpu->kvm->arch.fwnmi_enabled = true; + break; +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ default: r = -EINVAL; break; @@ -1712,6 +1740,15 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; break; } + case KVM_CAP_PPC_SMT: { + unsigned long mode = cap->args[0]; + unsigned long flags = cap->args[1]; + + r = -EINVAL; + if (kvm->arch.kvm_ops->set_smt_mode) + r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); + break; + } #endif default: r = -EINVAL; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index aee2bb817ac6..861c5af1c9c4 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -938,7 +938,7 @@ common_load: /* * Tail call */ - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); break; @@ -1052,6 +1052,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fp->bpf_func = (void *)image; fp->jited = 1; + fp->jited_len = alloclen; bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index d0441ad2a990..e508dff92535 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -59,7 +59,9 @@ union ctlreg0 { unsigned long lap : 1; /* Low-address-protection control */ unsigned long : 4; unsigned long edat : 1; /* Enhanced-DAT-enablement control */ - unsigned long : 4; + unsigned long : 2; + unsigned long iep : 1; /* Instruction-Execution-Protection */ + unsigned long : 1; unsigned long afp : 1; /* AFP-register control */ unsigned long vx : 1; /* Vector enablement control */ unsigned long : 7; diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 8acf482162ed..88162bb5c190 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -8,6 +8,7 @@ #ifndef _ASM_S390_DIAG_H #define _ASM_S390_DIAG_H +#include <linux/if_ether.h> #include <linux/percpu.h> enum diag_stat_enum { @@ -24,6 +25,7 @@ enum diag_stat_enum { DIAG_STAT_X224, DIAG_STAT_X250, DIAG_STAT_X258, + DIAG_STAT_X26C, DIAG_STAT_X288, DIAG_STAT_X2C4, DIAG_STAT_X2FC, @@ -225,6 +227,30 @@ struct diag204_x_phys_block { struct diag204_x_phys_cpu cpus[]; } __packed; +enum diag26c_sc { + DIAG26C_MAC_SERVICES = 0x00000030 +}; + +enum diag26c_version { + DIAG26C_VERSION2 = 0x00000002 /* z/VM 5.4.0 */ +}; + +#define DIAG26C_GET_MAC 0x0000 +struct diag26c_mac_req { + u32 resp_buf_len; + u32 resp_version; + u16 op_code; + u16 devno; + u8 res[4]; +}; + +struct diag26c_mac_resp { + u32 version; + u8 mac[ETH_ALEN]; + u8 res[2]; +} __aligned(8); + int diag204(unsigned long subcode, unsigned long size, void *addr); int diag224(void *ptr); +int diag26c(void *req, void *resp, enum diag26c_sc subcode); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6baae236f461..a409d5991934 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -42,9 +42,11 @@ #define KVM_HALT_POLL_NS_DEFAULT 80000 /* s390-specific vcpu->requests bit members */ -#define KVM_REQ_ENABLE_IBS 8 -#define KVM_REQ_DISABLE_IBS 9 -#define KVM_REQ_ICPT_OPEREXC 10 +#define KVM_REQ_ENABLE_IBS KVM_ARCH_REQ(0) +#define KVM_REQ_DISABLE_IBS KVM_ARCH_REQ(1) +#define KVM_REQ_ICPT_OPEREXC KVM_ARCH_REQ(2) +#define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3) +#define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4) #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -56,7 +58,7 @@ union bsca_sigp_ctrl { __u8 r : 1; __u8 scn : 6; }; -} __packed; +}; union esca_sigp_ctrl { __u16 value; @@ -65,14 +67,14 @@ union esca_sigp_ctrl { __u8 reserved: 7; __u8 scn; }; -} __packed; +}; struct esca_entry { union esca_sigp_ctrl sigp_ctrl; __u16 reserved1[3]; __u64 sda; __u64 reserved2[6]; -} __packed; +}; struct bsca_entry { __u8 reserved0; @@ -80,7 +82,7 @@ struct bsca_entry { __u16 reserved[3]; __u64 sda; __u64 reserved2[2]; -} __attribute__((packed)); +}; union ipte_control { unsigned long val; @@ -97,7 +99,7 @@ struct bsca_block { __u64 mcn; __u64 reserved2; struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; -} __attribute__((packed)); +}; struct esca_block { union ipte_control ipte_control; @@ -105,7 +107,7 @@ struct esca_block { __u64 mcn[4]; __u64 reserved2[20]; struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; -} __packed; +}; /* * This struct is used to store some machine check info from lowcore @@ -274,7 +276,7 @@ struct kvm_s390_sie_block { struct kvm_s390_itdb { __u8 data[256]; -} __packed; +}; struct sie_page { struct kvm_s390_sie_block sie_block; @@ -282,7 +284,7 @@ struct sie_page { __u8 reserved218[1000]; /* 0x0218 */ struct kvm_s390_itdb itdb; /* 0x0600 */ __u8 reserved700[2304]; /* 0x0700 */ -} __packed; +}; struct kvm_vcpu_stat { u64 exit_userspace; @@ -695,7 +697,7 @@ struct sie_page2 { __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ struct kvm_s390_crypto_cb crycb; /* 0x0800 */ u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ -} __packed; +}; struct kvm_s390_vsie { struct mutex mutex; @@ -705,6 +707,12 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_migration_state { + unsigned long bitmap_size; /* in bits (number of guest pages) */ + atomic64_t dirty_pages; /* number of dirty pages */ + unsigned long *pgste_bitmap; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -732,6 +740,7 @@ struct kvm_arch{ struct kvm_s390_crypto crypto; struct kvm_s390_vsie vsie; u64 epoch; + struct kvm_s390_migration_state *migration_state; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); }; diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 13623b9991d4..9d91cf3e427f 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -26,6 +26,12 @@ #define MCCK_CODE_PSW_MWP_VALID _BITUL(63 - 20) #define MCCK_CODE_PSW_IA_VALID _BITUL(63 - 23) +#define MCCK_CR14_CR_PENDING_SUB_MASK (1 << 28) +#define MCCK_CR14_RECOVERY_SUB_MASK (1 << 27) +#define MCCK_CR14_DEGRAD_SUB_MASK (1 << 26) +#define MCCK_CR14_EXT_DAMAGE_SUB_MASK (1 << 25) +#define MCCK_CR14_WARN_SUB_MASK (1 << 24) + #ifndef __ASSEMBLY__ union mci { diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 3dd2a1d308dd..69d09c39bbcd 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -28,6 +28,7 @@ #define KVM_DEV_FLIC_CLEAR_IO_IRQ 8 #define KVM_DEV_FLIC_AISM 9 #define KVM_DEV_FLIC_AIRQ_INJECT 10 +#define KVM_DEV_FLIC_AISM_ALL 11 /* * We can have up to 4*64k pending subchannels + 8 adapter interrupts, * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. @@ -53,6 +54,11 @@ struct kvm_s390_ais_req { __u16 mode; }; +struct kvm_s390_ais_all { + __u8 simm; + __u8 nimm; +}; + #define KVM_S390_IO_ADAPTER_MASK 1 #define KVM_S390_IO_ADAPTER_MAP 2 #define KVM_S390_IO_ADAPTER_UNMAP 3 @@ -70,6 +76,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_TOD 1 #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 +#define KVM_S390_VM_MIGRATION 4 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 @@ -151,6 +158,11 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +/* kvm attributes for migration mode */ +#define KVM_S390_VM_MIGRATION_STOP 0 +#define KVM_S390_VM_MIGRATION_START 1 +#define KVM_S390_VM_MIGRATION_STATUS 2 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index e8e5ecf673fd..52a63f4175cb 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -104,4 +104,8 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index ac6abcd3fe6a..349914571772 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -38,6 +38,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X224] = { .code = 0x224, .name = "EBCDIC-Name Table" }, [DIAG_STAT_X250] = { .code = 0x250, .name = "Block I/O" }, [DIAG_STAT_X258] = { .code = 0x258, .name = "Page-Reference Services" }, + [DIAG_STAT_X26C] = { .code = 0x26c, .name = "Certain System Information" }, [DIAG_STAT_X288] = { .code = 0x288, .name = "Time Bomb" }, [DIAG_STAT_X2C4] = { .code = 0x2c4, .name = "FTP Services" }, [DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" }, @@ -236,3 +237,31 @@ int diag224(void *ptr) return rc; } EXPORT_SYMBOL(diag224); + +/* + * Diagnose 26C: Access Certain System Information + */ +static inline int __diag26c(void *req, void *resp, enum diag26c_sc subcode) +{ + register unsigned long _req asm("2") = (addr_t) req; + register unsigned long _resp asm("3") = (addr_t) resp; + register unsigned long _subcode asm("4") = subcode; + register unsigned long _rc asm("5") = -EOPNOTSUPP; + + asm volatile( + " sam31\n" + " diag %[rx],%[ry],0x26c\n" + "0: sam64\n" + EX_TABLE(0b,0b) + : "+d" (_rc) + : [rx] "d" (_req), "d" (_resp), [ry] "d" (_subcode) + : "cc", "memory"); + return _rc; +} + +int diag26c(void *req, void *resp, enum diag26c_sc subcode) +{ + diag_stat_inc(DIAG_STAT_X26C); + return __diag26c(req, resp, subcode); +} +EXPORT_SYMBOL(diag26c); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 875f8bea8c67..653cae5e1ee1 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -89,7 +89,7 @@ struct region3_table_entry_fc1 { unsigned long f : 1; /* Fetch-Protection Bit */ unsigned long fc : 1; /* Format-Control */ unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long co : 1; /* Change-Recording Override */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ unsigned long : 2; unsigned long i : 1; /* Region-Invalid Bit */ unsigned long cr : 1; /* Common-Region Bit */ @@ -131,7 +131,7 @@ struct segment_entry_fc1 { unsigned long f : 1; /* Fetch-Protection Bit */ unsigned long fc : 1; /* Format-Control */ unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long co : 1; /* Change-Recording Override */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ unsigned long : 2; unsigned long i : 1; /* Segment-Invalid Bit */ unsigned long cs : 1; /* Common-Segment Bit */ @@ -168,7 +168,8 @@ union page_table_entry { unsigned long z : 1; /* Zero Bit */ unsigned long i : 1; /* Page-Invalid Bit */ unsigned long p : 1; /* DAT-Protection Bit */ - unsigned long : 9; + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 8; }; }; @@ -241,7 +242,7 @@ struct ale { unsigned long asteo : 25; /* ASN-Second-Table-Entry Origin */ unsigned long : 6; unsigned long astesn : 32; /* ASTE Sequence Number */ -} __packed; +}; struct aste { unsigned long i : 1; /* ASX-Invalid Bit */ @@ -257,7 +258,7 @@ struct aste { unsigned long ald : 32; unsigned long astesn : 32; /* .. more fields there */ -} __packed; +}; int ipte_lock_held(struct kvm_vcpu *vcpu) { @@ -485,6 +486,7 @@ enum prot_type { PROT_TYPE_KEYC = 1, PROT_TYPE_ALC = 2, PROT_TYPE_DAT = 3, + PROT_TYPE_IEP = 4, }; static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, @@ -500,6 +502,9 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { + case PROT_TYPE_IEP: + tec->b61 = 1; + /* FALL THROUGH */ case PROT_TYPE_LA: tec->b56 = 1; break; @@ -591,6 +596,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * @gpa: points to where guest physical (absolute) address should be stored * @asce: effective asce * @mode: indicates the access mode to be used + * @prot: returns the type for protection exceptions * * Translate a guest virtual address into a guest absolute address by means * of dynamic address translation as specified by the architecture. @@ -606,19 +612,21 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) */ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, unsigned long *gpa, const union asce asce, - enum gacc_mode mode) + enum gacc_mode mode, enum prot_type *prot) { union vaddress vaddr = {.addr = gva}; union raddress raddr = {.addr = gva}; union page_table_entry pte; int dat_protection = 0; + int iep_protection = 0; union ctlreg0 ctlreg0; unsigned long ptr; - int edat1, edat2; + int edat1, edat2, iep; ctlreg0.val = vcpu->arch.sie_block->gcr[0]; edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8); edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78); + iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); if (asce.r) goto real_address; ptr = asce.origin * 4096; @@ -702,6 +710,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_TRANSLATION_SPEC; if (rtte.fc && edat2) { dat_protection |= rtte.fc1.p; + iep_protection = rtte.fc1.iep; raddr.rfaa = rtte.fc1.rfaa; goto absolute_address; } @@ -729,6 +738,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_TRANSLATION_SPEC; if (ste.fc && edat1) { dat_protection |= ste.fc1.p; + iep_protection = ste.fc1.iep; raddr.sfaa = ste.fc1.sfaa; goto absolute_address; } @@ -745,12 +755,19 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, if (pte.z) return PGM_TRANSLATION_SPEC; dat_protection |= pte.p; + iep_protection = pte.iep; raddr.pfra = pte.pfra; real_address: raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr); absolute_address: - if (mode == GACC_STORE && dat_protection) + if (mode == GACC_STORE && dat_protection) { + *prot = PROT_TYPE_DAT; return PGM_PROTECTION; + } + if (mode == GACC_IFETCH && iep_protection && iep) { + *prot = PROT_TYPE_IEP; + return PGM_PROTECTION; + } if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) return PGM_ADDRESSING; *gpa = raddr.addr; @@ -782,6 +799,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, { psw_t *psw = &vcpu->arch.sie_block->gpsw; int lap_enabled, rc = 0; + enum prot_type prot; lap_enabled = low_address_protection_enabled(vcpu, asce); while (nr_pages) { @@ -791,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, PROT_TYPE_LA); ga &= PAGE_MASK; if (psw_bits(*psw).dat) { - rc = guest_translate(vcpu, ga, pages, asce, mode); + rc = guest_translate(vcpu, ga, pages, asce, mode, &prot); if (rc < 0) return rc; } else { @@ -800,7 +818,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, rc = PGM_ADDRESSING; } if (rc) - return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT); + return trans_exc(vcpu, rc, ga, ar, mode, prot); ga += PAGE_SIZE; pages++; nr_pages--; @@ -886,6 +904,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long *gpa, enum gacc_mode mode) { psw_t *psw = &vcpu->arch.sie_block->gpsw; + enum prot_type prot; union asce asce; int rc; @@ -900,9 +919,9 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, } if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */ - rc = guest_translate(vcpu, gva, gpa, asce, mode); + rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot); if (rc > 0) - return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT); + return trans_exc(vcpu, rc, gva, 0, mode, prot); } else { *gpa = kvm_s390_real_to_abs(vcpu, gva); if (kvm_is_error_gpa(vcpu->kvm, *gpa)) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2d120fef7d90..a619ddae610d 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -251,8 +251,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); if (psw_mchk_disabled(vcpu)) active_mask &= ~IRQ_PEND_MCHK_MASK; + /* + * Check both floating and local interrupt's cr14 because + * bit IRQ_PEND_MCHK_REP could be set in both cases. + */ if (!(vcpu->arch.sie_block->gcr[14] & - vcpu->kvm->arch.float_int.mchk.cr14)) + (vcpu->kvm->arch.float_int.mchk.cr14 | + vcpu->arch.local_int.irq.mchk.cr14))) __clear_bit(IRQ_PEND_MCHK_REP, &active_mask); /* @@ -1876,6 +1881,28 @@ out: return ret < 0 ? ret : n; } +static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_all ais; + + if (attr->attr < sizeof(ais)) + return -EINVAL; + + if (!test_kvm_facility(kvm, 72)) + return -ENOTSUPP; + + mutex_lock(&fi->ais_lock); + ais.simm = fi->simm; + ais.nimm = fi->nimm; + mutex_unlock(&fi->ais_lock); + + if (copy_to_user((void __user *)attr->addr, &ais, sizeof(ais))) + return -EFAULT; + + return 0; +} + static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r; @@ -1885,6 +1912,9 @@ static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr, attr->attr); break; + case KVM_DEV_FLIC_AISM_ALL: + r = flic_ais_mode_get_all(dev->kvm, attr); + break; default: r = -EINVAL; } @@ -2235,6 +2265,25 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr) return kvm_s390_inject_airq(kvm, adapter); } +static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_all ais; + + if (!test_kvm_facility(kvm, 72)) + return -ENOTSUPP; + + if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais))) + return -EFAULT; + + mutex_lock(&fi->ais_lock); + fi->simm = ais.simm; + fi->nimm = ais.nimm; + mutex_unlock(&fi->ais_lock); + + return 0; +} + static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; @@ -2277,6 +2326,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_AIRQ_INJECT: r = flic_inject_airq(dev->kvm, attr); break; + case KVM_DEV_FLIC_AISM_ALL: + r = flic_ais_mode_set_all(dev->kvm, attr); + break; default: r = -EINVAL; } @@ -2298,6 +2350,7 @@ static int flic_has_attr(struct kvm_device *dev, case KVM_DEV_FLIC_CLEAR_IO_IRQ: case KVM_DEV_FLIC_AISM: case KVM_DEV_FLIC_AIRQ_INJECT: + case KVM_DEV_FLIC_AISM_ALL: return 0; } return -ENXIO; @@ -2415,6 +2468,42 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, return ret; } +/* + * Inject the machine check to the guest. + */ +void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, + struct mcck_volatile_info *mcck_info) +{ + struct kvm_s390_interrupt_info inti; + struct kvm_s390_irq irq; + struct kvm_s390_mchk_info *mchk; + union mci mci; + __u64 cr14 = 0; /* upper bits are not used */ + + mci.val = mcck_info->mcic; + if (mci.sr) + cr14 |= MCCK_CR14_RECOVERY_SUB_MASK; + if (mci.dg) + cr14 |= MCCK_CR14_DEGRAD_SUB_MASK; + if (mci.w) + cr14 |= MCCK_CR14_WARN_SUB_MASK; + + mchk = mci.ck ? &inti.mchk : &irq.u.mchk; + mchk->cr14 = cr14; + mchk->mcic = mcck_info->mcic; + mchk->ext_damage_code = mcck_info->ext_damage_code; + mchk->failing_storage_address = mcck_info->failing_storage_address; + if (mci.ck) { + /* Inject the floating machine check */ + inti.type = KVM_S390_MCHK; + WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti)); + } else { + /* Inject the machine check to specified vcpu */ + irq.type = KVM_S390_MCHK; + WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq)); + } +} + int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b0d7de5a533d..3f2884e99ed4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -30,6 +30,7 @@ #include <linux/vmalloc.h> #include <linux/bitmap.h> #include <linux/sched/signal.h> +#include <linux/string.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> @@ -386,6 +387,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: case KVM_CAP_S390_USER_INSTR0: + case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: r = 1; break; @@ -749,6 +751,129 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) +{ + int cx; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(cx, vcpu, kvm) + kvm_s390_sync_request(req, vcpu); +} + +/* + * Must be called with kvm->srcu held to avoid races on memslots, and with + * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + */ +static int kvm_s390_vm_start_migration(struct kvm *kvm) +{ + struct kvm_s390_migration_state *mgs; + struct kvm_memory_slot *ms; + /* should be the only one */ + struct kvm_memslots *slots; + unsigned long ram_pages; + int slotnr; + + /* migration mode already enabled */ + if (kvm->arch.migration_state) + return 0; + + slots = kvm_memslots(kvm); + if (!slots || !slots->used_slots) + return -EINVAL; + + mgs = kzalloc(sizeof(*mgs), GFP_KERNEL); + if (!mgs) + return -ENOMEM; + kvm->arch.migration_state = mgs; + + if (kvm->arch.use_cmma) { + /* + * Get the last slot. They should be sorted by base_gfn, so the + * last slot is also the one at the end of the address space. + * We have verified above that at least one slot is present. + */ + ms = slots->memslots + slots->used_slots - 1; + /* round up so we only use full longs */ + ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); + /* allocate enough bytes to store all the bits */ + mgs->pgste_bitmap = vmalloc(ram_pages / 8); + if (!mgs->pgste_bitmap) { + kfree(mgs); + kvm->arch.migration_state = NULL; + return -ENOMEM; + } + + mgs->bitmap_size = ram_pages; + atomic64_set(&mgs->dirty_pages, ram_pages); + /* mark all the pages in active slots as dirty */ + for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { + ms = slots->memslots + slotnr; + bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages); + } + + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); + } + return 0; +} + +/* + * Must be called with kvm->lock to avoid races with ourselves and + * kvm_s390_vm_start_migration. + */ +static int kvm_s390_vm_stop_migration(struct kvm *kvm) +{ + struct kvm_s390_migration_state *mgs; + + /* migration mode already disabled */ + if (!kvm->arch.migration_state) + return 0; + mgs = kvm->arch.migration_state; + kvm->arch.migration_state = NULL; + + if (kvm->arch.use_cmma) { + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + vfree(mgs->pgste_bitmap); + } + kfree(mgs); + return 0; +} + +static int kvm_s390_vm_set_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + int idx, res = -ENXIO; + + mutex_lock(&kvm->lock); + switch (attr->attr) { + case KVM_S390_VM_MIGRATION_START: + idx = srcu_read_lock(&kvm->srcu); + res = kvm_s390_vm_start_migration(kvm); + srcu_read_unlock(&kvm->srcu, idx); + break; + case KVM_S390_VM_MIGRATION_STOP: + res = kvm_s390_vm_stop_migration(kvm); + break; + default: + break; + } + mutex_unlock(&kvm->lock); + + return res; +} + +static int kvm_s390_vm_get_migration(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u64 mig = (kvm->arch.migration_state != NULL); + + if (attr->attr != KVM_S390_VM_MIGRATION_STATUS) + return -ENXIO; + + if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig))) + return -EFAULT; + return 0; +} + static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) { u8 gtod_high; @@ -1089,6 +1214,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CRYPTO: ret = kvm_s390_vm_set_crypto(kvm, attr); break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_set_migration(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1111,6 +1239,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MODEL: ret = kvm_s390_get_cpu_model(kvm, attr); break; + case KVM_S390_VM_MIGRATION: + ret = kvm_s390_vm_get_migration(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1178,6 +1309,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) break; } break; + case KVM_S390_VM_MIGRATION: + ret = 0; + break; default: ret = -ENXIO; break; @@ -1285,6 +1419,182 @@ out: return r; } +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* for consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + +/* + * This function searches for the next page with dirty CMMA attributes, and + * saves the attributes in the buffer up to either the end of the buffer or + * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found; + * no trailing clean bytes are saved. + * In case no dirty bits were found, or if CMMA was not enabled or used, the + * output buffer will indicate 0 as length. + */ +static int kvm_s390_get_cmma_bits(struct kvm *kvm, + struct kvm_s390_cmma_log *args) +{ + struct kvm_s390_migration_state *s = kvm->arch.migration_state; + unsigned long bufsize, hva, pgstev, i, next, cur; + int srcu_idx, peek, r = 0, rr; + u8 *res; + + cur = args->start_gfn; + i = next = pgstev = 0; + + if (unlikely(!kvm->arch.use_cmma)) + return -ENXIO; + /* Invalid/unsupported flags were specified */ + if (args->flags & ~KVM_S390_CMMA_PEEK) + return -EINVAL; + /* Migration mode query, and we are not doing a migration */ + peek = !!(args->flags & KVM_S390_CMMA_PEEK); + if (!peek && !s) + return -EINVAL; + /* CMMA is disabled or was not used, or the buffer has length zero */ + bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!bufsize || !kvm->mm->context.use_cmma) { + memset(args, 0, sizeof(*args)); + return 0; + } + + if (!peek) { + /* We are not peeking, and there are no dirty pages */ + if (!atomic64_read(&s->dirty_pages)) { + memset(args, 0, sizeof(*args)); + return 0; + } + cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, + args->start_gfn); + if (cur >= s->bitmap_size) /* nothing found, loop back */ + cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0); + if (cur >= s->bitmap_size) { /* again! (very unlikely) */ + memset(args, 0, sizeof(*args)); + return 0; + } + next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1); + } + + res = vmalloc(bufsize); + if (!res) + return -ENOMEM; + + args->start_gfn = cur; + + down_read(&kvm->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); + while (i < bufsize) { + hva = gfn_to_hva(kvm, cur); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + /* decrement only if we actually flipped the bit to 0 */ + if (!peek && test_and_clear_bit(cur, s->pgste_bitmap)) + atomic64_dec(&s->dirty_pages); + r = get_pgste(kvm->mm, hva, &pgstev); + if (r < 0) + pgstev = 0; + /* save the value */ + res[i++] = (pgstev >> 24) & 0x3; + /* + * if the next bit is too far away, stop. + * if we reached the previous "next", find the next one + */ + if (!peek) { + if (next > cur + KVM_S390_MAX_BIT_DISTANCE) + break; + if (cur == next) + next = find_next_bit(s->pgste_bitmap, + s->bitmap_size, cur + 1); + /* reached the end of the bitmap or of the buffer, stop */ + if ((next >= s->bitmap_size) || + (next >= args->start_gfn + bufsize)) + break; + } + cur++; + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + up_read(&kvm->mm->mmap_sem); + args->count = i; + args->remaining = s ? atomic64_read(&s->dirty_pages) : 0; + + rr = copy_to_user((void __user *)args->values, res, args->count); + if (rr) + r = -EFAULT; + + vfree(res); + return r; +} + +/* + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.use_cmma flag is set. + */ +static int kvm_s390_set_cmma_bits(struct kvm *kvm, + const struct kvm_s390_cmma_log *args) +{ + unsigned long hva, mask, pgstev, i; + uint8_t *bits; + int srcu_idx, r = 0; + + mask = args->mask; + + if (!kvm->arch.use_cmma) + return -ENXIO; + /* invalid/unsupported flags */ + if (args->flags != 0) + return -EINVAL; + /* Enforce sane limit on memory allocation */ + if (args->count > KVM_S390_CMMA_SIZE_MAX) + return -EINVAL; + /* Nothing to do */ + if (args->count == 0) + return 0; + + bits = vmalloc(sizeof(*bits) * args->count); + if (!bits) + return -ENOMEM; + + r = copy_from_user(bits, (void __user *)args->values, args->count); + if (r) { + r = -EFAULT; + goto out; + } + + down_read(&kvm->mm->mmap_sem); + srcu_idx = srcu_read_lock(&kvm->srcu); + for (i = 0; i < args->count; i++) { + hva = gfn_to_hva(kvm, args->start_gfn + i); + if (kvm_is_error_hva(hva)) { + r = -EFAULT; + break; + } + + pgstev = bits[i]; + pgstev = pgstev << 24; + mask &= _PGSTE_GPS_USAGE_MASK; + set_pgste_bits(kvm->mm, hva, mask, pgstev); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + up_read(&kvm->mm->mmap_sem); + + if (!kvm->mm->context.use_cmma) { + down_write(&kvm->mm->mmap_sem); + kvm->mm->context.use_cmma = 1; + up_write(&kvm->mm->mmap_sem); + } +out: + vfree(bits); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1363,6 +1673,29 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_set_skeys(kvm, &args); break; } + case KVM_S390_GET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + r = kvm_s390_get_cmma_bits(kvm, &args); + if (!r) { + r = copy_to_user(argp, &args, sizeof(args)); + if (r) + r = -EFAULT; + } + break; + } + case KVM_S390_SET_CMMA_BITS: { + struct kvm_s390_cmma_log args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + r = kvm_s390_set_cmma_bits(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -1631,6 +1964,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); + if (kvm->arch.migration_state) { + vfree(kvm->arch.migration_state->pgste_bitmap); + kfree(kvm->arch.migration_state); + } KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } @@ -1975,7 +2312,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) if (!vcpu->arch.sie_block->cbrlo) return -ENOMEM; - vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI; return 0; } @@ -2439,7 +2775,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { retry: kvm_s390_vcpu_request_handled(vcpu); - if (!vcpu->requests) + if (!kvm_request_pending(vcpu)) return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the @@ -2488,6 +2824,27 @@ retry: goto retry; } + if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) { + /* + * Disable CMMA virtualization; we will emulate the ESSA + * instruction manually, in order to provide additional + * functionalities needed for live migration. + */ + vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA; + goto retry; + } + + if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) { + /* + * Re-enable CMMA virtualization if CMMA is available and + * was used. + */ + if ((vcpu->kvm->arch.use_cmma) && + (vcpu->kvm->mm->context.use_cmma)) + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + goto retry; + } + /* nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_UNHALT, vcpu); @@ -2682,6 +3039,9 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu) static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) { + struct mcck_volatile_info *mcck_info; + struct sie_page *sie_page; + VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); @@ -2692,6 +3052,15 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14; vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15; + if (exit_reason == -EINTR) { + VCPU_EVENT(vcpu, 3, "%s", "machine check"); + sie_page = container_of(vcpu->arch.sie_block, + struct sie_page, sie_block); + mcck_info = &sie_page->mcck_info; + kvm_s390_reinject_machine_check(vcpu, mcck_info); + return 0; + } + if (vcpu->arch.sie_block->icptcode > 0) { int rc = kvm_handle_sie_intercept(vcpu); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 55f5c8457d6d..6fedc8bc7a37 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -397,4 +397,6 @@ static inline int kvm_s390_use_sca_entries(void) */ return sclp.has_sigpif; } +void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, + struct mcck_volatile_info *mcck_info); #endif diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index e53292a89257..8a1dac793d6b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -24,6 +24,7 @@ #include <asm/ebcdic.h> #include <asm/sysinfo.h> #include <asm/pgtable.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/io.h> @@ -949,13 +950,72 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return 0; } +static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) +{ + struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state; + int r1, r2, nappended, entries; + unsigned long gfn, hva, res, pgstev, ptev; + unsigned long *cbrlo; + + /* + * We don't need to set SD.FPF.SK to 1 here, because if we have a + * machine check here we either handle it or crash + */ + + kvm_s390_get_regs_rre(vcpu, &r1, &r2); + gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; + hva = gfn_to_hva(vcpu->kvm, gfn); + entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; + + if (kvm_is_error_hva(hva)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); + if (nappended < 0) { + res = orc ? 0x10 : 0; + vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + return 0; + } + res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; + /* + * Set the block-content state part of the result. 0 means resident, so + * nothing to do if the page is valid. 2 is for preserved pages + * (non-present and non-zero), and 3 for zero pages (non-present and + * zero). + */ + if (ptev & _PAGE_INVALID) { + res |= 2; + if (pgstev & _PGSTE_GPS_ZERO) + res |= 1; + } + vcpu->run->s.regs.gprs[r1] = res; + /* + * It is possible that all the normal 511 slots were full, in which case + * we will now write in the 512th slot, which is reserved for host use. + * In both cases we let the normal essa handling code process all the + * slots, including the reserved one, if needed. + */ + if (nappended > 0) { + cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK); + cbrlo[entries] = gfn << PAGE_SHIFT; + } + + if (orc) { + /* increment only if we are really flipping the bit to 1 */ + if (!test_and_set_bit(gfn, ms->pgste_bitmap)) + atomic64_inc(&ms->dirty_pages); + } + + return nappended; +} + static int handle_essa(struct kvm_vcpu *vcpu) { /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; struct gmap *gmap; - int i; + int i, orc; VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); gmap = vcpu->arch.gmap; @@ -965,12 +1025,45 @@ static int handle_essa(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - - if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6) + /* Check for invalid operation request code */ + orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; + if (orc > ESSA_MAX) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* Retry the ESSA instruction */ - kvm_s390_retry_instr(vcpu); + if (likely(!vcpu->kvm->arch.migration_state)) { + /* + * CMMA is enabled in the KVM settings, but is disabled in + * the SIE block and in the mm_context, and we are not doing + * a migration. Enable CMMA in the mm_context. + * Since we need to take a write lock to write to the context + * to avoid races with storage keys handling, we check if the + * value really needs to be written to; if the value is + * already correct, we do nothing and avoid the lock. + */ + if (vcpu->kvm->mm->context.use_cmma == 0) { + down_write(&vcpu->kvm->mm->mmap_sem); + vcpu->kvm->mm->context.use_cmma = 1; + up_write(&vcpu->kvm->mm->mmap_sem); + } + /* + * If we are here, we are supposed to have CMMA enabled in + * the SIE block. Enabling CMMA works on a per-CPU basis, + * while the context use_cmma flag is per process. + * It's possible that the context flag is enabled and the + * SIE flag is not, so we set the flag always; if it was + * already set, nothing changes, otherwise we enable it + * on this CPU too. + */ + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + /* Retry the ESSA instruction */ + kvm_s390_retry_instr(vcpu); + } else { + /* Account for the possible extra cbrl entry */ + i = do_essa(vcpu, orc); + if (i < 0) + return i; + entries += i; + } vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); down_read(&gmap->mm->mmap_sem); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4719ecb9ab42..715c19c45d9a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -26,16 +26,21 @@ struct vsie_page { struct kvm_s390_sie_block scb_s; /* 0x0000 */ + /* + * the backup info for machine check. ensure it's at + * the same offset as that in struct sie_page! + */ + struct mcck_volatile_info mcck_info; /* 0x0200 */ /* the pinned originial scb */ - struct kvm_s390_sie_block *scb_o; /* 0x0200 */ + struct kvm_s390_sie_block *scb_o; /* 0x0218 */ /* the shadow gmap in use by the vsie_page */ - struct gmap *gmap; /* 0x0208 */ + struct gmap *gmap; /* 0x0220 */ /* address of the last reported fault to guest2 */ - unsigned long fault_addr; /* 0x0210 */ - __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */ + unsigned long fault_addr; /* 0x0228 */ + __u8 reserved[0x0700 - 0x0230]; /* 0x0230 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ -} __packed; +}; /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, @@ -801,6 +806,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct mcck_volatile_info *mcck_info; + struct sie_page *sie_page; int rc; handle_last_fault(vcpu, vsie_page); @@ -822,6 +829,14 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + if (rc == -EINTR) { + VCPU_EVENT(vcpu, 3, "%s", "machine check"); + sie_page = container_of(scb_s, struct sie_page, sie_block); + mcck_info = &sie_page->mcck_info; + kvm_s390_reinject_machine_check(vcpu, mcck_info); + return 0; + } + if (rc > 0) rc = 0; /* we could still have an icpt */ else if (rc == -EFAULT) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 6e97a2e3fd8d..01c6fbc3e85b 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -991,7 +991,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i } break; } - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: /* * Implicit input: * B1: pointer to ctx @@ -1329,6 +1329,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) bpf_jit_binary_lock_ro(header); fp->bpf_func = (void *) jit.prg_buf; fp->jited = 1; + fp->jited_len = jit.size; free_addrs: kfree(jit.addrs); out: diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 3f4ad19d9ec7..186fd8199f54 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -94,6 +94,10 @@ #define SO_COOKIE 0x003b +#define SCM_TIMESTAMPING_PKTINFO 0x003c + +#define SO_PEERGROUPS 0x003d + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 21de77419f48..8799ae9a8788 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -802,8 +802,13 @@ static void build_prologue(struct jit_ctx *ctx) { s32 stack_needed = BASE_STACKFRAME; - if (ctx->saw_frame_pointer || ctx->saw_tail_call) - stack_needed += MAX_BPF_STACK; + if (ctx->saw_frame_pointer || ctx->saw_tail_call) { + struct bpf_prog *prog = ctx->prog; + u32 stack_depth; + + stack_depth = prog->aux->stack_depth; + stack_needed += round_up(stack_depth, 16); + } if (ctx->saw_tail_call) stack_needed += 8; @@ -1217,7 +1222,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) } /* tail call */ - case BPF_JMP | BPF_CALL |BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_tail_call(ctx); break; @@ -1555,6 +1560,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog->bpf_func = (void *)ctx.image; prog->jited = 1; + prog->jited_len = image_size; out_off: kfree(ctx.offset); diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 910565547163..8739cf7795de 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -42,17 +42,15 @@ #define R5E %esi #define R6 %rdi #define R6E %edi -#define R7 %rbp -#define R7E %ebp +#define R7 %r9 /* don't use %rbp; it breaks stack traces */ +#define R7E %r9d #define R8 %r8 -#define R9 %r9 #define R10 %r10 #define R11 %r11 -#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ +#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ ENTRY(FUNC); \ movq r1,r2; \ - movq r3,r4; \ leaq KEY+48(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ @@ -70,9 +68,8 @@ je B192; \ leaq 32(r9),r9; -#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \ +#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \ movq r1,r2; \ - movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ @@ -88,12 +85,12 @@ movl TAB(,r6,4),r6 ## E; \ roll $16,r2 ## E; \ shrl $16,r4 ## E; \ - movzbl r4 ## H,r7 ## E; \ - movzbl r4 ## L,r4 ## E; \ + movzbl r4 ## L,r7 ## E; \ + movzbl r4 ## H,r4 ## E; \ xorl OFFSET(r8),ra ## E; \ xorl OFFSET+4(r8),rb ## E; \ - xorl TAB+3072(,r7,4),r5 ## E;\ - xorl TAB+2048(,r4,4),r6 ## E;\ + xorl TAB+3072(,r4,4),r5 ## E;\ + xorl TAB+2048(,r7,4),r6 ## E;\ movzbl r1 ## L,r7 ## E; \ movzbl r1 ## H,r4 ## E; \ movl TAB+1024(,r4,4),r4 ## E;\ @@ -101,19 +98,19 @@ roll $16,r1 ## E; \ shrl $16,r3 ## E; \ xorl TAB(,r7,4),r5 ## E; \ - movzbl r3 ## H,r7 ## E; \ - movzbl r3 ## L,r3 ## E; \ - xorl TAB+3072(,r7,4),r4 ## E;\ - xorl TAB+2048(,r3,4),r5 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r3 ## E; \ + movzbl r3 ## L,r7 ## E; \ + movzbl r3 ## H,r3 ## E; \ + xorl TAB+3072(,r3,4),r4 ## E;\ + xorl TAB+2048(,r7,4),r5 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r3 ## E; \ shrl $16,r1 ## E; \ - xorl TAB+3072(,r7,4),r6 ## E;\ - movl TAB+2048(,r3,4),r3 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r1 ## E; \ - xorl TAB+1024(,r7,4),r6 ## E;\ - xorl TAB(,r1,4),r3 ## E; \ + xorl TAB+3072(,r3,4),r6 ## E;\ + movl TAB+2048(,r7,4),r3 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r1 ## E; \ + xorl TAB+1024(,r1,4),r6 ## E;\ + xorl TAB(,r7,4),r3 ## E; \ movzbl r2 ## H,r1 ## E; \ movzbl r2 ## L,r7 ## E; \ shrl $16,r2 ## E; \ @@ -131,9 +128,9 @@ movl r4 ## E,r2 ## E; #define entry(FUNC,KEY,B128,B192) \ - prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) + prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11) -#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11) +#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11) #define encrypt_round(TAB,OFFSET) \ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3c465184ff8a..16627fec80b2 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + .text @@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i + pxor \XMM2, \XMM2 -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation - -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + pxor \XMM2, \XMM2 + + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some PT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -1454,18 +1549,35 @@ _return_T_decrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_decrypt - cmp $12, %r11 - je _T_12_decrypt + cmp $8, %r11 + jl _T_4_decrypt _T_8_decrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_decrypt -_T_12_decrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_4_decrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_123_decrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_decrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_decrypt + add $2, %r10 + sar $16, %eax +_T_1_decrypt: + mov %al, (%r10) jmp _return_T_done_decrypt _T_16_decrypt: movdqu %xmm0, (%r10) @@ -1718,18 +1830,35 @@ _return_T_encrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_encrypt - cmp $12, %r11 - je _T_12_encrypt + cmp $8, %r11 + jl _T_4_encrypt _T_8_encrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_encrypt -_T_12_encrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_4_encrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_123_encrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_encrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_encrypt + add $2, %r10 + sar $16, %eax +_T_1_encrypt: + mov %al, (%r10) jmp _return_T_done_encrypt _T_16_encrypt: movdqu %xmm0, (%r10) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index d664382c6e56..faecb1518bf8 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -155,6 +155,30 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + + .text @@ -372,41 +396,72 @@ VARIABLE_OFFSET = 16*8 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -480,7 +535,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -1427,19 +1481,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: @@ -1631,41 +1702,73 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -1740,7 +1843,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -2702,19 +2804,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 93de8ea51548..4a55cdcdc008 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -61,6 +61,11 @@ struct aesni_rfc4106_gcm_ctx { u8 nonce[4]; }; +struct generic_gcmaes_ctx { + u8 hash_subkey[16] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; +}; + struct aesni_xts_ctx { u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; @@ -102,13 +107,11 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, * u8 *out, Ciphertext output. Encrypt in-place is allowed. * const u8 *in, Plaintext input * unsigned long plaintext_len, Length of data in bytes for encryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this - * is going to be 8 or 12 bytes + * unsigned long aad_len, Length of AAD in bytes. * u8 *auth_tag, Authenticated Tag output. * unsigned long auth_tag_len), Authenticated Tag Length in bytes. * Valid values are 16 (most likely), 12 or 8. @@ -123,9 +126,8 @@ asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, * u8 *out, Plaintext output. Decrypt in-place is allowed. * const u8 *in, Ciphertext input * unsigned long ciphertext_len, Length of data in bytes for decryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going @@ -275,6 +277,16 @@ aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) align = 1; return PTR_ALIGN(crypto_aead_ctx(tfm), align); } + +static inline struct +generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) +{ + unsigned long align = AESNI_ALIGN; + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + return PTR_ALIGN(crypto_aead_ctx(tfm), align); +} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -712,32 +724,34 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); } -static int helper_rfc4106_encrypt(struct aead_request *req) +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; - __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -768,7 +782,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req) kernel_fpu_begin(); aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, dst + req->cryptlen, auth_tag_len); kernel_fpu_end(); @@ -791,37 +805,20 @@ static int helper_rfc4106_encrypt(struct aead_request *req) return 0; } -static int helper_rfc4106_decrypt(struct aead_request *req) +static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; unsigned long tempCipherLen = 0; - __be32 counter = cpu_to_be32(1); - int retval = 0; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); u8 authTag[16]; struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ + int retval = 0; tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -838,7 +835,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req) scatterwalk_start(&dst_sg_walk, req->dst); dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } - } else { /* Allocate memory for src, dst, assoc */ assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); @@ -850,9 +846,10 @@ static int helper_rfc4106_decrypt(struct aead_request *req) dst = src; } + kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, authTag, auth_tag_len); kernel_fpu_end(); @@ -875,6 +872,60 @@ static int helper_rfc4106_decrypt(struct aead_request *req) kfree(assoc); } return retval; + +} + +static int helper_rfc4106_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + __be32 counter = cpu_to_be32(1); + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length equal */ + /* to 16 or 20 bytes */ + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); +} + +static int helper_rfc4106_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length */ + /* equal to 16 or 20 bytes */ + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); } static int rfc4106_encrypt(struct aead_request *req) @@ -1035,6 +1086,46 @@ struct { }; #ifdef CONFIG_X86_64 +static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, + unsigned int key_len) +{ + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); + + return aes_set_key_common(crypto_aead_tfm(aead), + &ctx->aes_key_expanded, key, key_len) ?: + rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); +} + +static int generic_gcmaes_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + __be32 counter = cpu_to_be32(1); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + +static int generic_gcmaes_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + static struct aead_alg aesni_aead_algs[] = { { .setkey = common_rfc4106_set_key, .setauthsize = common_rfc4106_set_authsize, @@ -1069,6 +1160,23 @@ static struct aead_alg aesni_aead_algs[] = { { .cra_ctxsize = sizeof(struct cryptd_aead *), .cra_module = THIS_MODULE, }, +}, { + .setkey = generic_gcmaes_set_key, + .setauthsize = generic_gcmaes_set_authsize, + .encrypt = generic_gcmaes_encrypt, + .decrypt = generic_gcmaes_decrypt, + .ivsize = 12, + .maxauthsize = 16, + .base = { + .cra_name = "gcm(aes)", + .cra_driver_name = "generic-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), + .cra_alignmask = AESNI_ALIGN - 1, + .cra_module = THIS_MODULE, + }, } }; #else static struct aead_alg aesni_aead_algs[0]; diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 24ac9fad832d..d61e57960fe0 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -176,9 +176,6 @@ __glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, src -= 1; dst -= 1; } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; } } diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index 2dd3674b5a1e..458409b7568d 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -269,19 +269,19 @@ static struct sha512_hash_ctx * LAST */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; - return ctx; + goto unlock; } if (ctx->status & HASH_CTX_STS_PROCESSING) { /* Cannot submit to a currently processing job. */ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; - return ctx; + goto unlock; } if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { /* Cannot update a finished job. */ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; - return ctx; + goto unlock; } @@ -363,6 +363,7 @@ static struct sha512_hash_ctx } ctx = sha512_ctx_mgr_resubmit(mgr, ctx); +unlock: spin_unlock_irqrestore(&cstate->work_lock, irqflags); return ctx; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 695605eb1dfb..1588e9e3dc01 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,28 +48,31 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS /* x86-specific vcpu->requests bit members */ -#define KVM_REQ_MIGRATE_TIMER 8 -#define KVM_REQ_REPORT_TPR_ACCESS 9 -#define KVM_REQ_TRIPLE_FAULT 10 -#define KVM_REQ_MMU_SYNC 11 -#define KVM_REQ_CLOCK_UPDATE 12 -#define KVM_REQ_EVENT 14 -#define KVM_REQ_APF_HALT 15 -#define KVM_REQ_STEAL_UPDATE 16 -#define KVM_REQ_NMI 17 -#define KVM_REQ_PMU 18 -#define KVM_REQ_PMI 19 -#define KVM_REQ_SMI 20 -#define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_HV_CRASH 26 -#define KVM_REQ_IOAPIC_EOI_EXIT 27 -#define KVM_REQ_HV_RESET 28 -#define KVM_REQ_HV_EXIT 29 -#define KVM_REQ_HV_STIMER 30 +#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) +#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) +#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) +#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) +#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_EVENT KVM_ARCH_REQ(6) +#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) +#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) +#define KVM_REQ_NMI KVM_ARCH_REQ(9) +#define KVM_REQ_PMU KVM_ARCH_REQ(10) +#define KVM_REQ_PMI KVM_ARCH_REQ(11) +#define KVM_REQ_SMI KVM_ARCH_REQ(12) +#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) +#define KVM_REQ_MCLOCK_INPROGRESS \ + KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC \ + KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16) +#define KVM_REQ_APIC_PAGE_RELOAD \ + KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18) +#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19) +#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) +#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) +#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -254,7 +257,8 @@ union kvm_mmu_page_role { unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; - unsigned :8; + unsigned ad_disabled:1; + unsigned :7; /* * This is left at the top of the word so that diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d406894cd9a2..5573c75f8e4c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -426,6 +426,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc + #define MSR_IA32_XSS 0x00000da0 #define FEATURE_CONTROL_LOCKED (1<<0) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index f6d20f6cca12..11071fcd630e 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -43,6 +43,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/smap.h> #include <xen/interface/xen.h> #include <xen/interface/sched.h> @@ -50,6 +51,8 @@ #include <xen/interface/platform.h> #include <xen/interface/xen-mca.h> +struct xen_dm_op_buf; + /* * The hypercall asms have to meet several constraints: * - Work on 32- and 64-bit. @@ -214,10 +217,12 @@ privcmd_call(unsigned call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + stac(); asm volatile("call *%[call]" : __HYPERCALL_5PARAM : [call] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); + clac(); return (long)__res; } @@ -474,9 +479,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg) static inline int HYPERVISOR_dm_op( - domid_t dom, unsigned int nr_bufs, void *bufs) + domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs) { - return _hypercall3(int, dm_op, dom, nr_bufs, bufs); + int ret; + stac(); + ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs); + clac(); + return ret; } static inline void diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a6fd40aade7c..da6728383052 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} + static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 80890dee66ce..fb0055953fbc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) @@ -3942,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) } /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save + * and restore MXCSR. + */ +static size_t __fxstate_size(int nregs) +{ + return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16; +} + +static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt) +{ + bool cr4_osfxsr; + if (ctxt->mode == X86EMUL_MODE_PROT64) + return __fxstate_size(16); + + cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR; + return __fxstate_size(cr4_osfxsr ? 8 : 0); +} + +/* * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, * 1) 16 bit mode * 2) 32 bit mode @@ -3962,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) static int em_fxsave(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; - size_t size; int rc; rc = check_fxsr(ctxt); @@ -3978,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) - size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); - else - size = offsetof(struct fxregs_state, xmm_space[0]); - - return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -} - -static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, - struct fxregs_state *new) -{ - int rc = X86EMUL_CONTINUE; - struct fxregs_state old; - - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); - if (rc != X86EMUL_CONTINUE) - return rc; - - /* - * 64 bit host will restore XMM 8-15, which is not correct on non-64 - * bit guests. Load the current values in order to preserve 64 bit - * XMMs after fxrstor. - */ -#ifdef CONFIG_X86_64 - /* XXX: accessing XMM 8-15 very awkwardly */ - memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); -#endif - - /* - * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but - * does save and restore MXCSR. - */ - if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) - memcpy(new->xmm_space, old.xmm_space, 8 * 16); - - return rc; + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, + fxstate_size(ctxt)); } static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; + size_t size; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); - if (rc != X86EMUL_CONTINUE) - return rc; + ctxt->ops->get_fpu(ctxt); - if (fx_state.mxcsr >> 16) - return emulate_gp(ctxt, 0); + size = fxstate_size(ctxt); + if (size < __fxstate_size(16)) { + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + if (rc != X86EMUL_CONTINUE) + goto out; + } - ctxt->ops->get_fpu(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + goto out; - if (ctxt->mode < X86EMUL_MODE_PROT64) - rc = fxrstor_fixup(ctxt, &fx_state); + if (fx_state.mxcsr >> 16) { + rc = emulate_gp(ctxt, 0); + goto out; + } if (rc == X86EMUL_CONTINUE) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); +out: ctxt->ops->put_fpu(ctxt); return rc; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d24c8742d9b0..2819d4c123eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1495,6 +1495,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + WARN_ON(!apic->lapic_timer.hv_timer_in_use); preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; @@ -1503,25 +1504,56 @@ static void cancel_hv_timer(struct kvm_lapic *apic) static bool start_hv_timer(struct kvm_lapic *apic) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + int r; - if ((atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); + if (!kvm_x86_ops->set_hv_timer) + return false; + + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return false; - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) - cancel_hv_timer(apic); + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); + if (r < 0) + return false; + + ktimer->hv_timer_in_use = true; + hrtimer_cancel(&ktimer->timer); + + /* + * Also recheck ktimer->pending, in case the sw timer triggered in + * the window. For periodic timer, leave the hv timer running for + * simplicity, and the deadline will be recomputed on the next vmexit. + */ + if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { + if (r) + apic_timer_expired(apic); + return false; } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; + + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + return true; +} + +static void start_sw_timer(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); +} + +static void restart_apic_timer(struct kvm_lapic *apic) +{ + if (!start_hv_timer(apic)) + start_sw_timer(apic); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) @@ -1535,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); - if (!start_hv_timer(apic)) - start_sw_period(apic); + restart_apic_timer(apic); } } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - start_hv_timer(apic); + restart_apic_timer(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1556,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_timer(apic); + if (apic->lapic_timer.hv_timer_in_use) + start_sw_timer(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - if (atomic_read(&apic->lapic_timer.pending)) - return; +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - start_sw_period(apic); - else if (apic_lvtt_tscdeadline(apic)) - start_sw_tscdeadline(apic); + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + restart_apic_timer(apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { atomic_set(&apic->lapic_timer.pending, 0); - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - if (set_target_expiration(apic) && - !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_period(apic); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_tscdeadline(apic); - } + if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + && !set_target_expiration(apic)) + return; + + restart_apic_timer(apic); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -1813,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bcbe811f3b97..29caa2c3dff9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb8225969255..aafd399cf8c6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -183,13 +183,13 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_present_mask; /* - * The mask/value to distinguish a PTE that has been marked not-present for - * access tracking purposes. - * The mask would be either 0 if access tracking is disabled, or - * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. + * Non-present SPTEs with shadow_acc_track_value set are in place for access + * tracking. */ static u64 __read_mostly shadow_acc_track_mask; static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; @@ -207,16 +207,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { + BUG_ON((mmio_mask & mmio_value) != mmio_value); + shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return !(spte & shadow_acc_track_value); +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + static inline bool is_access_track_spte(u64 spte) { - /* Always false if shadow_acc_track_mask is zero. */ - return (spte & shadow_acc_track_mask) == shadow_acc_track_value; + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } /* @@ -270,7 +294,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -278,7 +302,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, static bool is_mmio_spte(u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_mask; + return (spte & shadow_mmio_mask) == shadow_mmio_value; } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -315,12 +339,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, u64 acc_track_mask) { - if (acc_track_mask != 0) - acc_track_mask |= SPTE_SPECIAL_MASK; + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & shadow_acc_track_value); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -329,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; shadow_acc_track_mask = acc_track_mask; - WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -549,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte) is_access_track_spte(spte)) return true; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { if ((spte & shadow_accessed_mask) == 0 || (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) return true; @@ -560,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte) static bool is_accessed_spte(u64 spte) { - return shadow_accessed_mask ? spte & shadow_accessed_mask - : !is_access_track_spte(spte); + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); } static bool is_dirty_spte(u64 spte) { - return shadow_dirty_mask ? spte & shadow_dirty_mask - : spte & PT_WRITABLE_MASK; + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -707,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static u64 mark_spte_for_access_track(u64 spte) { - if (shadow_accessed_mask != 0) + if (spte_ad_enabled(spte)) return spte & ~shadow_accessed_mask; - if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + if (is_access_track_spte(spte)) return spte; /* @@ -729,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte) spte |= (spte & shadow_acc_track_saved_bits_mask) << shadow_acc_track_saved_bits_shift; spte &= ~shadow_acc_track_mask; - spte |= shadow_acc_track_value; return spte; } @@ -741,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte) u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) & shadow_acc_track_saved_bits_mask; + WARN_ON_ONCE(spte_ad_enabled(spte)); WARN_ON_ONCE(!is_access_track_spte(spte)); new_spte &= ~shadow_acc_track_mask; @@ -759,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep) if (!is_accessed_spte(spte)) return false; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { @@ -1390,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } +static bool wrprot_ad_disabled_spte(u64 *sptep) +{ + bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); + if (was_writable) + kvm_set_pfn_dirty(spte_to_pfn(*sptep)); + + return was_writable; +} + +/* + * Gets the GFN ready for another round of dirty logging by clearing the + * - D bit on ad-enabled SPTEs, and + * - W bit on ad-disabled SPTEs. + * Returns true iff any D or W bits were cleared. + */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; @@ -1397,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_clear_dirty(sptep); + else + flush |= wrprot_ad_disabled_spte(sptep); return flush; } @@ -1420,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_set_dirty(sptep); return flush; } @@ -1452,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } /** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write + * protect the page if the D-bit isn't supported. * @kvm: kvm instance * @slot: slot to clear D-bit * @gfn_offset: start of the BITS_PER_LONG pages we care about @@ -1766,18 +1821,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep; struct rmap_iterator iter; - /* - * If there's no access bit in the secondary pte set by the hardware and - * fast access tracking is also not enabled, it's up to gup-fast/gup to - * set the access bit in the primary pte or in the page structure. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) return 1; -out: return 0; } @@ -1798,18 +1844,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - return kvm_handle_hva_range(kvm, start, end, 0, - kvm_unmap_rmapp); - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -2398,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + else + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2666,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte = 0; int ret = 0; + struct kvm_mmu_page *sp; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; + sp = page_header(__pa(sptep)); + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, @@ -2678,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, */ spte |= shadow_present_mask; if (!speculative) - spte |= shadow_accessed_mask; + spte |= spte_shadow_accessed_mask(spte); if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; @@ -2735,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_WRITE_MASK) { kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= shadow_dirty_mask; + spte |= spte_shadow_dirty_mask(spte); } if (speculative) @@ -2877,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; + sp = page_header(__pa(sptep)); + /* - * Since it's no accessed bit on EPT, it's no way to - * distinguish between actually accessed translations - * and prefetched, so disable pte prefetch if EPT is - * enabled. + * Without accessed bits, there's no way to distinguish between + * actually accessed translations and prefetched, so disable pte + * prefetch if accessed bits aren't available. */ - if (!shadow_accessed_mask) + if (sp_ad_disabled(sp)) return; - sp = page_header(__pa(sptep)); if (sp->role.level > PT_PAGE_TABLE_LEVEL) return; @@ -4290,6 +4334,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->base_role.word = 0; context->base_role.smm = is_smm(vcpu); + context->base_role.ad_disabled = (shadow_accessed_mask == 0); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -4377,6 +4422,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->root_level = context->shadow_root_level; context->root_hpa = INVALID_PAGE; context->direct_map = false; + context->base_role.ad_disabled = !accessed_dirty; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); @@ -4636,6 +4682,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; mask.smm = 1; + mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 330bf3a811fb..a276834950c1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 5a24b846a1cb..8b97a6cba8d1 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -30,8 +30,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ + trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -39,6 +40,7 @@ access_str[role.access], \ role.invalid ? " invalid" : "", \ role.nxe ? "" : "!", \ + role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ saved_ptr; \ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 33460fcdeef9..905ea6052517 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -190,6 +190,7 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + u64 nmi_singlestep_guest_rflags; unsigned int3_injected; unsigned long int3_rip; @@ -964,6 +965,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +static void disable_nmi_singlestep(struct vcpu_svm *svm) +{ + svm->nmi_singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { + /* Clear our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; + } +} + /* Note: * This hash table is used to map VM_ID to a struct kvm_arch, * when handling AMD IOMMU GALOG notification to schedule in @@ -1713,11 +1726,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { - return to_svm(vcpu)->vmcb->save.rflags; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long rflags = svm->vmcb->save.rflags; + + if (svm->nmi_singlestep) { + /* Hide our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + rflags &= ~X86_EFLAGS_RF; + } + return rflags; } static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (to_svm(vcpu)->nmi_singlestep) + rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + /* * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), @@ -2112,10 +2138,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->nmi_singlestep) { - svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) - svm->vmcb->save.rflags &= - ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + disable_nmi_singlestep(svm); } if (svm->vcpu.guest_debug & @@ -2370,8 +2393,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.efer & EFER_SVME) - || !is_paging(&svm->vcpu)) { + if (!(svm->vcpu.arch.efer & EFER_SVME) || + !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -2381,7 +2404,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) return 1; } - return 0; + return 0; } static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, @@ -2534,6 +2557,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } +/* DB exceptions for our internal use must not cause vmexit */ +static int nested_svm_intercept_db(struct vcpu_svm *svm) +{ + unsigned long dr6; + + /* if we're not singlestepping, it's not ours */ + if (!svm->nmi_singlestep) + return NESTED_EXIT_DONE; + + /* if it's not a singlestep exception, it's not ours */ + if (kvm_get_dr(&svm->vcpu, 6, &dr6)) + return NESTED_EXIT_DONE; + if (!(dr6 & DR6_BS)) + return NESTED_EXIT_DONE; + + /* if the guest is singlestepping, it should get the vmexit */ + if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { + disable_nmi_singlestep(svm); + return NESTED_EXIT_DONE; + } + + /* it's ours, the nested hypervisor must not see this one */ + return NESTED_EXIT_HOST; +} + static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -2589,8 +2637,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) - vmexit = NESTED_EXIT_DONE; + if (svm->nested.intercept_exceptions & excp_bits) { + if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) + vmexit = nested_svm_intercept_db(svm); + else + vmexit = NESTED_EXIT_DONE; + } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && svm->apf_reason != 0) @@ -4627,10 +4679,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ + if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0) + return; /* STGI will cause a vm exit */ + + if (svm->nested.exit_required) + return; /* we're not going to run the guest yet */ + /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) */ + svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } @@ -4771,6 +4830,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; + /* + * Disable singlestep if we're injecting an interrupt/exception. + * We don't want our modified rflags to be pushed on the stack where + * we might not be able to easily reset them if we disabled NMI + * singlestep later. + */ + if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { + /* + * Event injection happens before external interrupts cause a + * vmexit and interrupts are disabled here, so smp_send_reschedule + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); + smp_send_reschedule(vcpu->cpu); + } + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6dcc4873e435..f76efad248ab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -913,8 +913,9 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(unsigned long root_hpa); +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -2772,7 +2773,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept_ad_bits) { vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -3198,7 +3199,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3280,7 +3282,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -4013,7 +4019,7 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); } else { vpid_sync_context(vpid); } @@ -4188,14 +4194,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static u64 construct_eptp(unsigned long root_hpa) +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp; /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; - if (enable_ept_ad_bits) + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); @@ -4209,7 +4216,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(cr3); + eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); if (is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); @@ -5170,7 +5177,8 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, + VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -6220,17 +6228,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (is_guest_mode(vcpu) - && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) { - /* - * Fix up exit_qualification according to whether guest - * page table accesses are reads or writes. - */ - u64 eptp = nested_ept_get_cr3(vcpu); - if (!(eptp & VMX_EPT_AD_ENABLE_BIT)) - exit_qualification &= ~EPT_VIOLATION_ACC_WRITE; - } - /* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. @@ -6453,7 +6450,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + VMX_EPT_RWX_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -6557,7 +6554,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -7661,7 +7657,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) unsigned long type, types; gva_t gva; struct x86_exception e; - int vpid; + struct { + u64 vpid; + u64 gla; + } operand; if (!(vmx->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -7691,17 +7690,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, - sizeof(u32), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } + if (operand.vpid >> 16) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (is_noncanonical_address(operand.gla)) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + /* fall through */ case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!vpid) { + if (!operand.vpid) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return kvm_skip_emulated_instruction(vcpu); @@ -9394,6 +9404,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vmcs12->guest_physical_address = fault->address; } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT; +} + /* Callbacks for nested_ept_init_mmu_context: */ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) @@ -9404,18 +9419,18 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - u64 eptp; + bool wants_ad; WARN_ON(mmu_is_nested(vcpu)); - eptp = nested_ept_get_cr3(vcpu); - if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) + wants_ad = nested_ept_ad_enabled(vcpu); + if (wants_ad && !enable_ept_ad_bits) return 1; kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - eptp & VMX_EPT_AD_ENABLE_BIT); + wants_ad); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -10728,8 +10743,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } - if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10754,8 +10768,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); } /* @@ -11152,7 +11164,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) vmx->hv_deadline_tsc = tscl + delta_tsc; vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; + + return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e846f0cb83b..6c7266f7766d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2841,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_target_expiration_tsc(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_restart_hv_timer(vcpu); + /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration @@ -6011,7 +6011,7 @@ static void kvm_set_mmio_spte_mask(void) mask &= ~1ull; #endif - kvm_mmu_set_mmio_spte_mask(mask); + kvm_mmu_set_mmio_spte_mask(mask, mask); } #ifdef CONFIG_X86_64 @@ -6733,7 +6733,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -6897,7 +6897,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->sync_pir_to_irr(vcpu); } - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index f2a7faf4706e..b33093f84528 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -19,9 +19,6 @@ */ #define SKBDATA %r10 #define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */ -#define MAX_BPF_STACK (512 /* from filter.h */ + \ - 32 /* space for rbx,r13,r14,r15 */ + \ - 8 /* space for skb_copy_bits */) #define FUNC(name) \ .globl name; \ @@ -66,7 +63,7 @@ FUNC(sk_load_byte_positive_offset) /* rsi contains offset and can be scratched */ #define bpf_slow_path_common(LEN) \ - lea -MAX_BPF_STACK + 32(%rbp), %rdx;\ + lea 32(%rbp), %rdx;\ FRAME_BEGIN; \ mov %rbx, %rdi; /* arg1 == skb */ \ push %r9; \ @@ -83,14 +80,14 @@ FUNC(sk_load_byte_positive_offset) bpf_slow_path_word: bpf_slow_path_common(4) js bpf_error - mov - MAX_BPF_STACK + 32(%rbp),%eax + mov 32(%rbp),%eax bswap %eax ret bpf_slow_path_half: bpf_slow_path_common(2) js bpf_error - mov - MAX_BPF_STACK + 32(%rbp),%ax + mov 32(%rbp),%ax rol $8,%ax movzwl %ax,%eax ret @@ -98,7 +95,7 @@ bpf_slow_path_half: bpf_slow_path_byte: bpf_slow_path_common(1) js bpf_error - movzbl - MAX_BPF_STACK + 32(%rbp),%eax + movzbl 32(%rbp),%eax ret #define sk_negative_common(SIZE) \ @@ -148,9 +145,10 @@ FUNC(sk_load_byte_negative_offset) bpf_error: # force a return 0 from jit handler xor %eax,%eax - mov - MAX_BPF_STACK(%rbp),%rbx - mov - MAX_BPF_STACK + 8(%rbp),%r13 - mov - MAX_BPF_STACK + 16(%rbp),%r14 - mov - MAX_BPF_STACK + 24(%rbp),%r15 + mov (%rbp),%rbx + mov 8(%rbp),%r13 + mov 16(%rbp),%r14 + mov 24(%rbp),%r15 + add $40, %rbp leaveq ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f58939393eef..e1324f280e06 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -197,17 +197,16 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define STACKSIZE \ - (MAX_BPF_STACK + \ - 32 /* space for rbx, r13, r14, r15 */ + \ +#define AUX_STACK_SPACE \ + (32 /* space for rbx, r13, r14, r15 */ + \ 8 /* space for skb_copy_bits() buffer */) -#define PROLOGUE_SIZE 48 +#define PROLOGUE_SIZE 37 /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program */ -static void emit_prologue(u8 **pprog) +static void emit_prologue(u8 **pprog, u32 stack_depth) { u8 *prog = *pprog; int cnt = 0; @@ -215,13 +214,17 @@ static void emit_prologue(u8 **pprog) EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ - /* sub rsp, STACKSIZE */ - EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE); + /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */ + EMIT3_off32(0x48, 0x81, 0xEC, + round_up(stack_depth, 8) + AUX_STACK_SPACE); + + /* sub rbp, AUX_STACK_SPACE */ + EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); /* all classic BPF filters use R6(rbx) save it */ - /* mov qword ptr [rbp-X],rbx */ - EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE); + /* mov qword ptr [rbp+0],rbx */ + EMIT4(0x48, 0x89, 0x5D, 0); /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and @@ -231,12 +234,12 @@ static void emit_prologue(u8 **pprog) * than synthetic ones. Therefore not worth adding complexity. */ - /* mov qword ptr [rbp-X],r13 */ - EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8); - /* mov qword ptr [rbp-X],r14 */ - EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16); - /* mov qword ptr [rbp-X],r15 */ - EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); + /* mov qword ptr [rbp+8],r13 */ + EMIT4(0x4C, 0x89, 0x6D, 8); + /* mov qword ptr [rbp+16],r14 */ + EMIT4(0x4C, 0x89, 0x75, 16); + /* mov qword ptr [rbp+24],r15 */ + EMIT4(0x4C, 0x89, 0x7D, 24); /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls * we need to reset the counter to 0. It's done in two instructions, @@ -246,8 +249,8 @@ static void emit_prologue(u8 **pprog) /* xor eax, eax */ EMIT2(0x31, 0xc0); - /* mov qword ptr [rbp-X], rax */ - EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); + /* mov qword ptr [rbp+32], rax */ + EMIT4(0x48, 0x89, 0x45, 32); BUILD_BUG_ON(cnt != PROLOGUE_SIZE); *pprog = prog; @@ -289,13 +292,13 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ #define OFFSET2 36 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */ /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ @@ -361,7 +364,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int proglen = 0; u8 *prog = temp; - emit_prologue(&prog); + emit_prologue(&prog, bpf_prog->aux->stack_depth); if (seen_ld_abs) emit_load_skb_data_hlen(&prog); @@ -877,7 +880,7 @@ xadd: if (is_imm8(insn->off)) } break; - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_bpf_tail_call(&prog); break; @@ -1036,15 +1039,17 @@ common_load: seen_exit = true; /* update cleanup_addr */ ctx->cleanup_addr = proglen; - /* mov rbx, qword ptr [rbp-X] */ - EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE); - /* mov r13, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8); - /* mov r14, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16); - /* mov r15, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24); - + /* mov rbx, qword ptr [rbp+0] */ + EMIT4(0x48, 0x8B, 0x5D, 0); + /* mov r13, qword ptr [rbp+8] */ + EMIT4(0x4C, 0x8B, 0x6D, 8); + /* mov r14, qword ptr [rbp+16] */ + EMIT4(0x4C, 0x8B, 0x75, 16); + /* mov r15, qword ptr [rbp+24] */ + EMIT4(0x4C, 0x8B, 0x7D, 24); + + /* add rbp, AUX_STACK_SPACE */ + EMIT4(0x48, 0x83, 0xC5, AUX_STACK_SPACE); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ break; @@ -1162,6 +1167,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_binary_lock_ro(header); prog->bpf_func = (void *)image; prog->jited = 1; + prog->jited_len = proglen; } else { prog = orig_prog; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a5ffcbb20cc0..0e7ef69e8531 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -106,15 +106,83 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } -static void clamp_max_cpus(void) +static int xen_vcpu_setup_restore(int cpu) { -#ifdef CONFIG_SMP - if (setup_max_cpus > MAX_VIRT_CPUS) - setup_max_cpus = MAX_VIRT_CPUS; -#endif + int rc = 0; + + /* Any per_cpu(xen_vcpu) is stale, so reset it */ + xen_vcpu_info_reset(cpu); + + /* + * For PVH and PVHVM, setup online VCPUs only. The rest will + * be handled by hotplug. + */ + if (xen_pv_domain() || + (xen_hvm_domain() && cpu_online(cpu))) { + rc = xen_vcpu_setup(cpu); + } + + return rc; +} + +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + int cpu, rc; + + for_each_possible_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + bool is_up; + + if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID) + continue; + + /* Only Xen 4.5 and higher support this. */ + is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, + xen_vcpu_nr(cpu), NULL) > 0; + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) + BUG(); + + if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_runstate_info(cpu); + + rc = xen_vcpu_setup_restore(cpu); + if (rc) + pr_emerg_once("vcpu restore failed for cpu=%d err=%d. " + "System will hang.\n", cpu, rc); + /* + * In case xen_vcpu_setup_restore() fails, do not bring up the + * VCPU. This helps us avoid the resulting OOPS when the VCPU + * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.) + * Note that this does not improve the situation much -- now the + * VM hangs instead of OOPSing -- with the VCPUs that did not + * fail, spinning in stop_machine(), waiting for the failed + * VCPUs to come up. + */ + if (other_cpu && is_up && (rc == 0) && + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) + BUG(); + } } -void xen_vcpu_setup(int cpu) +void xen_vcpu_info_reset(int cpu) +{ + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) { + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + } else { + /* Set to NULL so that if somebody accesses it we get an OOPS */ + per_cpu(xen_vcpu, cpu) = NULL; + } +} + +int xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -123,11 +191,11 @@ void xen_vcpu_setup(int cpu) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* - * This path is called twice on PVHVM - first during bootup via - * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being - * hotplugged: cpu_up -> xen_hvm_cpu_notify. - * As we can only do the VCPUOP_register_vcpu_info once lets - * not over-write its result. + * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu) + * and at restore (xen_vcpu_restore). Also called for hotplugged + * VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm). + * However, the hypercall can only be done once (see below) so if a VCPU + * is offlined and comes back online then let's not redo the hypercall. * * For PV it is called during restore (xen_vcpu_restore) and bootup * (xen_setup_vcpu_info_placement). The hotplug mechanism does not @@ -135,42 +203,44 @@ void xen_vcpu_setup(int cpu) */ if (xen_hvm_domain()) { if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) - return; + return 0; } - if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - if (!xen_have_vcpu_info_placement) { - if (cpu >= MAX_VIRT_CPUS) - clamp_max_cpus(); - return; + if (xen_have_vcpu_info_placement) { + vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + /* + * Check to see if the hypervisor will put the vcpu_info + * structure where we want it, which allows direct access via + * a percpu-variable. + * N.B. This hypercall can _only_ be called once per CPU. + * Subsequent calls will error out with -EINVAL. This is due to + * the fact that hypervisor has no unregister variant and this + * hypercall does not allow to over-write info.mfn and + * info.offset. + */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, + xen_vcpu_nr(cpu), &info); + + if (err) { + pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n", + cpu, err); + xen_have_vcpu_info_placement = 0; + } else { + /* + * This cpu is using the registered vcpu info, even if + * later ones fail to. + */ + per_cpu(xen_vcpu, cpu) = vcpup; + } } - vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = arbitrary_virt_to_mfn(vcpup); - info.offset = offset_in_page(vcpup); - - /* Check to see if the hypervisor will put the vcpu_info - structure where we want it, which allows direct access via - a percpu-variable. - N.B. This hypercall can _only_ be called once per CPU. Subsequent - calls will error out with -EINVAL. This is due to the fact that - hypervisor has no unregister variant and this hypercall does not - allow to over-write info.mfn and info.offset. - */ - err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu), - &info); + if (!xen_have_vcpu_info_placement) + xen_vcpu_info_reset(cpu); - if (err) { - printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); - xen_have_vcpu_info_placement = 0; - clamp_max_cpus(); - } else { - /* This cpu is using the registered vcpu info, even if - later ones fail to. */ - per_cpu(xen_vcpu, cpu) = vcpup; - } + return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); } void xen_reboot(int reason) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index a6d014f47e52..87d791356ea9 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -1,5 +1,6 @@ #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/memblock.h> #include <xen/features.h> #include <xen/events.h> @@ -10,9 +11,11 @@ #include <asm/reboot.h> #include <asm/setup.h> #include <asm/hypervisor.h> +#include <asm/e820/api.h> #include <asm/xen/cpuid.h> #include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> #include "xen-ops.h" #include "mmu.h" @@ -20,37 +23,34 @@ void __ref xen_hvm_init_shared_info(void) { - int cpu; struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page; + u64 pa; + + if (HYPERVISOR_shared_info == &xen_dummy_shared_info) { + /* + * Search for a free page starting at 4kB physical address. + * Low memory is preferred to avoid an EPT large page split up + * by the mapping. + * Starting below X86_RESERVE_LOW (usually 64kB) is fine as + * the BIOS used for HVM guests is well behaved and won't + * clobber memory other than the first 4kB. + */ + for (pa = PAGE_SIZE; + !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) || + memblock_is_reserved(pa); + pa += PAGE_SIZE) + ; + + memblock_reserve(pa, PAGE_SIZE); + HYPERVISOR_shared_info = __va(pa); + } - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info); if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); - - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; - - /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info - * page, we use it in the event channel upcall and in some pvclock - * related functions. We don't need the vcpu_info placement - * optimizations because we don't use any pv_mmu or pv_irq op on - * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and - * in that case multiple vcpus might be online. */ - for_each_online_cpu(cpu) { - /* Leave it to be NULL. */ - if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) - continue; - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - } } static void __init init_hvm_pv_info(void) @@ -106,7 +106,7 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs) static int xen_cpu_up_prepare_hvm(unsigned int cpu) { - int rc; + int rc = 0; /* * This can happen if CPU was offlined earlier and @@ -121,7 +121,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); + rc = xen_vcpu_setup(cpu); + if (rc) + return rc; if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); @@ -130,9 +132,8 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) if (rc) { WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", cpu, rc); - return rc; } - return 0; + return rc; } static int xen_cpu_dead_hvm(unsigned int cpu) @@ -154,6 +155,13 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_shared_info(); + /* + * xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. + */ + xen_vcpu_info_reset(0); + xen_panic_handler_init(); if (xen_feature(XENFEAT_hvm_callback_vector)) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f33eef4ebd12..811e4ddb3f37 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -89,8 +89,6 @@ void *xen_initial_gdt; -RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); - static int xen_cpu_up_prepare_pv(unsigned int cpu); static int xen_cpu_dead_pv(unsigned int cpu); @@ -107,35 +105,6 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); -/* - * On restore, set the vcpu placement up again. - * If it fails, then we're in a bad state, since - * we can't back out from using it... - */ -void xen_vcpu_restore(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), - NULL); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) - BUG(); - - xen_setup_runstate_info(cpu); - - if (xen_have_vcpu_info_placement) - xen_vcpu_setup(cpu); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) - BUG(); - } -} - static void __init xen_banner(void) { unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); @@ -960,30 +929,43 @@ void xen_setup_shared_info(void) HYPERVISOR_shared_info = (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); -#ifndef CONFIG_SMP - /* In UP this is as good a place as any to set up shared info */ - xen_setup_vcpu_info_placement(); -#endif - xen_setup_mfn_list_list(); - /* - * Now that shared info is set up we can start using routines that - * point to pvclock area. - */ - if (system_state == SYSTEM_BOOTING) + if (system_state == SYSTEM_BOOTING) { +#ifndef CONFIG_SMP + /* + * In UP this is as good a place as any to set up shared info. + * Limit this to boot only, at restore vcpu setup is done via + * xen_vcpu_restore(). + */ + xen_setup_vcpu_info_placement(); +#endif + /* + * Now that shared info is set up we can start using routines + * that point to pvclock area. + */ xen_init_time_ops(); + } } /* This is called once we have the cpu_possible_mask */ -void xen_setup_vcpu_info_placement(void) +void __ref xen_setup_vcpu_info_placement(void) { int cpu; for_each_possible_cpu(cpu) { /* Set up direct vCPU id mapping for PV guests. */ per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); + + /* + * xen_vcpu_setup(cpu) can fail -- in which case it + * falls back to the shared_info version for cpus + * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS. + * + * xen_cpu_up_prepare_pv() handles the rest by failing + * them in hotplug. + */ + (void) xen_vcpu_setup(cpu); } /* @@ -1332,9 +1314,17 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif - /* Don't do the full vcpu_info placement stuff until we have a - possible map and a non-dummy shared_info. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + /* + * Setup xen_vcpu early because start_kernel needs it for + * local_irq_disable(), irqs_disabled(). + * + * Don't do the full vcpu_info placement stuff until we have + * the cpu_possible_mask and a non-dummy shared_info. + */ + xen_vcpu_info_reset(0); WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); @@ -1431,9 +1421,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif xen_raw_console_write("about to get started...\n"); - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - + /* We need this for printk timestamps */ xen_setup_runstate_info(0); xen_efi_init(); @@ -1451,6 +1439,9 @@ static int xen_cpu_up_prepare_pv(unsigned int cpu) { int rc; + if (per_cpu(xen_vcpu, cpu) == NULL) + return -ENODEV; + xen_setup_timer(cpu); rc = xen_smp_intr_init(cpu); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a5bf7c451435..c81046323ebc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -499,7 +499,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, void __init xen_remap_memory(void) { unsigned long buf = (unsigned long)&xen_remap_buf; - unsigned long mfn_save, mfn, pfn; + unsigned long mfn_save, pfn; unsigned long remapped = 0; unsigned int i; unsigned long pfn_s = ~0UL; @@ -515,8 +515,7 @@ void __init xen_remap_memory(void) pfn = xen_remap_buf.target_pfn; for (i = 0; i < xen_remap_buf.size; i++) { - mfn = xen_remap_buf.mfns[i]; - xen_update_mem_tables(pfn, mfn); + xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]); remapped++; pfn++; } @@ -530,8 +529,6 @@ void __init xen_remap_memory(void) pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } - - mfn = xen_remap_mfn; xen_remap_mfn = xen_remap_buf.next_area_mfn; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 82ac611f2fc1..e7f02eb73727 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -1,4 +1,5 @@ #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/cpumask.h> #include <linux/percpu.h> @@ -114,6 +115,36 @@ int xen_smp_intr_init(unsigned int cpu) return rc; } +void __init xen_smp_cpus_done(unsigned int max_cpus) +{ + int cpu, rc, count = 0; + + if (xen_hvm_domain()) + native_smp_cpus_done(max_cpus); + + if (xen_have_vcpu_info_placement) + return; + + for_each_online_cpu(cpu) { + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) + continue; + + rc = cpu_down(cpu); + + if (rc == 0) { + /* + * Reset vcpu_info so this cpu cannot be onlined again. + */ + xen_vcpu_info_reset(cpu); + count++; + } else { + pr_warn("%s: failed to bring CPU %d down, error %d\n", + __func__, cpu, rc); + } + } + WARN(count, "%s: brought %d CPUs offline\n", __func__, count); +} + void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 8ebb6acca64a..87d3c76cba37 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -14,6 +14,8 @@ extern void xen_smp_intr_free(unsigned int cpu); int xen_smp_intr_init_pv(unsigned int cpu); void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_cpus_done(unsigned int max_cpus); + void xen_smp_send_reschedule(int cpu); void xen_smp_send_call_function_ipi(const struct cpumask *mask); void xen_smp_send_call_function_single_ipi(int cpu); diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f18561bbf5c9..fd60abedf658 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -12,7 +12,8 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); /* - * Setup vcpu_info for boot CPU. + * Setup vcpu_info for boot CPU. Secondary CPUs get their vcpu_info + * in xen_cpu_up_prepare_hvm(). */ xen_vcpu_setup(0); @@ -27,10 +28,20 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) { + int cpu; + native_smp_prepare_cpus(max_cpus); WARN_ON(xen_smp_intr_init(0)); xen_init_lock_cpu(0); + + for_each_possible_cpu(cpu) { + if (cpu == 0) + continue; + + /* Set default vcpu_id to make sure that we don't use cpu-0's */ + per_cpu(xen_vcpu_id, cpu) = XEN_VCPU_ID_INVALID; + } } #ifdef CONFIG_HOTPLUG_CPU @@ -60,4 +71,5 @@ void __init xen_hvm_smp_init(void) smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; + smp_ops.smp_cpus_done = xen_smp_cpus_done; } diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index aae32535f4ec..1ea598e5f030 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -371,10 +371,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) return 0; } -static void xen_pv_smp_cpus_done(unsigned int max_cpus) -{ -} - #ifdef CONFIG_HOTPLUG_CPU static int xen_pv_cpu_disable(void) { @@ -469,7 +465,7 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_pv_smp_prepare_cpus, - .smp_cpus_done = xen_pv_smp_cpus_done, + .smp_cpus_done = xen_smp_cpus_done, .cpu_up = xen_pv_cpu_up, .cpu_die = xen_pv_cpu_die, diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index 01afcadde50a..484999416d8b 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -8,15 +8,10 @@ void xen_hvm_post_suspend(int suspend_cancelled) { - int cpu; - - if (!suspend_cancelled) + if (!suspend_cancelled) { xen_hvm_init_shared_info(); + xen_vcpu_restore(); + } xen_callback_vector(); xen_unplug_emulated_devices(); - if (xen_feature(XENFEAT_hvm_safe_pvclock)) { - for_each_online_cpu(cpu) { - xen_setup_runstate_info(cpu); - } - } } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a440a42c618..0d5004477db6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -78,7 +78,8 @@ bool xen_vcpu_stolen(int vcpu); extern int xen_have_vcpu_info_placement; -void xen_vcpu_setup(int cpu); +int xen_vcpu_setup(int cpu); +void xen_vcpu_info_reset(int cpu); void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 1eb6d2fe70d3..3eed2761c149 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -109,4 +109,8 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + +#define SO_PEERGROUPS 59 + #endif /* _XTENSA_SOCKET_H */ |