diff options
Diffstat (limited to 'arch')
82 files changed, 2733 insertions, 1520 deletions
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3dd691c85ca0..7e7e19ef6993 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -16,6 +16,7 @@ #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kvm_types.h> +#include <linux/maple_tree.h> #include <linux/percpu.h> #include <linux/psci.h> #include <asm/arch_gicv3.h> @@ -199,6 +200,9 @@ struct kvm_arch { /* Mandated version of PSCI */ u32 psci_version; + /* Protects VM-scoped configuration data */ + struct mutex config_lock; + /* * If we encounter a data abort without valid instruction syndrome * information, report this to user space. User space can (and @@ -221,7 +225,12 @@ struct kvm_arch { #define KVM_ARCH_FLAG_EL1_32BIT 4 /* PSCI SYSTEM_SUSPEND enabled for the guest */ #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5 - + /* VM counter offset */ +#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 6 + /* Timer PPIs made immutable */ +#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 7 + /* SMCCC filter initialized for the VM */ +#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 8 unsigned long flags; /* @@ -242,6 +251,7 @@ struct kvm_arch { /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; + struct maple_tree smccc_filter; /* * For an untrusted host VM, 'pkvm.handle' is used to lookup @@ -365,6 +375,10 @@ enum vcpu_sysreg { TPIDR_EL2, /* EL2 Software Thread ID Register */ CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ SP_EL2, /* EL2 Stack Pointer */ + CNTHP_CTL_EL2, + CNTHP_CVAL_EL2, + CNTHV_CTL_EL2, + CNTHV_CVAL_EL2, NR_SYS_REGS /* Nothing after this line! */ }; @@ -522,6 +536,7 @@ struct kvm_vcpu_arch { /* vcpu power state */ struct kvm_mp_state mp_state; + spinlock_t mp_state_lock; /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; @@ -939,6 +954,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); int __init kvm_sys_reg_table_init(void); +bool lock_all_vcpus(struct kvm *kvm); +void unlock_all_vcpus(struct kvm *kvm); + /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); @@ -1022,8 +1040,10 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, - struct kvm_arm_copy_mte_tags *copy_tags); +int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, + struct kvm_arm_copy_mte_tags *copy_tags); +int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, + struct kvm_arm_counter_offset *offset); /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); @@ -1078,6 +1098,9 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); (system_supports_32bit_el0() && \ !static_branch_unlikely(&arm64_mismatched_32bit_el0)) +#define kvm_vm_has_ran_once(kvm) \ + (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags)) + int kvm_trng_call(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM extern phys_addr_t hyp_mem_base; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 083cc47dca08..27e63c111f78 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -63,6 +63,7 @@ * specific registers encoded in the instructions). */ .macro kern_hyp_va reg +#ifndef __KVM_VHE_HYPERVISOR__ alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask and \reg, \reg, #1 /* mask with va_mask */ ror \reg, \reg, #1 /* rotate to the first tag bit */ @@ -70,6 +71,7 @@ alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask add \reg, \reg, #0, lsl 12 /* insert the top 12 bits of the tag */ ror \reg, \reg, #63 /* rotate back */ alternative_cb_end +#endif .endm /* @@ -127,6 +129,7 @@ void kvm_apply_hyp_relocations(void); static __always_inline unsigned long __kern_hyp_va(unsigned long v) { +#ifndef __KVM_VHE_HYPERVISOR__ asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" "ror %0, %0, #1\n" "add %0, %0, #0\n" @@ -135,6 +138,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) ARM64_ALWAYS_SYSTEM, kvm_update_va_mask) : "+r" (v)); +#endif return v; } diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index c48b41c9b0cc..e72d9aaab6b1 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -388,6 +388,7 @@ #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0) +#define SYS_CNTPCT_EL0 sys_reg(3, 3, 14, 0, 1) #define SYS_CNTPCTSS_EL0 sys_reg(3, 3, 14, 0, 5) #define SYS_CNTVCTSS_EL0 sys_reg(3, 3, 14, 0, 6) @@ -400,7 +401,9 @@ #define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0) #define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1) +#define SYS_AARCH32_CNTPCT sys_reg(0, 0, 0, 14, 0) #define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0) +#define SYS_AARCH32_CNTPCTSS sys_reg(0, 8, 0, 14, 0) #define __PMEV_op2(n) ((n) & 0x7) #define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index f8129c624b07..f7ddd73a8c0f 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags { __u64 reserved[2]; }; +/* + * Counter/Timer offset structure. Describe the virtual/physical offset. + * To be used with KVM_ARM_SET_COUNTER_OFFSET. + */ +struct kvm_arm_counter_offset { + __u64 counter_offset; + __u64 reserved; +}; + #define KVM_ARM_TAGS_TO_GUEST 0 #define KVM_ARM_TAGS_FROM_GUEST 1 @@ -372,6 +381,10 @@ enum { #endif }; +/* Device Control API on vm fd */ +#define KVM_ARM_VM_SMCCC_CTRL 0 +#define KVM_ARM_VM_SMCCC_FILTER 0 + /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 @@ -411,6 +424,8 @@ enum { #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 +#define KVM_ARM_VCPU_TIMER_IRQ_HVTIMER 2 +#define KVM_ARM_VCPU_TIMER_IRQ_HPTIMER 3 #define KVM_ARM_VCPU_PVTIME_CTRL 2 #define KVM_ARM_VCPU_PVTIME_IPA 0 @@ -469,6 +484,27 @@ enum { /* run->fail_entry.hardware_entry_failure_reason codes. */ #define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0) +enum kvm_smccc_filter_action { + KVM_SMCCC_FILTER_HANDLE = 0, + KVM_SMCCC_FILTER_DENY, + KVM_SMCCC_FILTER_FWD_TO_USER, + +#ifdef __KERNEL__ + NR_SMCCC_FILTER_ACTIONS +#endif +}; + +struct kvm_smccc_filter { + __u32 base; + __u32 nr_functions; + __u8 action; + __u8 pad[15]; +}; + +/* arm64-specific KVM_EXIT_HYPERCALL flags */ +#define KVM_HYPERCALL_EXIT_SMC (1U << 0) +#define KVM_HYPERCALL_EXIT_16BIT (1U << 1) + #endif #endif /* __ARM_KVM_H__ */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 2f9911256503..bf38d065396b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2230,6 +2230,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, IMP) }, + { + .desc = "Enhanced Counter Virtualization (CNTPOFF)", + .capability = ARM64_HAS_ECV_CNTPOFF, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR0_EL1, + .field_pos = ID_AA64MMFR0_EL1_ECV_SHIFT, + .field_width = 4, + .sign = FTR_UNSIGNED, + .min_field_value = ID_AA64MMFR0_EL1_ECV_CNTPOFF, + }, #ifdef CONFIG_ARM64_PAN { .desc = "Privileged Access Never", diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index e1af4301b913..05b022be885b 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -16,6 +16,7 @@ #include <asm/arch_timer.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_nested.h> #include <kvm/arm_vgic.h> #include <kvm/arm_arch_timer.h> @@ -30,14 +31,11 @@ static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); -static const struct kvm_irq_level default_ptimer_irq = { - .irq = 30, - .level = 1, -}; - -static const struct kvm_irq_level default_vtimer_irq = { - .irq = 27, - .level = 1, +static const u8 default_ppi[] = { + [TIMER_PTIMER] = 30, + [TIMER_VTIMER] = 27, + [TIMER_HPTIMER] = 26, + [TIMER_HVTIMER] = 28, }; static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); @@ -51,6 +49,24 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg); +static bool kvm_arch_timer_get_input_level(int vintid); + +static struct irq_ops arch_timer_irq_ops = { + .get_input_level = kvm_arch_timer_get_input_level, +}; + +static bool has_cntpoff(void) +{ + return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); +} + +static int nr_timers(struct kvm_vcpu *vcpu) +{ + if (!vcpu_has_nv(vcpu)) + return NR_KVM_EL0_TIMERS; + + return NR_KVM_TIMERS; +} u32 timer_get_ctl(struct arch_timer_context *ctxt) { @@ -61,6 +77,10 @@ u32 timer_get_ctl(struct arch_timer_context *ctxt) return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); case TIMER_PTIMER: return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + case TIMER_HVTIMER: + return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2); + case TIMER_HPTIMER: + return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2); default: WARN_ON(1); return 0; @@ -76,6 +96,10 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); case TIMER_PTIMER: return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + case TIMER_HVTIMER: + return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2); + case TIMER_HPTIMER: + return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); default: WARN_ON(1); return 0; @@ -84,10 +108,17 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) static u64 timer_get_offset(struct arch_timer_context *ctxt) { + u64 offset = 0; + + if (!ctxt) + return 0; + if (ctxt->offset.vm_offset) - return *ctxt->offset.vm_offset; + offset += *ctxt->offset.vm_offset; + if (ctxt->offset.vcpu_offset) + offset += *ctxt->offset.vcpu_offset; - return 0; + return offset; } static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) @@ -101,6 +132,12 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) case TIMER_PTIMER: __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl; break; + case TIMER_HVTIMER: + __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl; + break; + case TIMER_HPTIMER: + __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl; + break; default: WARN_ON(1); } @@ -117,6 +154,12 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) case TIMER_PTIMER: __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval; break; + case TIMER_HVTIMER: + __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval; + break; + case TIMER_HPTIMER: + __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval; + break; default: WARN_ON(1); } @@ -139,13 +182,27 @@ u64 kvm_phys_timer_read(void) static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) { - if (has_vhe()) { + if (vcpu_has_nv(vcpu)) { + if (is_hyp_ctxt(vcpu)) { + map->direct_vtimer = vcpu_hvtimer(vcpu); + map->direct_ptimer = vcpu_hptimer(vcpu); + map->emul_vtimer = vcpu_vtimer(vcpu); + map->emul_ptimer = vcpu_ptimer(vcpu); + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_vtimer = vcpu_hvtimer(vcpu); + map->emul_ptimer = vcpu_hptimer(vcpu); + } + } else if (has_vhe()) { map->direct_vtimer = vcpu_vtimer(vcpu); map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_vtimer = NULL; map->emul_ptimer = NULL; } else { map->direct_vtimer = vcpu_vtimer(vcpu); map->direct_ptimer = NULL; + map->emul_vtimer = NULL; map->emul_ptimer = vcpu_ptimer(vcpu); } @@ -212,7 +269,7 @@ static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx, ns = cyclecounter_cyc2ns(timecounter->cc, val - now, timecounter->mask, - &timecounter->frac); + &timer_ctx->ns_frac); return ns; } @@ -240,8 +297,11 @@ static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) { - struct arch_timer_context *ctx = vcpu_vtimer(vcpu); u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); + struct arch_timer_context *ctx; + + ctx = (vcpu_has_nv(vcpu) && is_hyp_ctxt(vcpu)) ? vcpu_hvtimer(vcpu) + : vcpu_vtimer(vcpu); return kvm_counter_compute_delta(ctx, val); } @@ -255,7 +315,7 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) u64 min_delta = ULLONG_MAX; int i; - for (i = 0; i < NR_KVM_TIMERS; i++) { + for (i = 0; i < nr_timers(vcpu); i++) { struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; WARN(ctx->loaded, "timer %d loaded\n", i); @@ -338,9 +398,11 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) switch (index) { case TIMER_VTIMER: + case TIMER_HVTIMER: cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); break; case TIMER_PTIMER: + case TIMER_HPTIMER: cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); break; case NR_KVM_TIMERS: @@ -392,12 +454,12 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, int ret; timer_ctx->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); if (!userspace_irqchip(vcpu->kvm)) { ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - timer_ctx->irq.irq, + timer_irq(timer_ctx), timer_ctx->irq.level, timer_ctx); WARN_ON(ret); @@ -432,6 +494,12 @@ static void set_cntvoff(u64 cntvoff) kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); } +static void set_cntpoff(u64 cntpoff) +{ + if (has_cntpoff()) + write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2); +} + static void timer_save_state(struct arch_timer_context *ctx) { struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); @@ -447,7 +515,10 @@ static void timer_save_state(struct arch_timer_context *ctx) goto out; switch (index) { + u64 cval; + case TIMER_VTIMER: + case TIMER_HVTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL)); @@ -473,13 +544,20 @@ static void timer_save_state(struct arch_timer_context *ctx) set_cntvoff(0); break; case TIMER_PTIMER: + case TIMER_HPTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL)); - timer_set_cval(ctx, read_sysreg_el0(SYS_CNTP_CVAL)); + cval = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTP_CTL); isb(); + set_cntpoff(0); break; case NR_KVM_TIMERS: BUG(); @@ -510,6 +588,7 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) */ if (!kvm_timer_irq_can_fire(map.direct_vtimer) && !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_vtimer) && !kvm_timer_irq_can_fire(map.emul_ptimer) && !vcpu_has_wfit_active(vcpu)) return; @@ -543,14 +622,23 @@ static void timer_restore_state(struct arch_timer_context *ctx) goto out; switch (index) { + u64 cval, offset; + case TIMER_VTIMER: + case TIMER_HVTIMER: set_cntvoff(timer_get_offset(ctx)); write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); break; case TIMER_PTIMER: - write_sysreg_el0(timer_get_cval(ctx), SYS_CNTP_CVAL); + case TIMER_HPTIMER: + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + set_cntpoff(offset); + if (!has_cntpoff()) + cval += offset; + write_sysreg_el0(cval, SYS_CNTP_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL); break; @@ -586,7 +674,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) - phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); + phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); phys_active |= ctx->irq.level; @@ -621,6 +709,128 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } +/* If _pred is true, set bit in _set, otherwise set it in _clr */ +#define assign_clear_set_bit(_pred, _bit, _clr, _set) \ + do { \ + if (_pred) \ + (_set) |= (_bit); \ + else \ + (_clr) |= (_bit); \ + } while (0) + +static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, + struct timer_map *map) +{ + int hw, ret; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + /* + * We only ever unmap the vtimer irq on a VHE system that runs nested + * virtualization, in which case we have both a valid emul_vtimer, + * emul_ptimer, direct_vtimer, and direct_ptimer. + * + * Since this is called from kvm_timer_vcpu_load(), a change between + * vEL2 and vEL1/0 will have just happened, and the timer_map will + * represent this, and therefore we switch the emul/direct mappings + * below. + */ + hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer)); + if (hw < 0) { + kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer)); + kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer)); + + ret = kvm_vgic_map_phys_irq(vcpu, + map->direct_vtimer->host_timer_irq, + timer_irq(map->direct_vtimer), + &arch_timer_irq_ops); + WARN_ON_ONCE(ret); + ret = kvm_vgic_map_phys_irq(vcpu, + map->direct_ptimer->host_timer_irq, + timer_irq(map->direct_ptimer), + &arch_timer_irq_ops); + WARN_ON_ONCE(ret); + + /* + * The virtual offset behaviour is "interresting", as it + * always applies when HCR_EL2.E2H==0, but only when + * accessed from EL1 when HCR_EL2.E2H==1. So make sure we + * track E2H when putting the HV timer in "direct" mode. + */ + if (map->direct_vtimer == vcpu_hvtimer(vcpu)) { + struct arch_timer_offset *offs = &map->direct_vtimer->offset; + + if (vcpu_el2_e2h_is_set(vcpu)) + offs->vcpu_offset = NULL; + else + offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); + } + } +} + +static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + bool tpt, tpc; + u64 clr, set; + + /* + * No trapping gets configured here with nVHE. See + * __timer_enable_traps(), which is where the stuff happens. + */ + if (!has_vhe()) + return; + + /* + * Our default policy is not to trap anything. As we progress + * within this function, reality kicks in and we start adding + * traps based on emulation requirements. + */ + tpt = tpc = false; + + /* + * We have two possibility to deal with a physical offset: + * + * - Either we have CNTPOFF (yay!) or the offset is 0: + * we let the guest freely access the HW + * + * - or neither of these condition apply: + * we trap accesses to the HW, but still use it + * after correcting the physical offset + */ + if (!has_cntpoff() && timer_get_offset(map->direct_ptimer)) + tpt = tpc = true; + + /* + * Apply the enable bits that the guest hypervisor has requested for + * its own guest. We can only add traps that wouldn't have been set + * above. + */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { + u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + /* Use the VHE format for mental sanity */ + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; + + tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); + tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); + } + + /* + * Now that we have collected our requirements, compute the + * trap and enable bits. + */ + set = 0; + clr = 0; + + assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); + assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); + + /* This only happens on VHE, so use the CNTKCTL_EL1 accessor */ + sysreg_clear_set(cntkctl_el1, clr, set); +} + void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); @@ -632,6 +842,9 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) get_timer_map(vcpu, &map); if (static_branch_likely(&has_gic_active_state)) { + if (vcpu_has_nv(vcpu)) + kvm_timer_vcpu_load_nested_switch(vcpu, &map); + kvm_timer_vcpu_load_gic(map.direct_vtimer); if (map.direct_ptimer) kvm_timer_vcpu_load_gic(map.direct_ptimer); @@ -644,9 +857,12 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) timer_restore_state(map.direct_vtimer); if (map.direct_ptimer) timer_restore_state(map.direct_ptimer); - + if (map.emul_vtimer) + timer_emulate(map.emul_vtimer); if (map.emul_ptimer) timer_emulate(map.emul_ptimer); + + timer_set_traps(vcpu, &map); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) @@ -689,6 +905,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ + if (map.emul_vtimer) + soft_timer_cancel(&map.emul_vtimer->hrtimer); if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); @@ -738,56 +956,89 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) * resets the timer to be disabled and unmasked and is compliant with * the ARMv7 architecture. */ - timer_set_ctl(vcpu_vtimer(vcpu), 0); - timer_set_ctl(vcpu_ptimer(vcpu), 0); + for (int i = 0; i < nr_timers(vcpu); i++) + timer_set_ctl(vcpu_get_timer(vcpu, i), 0); + + /* + * A vcpu running at EL2 is in charge of the offset applied to + * the virtual timer, so use the physical VM offset, and point + * the vcpu offset to CNTVOFF_EL2. + */ + if (vcpu_has_nv(vcpu)) { + struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset; + + offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); + offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset; + } if (timer->enabled) { - kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); - kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); + for (int i = 0; i < nr_timers(vcpu); i++) + kvm_timer_update_irq(vcpu, false, + vcpu_get_timer(vcpu, i)); if (irqchip_in_kernel(vcpu->kvm)) { - kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); + kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer)); if (map.direct_ptimer) - kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); + kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer)); } } + if (map.emul_vtimer) + soft_timer_cancel(&map.emul_vtimer->hrtimer); if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); return 0; } +static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) +{ + struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid); + struct kvm *kvm = vcpu->kvm; + + ctxt->vcpu = vcpu; + + if (timerid == TIMER_VTIMER) + ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; + else + ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; + + hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + ctxt->hrtimer.function = kvm_hrtimer_expire; + + switch (timerid) { + case TIMER_PTIMER: + case TIMER_HPTIMER: + ctxt->host_timer_irq = host_ptimer_irq; + break; + case TIMER_VTIMER: + case TIMER_HVTIMER: + ctxt->host_timer_irq = host_vtimer_irq; + break; + } +} + void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - vtimer->vcpu = vcpu; - vtimer->offset.vm_offset = &vcpu->kvm->arch.timer_data.voffset; - ptimer->vcpu = vcpu; + for (int i = 0; i < NR_KVM_TIMERS; i++) + timer_context_init(vcpu, i); - /* Synchronize cntvoff across all vtimers of a VM. */ - timer_set_offset(vtimer, kvm_phys_timer_read()); - timer_set_offset(ptimer, 0); + /* Synchronize offsets across timers of a VM if not already provided */ + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { + timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); + timer_set_offset(vcpu_ptimer(vcpu), 0); + } hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); timer->bg_timer.function = kvm_bg_timer_expire; +} - hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - vtimer->hrtimer.function = kvm_hrtimer_expire; - ptimer->hrtimer.function = kvm_hrtimer_expire; - - vtimer->irq.irq = default_vtimer_irq.irq; - ptimer->irq.irq = default_ptimer_irq.irq; - - vtimer->host_timer_irq = host_vtimer_irq; - ptimer->host_timer_irq = host_ptimer_irq; - - vtimer->host_timer_irq_flags = host_vtimer_irq_flags; - ptimer->host_timer_irq_flags = host_ptimer_irq_flags; +void kvm_timer_init_vm(struct kvm *kvm) +{ + for (int i = 0; i < NR_KVM_TIMERS; i++) + kvm->arch.timer_data.ppi[i] = default_ppi[i]; } void kvm_timer_cpu_up(void) @@ -814,8 +1065,11 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: - timer = vcpu_vtimer(vcpu); - timer_set_offset(timer, kvm_phys_timer_read() - value); + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, + &vcpu->kvm->arch.flags)) { + timer = vcpu_vtimer(vcpu); + timer_set_offset(timer, kvm_phys_timer_read() - value); + } break; case KVM_REG_ARM_TIMER_CVAL: timer = vcpu_vtimer(vcpu); @@ -825,6 +1079,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) timer = vcpu_ptimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; + case KVM_REG_ARM_PTIMER_CNT: + if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, + &vcpu->kvm->arch.flags)) { + timer = vcpu_ptimer(vcpu); + timer_set_offset(timer, kvm_phys_timer_read() - value); + } + break; case KVM_REG_ARM_PTIMER_CVAL: timer = vcpu_ptimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); @@ -902,6 +1163,10 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, val = kvm_phys_timer_read() - timer_get_offset(timer); break; + case TIMER_REG_VOFF: + val = *timer->offset.vcpu_offset; + break; + default: BUG(); } @@ -920,7 +1185,7 @@ u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, get_timer_map(vcpu, &map); timer = vcpu_get_timer(vcpu, tmr); - if (timer == map.emul_ptimer) + if (timer == map.emul_vtimer || timer == map.emul_ptimer) return kvm_arm_timer_read(vcpu, timer, treg); preempt_disable(); @@ -952,6 +1217,10 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, timer_set_cval(timer, val); break; + case TIMER_REG_VOFF: + *timer->offset.vcpu_offset = val; + break; + default: BUG(); } @@ -967,7 +1236,7 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, get_timer_map(vcpu, &map); timer = vcpu_get_timer(vcpu, tmr); - if (timer == map.emul_ptimer) { + if (timer == map.emul_vtimer || timer == map.emul_ptimer) { soft_timer_cancel(&timer->hrtimer); kvm_arm_timer_write(vcpu, timer, treg, val); timer_emulate(timer); @@ -1047,10 +1316,6 @@ static const struct irq_domain_ops timer_domain_ops = { .free = timer_irq_domain_free, }; -static struct irq_ops arch_timer_irq_ops = { - .get_input_level = kvm_arch_timer_get_input_level, -}; - static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags) { *flags = irq_get_trigger_type(virq); @@ -1192,44 +1457,56 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) { - int vtimer_irq, ptimer_irq, ret; - unsigned long i; + u32 ppis = 0; + bool valid; - vtimer_irq = vcpu_vtimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu)); - if (ret) - return false; + mutex_lock(&vcpu->kvm->arch.config_lock); - ptimer_irq = vcpu_ptimer(vcpu)->irq.irq; - ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu)); - if (ret) - return false; + for (int i = 0; i < nr_timers(vcpu); i++) { + struct arch_timer_context *ctx; + int irq; + + ctx = vcpu_get_timer(vcpu, i); + irq = timer_irq(ctx); + if (kvm_vgic_set_owner(vcpu, irq, ctx)) + break; - kvm_for_each_vcpu(i, vcpu, vcpu->kvm) { - if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq || - vcpu_ptimer(vcpu)->irq.irq != ptimer_irq) - return false; + /* + * We know by construction that we only have PPIs, so + * all values are less than 32. + */ + ppis |= BIT(irq); } - return true; + valid = hweight32(ppis) == nr_timers(vcpu); + + if (valid) + set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags); + + mutex_unlock(&vcpu->kvm->arch.config_lock); + + return valid; } -bool kvm_arch_timer_get_input_level(int vintid) +static bool kvm_arch_timer_get_input_level(int vintid) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - struct arch_timer_context *timer; if (WARN(!vcpu, "No vcpu context!\n")) return false; - if (vintid == vcpu_vtimer(vcpu)->irq.irq) - timer = vcpu_vtimer(vcpu); - else if (vintid == vcpu_ptimer(vcpu)->irq.irq) - timer = vcpu_ptimer(vcpu); - else - BUG(); + for (int i = 0; i < nr_timers(vcpu); i++) { + struct arch_timer_context *ctx; + + ctx = vcpu_get_timer(vcpu, i); + if (timer_irq(ctx) == vintid) + return kvm_timer_should_fire(ctx); + } + + /* A timer IRQ has fired, but no matching timer was found? */ + WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid); - return kvm_timer_should_fire(timer); + return false; } int kvm_timer_enable(struct kvm_vcpu *vcpu) @@ -1258,7 +1535,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) ret = kvm_vgic_map_phys_irq(vcpu, map.direct_vtimer->host_timer_irq, - map.direct_vtimer->irq.irq, + timer_irq(map.direct_vtimer), &arch_timer_irq_ops); if (ret) return ret; @@ -1266,7 +1543,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (map.direct_ptimer) { ret = kvm_vgic_map_phys_irq(vcpu, map.direct_ptimer->host_timer_irq, - map.direct_ptimer->irq.irq, + timer_irq(map.direct_ptimer), &arch_timer_irq_ops); } @@ -1278,45 +1555,17 @@ no_vgic: return 0; } -/* - * On VHE system, we only need to configure the EL2 timer trap register once, - * not for every world switch. - * The host kernel runs at EL2 with HCR_EL2.TGE == 1, - * and this makes those bits have no effect for the host kernel execution. - */ +/* If we have CNTPOFF, permanently set ECV to enable it */ void kvm_timer_init_vhe(void) { - /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */ - u32 cnthctl_shift = 10; - u64 val; - - /* - * VHE systems allow the guest direct access to the EL1 physical - * timer/counter. - */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCEN << cnthctl_shift); - val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); - write_sysreg(val, cnthctl_el2); -} - -static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu_vtimer(vcpu)->irq.irq = vtimer_irq; - vcpu_ptimer(vcpu)->irq.irq = ptimer_irq; - } + if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)) + sysreg_clear_set(cntkctl_el1, 0, CNTHCTL_ECV); } int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int __user *uaddr = (int __user *)(long)attr->addr; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - int irq; + int irq, idx, ret = 0; if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; @@ -1327,21 +1576,42 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!(irq_is_ppi(irq))) return -EINVAL; - if (vcpu->arch.timer_cpu.enabled) - return -EBUSY; + mutex_lock(&vcpu->kvm->arch.config_lock); + + if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, + &vcpu->kvm->arch.flags)) { + ret = -EBUSY; + goto out; + } switch (attr->attr) { case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: - set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq); + idx = TIMER_VTIMER; break; case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: - set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq); + idx = TIMER_PTIMER; + break; + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + idx = TIMER_HVTIMER; + break; + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: + idx = TIMER_HPTIMER; break; default: - return -ENXIO; + ret = -ENXIO; + goto out; } - return 0; + /* + * We cannot validate the IRQ unicity before we run, so take it at + * face value. The verdict will be given on first vcpu run, for each + * vcpu. Yes this is late. Blame it on the stupid API. + */ + vcpu->kvm->arch.timer_data.ppi[idx] = irq; + +out: + mutex_unlock(&vcpu->kvm->arch.config_lock); + return ret; } int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) @@ -1357,11 +1627,17 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: timer = vcpu_ptimer(vcpu); break; + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + timer = vcpu_hvtimer(vcpu); + break; + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: + timer = vcpu_hptimer(vcpu); + break; default: return -ENXIO; } - irq = timer->irq.irq; + irq = timer_irq(timer); return put_user(irq, uaddr); } @@ -1370,8 +1646,42 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: + case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: return 0; } return -ENXIO; } + +int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, + struct kvm_arm_counter_offset *offset) +{ + int ret = 0; + + if (offset->reserved) + return -EINVAL; + + mutex_lock(&kvm->lock); + + if (lock_all_vcpus(kvm)) { + set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags); + + /* + * If userspace decides to set the offset using this + * API rather than merely restoring the counter + * values, the offset applies to both the virtual and + * physical views. + */ + kvm->arch.timer_data.voffset = offset->counter_offset; + kvm->arch.timer_data.poffset = offset->counter_offset; + + unlock_all_vcpus(kvm); + } else { + ret = -EBUSY; + } + + mutex_unlock(&kvm->lock); + + return ret; +} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 6673c7b4f1a8..14391826241c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -126,6 +126,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; + mutex_init(&kvm->arch.config_lock); + +#ifdef CONFIG_LOCKDEP + /* Clue in lockdep that the config_lock must be taken inside kvm->lock */ + mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); + mutex_unlock(&kvm->arch.config_lock); + mutex_unlock(&kvm->lock); +#endif + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; @@ -146,6 +156,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_vgic_early_init(kvm); + kvm_timer_init_vm(kvm); + /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->max_vcpus = kvm_arm_default_max_vcpus(); @@ -190,6 +202,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_destroy_vcpus(kvm); kvm_unshare_hyp(kvm, kvm + 1); + + kvm_arm_teardown_hypercalls(kvm); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -219,6 +233,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_COUNTER_OFFSET: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -325,6 +340,16 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err; + spin_lock_init(&vcpu->arch.mp_state_lock); + +#ifdef CONFIG_LOCKDEP + /* Inform lockdep that the config_lock is acquired after vcpu->mutex */ + mutex_lock(&vcpu->mutex); + mutex_lock(&vcpu->kvm->arch.config_lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); + mutex_unlock(&vcpu->mutex); +#endif + /* Force users to call KVM_ARM_VCPU_INIT */ vcpu->arch.target = -1; bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); @@ -442,34 +467,41 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) +static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.mp_state_lock); + __kvm_arm_vcpu_power_off(vcpu); + spin_unlock(&vcpu->arch.mp_state_lock); +} + bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED; + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED; } static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) { - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED); kvm_make_request(KVM_REQ_SUSPEND, vcpu); kvm_vcpu_kick(vcpu); } static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED; + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - *mp_state = vcpu->arch.mp_state; + *mp_state = READ_ONCE(vcpu->arch.mp_state); return 0; } @@ -479,12 +511,14 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int ret = 0; + spin_lock(&vcpu->arch.mp_state_lock); + switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.mp_state = *mp_state; + WRITE_ONCE(vcpu->arch.mp_state, *mp_state); break; case KVM_MP_STATE_STOPPED: - kvm_arm_vcpu_power_off(vcpu); + __kvm_arm_vcpu_power_off(vcpu); break; case KVM_MP_STATE_SUSPENDED: kvm_arm_vcpu_suspend(vcpu); @@ -493,6 +527,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, ret = -EINVAL; } + spin_unlock(&vcpu->arch.mp_state_lock); + return ret; } @@ -592,9 +628,9 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (kvm_vm_is_protected(kvm)) kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -1209,10 +1245,14 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, /* * Handle the "start in power-off" case. */ + spin_lock(&vcpu->arch.mp_state_lock); + if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - kvm_arm_vcpu_power_off(vcpu); + __kvm_arm_vcpu_power_off(vcpu); else - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); + + spin_unlock(&vcpu->arch.mp_state_lock); return 0; } @@ -1438,11 +1478,31 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, } } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_ARM_VM_SMCCC_CTRL: + return kvm_vm_smccc_has_attr(kvm, attr); + default: + return -ENXIO; + } +} + +static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_ARM_VM_SMCCC_CTRL: + return kvm_vm_smccc_set_attr(kvm, attr); + default: + return -ENXIO; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; + struct kvm_device_attr attr; switch (ioctl) { case KVM_CREATE_IRQCHIP: { @@ -1478,11 +1538,73 @@ long kvm_arch_vm_ioctl(struct file *filp, return -EFAULT; return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags); } + case KVM_ARM_SET_COUNTER_OFFSET: { + struct kvm_arm_counter_offset offset; + + if (copy_from_user(&offset, argp, sizeof(offset))) + return -EFAULT; + return kvm_vm_ioctl_set_counter_offset(kvm, &offset); + } + case KVM_HAS_DEVICE_ATTR: { + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_has_attr(kvm, &attr); + } + case KVM_SET_DEVICE_ATTR: { + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_set_attr(kvm, &attr); + } default: return -EINVAL; } } +/* unlocks vcpus from @vcpu_lock_idx and smaller */ +static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) +{ + struct kvm_vcpu *tmp_vcpu; + + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&tmp_vcpu->mutex); + } +} + +void unlock_all_vcpus(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + + unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); +} + +/* Returns true if all vcpus were locked, false otherwise */ +bool lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *tmp_vcpu; + unsigned long c; + + lockdep_assert_held(&kvm->lock); + + /* + * Any time a vcpu is in an ioctl (including running), the + * core KVM code tries to grab the vcpu->mutex. + * + * By grabbing the vcpu->mutex of all VCPUs we ensure that no + * other VCPUs can fiddle with the state while we access it. + */ + kvm_for_each_vcpu(c, tmp_vcpu, kvm) { + if (!mutex_trylock(&tmp_vcpu->mutex)) { + unlock_vcpus(kvm, c - 1); + return false; + } + } + + return true; +} + static unsigned long nvhe_percpu_size(void) { return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 07444fa22888..20280a5233f6 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -590,11 +590,16 @@ static unsigned long num_core_regs(const struct kvm_vcpu *vcpu) return copy_core_reg_indices(vcpu, NULL); } -/** - * ARM64 versions of the TIMER registers, always available on arm64 - */ +static const u64 timer_reg_list[] = { + KVM_REG_ARM_TIMER_CTL, + KVM_REG_ARM_TIMER_CNT, + KVM_REG_ARM_TIMER_CVAL, + KVM_REG_ARM_PTIMER_CTL, + KVM_REG_ARM_PTIMER_CNT, + KVM_REG_ARM_PTIMER_CVAL, +}; -#define NUM_TIMER_REGS 3 +#define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) static bool is_timer_reg(u64 index) { @@ -602,6 +607,9 @@ static bool is_timer_reg(u64 index) case KVM_REG_ARM_TIMER_CTL: case KVM_REG_ARM_TIMER_CNT: case KVM_REG_ARM_TIMER_CVAL: + case KVM_REG_ARM_PTIMER_CTL: + case KVM_REG_ARM_PTIMER_CNT: + case KVM_REG_ARM_PTIMER_CVAL: return true; } return false; @@ -609,14 +617,11 @@ static bool is_timer_reg(u64 index) static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - if (put_user(KVM_REG_ARM_TIMER_CTL, uindices)) - return -EFAULT; - uindices++; - if (put_user(KVM_REG_ARM_TIMER_CNT, uindices)) - return -EFAULT; - uindices++; - if (put_user(KVM_REG_ARM_TIMER_CVAL, uindices)) - return -EFAULT; + for (int i = 0; i < NUM_TIMER_REGS; i++) { + if (put_user(timer_reg_list[i], uindices)) + return -EFAULT; + uindices++; + } return 0; } @@ -957,7 +962,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, switch (attr->group) { case KVM_ARM_VCPU_PMU_V3_CTRL: + mutex_lock(&vcpu->kvm->arch.config_lock); ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); + mutex_unlock(&vcpu->kvm->arch.config_lock); break; case KVM_ARM_VCPU_TIMER_CTRL: ret = kvm_arm_timer_set_attr(vcpu, attr); @@ -1019,8 +1026,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, return ret; } -long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, - struct kvm_arm_copy_mte_tags *copy_tags) +int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, + struct kvm_arm_copy_mte_tags *copy_tags) { gpa_t guest_ipa = copy_tags->guest_ipa; size_t length = copy_tags->length; @@ -1041,6 +1048,10 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK) return -EINVAL; + /* Lengths above INT_MAX cannot be represented in the return value */ + if (length > INT_MAX) + return -EINVAL; + gfn = gpa_to_gfn(guest_ipa); mutex_lock(&kvm->slots_lock); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a798c0b4d717..6dcd6604b6bc 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -36,8 +36,6 @@ static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr) static int handle_hvc(struct kvm_vcpu *vcpu) { - int ret; - trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0), kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; @@ -52,33 +50,29 @@ static int handle_hvc(struct kvm_vcpu *vcpu) return 1; } - ret = kvm_hvc_call_handler(vcpu); - if (ret < 0) { - vcpu_set_reg(vcpu, 0, ~0UL); - return 1; - } - - return ret; + return kvm_smccc_call_handler(vcpu); } static int handle_smc(struct kvm_vcpu *vcpu) { - int ret; - /* * "If an SMC instruction executed at Non-secure EL1 is * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a * Trap exception, not a Secure Monitor Call exception [...]" * * We need to advance the PC after the trap, as it would - * otherwise return to the same address... - * - * Only handle SMCs from the virtual EL2 with an immediate of zero and - * skip it otherwise. + * otherwise return to the same address. Furthermore, pre-incrementing + * the PC before potentially exiting to userspace maintains the same + * abstraction for both SMCs and HVCs. + */ + kvm_incr_pc(vcpu); + + /* + * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9 + * "SMC and HVC immediate value". */ - if (!vcpu_is_el2(vcpu) || kvm_vcpu_hvc_get_imm(vcpu)) { + if (kvm_vcpu_hvc_get_imm(vcpu)) { vcpu_set_reg(vcpu, 0, ~0UL); - kvm_incr_pc(vcpu); return 1; } @@ -89,13 +83,7 @@ static int handle_smc(struct kvm_vcpu *vcpu) * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than * being treated as UNDEFINED. */ - ret = kvm_hvc_call_handler(vcpu); - if (ret < 0) - vcpu_set_reg(vcpu, 0, ~0UL); - - kvm_incr_pc(vcpu); - - return ret; + return kvm_smccc_call_handler(vcpu); } /* diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 07d37ff88a3f..c41166f1a1dd 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -26,6 +26,7 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> @@ -326,6 +327,55 @@ static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } +static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *ctxt; + u32 sysreg; + u64 val; + + /* + * We only get here for 64bit guests, 32bit guests will hit + * the long and winding road all the way to the standard + * handling. Yes, it sucks to be irrelevant. + */ + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + + switch (sysreg) { + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + if (is_hyp_ctxt(vcpu)) { + ctxt = vcpu_hptimer(vcpu); + break; + } + + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + if (!vcpu_el2_e2h_is_set(vcpu)) + val = (val & CNTHCTL_EL1PCTEN) << 10; + + if (!(val & (CNTHCTL_EL1PCTEN << 10))) + return false; + } + + ctxt = vcpu_ptimer(vcpu); + break; + default: + return false; + } + + val = arch_timer_read_cntpct_el0(); + + if (ctxt->offset.vm_offset) + val -= *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + val -= *kern_hyp_va(ctxt->offset.vcpu_offset); + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + return true; +} + static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && @@ -339,6 +389,9 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) return kvm_hyp_handle_ptrauth(vcpu, exit_code); + if (kvm_hyp_handle_cntpct(vcpu)) + return true; + return false; } diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 2673bde62fad..d756b939f296 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -37,7 +37,6 @@ static void __debug_save_spe(u64 *pmscr_el1) /* Now drain all buffered data to memory */ psb_csync(); - dsb(nsh); } static void __debug_restore_spe(u64 pmscr_el1) @@ -69,7 +68,6 @@ static void __debug_save_trace(u64 *trfcr_el1) isb(); /* Drain the trace buffer to memory */ tsb_csync(); - dsb(nsh); } static void __debug_restore_trace(u64 trfcr_el1) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 552653fa18be..2e9ec4a2a4a3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -297,6 +297,13 @@ int __pkvm_prot_finalize(void) params->vttbr = kvm_get_vttbr(mmu); params->vtcr = host_mmu.arch.vtcr; params->hcr_el2 |= HCR_VM; + + /* + * The CMO below not only cleans the updated params to the + * PoC, but also provides the DSB that ensures ongoing + * page-table walks that have started before we trapped to EL2 + * have completed. + */ kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index c2cb46ca4fb6..71fa16a0dc77 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -272,6 +272,17 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ __debug_save_host_buffers_nvhe(vcpu); + /* + * We're about to restore some new MMU state. Make sure + * ongoing page-table walks that have started before we + * trapped to EL2 have completed. This also synchronises the + * above disabling of SPE and TRBE. + * + * See DDI0487I.a D8.1.5 "Out-of-context translation regimes", + * rule R_LFHQG and subsequent information statements. + */ + dsb(nsh); + __kvm_adjust_pc(vcpu); /* @@ -306,6 +317,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __timer_disable_traps(vcpu); __hyp_vgic_save_state(vcpu); + /* + * Same thing as before the guest run: we're about to switch + * the MMU context, so let's make sure we don't have any + * ongoing EL1&0 translations. + */ + dsb(nsh); + __deactivate_traps(vcpu); __load_host_stage2(); diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index 9072e71693ba..b185ac0dbd47 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -9,6 +9,7 @@ #include <linux/kvm_host.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> void __kvm_timer_set_cntvoff(u64 cntvoff) { @@ -35,14 +36,19 @@ void __timer_disable_traps(struct kvm_vcpu *vcpu) */ void __timer_enable_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 clr = 0, set = 0; /* * Disallow physical timer access for the guest - * Physical counter access is allowed + * Physical counter access is allowed if no offset is enforced + * or running protected (we don't offset anything in this case). */ - val = read_sysreg(cnthctl_el2); - val &= ~CNTHCTL_EL1PCEN; - val |= CNTHCTL_EL1PCTEN; - write_sysreg(val, cnthctl_el2); + clr = CNTHCTL_EL1PCEN; + if (is_protected_kvm_enabled() || + !kern_hyp_va(vcpu->kvm)->arch.timer_data.poffset) + set |= CNTHCTL_EL1PCTEN; + else + clr |= CNTHCTL_EL1PCTEN; + + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index d296d617f589..978179133f4b 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -15,8 +15,31 @@ struct tlb_inv_context { }; static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, - struct tlb_inv_context *cxt) + struct tlb_inv_context *cxt, + bool nsh) { + /* + * We have two requirements: + * + * - ensure that the page table updates are visible to all + * CPUs, for which a dsb(DOMAIN-st) is what we need, DOMAIN + * being either ish or nsh, depending on the invalidation + * type. + * + * - complete any speculative page table walk started before + * we trapped to EL2 so that we can mess with the MM + * registers out of context, for which dsb(nsh) is enough + * + * The composition of these two barriers is a dsb(DOMAIN), and + * the 'nsh' parameter tracks the distinction between + * Inner-Shareable and Non-Shareable, as specified by the + * callers. + */ + if (nsh) + dsb(nsh); + else + dsb(ish); + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; @@ -60,10 +83,8 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, { struct tlb_inv_context cxt; - dsb(ishst); - /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + __tlb_switch_to_guest(mmu, &cxt, false); /* * We could do so much better if we had the VA as well. @@ -113,10 +134,8 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; - dsb(ishst); - /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + __tlb_switch_to_guest(mmu, &cxt, false); __tlbi(vmalls12e1is); dsb(ish); @@ -130,7 +149,7 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest(mmu, &cxt); + __tlb_switch_to_guest(mmu, &cxt, false); __tlbi(vmalle1); asm volatile("ic iallu"); @@ -142,7 +161,8 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) void __kvm_flush_vm_context(void) { - dsb(ishst); + /* Same remark as in __tlb_switch_to_guest() */ + dsb(ish); __tlbi(alle1is); /* diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index cd3f3117bf16..3d868e84c7a0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -227,11 +227,10 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) /* * When we exit from the guest we change a number of CPU configuration - * parameters, such as traps. Make sure these changes take effect - * before running the host or additional guests. + * parameters, such as traps. We rely on the isb() in kvm_call_hyp*() + * to make sure these changes take effect before running the host or + * additional guests. */ - isb(); - return ret; } diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 7b44f6b3b547..b35a178e7e0d 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -13,6 +13,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_nested.h> /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and @@ -70,6 +71,17 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) __sysreg_save_user_state(host_ctxt); /* + * When running a normal EL1 guest, we only load a new vcpu + * after a context switch, which imvolves a DSB, so all + * speculative EL1&0 walks will have already completed. + * If running NV, the vcpu may transition between vEL1 and + * vEL2 without a context switch, so make sure we complete + * those walks before loading a new context. + */ + if (vcpu_has_nv(vcpu)) + dsb(nsh); + + /* * Load guest EL1 and user state * * We must restore the 32-bit state before the sysregs, thanks diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index c4b4678bc4a4..7fb4df0456de 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -47,7 +47,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset; break; case KVM_PTP_PHYS_COUNTER: - cycles = systime_snapshot.cycles; + cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.poffset; break; default: return; @@ -65,7 +65,7 @@ static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) val[3] = lower_32_bits(cycles); } -static bool kvm_hvc_call_default_allowed(u32 func_id) +static bool kvm_smccc_default_allowed(u32 func_id) { switch (func_id) { /* @@ -93,7 +93,7 @@ static bool kvm_hvc_call_default_allowed(u32 func_id) } } -static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id) +static bool kvm_smccc_test_fw_bmap(struct kvm_vcpu *vcpu, u32 func_id) { struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; @@ -117,20 +117,161 @@ static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id) return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP, &smccc_feat->vendor_hyp_bmap); default: - return kvm_hvc_call_default_allowed(func_id); + return false; + } +} + +#define SMC32_ARCH_RANGE_BEGIN ARM_SMCCC_VERSION_FUNC_ID +#define SMC32_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, ARM_SMCCC_FUNC_MASK) + +#define SMC64_ARCH_RANGE_BEGIN ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + 0, 0) +#define SMC64_ARCH_RANGE_END ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + 0, ARM_SMCCC_FUNC_MASK) + +static void init_smccc_filter(struct kvm *kvm) +{ + int r; + + mt_init(&kvm->arch.smccc_filter); + + /* + * Prevent userspace from handling any SMCCC calls in the architecture + * range, avoiding the risk of misrepresenting Spectre mitigation status + * to the guest. + */ + r = mtree_insert_range(&kvm->arch.smccc_filter, + SMC32_ARCH_RANGE_BEGIN, SMC32_ARCH_RANGE_END, + xa_mk_value(KVM_SMCCC_FILTER_HANDLE), + GFP_KERNEL_ACCOUNT); + WARN_ON_ONCE(r); + + r = mtree_insert_range(&kvm->arch.smccc_filter, + SMC64_ARCH_RANGE_BEGIN, SMC64_ARCH_RANGE_END, + xa_mk_value(KVM_SMCCC_FILTER_HANDLE), + GFP_KERNEL_ACCOUNT); + WARN_ON_ONCE(r); + +} + +static int kvm_smccc_set_filter(struct kvm *kvm, struct kvm_smccc_filter __user *uaddr) +{ + const void *zero_page = page_to_virt(ZERO_PAGE(0)); + struct kvm_smccc_filter filter; + u32 start, end; + int r; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (memcmp(filter.pad, zero_page, sizeof(filter.pad))) + return -EINVAL; + + start = filter.base; + end = start + filter.nr_functions - 1; + + if (end < start || filter.action >= NR_SMCCC_FILTER_ACTIONS) + return -EINVAL; + + mutex_lock(&kvm->arch.config_lock); + + if (kvm_vm_has_ran_once(kvm)) { + r = -EBUSY; + goto out_unlock; } + + r = mtree_insert_range(&kvm->arch.smccc_filter, start, end, + xa_mk_value(filter.action), GFP_KERNEL_ACCOUNT); + if (r) + goto out_unlock; + + set_bit(KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED, &kvm->arch.flags); + +out_unlock: + mutex_unlock(&kvm->arch.config_lock); + return r; +} + +static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id) +{ + unsigned long idx = func_id; + void *val; + + if (!test_bit(KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED, &kvm->arch.flags)) + return KVM_SMCCC_FILTER_HANDLE; + + /* + * But where's the error handling, you say? + * + * mt_find() returns NULL if no entry was found, which just so happens + * to match KVM_SMCCC_FILTER_HANDLE. + */ + val = mt_find(&kvm->arch.smccc_filter, &idx, idx); + return xa_to_value(val); } -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) +static u8 kvm_smccc_get_action(struct kvm_vcpu *vcpu, u32 func_id) +{ + /* + * Intervening actions in the SMCCC filter take precedence over the + * pseudo-firmware register bitmaps. + */ + u8 action = kvm_smccc_filter_get_action(vcpu->kvm, func_id); + if (action != KVM_SMCCC_FILTER_HANDLE) + return action; + + if (kvm_smccc_test_fw_bmap(vcpu, func_id) || + kvm_smccc_default_allowed(func_id)) + return KVM_SMCCC_FILTER_HANDLE; + + return KVM_SMCCC_FILTER_DENY; +} + +static void kvm_prepare_hypercall_exit(struct kvm_vcpu *vcpu, u32 func_id) +{ + u8 ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu)); + struct kvm_run *run = vcpu->run; + u64 flags = 0; + + if (ec == ESR_ELx_EC_SMC32 || ec == ESR_ELx_EC_SMC64) + flags |= KVM_HYPERCALL_EXIT_SMC; + + if (!kvm_vcpu_trap_il_is32bit(vcpu)) + flags |= KVM_HYPERCALL_EXIT_16BIT; + + run->exit_reason = KVM_EXIT_HYPERCALL; + run->hypercall = (typeof(run->hypercall)) { + .nr = func_id, + .flags = flags, + }; +} + +int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) { struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat; u32 func_id = smccc_get_function(vcpu); u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; u32 feature; + u8 action; gpa_t gpa; - if (!kvm_hvc_call_allowed(vcpu, func_id)) + action = kvm_smccc_get_action(vcpu, func_id); + switch (action) { + case KVM_SMCCC_FILTER_HANDLE: + break; + case KVM_SMCCC_FILTER_DENY: + goto out; + case KVM_SMCCC_FILTER_FWD_TO_USER: + kvm_prepare_hypercall_exit(vcpu, func_id); + return 0; + default: + WARN_RATELIMIT(1, "Unhandled SMCCC filter action: %d\n", action); goto out; + } switch (func_id) { case ARM_SMCCC_VERSION_FUNC_ID: @@ -245,6 +386,13 @@ void kvm_arm_init_hypercalls(struct kvm *kvm) smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES; smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES; smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES; + + init_smccc_filter(kvm); +} + +void kvm_arm_teardown_hypercalls(struct kvm *kvm) +{ + mtree_destroy(&kvm->arch.smccc_filter); } int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) @@ -377,17 +525,16 @@ static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val) if (val & ~fw_reg_features) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); - if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) && - val != *fw_reg_bmap) { + if (kvm_vm_has_ran_once(kvm) && val != *fw_reg_bmap) { ret = -EBUSY; goto out; } WRITE_ONCE(*fw_reg_bmap, val); out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -481,3 +628,25 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) return -EINVAL; } + +int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VM_SMCCC_FILTER: + return 0; + default: + return -ENXIO; + } +} + +int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + void __user *uaddr = (void __user *)attr->addr; + + switch (attr->attr) { + case KVM_ARM_VM_SMCCC_FILTER: + return kvm_smccc_set_filter(kvm, uaddr); + default: + return -ENXIO; + } +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 5eca0cdd961d..45727d50d18d 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -876,13 +876,13 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) struct arm_pmu *arm_pmu; int ret = -ENXIO; - mutex_lock(&kvm->lock); + lockdep_assert_held(&kvm->arch.config_lock); mutex_lock(&arm_pmus_lock); list_for_each_entry(entry, &arm_pmus, entry) { arm_pmu = entry->arm_pmu; if (arm_pmu->pmu.type == pmu_id) { - if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags) || + if (kvm_vm_has_ran_once(kvm) || (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { ret = -EBUSY; break; @@ -896,7 +896,6 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) } mutex_unlock(&arm_pmus_lock); - mutex_unlock(&kvm->lock); return ret; } @@ -904,22 +903,20 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { struct kvm *kvm = vcpu->kvm; + lockdep_assert_held(&kvm->arch.config_lock); + if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (vcpu->arch.pmu.created) return -EBUSY; - mutex_lock(&kvm->lock); if (!kvm->arch.arm_pmu) { /* No PMU set, get the default one */ kvm->arch.arm_pmu = kvm_pmu_probe_armpmu(); - if (!kvm->arch.arm_pmu) { - mutex_unlock(&kvm->lock); + if (!kvm->arch.arm_pmu) return -ENODEV; - } } - mutex_unlock(&kvm->lock); switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { @@ -963,19 +960,13 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) filter.action != KVM_PMU_EVENT_DENY)) return -EINVAL; - mutex_lock(&kvm->lock); - - if (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags)) { - mutex_unlock(&kvm->lock); + if (kvm_vm_has_ran_once(kvm)) return -EBUSY; - } if (!kvm->arch.pmu_filter) { kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); - if (!kvm->arch.pmu_filter) { - mutex_unlock(&kvm->lock); + if (!kvm->arch.pmu_filter) return -ENOMEM; - } /* * The default depends on the first applied filter. @@ -994,8 +985,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) else bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); - mutex_unlock(&kvm->lock); - return 0; } case KVM_ARM_VCPU_PMU_V3_SET_PMU: { diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 7fbc4c1b9df0..1f69b667332b 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -62,6 +62,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) struct vcpu_reset_state *reset_state; struct kvm *kvm = source_vcpu->kvm; struct kvm_vcpu *vcpu = NULL; + int ret = PSCI_RET_SUCCESS; unsigned long cpu_id; cpu_id = smccc_get_arg1(source_vcpu); @@ -76,11 +77,15 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ if (!vcpu) return PSCI_RET_INVALID_PARAMS; + + spin_lock(&vcpu->arch.mp_state_lock); if (!kvm_arm_vcpu_stopped(vcpu)) { if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) - return PSCI_RET_ALREADY_ON; + ret = PSCI_RET_ALREADY_ON; else - return PSCI_RET_INVALID_PARAMS; + ret = PSCI_RET_INVALID_PARAMS; + + goto out_unlock; } reset_state = &vcpu->arch.reset_state; @@ -96,7 +101,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ reset_state->r0 = smccc_get_arg3(source_vcpu); - WRITE_ONCE(reset_state->reset, true); + reset_state->reset = true; kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); /* @@ -105,10 +110,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ smp_wmb(); - vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; + WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); kvm_vcpu_wake_up(vcpu); - return PSCI_RET_SUCCESS; +out_unlock: + spin_unlock(&vcpu->arch.mp_state_lock); + return ret; } static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) @@ -168,8 +175,11 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags) * after this call is handled and before the VCPUs have been * re-initialized. */ - kvm_for_each_vcpu(i, tmp, vcpu->kvm) - tmp->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + spin_lock(&tmp->arch.mp_state_lock); + WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); + spin_unlock(&tmp->arch.mp_state_lock); + } kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); @@ -229,7 +239,6 @@ static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; u32 psci_fn = smccc_get_function(vcpu); unsigned long val; int ret = 1; @@ -254,9 +263,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) kvm_psci_narrow_to_32bit(vcpu); fallthrough; case PSCI_0_2_FN64_CPU_ON: - mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); break; case PSCI_0_2_FN_AFFINITY_INFO: kvm_psci_narrow_to_32bit(vcpu); @@ -395,7 +402,6 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; u32 psci_fn = smccc_get_function(vcpu); unsigned long val; @@ -405,9 +411,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) val = PSCI_RET_SUCCESS; break; case KVM_PSCI_FN_CPU_ON: - mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); - mutex_unlock(&kvm->lock); break; default: val = PSCI_RET_NOT_SUPPORTED; @@ -435,6 +439,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) int kvm_psci_call(struct kvm_vcpu *vcpu) { u32 psci_fn = smccc_get_function(vcpu); + int version = kvm_psci_version(vcpu); unsigned long val; val = kvm_psci_check_allowed_function(vcpu, psci_fn); @@ -443,7 +448,7 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) return 1; } - switch (kvm_psci_version(vcpu)) { + switch (version) { case KVM_ARM_PSCI_1_1: return kvm_psci_1_x_call(vcpu, 1); case KVM_ARM_PSCI_1_0: @@ -453,6 +458,8 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) case KVM_ARM_PSCI_0_1: return kvm_psci_0_1_call(vcpu); default: - return -EINVAL; + WARN_ONCE(1, "Unknown PSCI version %d", version); + smccc_set_retval(vcpu, SMCCC_RET_NOT_SUPPORTED, 0, 0, 0); + return 1; } } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 49a3257dec46..b5dee8e57e77 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -205,7 +205,7 @@ static int kvm_set_vm_width(struct kvm_vcpu *vcpu) is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); - lockdep_assert_held(&kvm->lock); + lockdep_assert_held(&kvm->arch.config_lock); if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) { /* @@ -262,17 +262,18 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) bool loaded; u32 pstate; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); ret = kvm_set_vm_width(vcpu); - if (!ret) { - reset_state = vcpu->arch.reset_state; - WRITE_ONCE(vcpu->arch.reset_state.reset, false); - } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); if (ret) return ret; + spin_lock(&vcpu->arch.mp_state_lock); + reset_state = vcpu->arch.reset_state; + vcpu->arch.reset_state.reset = false; + spin_unlock(&vcpu->arch.mp_state_lock); + /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 34688918c811..71b12094d613 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1154,6 +1154,12 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + case SYS_AARCH32_CNTPCT: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); kvm_inject_undefined(vcpu); @@ -2091,6 +2097,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVTYPER1_EL0(14), AMU_AMEVTYPER1_EL0(15), + { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, @@ -2541,10 +2549,12 @@ static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr }, { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ + { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index f3e46a976125..6ce5c025218d 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -206,6 +206,7 @@ TRACE_EVENT(kvm_get_timer_map, __field( unsigned long, vcpu_id ) __field( int, direct_vtimer ) __field( int, direct_ptimer ) + __field( int, emul_vtimer ) __field( int, emul_ptimer ) ), @@ -214,14 +215,17 @@ TRACE_EVENT(kvm_get_timer_map, __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); __entry->direct_ptimer = (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_vtimer = + (map->emul_vtimer) ? arch_timer_ctx_index(map->emul_vtimer) : -1; __entry->emul_ptimer = (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; ), - TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", + TP_printk("VCPU: %ld, dv: %d, dp: %d, ev: %d, ep: %d", __entry->vcpu_id, __entry->direct_vtimer, __entry->direct_ptimer, + __entry->emul_vtimer, __entry->emul_ptimer) ); diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 78cde687383c..07aa0437125a 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -85,7 +85,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) struct kvm *kvm = s->private; struct vgic_state_iter *iter; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; if (iter) { iter = ERR_PTR(-EBUSY); @@ -104,7 +104,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) if (end_of_vgic(iter)) iter = NULL; out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return iter; } @@ -132,12 +132,12 @@ static void vgic_debug_stop(struct seq_file *s, void *v) if (IS_ERR(v)) return; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; kfree(iter->lpi_array); kfree(iter); kvm->arch.vgic.iter = NULL; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); } static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index cd134db41a57..9d42c7cb2b58 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -74,9 +74,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) unsigned long i; int ret; - if (irqchip_in_kernel(kvm)) - return -EEXIST; - /* * This function is also called by the KVM_CREATE_IRQCHIP handler, * which had no chance yet to check the availability of the GICv2 @@ -87,10 +84,20 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) !kvm_vgic_global_state.can_emulate_gicv2) return -ENODEV; + /* Must be held to avoid race with vCPU creation */ + lockdep_assert_held(&kvm->lock); + ret = -EBUSY; if (!lock_all_vcpus(kvm)) return ret; + mutex_lock(&kvm->arch.config_lock); + + if (irqchip_in_kernel(kvm)) { + ret = -EEXIST; + goto out_unlock; + } + kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu_has_run_once(vcpu)) goto out_unlock; @@ -118,6 +125,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); out_unlock: + mutex_unlock(&kvm->arch.config_lock); unlock_all_vcpus(kvm); return ret; } @@ -227,9 +235,9 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) * KVM io device for the redistributor that belongs to this VCPU. */ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); ret = vgic_register_redist_iodev(vcpu); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); } return ret; } @@ -250,7 +258,6 @@ static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) * The function is generally called when nr_spis has been explicitly set * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. * vgic_initialized() returns true when this function has succeeded. - * Must be called with kvm->lock held! */ int vgic_init(struct kvm *kvm) { @@ -259,6 +266,8 @@ int vgic_init(struct kvm *kvm) int ret = 0, i; unsigned long idx; + lockdep_assert_held(&kvm->arch.config_lock); + if (vgic_initialized(kvm)) return 0; @@ -373,12 +382,13 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; } -/* To be called with kvm->lock held */ static void __kvm_vgic_destroy(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; + lockdep_assert_held(&kvm->arch.config_lock); + vgic_debug_destroy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) @@ -389,9 +399,9 @@ static void __kvm_vgic_destroy(struct kvm *kvm) void kvm_vgic_destroy(struct kvm *kvm) { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); __kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); } /** @@ -414,9 +424,9 @@ int vgic_lazy_init(struct kvm *kvm) if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) return -EBUSY; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); ret = vgic_init(kvm); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); } return ret; @@ -441,7 +451,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (likely(vgic_ready(kvm))) return 0; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); if (vgic_ready(kvm)) goto out; @@ -459,7 +469,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) dist->ready = true; out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return ret; } diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 2642e9ce2819..750e51e3779a 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -1958,6 +1958,16 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) mutex_init(&its->its_lock); mutex_init(&its->cmd_lock); + /* Yep, even more trickery for lock ordering... */ +#ifdef CONFIG_LOCKDEP + mutex_lock(&dev->kvm->arch.config_lock); + mutex_lock(&its->cmd_lock); + mutex_lock(&its->its_lock); + mutex_unlock(&its->its_lock); + mutex_unlock(&its->cmd_lock); + mutex_unlock(&dev->kvm->arch.config_lock); +#endif + its->vgic_its_base = VGIC_ADDR_UNDEF; INIT_LIST_HEAD(&its->device_list); @@ -2045,6 +2055,13 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) { ret = -ENXIO; goto out; @@ -2058,11 +2075,6 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev, goto out; } - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - addr = its->vgic_its_base + offset; len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4; @@ -2076,8 +2088,9 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev, } else { *reg = region->its_read(dev->kvm, its, addr, len); } - unlock_all_vcpus(dev->kvm); out: + mutex_unlock(&dev->kvm->arch.config_lock); + unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); return ret; } @@ -2749,14 +2762,15 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) return 0; mutex_lock(&kvm->lock); - mutex_lock(&its->its_lock); if (!lock_all_vcpus(kvm)) { - mutex_unlock(&its->its_lock); mutex_unlock(&kvm->lock); return -EBUSY; } + mutex_lock(&kvm->arch.config_lock); + mutex_lock(&its->its_lock); + switch (attr) { case KVM_DEV_ARM_ITS_CTRL_RESET: vgic_its_reset(kvm, its); @@ -2769,8 +2783,9 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) break; } - unlock_all_vcpus(kvm); mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->arch.config_lock); + unlock_all_vcpus(kvm); mutex_unlock(&kvm->lock); return ret; } diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index edeac2380591..35cfa268fd5d 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -46,7 +46,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev struct vgic_dist *vgic = &kvm->arch.vgic; int r; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); @@ -68,7 +68,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev r = -ENODEV; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); return r; } @@ -102,7 +102,7 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri if (get_user(addr, uaddr)) return -EFAULT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.config_lock); switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); @@ -191,7 +191,7 @@ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool wri } out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.config_lock); if (!r && !write) r = put_user(addr, uaddr); @@ -227,7 +227,7 @@ static int vgic_set_common_attr(struct kvm_device *dev, (val & 31)) return -EINVAL; - mutex_lock(&dev->kvm->lock); + mutex_lock(&dev->kvm->arch.config_lock); if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) ret = -EBUSY; @@ -235,16 +235,16 @@ static int vgic_set_common_attr(struct kvm_device *dev, dev->kvm->arch.vgic.nr_spis = val - VGIC_NR_PRIVATE_IRQS; - mutex_unlock(&dev->kvm->lock); + mutex_unlock(&dev->kvm->arch.config_lock); return ret; } case KVM_DEV_ARM_VGIC_GRP_CTRL: { switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: - mutex_lock(&dev->kvm->lock); + mutex_lock(&dev->kvm->arch.config_lock); r = vgic_init(dev->kvm); - mutex_unlock(&dev->kvm->lock); + mutex_unlock(&dev->kvm->arch.config_lock); return r; case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: /* @@ -260,7 +260,10 @@ static int vgic_set_common_attr(struct kvm_device *dev, mutex_unlock(&dev->kvm->lock); return -EBUSY; } + + mutex_lock(&dev->kvm->arch.config_lock); r = vgic_v3_save_pending_tables(dev->kvm); + mutex_unlock(&dev->kvm->arch.config_lock); unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); return r; @@ -342,44 +345,6 @@ int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, return 0; } -/* unlocks vcpus from @vcpu_lock_idx and smaller */ -static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) -{ - struct kvm_vcpu *tmp_vcpu; - - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&tmp_vcpu->mutex); - } -} - -void unlock_all_vcpus(struct kvm *kvm) -{ - unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); -} - -/* Returns true if all vcpus were locked, false otherwise */ -bool lock_all_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *tmp_vcpu; - unsigned long c; - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run and fiddle with the vgic state while we - * access it. - */ - kvm_for_each_vcpu(c, tmp_vcpu, kvm) { - if (!mutex_trylock(&tmp_vcpu->mutex)) { - unlock_vcpus(kvm, c - 1); - return false; - } - } - - return true; -} - /** * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state * @@ -411,15 +376,17 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + + mutex_lock(&dev->kvm->arch.config_lock); + ret = vgic_init(dev->kvm); if (ret) goto out; - if (!lock_all_vcpus(dev->kvm)) { - ret = -EBUSY; - goto out; - } - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); @@ -432,8 +399,9 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, break; } - unlock_all_vcpus(dev->kvm); out: + mutex_unlock(&dev->kvm->arch.config_lock); + unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && !is_write) @@ -569,12 +537,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); - if (unlikely(!vgic_initialized(dev->kvm))) { - ret = -EBUSY; - goto out; + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; } - if (!lock_all_vcpus(dev->kvm)) { + mutex_lock(&dev->kvm->arch.config_lock); + + if (unlikely(!vgic_initialized(dev->kvm))) { ret = -EBUSY; goto out; } @@ -609,8 +579,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, break; } - unlock_all_vcpus(dev->kvm); out: + mutex_unlock(&dev->kvm->arch.config_lock); + unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && uaccess && !is_write) { diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 91201f743033..472b18ac92a2 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -111,7 +111,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, case GICD_CTLR: { bool was_enabled, is_hwsgi; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); was_enabled = dist->enabled; is_hwsgi = dist->nassgireq; @@ -139,7 +139,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, else if (!was_enabled && dist->enabled) vgic_kick_vcpus(vcpu->kvm); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); break; } case GICD_TYPER: diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index e67b3b2c8044..1939c94e0b24 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -530,13 +530,13 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 val; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); val = __vgic_mmio_read_active(vcpu, addr, len); vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); return val; } @@ -625,13 +625,13 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_cactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, @@ -662,13 +662,13 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_sactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index a413718be92b..3bb003478060 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -232,9 +232,8 @@ int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq) * @kvm: Pointer to the VM being initialized * * We may be called each time a vITS is created, or when the - * vgic is initialized. This relies on kvm->lock to be - * held. In both cases, the number of vcpus should now be - * fixed. + * vgic is initialized. In both cases, the number of vcpus + * should now be fixed. */ int vgic_v4_init(struct kvm *kvm) { @@ -243,6 +242,8 @@ int vgic_v4_init(struct kvm *kvm) int nr_vcpus, ret; unsigned long i; + lockdep_assert_held(&kvm->arch.config_lock); + if (!kvm_vgic_global_state.has_gicv4) return 0; /* Nothing to see here... move along. */ @@ -309,14 +310,14 @@ int vgic_v4_init(struct kvm *kvm) /** * vgic_v4_teardown - Free the GICv4 data structures * @kvm: Pointer to the VM being destroyed - * - * Relies on kvm->lock to be held. */ void vgic_v4_teardown(struct kvm *kvm) { struct its_vm *its_vm = &kvm->arch.vgic.its_vm; int i; + lockdep_assert_held(&kvm->arch.config_lock); + if (!its_vm->vpes) return; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index d97e6080b421..8be4c1ebdec2 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -24,11 +24,13 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { /* * Locking order is always: * kvm->lock (mutex) - * its->cmd_lock (mutex) - * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * kvm->lpi_list_lock must be taken with IRQs disabled - * vgic_irq->irq_lock must be taken with IRQs disabled + * vcpu->mutex (mutex) + * kvm->arch.config_lock (mutex) + * its->cmd_lock (mutex) + * its->its_lock (mutex) + * vgic_cpu->ap_list_lock must be taken with IRQs disabled + * kvm->lpi_list_lock must be taken with IRQs disabled + * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, * we have to disable IRQs before taking this lock and everything lower @@ -573,6 +575,21 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) return 0; } +int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + unsigned long flags; + int ret = -1; + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw) + ret = irq->hwintid; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + return ret; +} + /** * kvm_vgic_set_owner - Set the owner of an interrupt for a VM * diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 7f7f3c5ed85a..f9923beedd27 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -273,9 +273,6 @@ int vgic_init(struct kvm *kvm); void vgic_debug_init(struct kvm *kvm); void vgic_debug_destroy(struct kvm *kvm); -bool lock_all_vcpus(struct kvm *kvm); -void unlock_all_vcpus(struct kvm *kvm); - static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) { struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 37b1340e9646..40ba95472594 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -23,6 +23,7 @@ HAS_DCPOP HAS_DIT HAS_E0PD HAS_ECV +HAS_ECV_CNTPOFF HAS_EPAN HAS_GENERIC_AUTH HAS_GENERIC_AUTH_ARCH_QARMA3 diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 77edce16f4f9..c9a0d1fa3209 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2115,6 +2115,10 @@ Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg +Sysreg CNTPOFF_EL2 3 4 14 0 6 +Field 63:0 PhysicalOffset +EndSysreg + Sysreg CPACR_EL12 3 5 1 0 2 Fields CPACR_ELx EndSysreg diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 2803c9c21ef9..957121a495f0 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -757,7 +757,7 @@ struct kvm_mips_callbacks { int (*vcpu_run)(struct kvm_vcpu *vcpu); void (*vcpu_reenter)(struct kvm_vcpu *vcpu); }; -extern struct kvm_mips_callbacks *kvm_mips_callbacks; +extern const struct kvm_mips_callbacks * const kvm_mips_callbacks; int kvm_mips_emulation_init(void); /* Debug: dump vcpu state */ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 36c8991b5d39..884be4ef99dc 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -993,9 +993,9 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } -long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { - long r; + int r; switch (ioctl) { default: diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index dafab003ea0d..3d21cbfa7443 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3305,7 +3305,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { }; /* FIXME: Get rid of the callbacks now that trap-and-emulate is gone. */ -struct kvm_mips_callbacks *kvm_mips_callbacks = &kvm_vz_callbacks; +const struct kvm_mips_callbacks * const kvm_mips_callbacks = &kvm_vz_callbacks; int kvm_mips_emulation_init(void) { diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index bc57d058ad5b..79a9c0bb8bba 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -167,7 +167,7 @@ extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); -extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); +extern int kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); extern void kvmppc_free_hpt(struct kvm_hpt_info *info); extern void kvmppc_rmap_reset(struct kvm *kvm); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, @@ -181,7 +181,7 @@ extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm); extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm); extern void kvmppc_setup_partition_table(struct kvm *kvm); -extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, +extern int kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); #define kvmppc_ioba_validate(stt, ioba, npages) \ (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ @@ -222,10 +222,10 @@ extern void kvmppc_bookehv_exit(void); extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); -extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, - struct kvm_ppc_resize_hpt *rhpt); -extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, +extern int kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, struct kvm_ppc_resize_hpt *rhpt); +extern int kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt); int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); @@ -297,8 +297,8 @@ struct kvmppc_ops { int (*emulate_mtspr)(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); int (*emulate_mfspr)(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); void (*fast_vcpu_kick)(struct kvm_vcpu *vcpu); - long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl, - unsigned long arg); + int (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl, + unsigned long arg); int (*hcall_implemented)(unsigned long hcall); int (*irq_bypass_add_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index af1f060533f2..7f765d5ad436 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -124,9 +124,9 @@ void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) info->virt, (long)info->order, kvm->arch.lpid); } -long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) +int kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) { - long err = -EBUSY; + int err = -EBUSY; struct kvm_hpt_info info; mutex_lock(&kvm->arch.mmu_setup_lock); @@ -1482,8 +1482,8 @@ static void resize_hpt_prepare_work(struct work_struct *work) mutex_unlock(&kvm->arch.mmu_setup_lock); } -long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, - struct kvm_ppc_resize_hpt *rhpt) +int kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) { unsigned long flags = rhpt->flags; unsigned long shift = rhpt->shift; @@ -1548,13 +1548,13 @@ static void resize_hpt_boot_vcpu(void *opaque) /* Nothing to do, just force a KVM exit */ } -long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, - struct kvm_ppc_resize_hpt *rhpt) +int kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) { unsigned long flags = rhpt->flags; unsigned long shift = rhpt->shift; struct kvm_resize_hpt *resize; - long ret; + int ret; if (flags != 0 || kvm_is_radix(kvm)) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 95e738ef9062..93b695b289e9 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -288,8 +288,8 @@ static const struct file_operations kvm_spapr_tce_fops = { .release = kvm_spapr_tce_release, }; -long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce_64 *args) +int kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, + struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; struct kvmppc_spapr_tce_table *siter; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 36b295e908ca..130bafdb1430 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5799,12 +5799,12 @@ static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons, } #endif -static long kvm_arch_vm_ioctl_hv(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_arch_vm_ioctl_hv(struct file *filp, + unsigned int ioctl, unsigned long arg) { struct kvm *kvm __maybe_unused = filp->private_data; void __user *argp = (void __user *)arg; - long r; + int r; switch (ioctl) { diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index da0e888e2521..9118242063fb 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -2044,8 +2044,8 @@ static int kvmppc_core_check_processor_compat_pr(void) return 0; } -static long kvm_arch_vm_ioctl_pr(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_arch_vm_ioctl_pr(struct file *filp, + unsigned int ioctl, unsigned long arg) { return -ENOTTY; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2a956b5bab98..7197c8256668 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2379,12 +2379,11 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) } #endif -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm __maybe_unused = filp->private_data; void __user *argp = (void __user *)arg; - long r; + int r; switch (ioctl) { case KVM_PPC_GET_PVINFO: { diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 65a964d7e70d..c13130ab459a 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -87,8 +87,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -EINVAL; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 9250fde1f97d..da6dac36e959 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -305,7 +305,7 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) { - return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa; + return READ_ONCE(gisa->next_alert) != (u32)virt_to_phys(gisa); } static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) @@ -3168,7 +3168,7 @@ void kvm_s390_gisa_init(struct kvm *kvm) hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); gi->timer.function = gisa_vcpu_kicker; memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); - gi->origin->next_alert = (u32)(u64)gi->origin; + gi->origin->next_alert = (u32)virt_to_phys(gi->origin); VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1eeb9ae57879..17b81659cdb2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1990,7 +1990,7 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } -static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -2038,7 +2038,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) return r; } -static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -2899,8 +2899,7 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) } } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index b124d586db55..7dab00f1e833 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -112,7 +112,7 @@ static int zpci_reset_aipb(u8 nisc) return -EINVAL; aift->sbv = zpci_aif_sbv; - aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait; + aift->gait = phys_to_virt(zpci_aipb->aipb.gait); return 0; } diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b6a0219e470a..8d6b765abf29 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -138,11 +138,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* Copy to APCB FORMAT1 from APCB FORMAT0 */ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, - unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h) + unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h) { struct kvm_s390_apcb0 tmp; + unsigned long apcb_gpa; - if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0))) + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, &tmp, + sizeof(struct kvm_s390_apcb0))) return -EFAULT; apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0]; @@ -157,15 +161,19 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0 * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original apcb in the guest2 + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the guest1 * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, unsigned long *apcb_h) + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb0))) return -EFAULT; @@ -178,16 +186,20 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original guest apcb + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the host * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb1))) return -EFAULT; @@ -200,7 +212,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb - Create a shadow copy of the apcb. * @vcpu: pointer to the virtual CPU * @crycb_s: pointer to shadow crycb - * @crycb_o: pointer to original guest crycb + * @crycb_gpa: guest physical address of original guest crycb * @crycb_h: pointer to the host crycb * @fmt_o: format of the original guest crycb. * @fmt_h: format of the host crycb. @@ -211,50 +223,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * Return 0 or an error number if the guest and host crycb are incompatible. */ static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s, - const u32 crycb_o, + const u32 crycb_gpa, struct kvm_s390_crypto_cb *crycb_h, int fmt_o, int fmt_h) { - struct kvm_s390_crypto_cb *crycb; - - crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o; - switch (fmt_o) { case CRYCB_FORMAT2: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK)) return -EACCES; if (fmt_h != CRYCB_FORMAT2) return -EINVAL; return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1, - (unsigned long) &crycb->apcb1, + crycb_gpa, (unsigned long *)&crycb_h->apcb1); case CRYCB_FORMAT1: switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } break; case CRYCB_FORMAT0: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK)) return -EACCES; switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: case CRYCB_FORMAT0: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } } diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 353b054812de..cb8ca46213be 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -226,10 +226,9 @@ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ @@ -338,6 +337,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ @@ -370,6 +370,7 @@ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ #define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ +#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 8dc345cc6318..13bc212cd4bc 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -54,8 +54,8 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) -KVM_X86_OP_OPTIONAL(tlb_remote_flush) -KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range) +KVM_X86_OP_OPTIONAL(flush_remote_tlbs) +KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range) KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) @@ -68,6 +68,8 @@ KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) KVM_X86_OP(inject_irq) KVM_X86_OP(inject_nmi) +KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending) +KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending) KVM_X86_OP(inject_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 808c292ad3f4..fb9d1f2d6136 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -420,6 +420,10 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1) + #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; @@ -439,9 +443,8 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); - int (*sync_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); + int (*sync_spte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, int i); struct kvm_mmu_root_info root; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; @@ -479,11 +482,6 @@ struct kvm_mmu { u64 pdptrs[4]; /* pae */ }; -struct kvm_tlb_range { - u64 start_gfn; - u64 pages; -}; - enum pmc_type { KVM_PMC_GP = 0, KVM_PMC_FIXED, @@ -515,6 +513,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { + u8 version; unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; unsigned available_event_types; @@ -527,7 +526,6 @@ struct kvm_pmu { u64 global_ovf_ctrl_mask; u64 reserved_bits; u64 raw_event_mask; - u8 version; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; @@ -876,7 +874,8 @@ struct kvm_vcpu_arch { u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ - unsigned nmi_pending; /* NMI queued after currently running handler */ + /* Number of NMIs pending injection, not including hardware vNMIs. */ + unsigned int nmi_pending; bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ u8 handling_intr_from_guest; @@ -947,23 +946,6 @@ struct kvm_vcpu_arch { u64 msr_kvm_poll_control; - /* - * Indicates the guest is trying to write a gfn that contains one or - * more of the PTEs used to translate the write itself, i.e. the access - * is changing its own translation in the guest page tables. KVM exits - * to userspace if emulation of the faulting instruction fails and this - * flag is set, as KVM cannot make forward progress. - * - * If emulation fails for a write to guest page tables, KVM unprotects - * (zaps) the shadow page for the target gfn and resumes the guest to - * retry the non-emulatable instruction (on hardware). Unprotecting the - * gfn doesn't allow forward progress for a self-changing access because - * doing so also zaps the translation for the gfn, i.e. retrying the - * instruction will hit a !PRESENT fault, which results in a new shadow - * page and sends KVM back to square one. - */ - bool write_fault_to_shadow_pgtable; - /* set at EPT violation at this point */ unsigned long exit_qualification; @@ -1602,9 +1584,9 @@ struct kvm_x86_ops { void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); - int (*tlb_remote_flush)(struct kvm *kvm); - int (*tlb_remote_flush_with_range)(struct kvm *kvm, - struct kvm_tlb_range *range); + int (*flush_remote_tlbs)(struct kvm *kvm); + int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, + gfn_t nr_pages); /* * Flush any TLB entries associated with the given GVA. @@ -1638,6 +1620,13 @@ struct kvm_x86_ops { int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); + /* Whether or not a virtual NMI is pending in hardware. */ + bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); + /* + * Attempt to pend a virtual NMI in harware. Returns %true on success + * to allow using static_call_ret0 as the fallback. + */ + bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); @@ -1808,8 +1797,8 @@ void kvm_arch_free_vm(struct kvm *kvm); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) { - if (kvm_x86_ops.tlb_remote_flush && - !static_call(kvm_x86_tlb_remote_flush)(kvm)) + if (kvm_x86_ops.flush_remote_tlbs && + !static_call(kvm_x86_flush_remote_tlbs)(kvm)) return 0; else return -ENOTSUPP; @@ -1907,6 +1896,25 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility * state and inject single-step #DBs after skipping * an instruction (after completing userspace I/O). + * + * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that + * is attempting to write a gfn that contains one or + * more of the PTEs used to translate the write itself, + * and the owning page table is being shadowed by KVM. + * If emulation of the faulting instruction fails and + * this flag is set, KVM will exit to userspace instead + * of retrying emulation as KVM cannot make forward + * progress. + * + * If emulation fails for a write to guest page tables, + * KVM unprotects (zaps) the shadow page for the target + * gfn and resumes the guest to retry the non-emulatable + * instruction (on hardware). Unprotecting the gfn + * doesn't allow forward progress for a self-changing + * access because doing so also zaps the translation for + * the gfn, i.e. retrying the instruction will hit a + * !PRESENT fault, which results in a new shadow page + * and sends KVM back to square one. */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) @@ -1916,6 +1924,7 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); #define EMULTYPE_VMWARE_GP (1 << 5) #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) +#define EMULTYPE_WRITE_PF_TO_SP (1 << 8) int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, @@ -1994,14 +2003,11 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -#define KVM_MMU_ROOT_CURRENT BIT(0) -#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) -#define KVM_MMU_ROOTS_ALL (~0UL) - int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); +int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); @@ -2041,8 +2047,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t gva, hpa_t root_hpa); +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, unsigned long roots); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); @@ -2204,4 +2210,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) +/* + * KVM previously used a u32 field in kvm_run to indicate the hypercall was + * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the + * remaining 31 lower bits must be 0 to preserve ABI. + */ +#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 770dcf75eaa9..e7c7379d6ac7 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -183,6 +183,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_GIF_SHIFT 9 #define V_GIF_MASK (1 << V_GIF_SHIFT) +#define V_NMI_PENDING_SHIFT 11 +#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT) + +#define V_NMI_BLOCKING_SHIFT 12 +#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT) + #define V_INTR_PRIO_SHIFT 16 #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) @@ -197,6 +203,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_GIF_ENABLE_SHIFT 25 #define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) +#define V_NMI_ENABLE_SHIFT 26 +#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT) + #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) @@ -278,7 +287,6 @@ static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL struct vmcb_seg { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7f467fe05d42..1a6a1f987949 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -559,4 +559,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ +/* x86-specific KVM_EXIT_HYPERCALL flags. */ +#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 599aebec2d52..123bf8b97a4b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -60,12 +60,6 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } -/* - * This one is tied to SSB in the user API, and not - * visible in /proc/cpuinfo. - */ -#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ - #define F feature_bit /* Scattered Flag - For features that are scattered by cpufeatures.h. */ @@ -266,7 +260,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) cpuid_entry_change(best, X86_FEATURE_OSXSAVE, - kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); + kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)); cpuid_entry_change(best, X86_FEATURE_APIC, vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); @@ -275,7 +269,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) @@ -420,7 +414,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ - if (vcpu->arch.last_vmentry_cpu != -1) { + if (kvm_vcpu_has_run(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) return r; @@ -653,7 +647,7 @@ void kvm_set_cpu_caps(void) F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -715,7 +709,7 @@ void kvm_set_cpu_caps(void) F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - __feature_bit(KVM_X86_FEATURE_AMD_PSFD) + F(AMD_PSFD) ); /* @@ -1002,7 +996,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { - u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xcr0 = kvm_get_filtered_xcr0(); u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a20bec931764..936a397a08cd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1640,6 +1640,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; break; case VCPU_SREG_CS: + /* + * KVM uses "none" when loading CS as part of emulating Real + * Mode exceptions and IRET (handled above). In all other + * cases, loading CS without a control transfer is a KVM bug. + */ + if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE)) + goto exception; + if (!(seg_desc.type & 8)) goto exception; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 4c91f626c058..75eae9c4998a 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -4,7 +4,7 @@ #include <linux/kvm_host.h> -#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) @@ -157,6 +157,14 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) return vcpu->arch.cr0 & mask; } +static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu, + unsigned long cr0_bit) +{ + BUILD_BUG_ON(!is_power_of_2(cr0_bit)); + + return !!kvm_read_cr0_bits(vcpu, cr0_bit); +} + static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) { return kvm_read_cr0_bits(vcpu, ~0UL); @@ -171,6 +179,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) return vcpu->arch.cr4 & mask; } +static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu, + unsigned long cr4_bit) +{ + BUILD_BUG_ON(!is_power_of_2(cr4_bit)); + + return !!kvm_read_cr4_bits(vcpu, cr4_bit); +} + static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index 482d6639ef88..ded0bd688c65 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -10,17 +10,22 @@ #include "hyperv.h" #include "kvm_onhyperv.h" +struct kvm_hv_tlb_range { + u64 start_gfn; + u64 pages; +}; + static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, void *data) { - struct kvm_tlb_range *range = data; + struct kvm_hv_tlb_range *range = data; return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, range->pages); } static inline int hv_remote_flush_root_tdp(hpa_t root_tdp, - struct kvm_tlb_range *range) + struct kvm_hv_tlb_range *range) { if (range) return hyperv_flush_guest_mapping_range(root_tdp, @@ -29,8 +34,8 @@ static inline int hv_remote_flush_root_tdp(hpa_t root_tdp, return hyperv_flush_guest_mapping(root_tdp); } -int hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_tlb_range *range) +static int __hv_flush_remote_tlbs_range(struct kvm *kvm, + struct kvm_hv_tlb_range *range) { struct kvm_arch *kvm_arch = &kvm->arch; struct kvm_vcpu *vcpu; @@ -86,19 +91,29 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, spin_unlock(&kvm_arch->hv_root_tdp_lock); return ret; } -EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range); -int hv_remote_flush_tlb(struct kvm *kvm) +int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages) +{ + struct kvm_hv_tlb_range range = { + .start_gfn = start_gfn, + .pages = nr_pages, + }; + + return __hv_flush_remote_tlbs_range(kvm, &range); +} +EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range); + +int hv_flush_remote_tlbs(struct kvm *kvm) { - return hv_remote_flush_tlb_with_range(kvm, NULL); + return __hv_flush_remote_tlbs_range(kvm, NULL); } -EXPORT_SYMBOL_GPL(hv_remote_flush_tlb); +EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { struct kvm_arch *kvm_arch = &vcpu->kvm->arch; - if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) { spin_lock(&kvm_arch->hv_root_tdp_lock); vcpu->arch.hv_root_tdp = root_tdp; if (root_tdp != kvm_arch->hv_root_tdp) diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index 6272dabec02d..f9ca3e7432b2 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -7,12 +7,11 @@ #define __ARCH_X86_KVM_KVM_ONHYPERV_H__ #if IS_ENABLED(CONFIG_HYPERV) -int hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_tlb_range *range); -int hv_remote_flush_tlb(struct kvm *kvm); +int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); +int hv_flush_remote_tlbs(struct kvm *kvm); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); #else /* !CONFIG_HYPERV */ -static inline int hv_remote_flush_tlb(struct kvm *kvm) +static inline int hv_flush_remote_tlbs(struct kvm *kvm) { return -EOPNOTSUPP; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 168c46fd8dd1..92d5a1924fc1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -113,6 +113,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); +void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); @@ -132,7 +134,7 @@ static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) { BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); - return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) + return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE) ? cr3 & X86_CR3_PCID_MASK : 0; } @@ -153,6 +155,24 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_role.level); } +static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) +{ + /* + * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e. + * @mmu's snapshot of CR0.WP and thus all related paging metadata may + * be stale. Refresh CR0.WP and the metadata on-demand when checking + * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing + * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does + * need to refresh nested_mmu, a.k.a. the walker used to translate L2 + * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP. + */ + if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu) + return; + + __kvm_mmu_refresh_passthrough_bits(vcpu, mmu); +} + /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE @@ -184,8 +204,12 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; - bool fault = (mmu->permissions[index] >> pte_access) & 1; u32 errcode = PFERR_PRESENT_MASK; + bool fault; + + kvm_mmu_refresh_passthrough_bits(vcpu, mmu); + + fault = (mmu->permissions[index] >> pte_access) & 1; WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); if (unlikely(mmu->pkru_mask)) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8ebe542c565..c8961f45e3b1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -125,17 +125,31 @@ module_param(dbg, bool, 0644); #define PTE_LIST_EXT 14 /* - * Slight optimization of cacheline layout, by putting `more' and `spte_count' - * at the start; then accessing it will only use one single cacheline for - * either full (entries==PTE_LIST_EXT) case or entries<=6. + * struct pte_list_desc is the core data structure used to implement a custom + * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a + * given GFN when used in the context of rmaps. Using a custom list allows KVM + * to optimize for the common case where many GFNs will have at most a handful + * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small + * memory footprint, which in turn improves runtime performance by exploiting + * cache locality. + * + * A list is comprised of one or more pte_list_desc objects (descriptors). + * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor + * is full and a new SPTEs needs to be added, a new descriptor is allocated and + * becomes the head of the list. This means that by definitions, all tail + * descriptors are full. + * + * Note, the meta data fields are deliberately placed at the start of the + * structure to optimize the cacheline layout; accessing the descriptor will + * touch only a single cacheline so long as @spte_count<=6 (or if only the + * descriptors metadata is accessed). */ struct pte_list_desc { struct pte_list_desc *more; - /* - * Stores number of entries stored in the pte_list_desc. No need to be - * u64 but just for easier alignment. When PTE_LIST_EXT, means full. - */ - u64 spte_count; + /* The number of PTEs stored in _this_ descriptor. */ + u32 spte_count; + /* The number of PTEs stored in all tails of this descriptor. */ + u32 tail_count; u64 *sptes[PTE_LIST_EXT]; }; @@ -242,32 +256,35 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) return regs; } -static inline bool kvm_available_flush_tlb_with_range(void) +static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.tlb_remote_flush_with_range; + return kvm_read_cr3(vcpu); } -static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, - struct kvm_tlb_range *range) +static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) { - int ret = -ENOTSUPP; - - if (range && kvm_x86_ops.tlb_remote_flush_with_range) - ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); + if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) + return kvm_read_cr3(vcpu); - if (ret) - kvm_flush_remote_tlbs(kvm); + return mmu->get_guest_pgd(vcpu); } -void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, - u64 start_gfn, u64 pages) +static inline bool kvm_available_flush_remote_tlbs_range(void) { - struct kvm_tlb_range range; + return kvm_x86_ops.flush_remote_tlbs_range; +} - range.start_gfn = start_gfn; - range.pages = pages; +void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, + gfn_t nr_pages) +{ + int ret = -EOPNOTSUPP; - kvm_flush_remote_tlbs_with_range(kvm, &range); + if (kvm_x86_ops.flush_remote_tlbs_range) + ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn, + nr_pages); + if (ret) + kvm_flush_remote_tlbs(kvm); } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); @@ -888,9 +905,9 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) untrack_possible_nx_huge_page(kvm, sp); } -static struct kvm_memory_slot * -gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) +static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, + gfn_t gfn, + bool no_dirty_log) { struct kvm_memory_slot *slot; @@ -929,53 +946,69 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; + desc->tail_count = 0; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->spte_count == PTE_LIST_EXT) { - count += PTE_LIST_EXT; - if (!desc->more) { - desc->more = kvm_mmu_memory_cache_alloc(cache); - desc = desc->more; - desc->spte_count = 0; - break; - } - desc = desc->more; + count = desc->tail_count + desc->spte_count; + + /* + * If the previous head is full, allocate a new head descriptor + * as tail descriptors are always kept full. + */ + if (desc->spte_count == PTE_LIST_EXT) { + desc = kvm_mmu_memory_cache_alloc(cache); + desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc->spte_count = 0; + desc->tail_count = count; + rmap_head->val = (unsigned long)desc | 1; } - count += desc->spte_count; desc->sptes[desc->spte_count++] = spte; } return count; } -static void -pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, - struct pte_list_desc *desc, int i, - struct pte_list_desc *prev_desc) +static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, + struct pte_list_desc *desc, int i) { - int j = desc->spte_count - 1; + struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + int j = head_desc->spte_count - 1; + + /* + * The head descriptor should never be empty. A new head is added only + * when adding an entry and the previous head is full, and heads are + * removed (this flow) when they become empty. + */ + BUG_ON(j < 0); - desc->sptes[i] = desc->sptes[j]; - desc->sptes[j] = NULL; - desc->spte_count--; - if (desc->spte_count) + /* + * Replace the to-be-freed SPTE with the last valid entry from the head + * descriptor to ensure that tail descriptors are full at all times. + * Note, this also means that tail_count is stable for each descriptor. + */ + desc->sptes[i] = head_desc->sptes[j]; + head_desc->sptes[j] = NULL; + head_desc->spte_count--; + if (head_desc->spte_count) return; - if (!prev_desc && !desc->more) + + /* + * The head descriptor is empty. If there are no tail descriptors, + * nullify the rmap head to mark the list as emtpy, else point the rmap + * head at the next descriptor, i.e. the new head. + */ + if (!head_desc->more) rmap_head->val = 0; else - if (prev_desc) - prev_desc->more = desc->more; - else - rmap_head->val = (unsigned long)desc->more | 1; - mmu_free_pte_list_desc(desc); + rmap_head->val = (unsigned long)head_desc->more | 1; + mmu_free_pte_list_desc(head_desc); } static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - struct pte_list_desc *prev_desc; int i; if (!rmap_head->val) { @@ -991,16 +1024,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } else { rmap_printk("%p many->many\n", spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - prev_desc = NULL; while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(rmap_head, - desc, i, prev_desc); + pte_list_desc_remove_entry(rmap_head, desc, i); return; } } - prev_desc = desc; desc = desc->more; } pr_err("%s: %p many->many\n", __func__, spte); @@ -1047,7 +1077,6 @@ out: unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - unsigned int count = 0; if (!rmap_head->val) return 0; @@ -1055,13 +1084,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) return 1; desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - - while (desc) { - count += desc->spte_count; - desc = desc->more; - } - - return count; + return desc->tail_count + desc->spte_count; } static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, @@ -1073,14 +1096,6 @@ static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } -static bool rmap_can_add(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_memory_cache *mc; - - mc = &vcpu->arch.mmu_pte_list_desc_cache; - return kvm_mmu_memory_cache_nr_free_objects(mc); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; @@ -1479,7 +1494,7 @@ restart: } } - if (need_flush && kvm_available_flush_tlb_with_range()) { + if (need_flush && kvm_available_flush_remote_tlbs_range()) { kvm_flush_remote_tlbs_gfn(kvm, gfn, level); return false; } @@ -1504,8 +1519,8 @@ struct slot_rmap_walk_iterator { struct kvm_rmap_head *end_rmap; }; -static void -rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) +static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, + int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; @@ -1513,10 +1528,10 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } -static void -slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - const struct kvm_memory_slot *slot, int start_level, - int end_level, gfn_t start_gfn, gfn_t end_gfn) +static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, + const struct kvm_memory_slot *slot, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; iterator->start_level = start_level; @@ -1789,12 +1804,6 @@ static void mark_unsync(u64 *spte) kvm_mmu_mark_parents_unsync(sp); } -static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - return -1; -} - #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -1914,10 +1923,79 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else +static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; + + /* + * Ignore various flags when verifying that it's safe to sync a shadow + * page using the current MMU context. + * + * - level: not part of the overall MMU role and will never match as the MMU's + * level tracks the root level + * - access: updated based on the new guest PTE + * - quadrant: not part of the overall MMU role (similar to level) + */ + const union kvm_mmu_page_role sync_role_ign = { + .level = 0xf, + .access = 0x7, + .quadrant = 0x3, + .passthrough = 0x1, + }; + + /* + * Direct pages can never be unsync, and KVM should never attempt to + * sync a shadow page for a different MMU context, e.g. if the role + * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the + * reserved bits checks will be wrong, etc... + */ + if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte || + (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) + return false; + + return true; +} + +static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) +{ + if (!sp->spt[i]) + return 0; + + return vcpu->arch.mmu->sync_spte(vcpu, sp, i); +} + +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int flush = 0; + int i; + + if (!kvm_sync_page_check(vcpu, sp)) + return -1; + + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { + int ret = kvm_sync_spte(vcpu, sp, i); + + if (ret < -1) + return -1; + flush |= ret; + } + + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ + return flush; +} + static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - int ret = vcpu->arch.mmu->sync_page(vcpu, sp); + int ret = __kvm_sync_page(vcpu, sp); if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); @@ -3304,9 +3382,9 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) * Returns true if the SPTE was fixed successfully. Otherwise, * someone else modified the SPTE from its original value. */ -static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - u64 *sptep, u64 old_spte, u64 new_spte) +static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + u64 *sptep, u64 old_spte, u64 new_spte) { /* * Theoretically we could also set dirty bit (and flush TLB) here in @@ -3513,6 +3591,8 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, LIST_HEAD(invalid_list); bool free_active_root; + WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL); + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ @@ -3731,7 +3811,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) int quadrant, i, r; hpa_t root; - root_pgd = mmu->get_guest_pgd(vcpu); + root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) @@ -4181,7 +4261,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, arch.token = alloc_apf_token(vcpu); arch.gfn = gfn; arch.direct_map = vcpu->arch.mmu->root_role.direct; - arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); @@ -4200,10 +4280,10 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return; if (!vcpu->arch.mmu->root_role.direct && - work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) + work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); } static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4469,8 +4549,7 @@ static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; - context->sync_page = nonpaging_sync_page; - context->invlpg = NULL; + context->sync_spte = NULL; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, @@ -4604,11 +4683,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); -static unsigned long get_cr3(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr3(vcpu); -} - static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { @@ -4638,10 +4712,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, #include "paging_tmpl.h" #undef PTTYPE -static void -__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, int level, bool nx, bool gbpages, - bool pse, bool amd) +static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, + u64 pa_bits_rsvd, int level, bool nx, + bool gbpages, bool pse, bool amd) { u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; @@ -4754,9 +4827,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, guest_cpuid_is_amd_or_hygon(vcpu)); } -static void -__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, bool execonly, int huge_page_level) +static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, + u64 pa_bits_rsvd, bool execonly, + int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 large_1g_rsvd = 0, large_2m_rsvd = 0; @@ -4856,8 +4929,7 @@ static inline bool boot_cpu_is_amd(void) * the direct page table on host, use as much mmu features as * possible, however, kvm currently does not do execution-protection. */ -static void -reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) +static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; @@ -5060,20 +5132,18 @@ static void paging64_init_context(struct kvm_mmu *context) { context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; - context->sync_page = paging64_sync_page; - context->invlpg = paging64_invlpg; + context->sync_spte = paging64_sync_spte; } static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; - context->sync_page = paging32_sync_page; - context->invlpg = paging32_invlpg; + context->sync_spte = paging32_sync_spte; } -static union kvm_cpu_role -kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) +static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, + const struct kvm_mmu_role_regs *regs) { union kvm_cpu_role role = {0}; @@ -5112,6 +5182,21 @@ kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) return role; } +void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) +{ + const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); + + BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); + BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); + + if (is_cr0_wp(mmu) == cr0_wp) + return; + + mmu->cpu_role.base.cr0_wp = cr0_wp; + reset_guest_paging_metadata(vcpu, mmu); +} + static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { /* tdp_root_level is architecture forced level, use it if nonzero */ @@ -5157,9 +5242,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; - context->sync_page = nonpaging_sync_page; - context->invlpg = NULL; - context->get_guest_pgd = get_cr3; + context->sync_spte = NULL; + context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; @@ -5289,8 +5373,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; - context->sync_page = ept_sync_page; - context->invlpg = ept_invlpg; + context->sync_spte = ept_sync_spte; update_permission_bitmask(context, true); context->pkru_mask = 0; @@ -5309,7 +5392,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu, kvm_init_shadow_mmu(vcpu, cpu_role); - context->get_guest_pgd = get_cr3; + context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } @@ -5323,7 +5406,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, return; g_context->cpu_role.as_u64 = new_mode.as_u64; - g_context->get_guest_pgd = get_cr3; + g_context->get_guest_pgd = get_guest_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; @@ -5331,7 +5414,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, * L2 page tables are never shadowed, so there is no need to sync * SPTEs. */ - g_context->invlpg = NULL; + g_context->sync_spte = NULL; /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using @@ -5393,7 +5476,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * Changing guest CPUID after KVM_RUN is forbidden, see the comment in * kvm_arch_vcpu_ioctl(). */ - KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); + KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -5664,7 +5747,8 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), false); + lower_32_bits(error_code), false, + &emulation_type); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } @@ -5706,48 +5790,77 @@ emulate: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); -void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t gva, hpa_t root_hpa) +static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, hpa_t root_hpa) +{ + struct kvm_shadow_walk_iterator iterator; + + vcpu_clear_mmio_info(vcpu, addr); + + if (!VALID_PAGE(root_hpa)) + return; + + write_lock(&vcpu->kvm->mmu_lock); + for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) { + struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep); + + if (sp->unsync) { + int ret = kvm_sync_spte(vcpu, sp, iterator.index); + + if (ret < 0) + mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL); + if (ret) + kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep); + } + + if (!sp->unsync_children) + break; + } + write_unlock(&vcpu->kvm->mmu_lock); +} + +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, unsigned long roots) { int i; + WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); + /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(gva, vcpu)) + if (is_noncanonical_address(addr, vcpu)) return; - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, addr); } - if (!mmu->invlpg) + if (!mmu->sync_spte) return; - if (root_hpa == INVALID_PAGE) { - mmu->invlpg(vcpu, gva, mmu->root.hpa); + if (roots & KVM_MMU_ROOT_CURRENT) + __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa); - /* - * INVLPG is required to invalidate any global mappings for the VA, - * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether any of the prev_root mappings of the VA - * is marked global, or to just sync it blindly, so we might as well - * just always sync it. - * - * Mappings not reachable via the current cr3 or the prev_roots will be - * synced when switching to that cr3, so nothing needs to be done here - * for them. - */ - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (VALID_PAGE(mmu->prev_roots[i].hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); - } else { - mmu->invlpg(vcpu, gva, root_hpa); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (roots & KVM_MMU_ROOT_PREVIOUS(i)) + __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } +EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Blindly sync all roots as it would take + * roughly the same amount of work/time to determine whether any of the + * previous roots have a global mapping. + * + * Mappings not reachable via the current or previous cached roots will + * be synced when switching to that new cr3, so nothing needs to be + * done here for them. + */ + kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5756,27 +5869,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; - bool tlb_flush = false; + unsigned long roots = 0; uint i; - if (pcid == kvm_get_active_pcid(vcpu)) { - if (mmu->invlpg) - mmu->invlpg(vcpu, gva, mmu->root.hpa); - tlb_flush = true; - } + if (pcid == kvm_get_active_pcid(vcpu)) + roots |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { - if (mmu->invlpg) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); - tlb_flush = true; - } + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) + roots |= KVM_MMU_ROOT_PREVIOUS(i); } - if (tlb_flush) - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); - + if (roots) + kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); ++vcpu->stat.invlpg; /* @@ -5813,29 +5919,30 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, +typedef bool (*slot_rmaps_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot); -/* The caller should hold mmu-lock before calling this function. */ -static __always_inline bool -slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, - bool flush) +static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, + bool flush_on_yield, bool flush) { struct slot_rmap_walk_iterator iterator; - for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + lockdep_assert_held_write(&kvm->mmu_lock); + + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) - flush |= fn(kvm, iterator.rmap, memslot); + flush |= fn(kvm, iterator.rmap, slot); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && flush_on_yield) { - kvm_flush_remote_tlbs_with_address(kvm, - start_gfn, - iterator.gfn - start_gfn + 1); + kvm_flush_remote_tlbs_range(kvm, start_gfn, + iterator.gfn - start_gfn + 1); flush = false; } cond_resched_rwlock_write(&kvm->mmu_lock); @@ -5845,23 +5952,23 @@ slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, return flush; } -static __always_inline bool -slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - bool flush_on_yield) +static __always_inline bool walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + bool flush_on_yield) { - return slot_handle_level_range(kvm, memslot, fn, start_level, - end_level, memslot->base_gfn, - memslot->base_gfn + memslot->npages - 1, - flush_on_yield, false); + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, + slot->base_gfn, slot->base_gfn + slot->npages - 1, + flush_on_yield, false); } -static __always_inline bool -slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, bool flush_on_yield) +static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + bool flush_on_yield) { - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - PG_LEVEL_4K, flush_on_yield); + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); } static void free_mmu_pages(struct kvm_mmu *mmu) @@ -6156,9 +6263,9 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, true, flush); } } @@ -6190,8 +6297,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); + kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); kvm_mmu_invalidate_end(kvm, 0, -1ul); @@ -6211,8 +6317,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } @@ -6447,10 +6553,9 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, * all the way to the target level. There's no need to split pages * already at the target level. */ - for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) { - slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages, - level, level, start, end - 1, true, false); - } + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) + __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, false); } /* Must be called with the mmu_lock held in write-mode. */ @@ -6529,7 +6634,7 @@ restart: PG_LEVEL_NUM)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); - if (kvm_available_flush_tlb_with_range()) + if (kvm_available_flush_remote_tlbs_range()) kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; @@ -6548,8 +6653,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap * pages that are already mapped at the maximum hugepage level. */ - if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); } @@ -6580,8 +6685,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, * is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); - kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, - memslot->npages); + kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, @@ -6593,7 +6697,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ - slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); + walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } @@ -6663,8 +6767,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) } } -static unsigned long -mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long mmu_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; @@ -6722,8 +6826,8 @@ unlock: return freed; } -static unsigned long -mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long mmu_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) { return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index cc58631e2336..d39af5639ce9 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -170,14 +170,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); -void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, - u64 start_gfn, u64 pages); +void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, + gfn_t nr_pages); /* Flush the given page (huge or not) of guest memory. */ static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) { - kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level), - KVM_PAGES_PER_HPAGE(level)); + kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level), + KVM_PAGES_PER_HPAGE(level)); } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); @@ -240,6 +240,13 @@ struct kvm_page_fault { kvm_pfn_t pfn; hva_t hva; bool map_writable; + + /* + * Indicates the guest is trying to write a gfn that contains one or + * more of the PTEs used to translate the write itself, i.e. the access + * is changing its own translation in the guest page tables. + */ + bool write_fault_to_shadow_pgtable; }; int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); @@ -273,7 +280,7 @@ enum { }; static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch) + u32 err, bool prefetch, int *emulation_type) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -312,6 +319,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, else r = vcpu->arch.mmu->page_fault(vcpu, &fault); + if (fault.write_fault_to_shadow_pgtable && emulation_type) + *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + /* * Similar to above, prefetch faults aren't truly spurious, and the * async #PF path doesn't do emulation. Do count faults that are fixed diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 57f0b75c80f9..0662e0278e70 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->cpu_role.base.level; - pte = mmu->get_guest_pgd(vcpu); + pte = kvm_mmu_get_guest_pgd(vcpu, mmu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 @@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, pt_element_t gpte, bool no_dirty_log) + u64 *spte, pt_element_t gpte) { struct kvm_memory_slot *slot; unsigned pte_access; @@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, - no_dirty_log && (pte_access & ACC_WRITE_MASK)); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); if (!slot) return false; @@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (is_shadow_present_pte(*spte)) continue; - if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) break; } } @@ -685,8 +684,17 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); + + if (fault->write && table_gfn == fault->gfn) + fault->write_fault_to_shadow_pgtable = true; } + /* + * Adjust the hugepage size _after_ resolving indirect shadow pages. + * KVM doesn't support mapping hugepages into the guest for gfns that + * are being shadowed by KVM, i.e. allocating a new shadow page may + * affect the allowed hugepage size. + */ kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); @@ -731,46 +739,6 @@ out_gpte_changed: return RET_PF_RETRY; } - /* - * To see whether the mapped gfn can write its page table in the current - * mapping. - * - * It is the helper function of FNAME(page_fault). When guest uses large page - * size to map the writable gfn which is used as current page table, we should - * force kvm to use small page size to map it because new shadow page will be - * created when kvm establishes shadow page table that stop kvm using large - * page size. Do it early can avoid unnecessary #PF and emulation. - * - * @write_fault_to_shadow_pgtable will return true if the fault gfn is - * currently used as its page table. - * - * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok - * since the PDPT is always shadowed, that means, we can not use large page - * size to map the gfn which is used as PDPT. - */ -static bool -FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, bool user_fault, - bool *write_fault_to_shadow_pgtable) -{ - int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); - bool self_changed = false; - - if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) - return false; - - for (level = walker->level; level <= walker->max_level; level++) { - gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; - - self_changed |= !(gfn & mask); - *write_fault_to_shadow_pgtable |= !gfn; - } - - return self_changed; -} - /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte @@ -789,7 +757,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); @@ -814,6 +781,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault } fault->gfn = walker.gfn; + fault->max_level = walker.level; fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); if (page_fault_handle_page_track(vcpu, fault)) { @@ -825,16 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - vcpu->arch.write_fault_to_shadow_pgtable = false; - - is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); - - if (is_self_change_mapping) - fault->max_level = PG_LEVEL_4K; - else - fault->max_level = walker.level; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -887,64 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) -{ - struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - u64 old_spte; - int level; - u64 *sptep; - - vcpu_clear_mmio_info(vcpu, gva); - - /* - * No need to check return value here, rmap_can_add() can - * help us to skip pte prefetch later. - */ - mmu_topup_memory_caches(vcpu, true); - - if (!VALID_PAGE(root_hpa)) { - WARN_ON(1); - return; - } - - write_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { - level = iterator.level; - sptep = iterator.sptep; - - sp = sptep_to_sp(sptep); - old_spte = *sptep; - if (is_last_spte(old_spte, level)) { - pt_element_t gpte; - gpa_t pte_gpa; - - if (!sp->unsync) - break; - - pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += spte_index(sptep) * sizeof(pt_element_t); - - mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); - if (is_shadow_present_pte(old_spte)) - kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); - - if (!rmap_can_add(vcpu)) - break; - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - break; - - FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); - } - - if (!sp->unsync_children) - break; - } - write_unlock(&vcpu->kvm->mmu_lock); -} - /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t addr, u64 access, @@ -977,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * can't change unless all sptes pointing to it are nuked first. * * Returns - * < 0: the sp should be zapped - * 0: the sp is synced and no tlb flushing is required - * > 0: the sp is synced and tlb flushing is required + * < 0: failed to sync spte + * 0: the spte is synced and no tlb flushing is required + * > 0: the spte is synced and tlb flushing is required */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; - int i; bool host_writable; gpa_t first_pte_gpa; - bool flush = false; - - /* - * Ignore various flags when verifying that it's safe to sync a shadow - * page using the current MMU context. - * - * - level: not part of the overall MMU role and will never match as the MMU's - * level tracks the root level - * - access: updated based on the new guest PTE - * - quadrant: not part of the overall MMU role (similar to level) - */ - const union kvm_mmu_page_role sync_role_ign = { - .level = 0xf, - .access = 0x7, - .quadrant = 0x3, - .passthrough = 0x1, - }; + u64 *sptep, spte; + struct kvm_memory_slot *slot; + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn; - /* - * Direct pages can never be unsync, and KVM should never attempt to - * sync a shadow page for a different MMU context, e.g. if the role - * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the - * reserved bits checks will be wrong, etc... - */ - if (WARN_ON_ONCE(sp->role.direct || - (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) - return -1; + if (WARN_ON_ONCE(!sp->spt[i])) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { - u64 *sptep, spte; - struct kvm_memory_slot *slot; - unsigned pte_access; - pt_element_t gpte; - gpa_t pte_gpa; - gfn_t gfn; - - if (!sp->spt[i]) - continue; - - pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - return -1; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - flush = true; - continue; - } - - gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) - continue; + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -1; - /* - * Drop the SPTE if the new protections would result in a RWX=0 - * SPTE or if the gfn is changing. The RWX=0 case only affects - * EPT with execute-only support, i.e. EPT without an effective - * "present" bit, as all other paging modes will create a - * read-only SPTE if pte_access is zero. - */ - if ((!pte_access && !shadow_present_mask) || - gfn != kvm_mmu_page_get_gfn(sp, i)) { - drop_spte(vcpu->kvm, &sp->spt[i]); - flush = true; - continue; - } + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) + return 1; - /* Update the shadowed access bits in case they changed. */ - kvm_mmu_page_set_access(sp, i, pte_access); + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access; + pte_access &= FNAME(gpte_access)(gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - sptep = &sp->spt[i]; - spte = *sptep; - host_writable = spte & shadow_host_writable_mask; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - make_spte(vcpu, sp, slot, pte_access, gfn, - spte_to_pfn(spte), spte, true, false, - host_writable, &spte); + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) + return 0; - flush |= mmu_spte_update(sptep, spte); + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { + drop_spte(vcpu->kvm, &sp->spt[i]); + return 1; } - /* - * Note, any flush is purely for KVM's correctness, e.g. when dropping - * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier - * unmap or dirty logging event doesn't fail to flush. The guest is - * responsible for flushing the TLB to ensure any changes in protection - * bits are recognized, i.e. until the guest flushes or page faults on - * a relevant address, KVM is architecturally allowed to let vCPUs use - * cached translations with the old protection bits. + * Do nothing if the permissions are unchanged. The existing SPTE is + * still, and prefetch_invalid_gpte() has verified that the A/D bits + * are set in the "new" gPTE, i.e. there is no danger of missing an A/D + * update due to A/D bits being set in the SPTE but not the gPTE. */ - return flush; + if (kvm_mmu_page_get_access(sp, i) == pte_access) + return 0; + + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, false, + host_writable, &spte); + + return mmu_spte_update(sptep, spte); } #undef pt_element_t diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c15bfca3ed15..cf2c6426a6fc 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -164,7 +164,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, /* * For simplicity, enforce the NX huge page mitigation even if not * strictly necessary. KVM could ignore the mitigation if paging is - * disabled in the guest, as the guest doesn't have an page tables to + * disabled in the guest, as the guest doesn't have any page tables to * abuse. But to safely ignore the mitigation, KVM would have to * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG * is toggled on, and that's a net negative for performance when TDP is diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index f0af385c56e0..fae559559a80 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -29,29 +29,49 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) WRITE_ONCE(*rcu_dereference(sptep), new_spte); } +/* + * SPTEs must be modified atomically if they are shadow-present, leaf + * SPTEs, and have volatile bits, i.e. has bits that can be set outside + * of mmu_lock. The Writable bit can be set by KVM's fast page fault + * handler, and Accessed and Dirty bits can be set by the CPU. + * + * Note, non-leaf SPTEs do have Accessed bits and those bits are + * technically volatile, but KVM doesn't consume the Accessed bit of + * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This + * logic needs to be reassessed if KVM were to use non-leaf Accessed + * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + */ +static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level) +{ + return is_shadow_present_pte(old_spte) && + is_last_spte(old_spte, level) && + spte_has_volatile_bits(old_spte); +} + static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - /* - * Atomically write the SPTE if it is a shadow-present, leaf SPTE with - * volatile bits, i.e. has bits that can be set outside of mmu_lock. - * The Writable bit can be set by KVM's fast page fault handler, and - * Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. - */ - if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte)) + if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); return old_spte; } +static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, + u64 mask, int level) +{ + atomic64_t *sptep_atomic; + + if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { + sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + return (u64)atomic64_fetch_and(~mask, sptep_atomic); + } + + __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); + return old_spte; +} + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7c25dbf32ecc..b2fca11b91ff 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -334,35 +334,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) -{ - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) - return; - - if (is_accessed_spte(old_spte) && - (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || - spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); -} - -static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) -{ - bool pfn_changed; - struct kvm_memory_slot *slot; - - if (level > PG_LEVEL_4K) - return; - - pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - - if ((!is_writable_pte(old_spte) || pfn_changed) && - is_writable_pte(new_spte)) { - slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); - mark_page_dirty_in_slot(kvm, slot, gfn); - } -} - static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); @@ -505,7 +476,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) } /** - * __handle_changed_spte - handle bookkeeping associated with an SPTE change + * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of * @gfn: the base GFN that was mapped by the SPTE @@ -516,12 +487,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * - * Handle bookkeeping that might result from the modification of a SPTE. - * This function must be called for all TDP SPTE modifications. + * Handle bookkeeping that might result from the modification of a SPTE. Note, + * dirty logging updates are handled in common code, not here (see make_spte() + * and fast_pf_fix_direct_spte()). */ -static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); @@ -605,17 +577,10 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_present && !was_leaf && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); -} -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) -{ - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, - shared); - handle_changed_spte_acc_track(old_spte, new_spte, level); - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + if (was_leaf && is_accessed_spte(old_spte) && + (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } /* @@ -658,9 +623,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); - handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return 0; } @@ -696,7 +660,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, /* - * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance * @as_id: Address space ID, i.e. regular vs. SMM * @sptep: Pointer to the SPTE @@ -704,23 +668,12 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * @new_spte: The new value that will be set for the SPTE * @gfn: The base GFN that was (or will be) mapped by the SPTE * @level: The level _containing_ the SPTE (its parent PT's level) - * @record_acc_track: Notify the MM subsystem of changes to the accessed state - * of the page. Should be set unless handling an MMU - * notifier for access tracking. Leaving record_acc_track - * unset in that case prevents page accesses from being - * double counted. - * @record_dirty_log: Record the page as dirty in the dirty bitmap if - * appropriate for the change being made. Should be set - * unless performing certain dirty logging operations. - * Leaving record_dirty_log unset in that case prevents page - * writes from being double counted. * * Returns the old SPTE value, which _may_ be different than @old_spte if the * SPTE had voldatile bits. */ -static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -735,46 +688,17 @@ static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); - - if (record_acc_track) - handle_changed_spte_acc_track(old_spte, new_spte, level); - if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); return old_spte; } -static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte, bool record_acc_track, - bool record_dirty_log) +static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) { WARN_ON_ONCE(iter->yielded); - - iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, - iter->old_spte, new_spte, - iter->gfn, iter->level, - record_acc_track, record_dirty_log); -} - -static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); -} - -static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); -} - -static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); + iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level); } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ @@ -866,7 +790,7 @@ retry: continue; if (!shared) - tdp_mmu_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, 0); else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) goto retry; } @@ -923,8 +847,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) return false; - __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, - sp->gfn, sp->role.level + 1, true, true); + tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, + sp->gfn, sp->role.level + 1); return true; } @@ -958,7 +882,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, 0); flush = true; } @@ -1128,7 +1052,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, if (ret) return ret; } else { - tdp_mmu_set_spte(kvm, iter, spte); + tdp_mmu_iter_set_spte(kvm, iter, spte); } tdp_account_mmu_page(kvm, sp); @@ -1262,33 +1186,42 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. + * + * No need to mark the corresponding PFN as accessed as this call is coming + * from the clear_young() or clear_flush_young() notifier, which uses the + * return value to determine if the page has been accessed. */ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { - u64 new_spte = 0; + u64 new_spte; /* If we have a non-accessed entry we don't need to change the pte. */ if (!is_accessed_spte(iter->old_spte)) return false; - new_spte = iter->old_spte; - - if (spte_ad_enabled(new_spte)) { - new_spte &= ~shadow_accessed_mask; + if (spte_ad_enabled(iter->old_spte)) { + iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, + iter->old_spte, + shadow_accessed_mask, + iter->level); + new_spte = iter->old_spte & ~shadow_accessed_mask; } else { /* * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + if (is_writable_pte(iter->old_spte)) + kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); - new_spte = mark_spte_for_access_track(new_spte); + new_spte = mark_spte_for_access_track(iter->old_spte); + iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, + iter->old_spte, new_spte, + iter->level); } - tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); - + trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, + iter->old_spte, new_spte); return true; } @@ -1324,15 +1257,15 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, * Note, when changing a read-only SPTE, it's not strictly necessary to * zero the SPTE before setting the new PFN, but doing so preserves the * invariant that the PFN of a present * leaf SPTE can never change. - * See __handle_changed_spte(). + * See handle_changed_spte(). */ - tdp_mmu_set_spte(kvm, iter, 0); + tdp_mmu_iter_set_spte(kvm, iter, 0); if (!pte_write(range->pte)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, pte_pfn(range->pte)); - tdp_mmu_set_spte(kvm, iter, new_spte); + tdp_mmu_iter_set_spte(kvm, iter, new_spte); } return true; @@ -1349,7 +1282,7 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) /* * No need to handle the remote TLB flush under RCU protection, the * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a - * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). + * shadow page. See the WARN on pfn_changed in handle_changed_spte(). */ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); } @@ -1607,8 +1540,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { + u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; struct tdp_iter iter; - u64 new_spte; bool spte_set = false; rcu_read_lock(); @@ -1621,19 +1554,13 @@ retry: if (!is_shadow_present_pte(iter.old_spte)) continue; - if (spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); - if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) + if (!(iter.old_spte & dbit)) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; spte_set = true; @@ -1675,8 +1602,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { + u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; - u64 new_spte; rcu_read_lock(); @@ -1685,25 +1613,26 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; + MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); + if (iter.level > PG_LEVEL_4K || !(mask & (1UL << (iter.gfn - gfn)))) continue; mask &= ~(1UL << (iter.gfn - gfn)); - if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + if (!(iter.old_spte & dbit)) + continue; + + iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, + iter.old_spte, dbit, + iter.level); - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, + iter.old_spte, + iter.old_spte & ~dbit); + kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); } rcu_read_unlock(); @@ -1821,7 +1750,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, if (new_spte == iter.old_spte) break; - tdp_mmu_set_spte(kvm, &iter, new_spte); + tdp_mmu_iter_set_spte(kvm, &iter, new_spte); spte_set = true; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 612e6c70ce2e..1690d41c1830 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,7 +93,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) { return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); } @@ -400,6 +400,12 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } +static bool pmc_event_is_allowed(struct kvm_pmc *pmc) +{ + return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + check_pmu_event_filter(pmc); +} + static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -409,10 +415,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); - if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) - goto reprogram_complete; - - if (!check_pmu_event_filter(pmc)) + if (!pmc_event_is_allowed(pmc)) goto reprogram_complete; if (pmc->counter < pmc->prev_counter) @@ -540,9 +543,9 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (!pmc) return 1; - if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) && (static_call(kvm_x86_get_cpl)(vcpu) != 0) && - (kvm_read_cr0(vcpu) & X86_CR0_PE)) + kvm_is_cr0_bit_set(vcpu, X86_CR0_PE)) return 1; *data = pmc_read_counter(pmc) & mask; @@ -589,6 +592,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + return; + + bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); static_call(kvm_x86_pmu_refresh)(vcpu); } @@ -646,7 +653,7 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, @@ -684,7 +691,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); - if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) + if (!pmc || !pmc_event_is_allowed(pmc)) continue; /* Ignore checks for edge detect, pin control, invert and CMASK bits */ diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index be62c16f2265..5c7bbf03b599 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -195,7 +195,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) KVM_PMC_MAX_FIXED); } -static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc) +static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 05d38944a6c0..96936ddf1b3c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -139,13 +139,18 @@ void recalc_intercepts(struct vcpu_svm *svm) if (g->int_ctl & V_INTR_MASKING_MASK) { /* - * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8 - * does not affect any interrupt we may want to inject; - * therefore, writes to CR8 are irrelevant to L0, as are - * interrupt window vmexits. + * If L2 is active and V_INTR_MASKING is enabled in vmcb12, + * disable intercept of CR8 writes as L2's CR8 does not affect + * any interrupt KVM may want to inject. + * + * Similarly, disable intercept of virtual interrupts (used to + * detect interrupt windows) if the saved RFLAGS.IF is '0', as + * the effective RFLAGS.IF for L1 interrupts will never be set + * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs). */ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); - vmcb_clr_intercept(c, INTERCEPT_VINTR); + if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)) + vmcb_clr_intercept(c, INTERCEPT_VINTR); } /* @@ -276,6 +281,11 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl))) return false; + if (CC((control->int_ctl & V_NMI_ENABLE_MASK) && + !vmcb12_is_intercept(control, INTERCEPT_NMI))) { + return false; + } + return true; } @@ -416,22 +426,24 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; - if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && - svm_is_intercept(svm, INTERCEPT_VINTR)) { - /* - * In order to request an interrupt window, L0 is usurping - * svm->vmcb->control.int_ctl and possibly setting V_IRQ - * even if it was clear in L1's VMCB. Restoring it would be - * wrong. However, in this case V_IRQ will remain true until - * interrupt_window_interception calls svm_clear_vintr and - * restores int_ctl. We can just leave it aside. - */ + /* + * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting + * virtual interrupts in order to request an interrupt window, as KVM + * has usurped vmcb02's int_ctl. If an interrupt window opens before + * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl. + * If no window opens, V_IRQ will be correctly preserved in vmcb12's + * int_ctl (because it was never recognized while L2 was running). + */ + if (svm_is_intercept(svm, INTERCEPT_VINTR) && + !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts)) mask &= ~V_IRQ_MASK; - } if (nested_vgif_enabled(svm)) mask |= V_GIF_MASK; + if (nested_vnmi_enabled(svm)) + mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK; + svm->nested.ctl.int_ctl &= ~mask; svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; } @@ -651,6 +663,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, else int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); + if (vnmi) { + if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) { + svm->vcpu.arch.nmi_pending++; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } + if (nested_vnmi_enabled(svm)) + int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK | + V_NMI_ENABLE_MASK | + V_NMI_BLOCKING_MASK); + } + /* Copied from vmcb01. msrpm_base can be overwritten later. */ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; @@ -1021,6 +1044,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm_switch_vmcb(svm, &svm->vmcb01); + /* + * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01: + * + * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't + * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related + * flags) to detect interrupt windows for L1 IRQs (even if L1 uses + * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that + * KVM re-requests an interrupt window if necessary, which implicitly + * copies this bits from vmcb02 to vmcb01. + * + * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR + * is stored in vmcb02, but its value doesn't need to be copied from/to + * vmcb01 because it is copied from/to the virtual APIC's TPR register + * on each VM entry/exit. + * + * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's + * V_GIF. However, GIF is architecturally clear on each VM exit, thus + * there is no need to copy V_GIF from vmcb02 to vmcb01. + */ + if (!nested_exit_on_intr(svm)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); @@ -1029,6 +1074,20 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm_update_lbrv(vcpu); } + if (vnmi) { + if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) + vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK; + else + vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK; + + if (vcpu->arch.nmi_pending) { + vcpu->arch.nmi_pending--; + vmcb01->control.int_ctl |= V_NMI_PENDING_MASK; + } else { + vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK; + } + } + /* * On vmexit the GIF is set to false and * no event can be injected in L1. diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index cc77a0681800..5fa939e411d8 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -161,7 +161,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a1b08359769b..ca32389f3c36 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -99,6 +99,7 @@ static const struct svm_direct_access_msrs { #endif { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_FLUSH_CMD, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -234,6 +235,8 @@ module_param(dump_invalid_vmcb, bool, 0644); bool intercept_smi = true; module_param(intercept_smi, bool, 0444); +bool vnmi = true; +module_param(vnmi, bool, 0444); static bool svm_gp_erratum_intercept = true; @@ -1315,6 +1318,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); + if (vnmi) + svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; + if (vgif) { svm_clr_intercept(svm, INTERCEPT_STGI); svm_clr_intercept(svm, INTERCEPT_CLGI); @@ -1588,6 +1594,16 @@ static void svm_set_vintr(struct vcpu_svm *svm) svm_set_intercept(svm, INTERCEPT_VINTR); /* + * Recalculating intercepts may have cleared the VINTR intercept. If + * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF + * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN. + * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as + * interrupts will never be unblocked while L2 is running. + */ + if (!svm_is_intercept(svm, INTERCEPT_VINTR)) + return; + + /* * This is just a dummy VINTR to actually cause a vmexit to happen. * Actual injection of virtual interrupts happens through EVENTINJ. */ @@ -2484,16 +2500,29 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) has_error_code, error_code); } +static void svm_clr_iret_intercept(struct vcpu_svm *svm) +{ + if (!sev_es_guest(svm->vcpu.kvm)) + svm_clr_intercept(svm, INTERCEPT_IRET); +} + +static void svm_set_iret_intercept(struct vcpu_svm *svm) +{ + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_IRET); +} + static int iret_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); ++vcpu->stat.nmi_window_exits; svm->awaiting_iret_completion = true; - if (!sev_es_guest(vcpu->kvm)) { - svm_clr_intercept(svm, INTERCEPT_IRET); + + svm_clr_iret_intercept(svm); + if (!sev_es_guest(vcpu->kvm)) svm->nmi_iret_rip = kvm_rip_read(vcpu); - } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -2876,7 +2905,7 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); - int r; + int ret = 0; u32 ecx = msr->index; u64 data = msr->data; @@ -2946,21 +2975,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) */ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); break; - case MSR_IA32_PRED_CMD: - if (!msr->host_initiated && - !guest_has_pred_cmd_msr(vcpu)) - return 1; - - if (data & ~PRED_CMD_IBPB) - return 1; - if (!boot_cpu_has(X86_FEATURE_IBPB)) - return 1; - if (!data) - break; - - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); - break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) @@ -3013,10 +3027,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * guest via direct_access_msrs, and switch it via user return. */ preempt_disable(); - r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); + ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); preempt_enable(); - if (r) - return 1; + if (ret) + break; svm->tsc_aux = data; break; @@ -3074,7 +3088,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) default: return kvm_set_msr_common(vcpu, msr); } - return 0; + return ret; } static int msr_interception(struct kvm_vcpu *vcpu) @@ -3485,11 +3499,43 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) return; svm->nmi_masked = true; - if (!sev_es_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_IRET); + svm_set_iret_intercept(svm); ++vcpu->stat.nmi_injections; } +static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!is_vnmi_enabled(svm)) + return false; + + return !!(svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK); +} + +static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!is_vnmi_enabled(svm)) + return false; + + if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK) + return false; + + svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK; + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); + + /* + * Because the pending NMI is serviced by hardware, KVM can't know when + * the NMI is "injected", but for all intents and purposes, passing the + * NMI off to hardware counts as injection. + */ + ++vcpu->stat.nmi_injections; + + return true; +} + static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3585,6 +3631,35 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) svm_set_intercept(svm, INTERCEPT_CR8_WRITE); } +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_vnmi_enabled(svm)) + return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK; + else + return svm->nmi_masked; +} + +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_vnmi_enabled(svm)) { + if (masked) + svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK; + else + svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK; + + } else { + svm->nmi_masked = masked; + if (masked) + svm_set_iret_intercept(svm); + else + svm_clr_iret_intercept(svm); + } +} + bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3596,8 +3671,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return false; - return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - svm->nmi_masked; + if (svm_get_nmi_mask(vcpu)) + return true; + + return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK; } static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) @@ -3615,26 +3692,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return 1; } -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) -{ - return to_svm(vcpu)->nmi_masked; -} - -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (masked) { - svm->nmi_masked = true; - if (!sev_es_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_IRET); - } else { - svm->nmi_masked = false; - if (!sev_es_guest(vcpu->kvm)) - svm_clr_intercept(svm, INTERCEPT_IRET); - } -} - bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3715,7 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->nmi_masked && !svm->awaiting_iret_completion) + /* + * KVM should never request an NMI window when vNMI is enabled, as KVM + * allows at most one to-be-injected NMI and one pending NMI, i.e. if + * two NMIs arrive simultaneously, KVM will inject one and set + * V_NMI_PENDING for the other. WARN, but continue with the standard + * single-step approach to try and salvage the pending NMI. + */ + WARN_ON_ONCE(is_vnmi_enabled(svm)); + + if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { @@ -3777,13 +3843,13 @@ static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) { /* * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB - * flushes should be routed to hv_remote_flush_tlb() without requesting + * flushes should be routed to hv_flush_remote_tlbs() without requesting * a "regular" remote flush. Reaching this point means either there's - * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of + * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of * which might be fatal to the guest. Yell, but try to recover. */ if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) - hv_remote_flush_tlb(vcpu->kvm); + hv_flush_remote_tlbs(vcpu->kvm); svm_flush_tlb_asid(vcpu); } @@ -4142,7 +4208,7 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: return false; case MSR_IA32_SMBASE: if (!IS_ENABLED(CONFIG_KVM_SMM)) @@ -4184,8 +4250,18 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); + svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI); + svm_recalc_instruction_intercepts(vcpu, svm); + if (boot_cpu_has(X86_FEATURE_IBPB)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, + !!guest_has_pred_cmd_msr(vcpu)); + + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, + !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { best = kvm_find_cpuid_entry(vcpu, 0x8000001F); @@ -4563,7 +4639,6 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { bool smep, smap, is_user; - unsigned long cr4; u64 error_code; /* Emulation is always possible when KVM has access to all guest state. */ @@ -4655,9 +4730,8 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; - cr4 = kvm_read_cr4(vcpu); - smep = cr4 & X86_CR4_SMEP; - smap = cr4 & X86_CR4_SMAP; + smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP); + smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP); is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); @@ -4795,6 +4869,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .patch_hypercall = svm_patch_hypercall, .inject_irq = svm_inject_irq, .inject_nmi = svm_inject_nmi, + .is_vnmi_pending = svm_is_vnmi_pending, + .set_vnmi_pending = svm_set_vnmi_pending, .inject_exception = svm_inject_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4937,6 +5013,9 @@ static __init void svm_set_cpu_caps(void) if (vgif) kvm_cpu_cap_set(X86_FEATURE_VGIF); + if (vnmi) + kvm_cpu_cap_set(X86_FEATURE_VNMI); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -5088,6 +5167,16 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI); + if (vnmi) + pr_info("Virtual NMI enabled\n"); + + if (!vnmi) { + svm_x86_ops.is_vnmi_pending = NULL; + svm_x86_ops.set_vnmi_pending = NULL; + } + + if (lbrv) { if (!boot_cpu_has(X86_FEATURE_LBRV)) lbrv = false; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 839809972da1..f44751dd8d5d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -36,6 +36,7 @@ extern bool npt_enabled; extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; +extern bool vnmi; /* * Clean bits in VMCB. @@ -265,6 +266,7 @@ struct vcpu_svm { bool pause_filter_enabled : 1; bool pause_threshold_enabled : 1; bool vgif_enabled : 1; + bool vnmi_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -539,6 +541,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) +{ + return svm->vnmi_enabled && + (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); +} + static inline bool is_x2apic_msrpm_offset(u32 offset) { /* 4 msrs per u8, and 4 u8 in u32 */ @@ -548,6 +556,27 @@ static inline bool is_x2apic_msrpm_offset(u32 offset) (msr < (APIC_BASE_MSR + 0x100)); } +static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm) +{ + if (!vnmi) + return NULL; + + if (is_guest_mode(&svm->vcpu)) + return NULL; + else + return svm->vmcb01.ptr; +} + +static inline bool is_vnmi_enabled(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = get_vnmi_vmcb_l1(svm); + + if (vmcb) + return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK); + else + return false; +} + /* svm.c */ #define MSR_INVALID 0xffffffffU diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 786d46d73a8e..f85bc617ffe4 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -45,9 +45,8 @@ static inline __init void svm_hv_hardware_setup(void) if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); - svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; - svm_x86_ops.tlb_remote_flush_with_range = - hv_remote_flush_tlb_with_range; + svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 22daca752797..79450e1ed7cf 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -13,7 +13,110 @@ #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK -DEFINE_STATIC_KEY_FALSE(enable_evmcs); +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ @@ -506,6 +609,8 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) } #if IS_ENABLED(CONFIG_HYPERV) +DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + /* * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption * is: in case a feature has corresponding fields in eVMCS described and it was diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 78d17667e7ec..9623fe1651c4 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -16,117 +16,10 @@ struct vmcs_config; -DECLARE_STATIC_KEY_FALSE(enable_evmcs); - #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) #define KVM_EVMCS_VERSION 1 -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_SUPPORTED_PINCTRL \ - (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - PIN_BASED_EXT_INTR_MASK | \ - PIN_BASED_NMI_EXITING | \ - PIN_BASED_VIRTUAL_NMIS) - -#define EVMCS1_SUPPORTED_EXEC_CTRL \ - (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - CPU_BASED_HLT_EXITING | \ - CPU_BASED_CR3_LOAD_EXITING | \ - CPU_BASED_CR3_STORE_EXITING | \ - CPU_BASED_UNCOND_IO_EXITING | \ - CPU_BASED_MOV_DR_EXITING | \ - CPU_BASED_USE_TSC_OFFSETTING | \ - CPU_BASED_MWAIT_EXITING | \ - CPU_BASED_MONITOR_EXITING | \ - CPU_BASED_INVLPG_EXITING | \ - CPU_BASED_RDPMC_EXITING | \ - CPU_BASED_INTR_WINDOW_EXITING | \ - CPU_BASED_CR8_LOAD_EXITING | \ - CPU_BASED_CR8_STORE_EXITING | \ - CPU_BASED_RDTSC_EXITING | \ - CPU_BASED_TPR_SHADOW | \ - CPU_BASED_USE_IO_BITMAPS | \ - CPU_BASED_MONITOR_TRAP_FLAG | \ - CPU_BASED_USE_MSR_BITMAPS | \ - CPU_BASED_NMI_WINDOW_EXITING | \ - CPU_BASED_PAUSE_EXITING | \ - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - -#define EVMCS1_SUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ - SECONDARY_EXEC_WBINVD_EXITING | \ - SECONDARY_EXEC_ENABLE_VPID | \ - SECONDARY_EXEC_ENABLE_EPT | \ - SECONDARY_EXEC_UNRESTRICTED_GUEST | \ - SECONDARY_EXEC_DESC | \ - SECONDARY_EXEC_ENABLE_RDTSCP | \ - SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_XSAVES | \ - SECONDARY_EXEC_RDSEED_EXITING | \ - SECONDARY_EXEC_RDRAND_EXITING | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ - SECONDARY_EXEC_PT_USE_GPA | \ - SECONDARY_EXEC_PT_CONCEAL_VMX | \ - SECONDARY_EXEC_BUS_LOCK_DETECTION | \ - SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) - -#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) - -#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_EXIT_SAVE_DEBUG_CONTROLS | \ - VM_EXIT_ACK_INTR_ON_EXIT | \ - VM_EXIT_HOST_ADDR_SPACE_SIZE | \ - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_EXIT_SAVE_IA32_PAT | \ - VM_EXIT_LOAD_IA32_PAT | \ - VM_EXIT_SAVE_IA32_EFER | \ - VM_EXIT_LOAD_IA32_EFER | \ - VM_EXIT_CLEAR_BNDCFGS | \ - VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_IA32E_MODE | \ - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_ENTRY_LOAD_IA32_PAT | \ - VM_ENTRY_LOAD_IA32_EFER | \ - VM_ENTRY_LOAD_BNDCFGS | \ - VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMFUNC (0) - struct evmcs_field { u16 offset; u16 clean_field; @@ -174,6 +67,13 @@ static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, #if IS_ENABLED(CONFIG_HYPERV) +DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + +static __always_inline bool kvm_is_using_evmcs(void) +{ + return static_branch_unlikely(&__kvm_is_using_evmcs); +} + static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) { @@ -263,6 +163,7 @@ static inline void evmcs_load(u64 phys_addr) void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ +static __always_inline bool kvm_is_using_evmcs(void) { return false; } static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static __always_inline void evmcs_write32(unsigned long field, u32 value) {} static __always_inline void evmcs_write16(unsigned long field, u16 value) {} diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 768487611db7..e35cf0bd0df9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -358,6 +358,7 @@ static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, gpa_t addr) { + unsigned long roots = 0; uint i; struct kvm_mmu_root_info *cached_root; @@ -368,8 +369,10 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, eptp)) - vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + roots |= KVM_MMU_ROOT_PREVIOUS(i); } + if (roots) + kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, @@ -654,6 +657,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); vmx->nested.force_msr_bitmap_recalc = false; @@ -4483,7 +4489,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ - vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ @@ -4634,7 +4640,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); @@ -5156,7 +5162,7 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) * does force CR0.PE=1, but only to also force VM86 in order to emulate * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6755,36 +6761,9 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void) return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; } -/* - * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be - * returned for the various VMX controls MSRs when nested VMX is enabled. - * The same values should also be used to verify that vmcs12 control fields are - * valid during nested entry from L1 to L2. - * Each of these control msrs has a low and high 32-bit half: A low bit is on - * if the corresponding bit in the (32-bit) control field *must* be on, and a - * bit in the high half is on if the corresponding bit in the control field - * may be on. See also vmx_control_verify(). - */ -void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) +static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) { - struct nested_vmx_msrs *msrs = &vmcs_conf->nested; - - /* - * Note that as a general rule, the high half of the MSRs (bits in - * the control fields which may be 1) should be initialized by the - * intersection of the underlying hardware's MSR (i.e., features which - * can be supported) and the list of features we want to expose - - * because they are known to be properly supported in our code. - * Also, usually, the low half of the MSRs (bits which must be 1) can - * be set to 0, meaning that L1 may turn off any of these bits. The - * reason is that if one of these bits is necessary, it will appear - * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control - * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_l1_wants_exit() will not pass related exits to L1. - * These rules have exceptions below. - */ - - /* pin-based controls */ msrs->pinbased_ctls_low = PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6797,8 +6776,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; +} - /* exit controls */ +static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6817,8 +6799,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; +} - /* entry controls */ +static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6834,8 +6819,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; +} - /* cpu-based controls */ +static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6867,12 +6855,12 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) /* We support free control of CR3 access interception. */ msrs->procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); +} - /* - * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by - * vmx_vcpu_after_set_cpuid. - */ +static void nested_vmx_setup_secondary_ctls(u32 ept_caps, + struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; @@ -6950,8 +6938,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) if (enable_sgx) msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; +} - /* miscellaneous data */ +static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | @@ -6959,7 +6950,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; msrs->misc_high = 0; +} +static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) +{ /* * This MSR reports some information about VMX support. We * should return information about the VMX we emulate for the @@ -6974,7 +6968,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; +} +static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) +{ /* * These MSRs specify bits which the guest must keep fixed on * while L1 is in VMXON mode (in L1's root mode, or running an L2). @@ -6991,6 +6988,51 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; +} + +/* + * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be + * returned for the various VMX controls MSRs when nested VMX is enabled. + * The same values should also be used to verify that vmcs12 control fields are + * valid during nested entry from L1 to L2. + * Each of these control msrs has a low and high 32-bit half: A low bit is on + * if the corresponding bit in the (32-bit) control field *must* be on, and a + * bit in the high half is on if the corresponding bit in the control field + * may be on. See also vmx_control_verify(). + */ +void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) +{ + struct nested_vmx_msrs *msrs = &vmcs_conf->nested; + + /* + * Note that as a general rule, the high half of the MSRs (bits in + * the control fields which may be 1) should be initialized by the + * intersection of the underlying hardware's MSR (i.e., features which + * can be supported) and the list of features we want to expose - + * because they are known to be properly supported in our code. + * Also, usually, the low half of the MSRs (bits which must be 1) can + * be set to 0, meaning that L1 may turn off any of these bits. The + * reason is that if one of these bits is necessary, it will appear + * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control + * fields of vmcs01 and vmcs02, will turn these bits off - and + * nested_vmx_l1_wants_exit() will not pass related exits to L1. + * These rules have exceptions below. + */ + nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); + + nested_vmx_setup_exit_ctls(vmcs_conf, msrs); + + nested_vmx_setup_entry_ctls(vmcs_conf, msrs); + + nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); + + nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); + + nested_vmx_setup_misc_data(vmcs_conf, msrs); + + nested_vmx_setup_basic(msrs); + + nested_vmx_setup_cr_fixed(msrs); msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e8a3be0b9df9..741efe2c497b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -57,7 +57,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } } @@ -76,13 +76,13 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) { int bit; - struct kvm_pmc *pmc; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { - pmc = intel_pmc_idx_to_pmc(pmu, bit); - if (pmc) - kvm_pmu_request_counter_reprogam(pmc); - } + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); } static bool intel_hw_event_available(struct kvm_pmc *pmc) @@ -351,45 +351,47 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; - return 0; + break; case MSR_CORE_PERF_GLOBAL_STATUS: msr_info->data = pmu->global_status; - return 0; + break; case MSR_CORE_PERF_GLOBAL_CTRL: msr_info->data = pmu->global_ctrl; - return 0; + break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; - return 0; + break; case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; - return 0; + break; case MSR_IA32_DS_AREA: msr_info->data = pmu->ds_area; - return 0; + break; case MSR_PEBS_DATA_CFG: msr_info->data = pmu->pebs_data_cfg; - return 0; + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_GP]; - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { msr_info->data = pmc->eventsel; - return 0; - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) { + break; + } + return 1; } - return 1; + return 0; } static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -402,44 +404,43 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & pmu->fixed_ctr_ctrl_mask)) { + if (data & pmu->fixed_ctr_ctrl_mask) + return 1; + + if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); - return 0; - } break; case MSR_CORE_PERF_GLOBAL_STATUS: - if (msr_info->host_initiated) { - pmu->global_status = data; - return 0; - } - break; /* RO MSR */ + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + + pmu->global_status = data; + break; case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (kvm_valid_perf_global_ctrl(pmu, data)) { + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); - return 0; } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & pmu->global_ovf_ctrl_mask)) { - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - return 0; - } + if (data & pmu->global_ovf_ctrl_mask) + return 1; + + if (!msr_info->host_initiated) + pmu->global_status &= ~data; break; case MSR_IA32_PEBS_ENABLE: - if (pmu->pebs_enable == data) - return 0; - if (!(data & pmu->pebs_enable_mask)) { + if (data & pmu->pebs_enable_mask) + return 1; + + if (pmu->pebs_enable != data) { diff = pmu->pebs_enable ^ data; pmu->pebs_enable = data; reprogram_counters(pmu, diff); - return 0; } break; case MSR_IA32_DS_AREA: @@ -447,15 +448,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (is_noncanonical_address(data, vcpu)) return 1; + pmu->ds_area = data; - return 0; + break; case MSR_PEBS_DATA_CFG: - if (pmu->pebs_data_cfg == data) - return 0; - if (!(data & pmu->pebs_data_cfg_mask)) { - pmu->pebs_data_cfg = data; - return 0; - } + if (data & pmu->pebs_data_cfg_mask) + return 1; + + pmu->pebs_data_cfg = data; break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -463,33 +463,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((msr & MSR_PMC_FULL_WIDTH_BIT) && (data & ~pmu->counter_bitmask[KVM_PMC_GP])) return 1; + if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; reserved_bits = pmu->reserved_bits; if ((pmc->idx == 2) && (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; - if (!(data & reserved_bits)) { + if (data & reserved_bits) + return 1; + + if (data != pmc->eventsel) { pmc->eventsel = data; - kvm_pmu_request_counter_reprogam(pmc); - return 0; + kvm_pmu_request_counter_reprogram(pmc); } - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) { + break; + } + /* Not a known PMU MSR. */ + return 1; } - return 1; + return 0; } static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) @@ -531,6 +536,16 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; + memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); + + /* + * Setting passthrough of LBR MSRs is done only in the VM-Entry loop, + * and PMU refresh is disallowed after the vCPU has run, i.e. this code + * should never be reached while KVM is passing through MSRs. + */ + if (KVM_BUG_ON(lbr_desc->msr_passthrough, vcpu->kvm)) + return; + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index aa53c98034bf..0574030b071f 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -29,14 +29,14 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ *gva = offset; - if (!is_long_mode(vcpu)) { + if (!is_64_bit_mode(vcpu)) { vmx_get_segment(vcpu, &s, VCPU_SREG_DS); *gva += s.base; } if (!IS_ALIGNED(*gva, alignment)) { fault = true; - } else if (likely(is_long_mode(vcpu))) { + } else if (likely(is_64_bit_mode(vcpu))) { fault = is_noncanonical_address(*gva, vcpu); } else { *gva &= 0xffffffff; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d2d6e1b6c788..44fb619803b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -164,6 +164,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_IA32_SPEC_CTRL, MSR_IA32_PRED_CMD, + MSR_IA32_FLUSH_CMD, MSR_IA32_TSC, #ifdef CONFIG_X86_64 MSR_FS_BASE, @@ -579,7 +580,7 @@ static __init void hv_init_evmcs(void) if (enlightened_vmcs) { pr_info("Using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); + static_branch_enable(&__kvm_is_using_evmcs); } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) @@ -595,7 +596,7 @@ static void hv_reset_evmcs(void) { struct hv_vp_assist_page *vp_ap; - if (!static_branch_unlikely(&enable_evmcs)) + if (!kvm_is_using_evmcs()) return; /* @@ -1945,7 +1946,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, static int vmx_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); @@ -2030,7 +2031,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested_vmx_allowed(vcpu)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, @@ -2285,33 +2286,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; goto find_uret_msr; - case MSR_IA32_PRED_CMD: - if (!msr_info->host_initiated && - !guest_has_pred_cmd_msr(vcpu)) - return 1; - - if (data & ~PRED_CMD_IBPB) - return 1; - if (!boot_cpu_has(X86_FEATURE_IBPB)) - return 1; - if (!data) - break; - - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - - /* - * For non-nested: - * When it's written (to non-zero) for the first time, pass - * it through. - * - * For nested: - * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_prepare_msr_bitmap. We should not touch the - * vmcs02.msr_bitmap here since it gets completely overwritten - * in the merging. - */ - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); - break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) return 1; @@ -2366,7 +2340,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->msr_ia32_sgxlepubkeyhash [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ if (!nested_vmx_allowed(vcpu)) @@ -2816,8 +2790,7 @@ static int vmx_hardware_enable(void) * This can happen if we hot-added a CPU but failed to allocate * VP assist page for it. */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) + if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) return -EFAULT; intel_pt_handle_vmx(1); @@ -2869,7 +2842,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) memset(vmcs, 0, vmcs_config.size); /* KVM supports Enlightened VMCS v1 only */ - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else vmcs->hdr.revision_id = vmcs_config.revision_id; @@ -2964,7 +2937,7 @@ static __init int alloc_kvm_area(void) * still be marked with revision_id reported by * physical CPU. */ - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = vmcs_config.revision_id; per_cpu(vmxarea, cpu) = vmcs; @@ -3931,7 +3904,7 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ - if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { + if (kvm_is_using_evmcs()) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; if (evmcs->hv_enlightenments_control.msr_bitmap) @@ -4773,7 +4746,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) /* 22.2.1, 20.8.1 */ vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); - vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); set_cr4_guest_host_mask(vmx); @@ -5163,7 +5136,7 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) return true; - return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) && + return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } @@ -5500,7 +5473,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); kvm_lmsw(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -6957,7 +6930,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) * real mode. */ return enable_unrestricted_guest || emulate_invalid_guest_state; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: case MSR_AMD64_TSC_RATIO: @@ -7310,7 +7283,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ - if (static_branch_unlikely(&enable_evmcs)) { + if (kvm_is_using_evmcs()) { current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; @@ -7440,7 +7413,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) * feature only for vmcs01, KVM currently isn't equipped to realize any * performance benefits from enabling it for vmcs02. */ - if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + if (kvm_is_using_evmcs() && (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; @@ -7558,7 +7531,7 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; - if (kvm_read_cr0(vcpu) & X86_CR0_CD) { + if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else @@ -7744,6 +7717,13 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + if (boot_cpu_has(X86_FEATURE_IBPB)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); + + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); set_cr4_guest_host_mask(vmx); @@ -7776,9 +7756,11 @@ static u64 vmx_get_perf_capabilities(void) if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - x86_perf_get_lbr(&lbr); - if (lbr.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { + x86_perf_get_lbr(&lbr); + if (lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + } if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; @@ -7918,6 +7900,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ break; + case x86_intercept_pause: + /* + * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides + * with vanilla NOPs in the emulator. Apply the interception + * check only to actual PAUSE instructions. Don't check + * PAUSE-loop-exiting, software can't expect a given PAUSE to + * exit, i.e. KVM is within its rights to allow L2 to execute + * the PAUSE. + */ + if ((info->rep_prefix != REPE_PREFIX) || + !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + return X86EMUL_CONTINUE; + + break; + /* TODO: check more intercepts... */ default: break; @@ -8415,9 +8412,8 @@ static __init int hardware_setup(void) #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; - vmx_x86_ops.tlb_remote_flush_with_range = - hv_remote_flush_tlb_with_range; + vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } #endif diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2acdc54bc34b..9e66531861cf 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -369,7 +369,7 @@ struct vcpu_vmx { struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15 +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16 struct { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -640,6 +640,24 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) (1 << VCPU_EXREG_EXIT_INFO_1) | \ (1 << VCPU_EXREG_EXIT_INFO_2)) +static inline unsigned long vmx_l1_guest_owned_cr0_bits(void) +{ + unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS; + + /* + * CR0.WP needs to be intercepted when KVM is shadowing legacy paging + * in order to construct shadow PTEs with the correct protections. + * Note! CR0.WP technically can be passed through to the guest if + * paging is disabled, but checking CR0.PG would generate a cyclical + * dependency of sorts due to forcing the caller to ensure CR0 holds + * the correct value prior to determining which CR0 bits can be owned + * by L1. Keep it simple and limit the optimization to EPT. + */ + if (!enable_ept) + bits &= ~X86_CR0_WP; + return bits; +} + static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index db95bde52998..ce47dc265f89 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -147,7 +147,7 @@ do_exception: static __always_inline u16 vmcs_read16(unsigned long field) { vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read16(field); return __vmcs_readl(field); } @@ -155,7 +155,7 @@ static __always_inline u16 vmcs_read16(unsigned long field) static __always_inline u32 vmcs_read32(unsigned long field) { vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read32(field); return __vmcs_readl(field); } @@ -163,7 +163,7 @@ static __always_inline u32 vmcs_read32(unsigned long field) static __always_inline u64 vmcs_read64(unsigned long field) { vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read64(field); #ifdef CONFIG_X86_64 return __vmcs_readl(field); @@ -175,7 +175,7 @@ static __always_inline u64 vmcs_read64(unsigned long field) static __always_inline unsigned long vmcs_readl(unsigned long field) { vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read64(field); return __vmcs_readl(field); } @@ -222,7 +222,7 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val static __always_inline void vmcs_write16(unsigned long field, u16 value) { vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write16(field, value); __vmcs_writel(field, value); @@ -231,7 +231,7 @@ static __always_inline void vmcs_write16(unsigned long field, u16 value) static __always_inline void vmcs_write32(unsigned long field, u32 value) { vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write32(field, value); __vmcs_writel(field, value); @@ -240,7 +240,7 @@ static __always_inline void vmcs_write32(unsigned long field, u32 value) static __always_inline void vmcs_write64(unsigned long field, u64 value) { vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write64(field, value); __vmcs_writel(field, value); @@ -252,7 +252,7 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value) static __always_inline void vmcs_writel(unsigned long field, unsigned long value) { vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write64(field, value); __vmcs_writel(field, value); @@ -262,7 +262,7 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_clear_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write32(field, evmcs_read32(field) & ~mask); __vmcs_writel(field, __vmcs_readl(field) & ~mask); @@ -272,7 +272,7 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_set_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write32(field, evmcs_read32(field) | mask); __vmcs_writel(field, __vmcs_readl(field) | mask); @@ -289,7 +289,7 @@ static inline void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_load(phys_addr); vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b1d82647195..ceb7c5e9cf9e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -196,7 +196,7 @@ bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); /* Enable/disable SMT_RSB bug mitigation */ -bool __read_mostly mitigate_smt_rsb; +static bool __read_mostly mitigate_smt_rsb; module_param(mitigate_smt_rsb, bool, 0444); /* @@ -804,8 +804,8 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, */ if ((fault->error_code & PFERR_PRESENT_MASK) && !(fault->error_code & PFERR_RSVD_MASK)) - kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, - fault_mmu->root.hpa); + kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address, + KVM_MMU_ROOT_CURRENT); fault_mmu->inject_page_fault(vcpu, fault); } @@ -843,7 +843,7 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { - if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE)) return true; kvm_queue_exception(vcpu, UD_VECTOR); @@ -908,6 +908,24 @@ EXPORT_SYMBOL_GPL(load_pdptrs); void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { + /* + * CR0.WP is incorporated into the MMU role, but only for non-nested, + * indirect shadow MMUs. If paging is disabled, no updates are needed + * as there are no permission bits to emulate. If TDP is enabled, the + * MMU's metadata needs to be updated, e.g. so that emulating guest + * translations does the right thing, but there's no need to unload the + * root as CR0.WP doesn't affect SPTEs. + */ + if ((cr0 ^ old_cr0) == X86_CR0_WP) { + if (!(cr0 & X86_CR0_PG)) + return; + + if (tdp_enabled) { + kvm_init_mmu(vcpu); + return; + } + } + if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); @@ -967,7 +985,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; if (!(cr0 & X86_CR0_PG) && - (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))) + (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; static_call(kvm_x86_set_cr0)(vcpu, cr0); @@ -989,7 +1007,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); @@ -1003,7 +1021,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (static_cpu_has(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || - kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } @@ -1017,14 +1035,14 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (static_cpu_has(X86_FEATURE_PKU) && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || - kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) { + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); @@ -1180,9 +1198,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) - return 1; - /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) return 1; @@ -1229,7 +1244,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB * with PCIDE=0. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) return; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) @@ -1244,9 +1259,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) bool skip_tlb_flush = false; unsigned long pcid = 0; #ifdef CONFIG_X86_64 - bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - if (pcid_enabled) { + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; pcid = cr3 & X86_CR3_PCID_MASK; @@ -1545,39 +1558,41 @@ static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* - * List of msr numbers which are used to expose MSR-based features that - * can be used by a hypervisor to validate requested CPU features. + * List of MSRs that control the existence of MSR-based features, i.e. MSRs + * that are effectively CPUID leafs. VMX MSRs are also included in the set of + * feature MSRs, but are handled separately to allow expedited lookups. */ -static const u32 msr_based_features_all[] = { - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR0_FIXED1, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_CR4_FIXED1, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - +static const u32 msr_based_features_all_except_vmx[] = { MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, }; -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; static unsigned int num_msr_based_features; /* + * All feature MSRs except uCode revID, which tracks the currently loaded uCode + * patch, are immutable once the vCPU model is defined. + */ +static bool kvm_is_immutable_feature_msr(u32 msr) +{ + int i; + + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) + return true; + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { + if (msr == msr_based_features_all_except_vmx[i]) + return msr != MSR_IA32_UCODE_REV; + } + + return false; +} + +/* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM * does not yet virtualize. These include: * 10 - MISC_PACKAGE_CTRLS @@ -2194,6 +2209,22 @@ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { + u64 val; + + /* + * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does + * not support modifying the guest vCPU model on the fly, e.g. changing + * the nVMX capabilities while L2 is running is nonsensical. Ignore + * writes of the same value, e.g. to allow userspace to blindly stuff + * all MSRs when emulating RESET. + */ + if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) { + if (do_get_msr(vcpu, index, &val) || *data != val) + return -EINVAL; + + return 0; + } + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } @@ -3616,9 +3647,40 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~kvm_caps.supported_perf_cap) return 1; + /* + * Note, this is not just a performance optimization! KVM + * disallows changing feature MSRs after the vCPU has run; PMU + * refresh will bug the VM if called after the vCPU has run. + */ + if (vcpu->arch.perf_capabilities == data) + break; + vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); - return 0; + break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu)) + return 1; + + if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB)) + return 1; + if (!data) + break; + + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + break; + case MSR_IA32_FLUSH_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) + return 1; + + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) + return 1; + if (!data) + break; + + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + break; case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -4534,9 +4596,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 0; break; case KVM_CAP_XSAVE2: { - u64 guest_perm = xstate_get_guest_group_perm(); - - r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_get_filtered_xcr0(), false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; @@ -5036,7 +5096,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { + !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } @@ -5128,7 +5188,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending != 0; + events->nmi.pending = kvm_get_nr_pending_nmis(vcpu); events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); /* events->sipi_vector is never valid when reporting to user space */ @@ -5215,8 +5275,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; - if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) - vcpu->arch.nmi_pending = events->nmi.pending; + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { + vcpu->arch.nmi_pending = 0; + atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending); + kvm_make_request(KVM_REQ_NMI, vcpu); + } static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && @@ -6024,11 +6087,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) -{ - return kvm->arch.n_max_mmu_pages; -} - static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { struct kvm_pic *pic = kvm->arch.vpic; @@ -6675,8 +6733,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; @@ -6714,9 +6771,6 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; - case KVM_GET_NR_MMU_PAGES: - r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); - break; case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); @@ -7021,6 +7075,18 @@ out: return r; } +static void kvm_probe_feature_msr(u32 msr_index) +{ + struct kvm_msr_entry msr = { + .index = msr_index, + }; + + if (kvm_get_msr_feature(&msr)) + return; + + msr_based_features[num_msr_based_features++] = msr_index; +} + static void kvm_probe_msr_to_save(u32 msr_index) { u32 dummy[2]; @@ -7096,7 +7162,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) msrs_to_save[num_msrs_to_save++] = msr_index; } -static void kvm_init_msr_list(void) +static void kvm_init_msr_lists(void) { unsigned i; @@ -7122,15 +7188,11 @@ static void kvm_init_msr_list(void) emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { - struct kvm_msr_entry msr; + for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++) + kvm_probe_feature_msr(i); - msr.index = msr_based_features_all[i]; - if (kvm_get_msr_feature(&msr)) - continue; - - msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; - } + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) + kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]); } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -8466,7 +8528,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - bool write_fault_to_shadow_pgtable, int emulation_type) { gpa_t gpa = cr2_or_gpa; @@ -8537,7 +8598,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * be fixed by unprotecting shadow page and it should * be reported to userspace. */ - return !write_fault_to_shadow_pgtable; + return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, @@ -8785,20 +8846,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt; if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) return 1; vcpu->arch.l1tf_flush_l1d = true; - /* - * Clear write_fault_to_shadow_pgtable here to ensure it is - * never reused. - */ - write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; - vcpu->arch.write_fault_to_shadow_pgtable = false; - if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); @@ -8819,7 +8872,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; } if (reexecute_instruction(vcpu, cr2_or_gpa, - write_fault_to_spt, emulation_type)) return 1; @@ -8898,8 +8950,7 @@ restart: return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, - emulation_type)) + if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) return 1; return handle_emulation_failure(vcpu, emulation_type); @@ -9477,7 +9528,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_caps.max_guest_tsc_khz = max; } kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; - kvm_init_msr_list(); + kvm_init_msr_lists(); return 0; out_unwind_ops: @@ -9808,7 +9859,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = attrs; - vcpu->run->hypercall.longmode = op_64_bit; + vcpu->run->hypercall.flags = 0; + if (op_64_bit) + vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; + + WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); vcpu->arch.complete_userspace_io = complete_hypercall_exit; return 0; } @@ -10170,19 +10225,46 @@ out: static void process_nmi(struct kvm_vcpu *vcpu) { - unsigned limit = 2; + unsigned int limit; /* - * x86 is limited to one NMI running, and one NMI pending after it. - * If an NMI is already in progress, limit further NMIs to just one. - * Otherwise, allow two (and we'll inject the first one immediately). + * x86 is limited to one NMI pending, but because KVM can't react to + * incoming NMIs as quickly as bare metal, e.g. if the vCPU is + * scheduled out, KVM needs to play nice with two queued NMIs showing + * up at the same time. To handle this scenario, allow two NMIs to be + * (temporarily) pending so long as NMIs are not blocked and KVM is not + * waiting for a previous NMI injection to complete (which effectively + * blocks NMIs). KVM will immediately inject one of the two NMIs, and + * will request an NMI window to handle the second NMI. */ if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; + else + limit = 2; + + /* + * Adjust the limit to account for pending virtual NMIs, which aren't + * tracked in vcpu->arch.nmi_pending. + */ + if (static_call(kvm_x86_is_vnmi_pending)(vcpu)) + limit--; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); - kvm_make_request(KVM_REQ_EVENT, vcpu); + + if (vcpu->arch.nmi_pending && + (static_call(kvm_x86_set_vnmi_pending)(vcpu))) + vcpu->arch.nmi_pending--; + + if (vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + +/* Return total number of NMIs pending injection to the VM */ +int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.nmi_pending + + static_call(kvm_x86_is_vnmi_pending)(vcpu); } void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, @@ -13268,7 +13350,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE); switch (type) { case INVPCID_TYPE_INDIV_ADDR: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a8167b47b8c8..c544602d07a3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -3,6 +3,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include <asm/fpu/xstate.h> #include <asm/mce.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" @@ -40,6 +41,14 @@ void kvm_spurious_fault(void); failed; \ }) +/* + * The first...last VMX feature MSRs that are emulated by KVM. This may or may + * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an + * associated feature that KVM supports for nested virtualization. + */ +#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC +#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC + #define KVM_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 #define KVM_DEFAULT_PLE_WINDOW_GROW 2 @@ -83,6 +92,11 @@ static inline unsigned int __shrink_ple_window(unsigned int val, void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.last_vmentry_cpu != -1; +} + static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) { return vcpu->arch.exception.pending || @@ -123,15 +137,15 @@ static inline bool kvm_exception_is_soft(unsigned int nr) static inline bool is_protmode(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, X86_CR0_PE); + return kvm_is_cr0_bit_set(vcpu, X86_CR0_PE); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) +static inline bool is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - return vcpu->arch.efer & EFER_LMA; + return !!(vcpu->arch.efer & EFER_LMA); #else - return 0; + return false; #endif } @@ -171,19 +185,19 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } -static inline int is_pae(struct kvm_vcpu *vcpu) +static inline bool is_pae(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); + return kvm_is_cr4_bit_set(vcpu, X86_CR4_PAE); } -static inline int is_pse(struct kvm_vcpu *vcpu) +static inline bool is_pse(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); + return kvm_is_cr4_bit_set(vcpu, X86_CR4_PSE); } -static inline int is_paging(struct kvm_vcpu *vcpu) +static inline bool is_paging(struct kvm_vcpu *vcpu) { - return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); + return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG)); } static inline bool is_pae_paging(struct kvm_vcpu *vcpu) @@ -193,7 +207,7 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu) static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; + return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48; } static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) @@ -315,6 +329,34 @@ extern struct kvm_caps kvm_caps; extern bool enable_pmu; +/* + * Get a filtered version of KVM's supported XCR0 that strips out dynamic + * features for which the current process doesn't (yet) have permission to use. + * This is intended to be used only when enumerating support to userspace, + * e.g. in KVM_GET_SUPPORTED_CPUID and KVM_CAP_XSAVE2, it does NOT need to be + * used to check/restrict guest behavior as KVM rejects KVM_SET_CPUID{2} if + * userspace attempts to enable unpermitted features. + */ +static inline u64 kvm_get_filtered_xcr0(void) +{ + u64 permitted_xcr0 = kvm_caps.supported_xcr0; + + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); + + if (permitted_xcr0 & XFEATURE_MASK_USER_DYNAMIC) { + permitted_xcr0 &= xstate_get_guest_group_perm(); + + /* + * Treat XTILE_CFG as unsupported if the current process isn't + * allowed to use XTILE_DATA, as attempting to set XTILE_CFG in + * XCR0 without setting XTILE_DATA is architecturally illegal. + */ + if (!(permitted_xcr0 & XFEATURE_MASK_XTILE_DATA)) + permitted_xcr0 &= ~XFEATURE_MASK_XTILE_CFG; + } + return permitted_xcr0; +} + static inline bool kvm_mpx_supported(void) { return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) |