diff options
Diffstat (limited to 'arch/x86')
44 files changed, 1316 insertions, 1070 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2061ed1c398f..58cb9495e40f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -439,6 +439,7 @@ #define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 3be6a98751f0..c9f6a6c5de3c 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -205,8 +205,6 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); #endif #endif -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); #ifdef CONFIG_CRASH_HOTPLUG diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3bc146dfd38d..1a4def36d5bb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -288,13 +288,13 @@ struct kvm_kernel_irq_routing_entry; * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to - * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating - * 2 bytes per gfn instead of 4 bytes per gfn. + * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows + * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via - * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create - * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise - * gfn_track will overflow and explosions will ensure. + * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must + * not create more than 2^16-1 upper-level shadow pages at a single gfn, + * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which @@ -746,7 +746,6 @@ struct kvm_vcpu_arch { u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; - bool xsaves_enabled; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; @@ -831,6 +830,25 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + /* + * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly + * when "struct kvm_vcpu_arch" is no longer defined in an + * arch/x86/include/asm header. The max is mostly arbitrary, i.e. + * can be increased as necessary. + */ +#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG + + /* + * Track whether or not the guest is allowed to use features that are + * governed by KVM, where "governed" means KVM needs to manage state + * and/or explicitly enable the feature in hardware. Typically, but + * not always, governed features can be used by the guest if and only + * if both KVM and userspace want to expose the feature to the guest. + */ + struct { + DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES); + } governed_features; + u64 reserved_gpa_bits; int maxphyaddr; @@ -1005,7 +1023,7 @@ struct kvm_lpage_info { struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; - unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; + unsigned short *gfn_write_track; }; /* @@ -1247,8 +1265,9 @@ struct kvm_arch { * create an NX huge page (without hanging the guest). */ struct list_head possible_nx_huge_pages; - struct kvm_page_track_notifier_node mmu_sp_tracker; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; +#endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync @@ -1655,8 +1674,8 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit information. Intended to @@ -1795,8 +1814,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void) #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); -#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB -static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (kvm_x86_ops.flush_remote_tlbs && !static_call(kvm_x86_flush_remote_tlbs)(kvm)) @@ -1805,6 +1824,8 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE + #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest) @@ -1833,7 +1854,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); -void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index eb186bc57f6a..3d040741044b 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,11 +2,9 @@ #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H -enum kvm_page_track_mode { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_MAX, -}; +#include <linux/kvm_types.h> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * The notifier represented by @kvm_page_track_notifier_node is linked into * the head which will be notified when guest is triggering the track event. @@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node { * It is called when guest is writing the write-tracked page * and write emulation is finished at that time. * - * @vcpu: the vcpu where the write access happened. * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. * @node: this node */ - void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes, struct kvm_page_track_notifier_node *node); + void (*track_write)(gpa_t gpa, const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node); + /* - * It is called when memory slot is being moved or removed - * users can drop write-protection for the pages in that memory slot + * Invoked when a memory region is removed from the guest. Or in KVM + * terms, when a memslot is deleted. * - * @kvm: the kvm where memory slot being moved or removed - * @slot: the memory slot being moved or removed - * @node: this node + * @gfn: base gfn of the region being removed + * @nr_pages: number of pages in the to-be-removed region + * @node: this node */ - void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node); + void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages, + struct kvm_page_track_notifier_node *node); }; -int kvm_page_track_init(struct kvm *kvm); -void kvm_page_track_cleanup(struct kvm *kvm); +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); -bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); -int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); - -void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages); - -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode); +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn); +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn); +#else +/* + * Allow defining a node in a structure even if page tracking is disabled, e.g. + * to play nice with testing headers via direct inclusion from the command line. + */ +struct kvm_page_track_notifier_node {}; +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 9177b4354c3f..6536873f8fc0 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,7 +25,14 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +typedef void (cpu_emergency_virt_cb)(void); +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); +#else +static inline void cpu_emergency_disable_virtualization(void) {} +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index e7c7379d6ac7..19bf955b67e0 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -288,6 +288,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) struct vmcb_seg { u16 selector; @@ -345,7 +346,7 @@ struct vmcb_save_area { u64 last_excp_from; u64 last_excp_to; u8 reserved_0x298[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; /* Save area definition for SEV-ES and SEV-SNP guests */ @@ -512,7 +513,7 @@ struct ghcb { } __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h deleted file mode 100644 index 3b12e6b99412..000000000000 --- a/arch/x86/include/asm/virtext.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* CPU virtualization extensions handling - * - * This should carry the code for handling CPU virtualization extensions - * that needs to live in the kernel core. - * - * Author: Eduardo Habkost <ehabkost@redhat.com> - * - * Copyright (C) 2008, Red Hat Inc. - * - * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - */ -#ifndef _ASM_X86_VIRTEX_H -#define _ASM_X86_VIRTEX_H - -#include <asm/processor.h> - -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/tlbflush.h> - -/* - * VMX functions: - */ - -static inline int cpu_has_vmx(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - - -/** - * cpu_vmxoff() - Disable VMX on the current CPU - * - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static inline int cpu_vmxoff(void) -{ - asm_volatile_goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - -static inline int cpu_vmx_enabled(void) -{ - return __read_cr4() & X86_CR4_VMXE; -} - -/** Disable VMX if it is enabled on the current CPU - * - * You shouldn't call this if cpu_has_vmx() returns 0. - */ -static inline void __cpu_emergency_vmxoff(void) -{ - if (cpu_vmx_enabled()) - cpu_vmxoff(); -} - -/** Disable VMX if it is supported and enabled on the current CPU - */ -static inline void cpu_emergency_vmxoff(void) -{ - if (cpu_has_vmx()) - __cpu_emergency_vmxoff(); -} - - - - -/* - * SVM functions: - */ - -/** Check if the CPU has SVM support - * - * You can use the 'msg' arg to get a message describing the problem, - * if the function returns zero. Simply pass NULL if you are not interested - * on the messages; gcc should take care of not generating code for - * the messages on this case. - */ -static inline int cpu_has_svm(const char **msg) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { - if (msg) - *msg = "not amd or hygon"; - return 0; - } - - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - - if (!boot_cpu_has(X86_FEATURE_SVM)) { - if (msg) - *msg = "svm not available"; - return 0; - } - return 1; -} - - -/** Disable SVM on the current CPU - * - * You should call this only if cpu_has_svm() returned true. - */ -static inline void cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - if (efer & EFER_SVME) { - /* - * Force GIF=1 prior to disabling SVM to ensure INIT and NMI - * aren't blocked, e.g. if a fatal error occurred between CLGI - * and STGI. Note, STGI may #UD if SVM is disabled from NMI - * context between reading EFER and executing STGI. In that - * case, GIF must already be set, otherwise the NMI would have - * been blocked, so just eat the fault. - */ - asm_volatile_goto("1: stgi\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "memory" : fault); -fault: - wrmsrl(MSR_EFER, efer & ~EFER_SVME); - } -} - -/** Makes sure SVM is disabled, if it is supported on the CPU - */ -static inline void cpu_emergency_svm_disable(void) -{ - if (cpu_has_svm(NULL)) - cpu_svm_disable(); -} - -#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0d02c4aafa6f..0e73616b82f3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,7 +71,7 @@ #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) -#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) +#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 587c7743fd21..c92d88680dbf 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -48,27 +48,6 @@ struct crash_memmap_data { unsigned int type; }; -/* - * This is used to VMCLEAR all VMCSs loaded on the - * processor. And when loading kvm_intel module, the - * callback function pointer will be assigned. - * - * protected by rcu. - */ -crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; -EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); - -static inline void cpu_crash_vmclear_loaded_vmcss(void) -{ - crash_vmclear_fn *do_vmclear_operation = NULL; - - rcu_read_lock(); - do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); - if (do_vmclear_operation) - do_vmclear_operation(); - rcu_read_unlock(); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -76,11 +55,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) crash_save_cpu(regs, cpu); /* - * VMCLEAR VMCSs loaded on all cpus if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - - /* * Disable Intel PT to stop its logging */ cpu_emergency_stop_pt(); @@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); - /* - * VMCLEAR VMCSs loaded on this cpu if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - cpu_emergency_disable_virtualization(); /* diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3adbe97015c1..830425e6d38e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -22,7 +22,6 @@ #include <asm/reboot_fixups.h> #include <asm/reboot.h> #include <asm/pci_x86.h> -#include <asm/virtext.h> #include <asm/cpu.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -530,9 +529,54 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +/* RCU-protected callback to disable virtualization prior to reboot. */ +static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, callback); +} +EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); + +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_virt_cb *callback; + + /* + * IRQs must be disabled as KVM enables virtualization in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + + rcu_read_lock(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); + rcu_read_unlock(); +} + static void emergency_reboot_disable_virtualization(void) { - /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* @@ -545,7 +589,7 @@ static void emergency_reboot_disable_virtualization(void) * Do the NMI shootdown even if virtualization is off on _this_ CPU, as * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx() || cpu_has_svm(NULL)) { + if (rcu_access_pointer(cpu_emergency_virt_callback)) { /* Safely force _this_ CPU out of VMX/SVM operation. */ cpu_emergency_disable_virtualization(); @@ -553,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void) nmi_shootdown_cpus_on_restart(); } } - +#else +static void emergency_reboot_disable_virtualization(void) { } +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ void __attribute__((weak)) mach_reboot_fixups(void) { @@ -787,21 +833,9 @@ void machine_crash_shutdown(struct pt_regs *regs) } #endif - /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; -/* - * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during - * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if - * GIF=0, i.e. if the crash occurred between CLGI and STGI. - */ -void cpu_emergency_disable_virtualization(void) -{ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); -} - #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 89ca7f4c1464..ed90f148140d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -101,7 +101,7 @@ config X86_SGX_KVM config KVM_AMD tristate "KVM for AMD processors support" - depends on KVM + depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) help Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. @@ -138,6 +138,19 @@ config KVM_XEN If in doubt, say "N". +config KVM_PROVE_MMU + bool "Prove KVM MMU correctness" + depends on DEBUG_KERNEL + depends on KVM + depends on EXPERT + help + Enables runtime assertions in KVM's MMU that are too costly to enable + in anything remotely resembling a production environment, e.g. this + gates code that verifies a to-be-freed page table doesn't have any + present SPTEs. + + If in doubt, say "N". + config KVM_EXTERNAL_WRITE_TRACKING bool diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d3432687c9e6..0544e30b4946 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> +#include "linux/lockdep.h" #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -84,6 +85,18 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *e; int i; + /* + * KVM has a semi-arbitrary rule that querying the guest's CPUID model + * with IRQs disabled is disallowed. The CPUID model can legitimately + * have over one hundred entries, i.e. the lookup is slow, and IRQs are + * typically disabled in KVM only when KVM is in a performance critical + * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break + * if this rule is violated, this assertion is purely to flag potential + * performance issues. If this fires, consider moving the lookup out + * of the hotpath, e.g. by caching information during CPUID updates. + */ + lockdep_assert_irqs_enabled(); + for (i = 0; i < nent; i++) { e = &entries[i]; @@ -312,6 +325,27 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + bool allow_gbpages; + + BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); + bitmap_zero(vcpu->arch.governed_features.enabled, + KVM_MAX_NR_GOVERNED_FEATURES); + + /* + * If TDP is enabled, let the guest use GBPAGES if they're supported in + * hardware. The hardware page walker doesn't let KVM disable GBPAGES, + * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA + * walk for performance and complexity reasons. Not to mention KVM + * _can't_ solve the problem because GVA->GPA walks aren't visible to + * KVM once a TDP translation is installed. Mimic hardware behavior so + * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. + * If TDP is disabled, honor *only* guest CPUID as KVM has full control + * and can install smaller shadow pages if the host lacks 1GiB support. + */ + allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); + if (allow_gbpages) + kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES); best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { @@ -647,7 +681,8 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | + F(AMX_COMPLEX) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, @@ -1154,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; + case 0x80000005: + /* Pass host L1 cache and TLB info. */ + break; case 0x80000006: /* Drop reserved bits, pass host L2 cache and TLB info. */ entry->edx &= ~GENMASK(17, 16); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b1658c0de847..284fa4704553 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -232,4 +232,50 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } +enum kvm_governed_features { +#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, +#include "governed_features.h" + KVM_NR_GOVERNED_FEATURES +}; + +static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) +{ + switch (x86_feature) { +#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; +#include "governed_features.h" + default: + return -1; + } +} + +static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) +{ + return kvm_governed_feature_index(x86_feature) >= 0; +} + +static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + __set_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + +static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) + kvm_governed_feature_set(vcpu, x86_feature); +} + +static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + return test_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 936a397a08cd..2673cd5c46cb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1799,13 +1799,11 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->addr.mem, &op->val, op->bytes); - break; case OP_MEM_STR: return segmented_write(ctxt, op->addr.mem, op->data, op->bytes * op->count); - break; case OP_XMM: kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h new file mode 100644 index 000000000000..423a73395c10 --- /dev/null +++ b/arch/x86/kvm/governed_features.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE) +BUILD_BUG() +#endif + +#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) + +KVM_GOVERNED_X86_FEATURE(GBPAGES) +KVM_GOVERNED_X86_FEATURE(XSAVES) +KVM_GOVERNED_X86_FEATURE(VMX) +KVM_GOVERNED_X86_FEATURE(NRIPS) +KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) +KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) +KVM_GOVERNED_X86_FEATURE(LBRV) +KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) +KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) +KVM_GOVERNED_X86_FEATURE(VGIF) +KVM_GOVERNED_X86_FEATURE(VNMI) + +#undef KVM_GOVERNED_X86_FEATURE +#undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b28fd020066f..7c2dac6824e2 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1293,7 +1293,6 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; - break; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index ab65f3a47dfd..be7aeb9b8ea3 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -213,7 +213,6 @@ struct x86_emulate_ops { bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); - bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a983a16163b1..dcd60b39e794 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -376,7 +376,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ - bool xapic_id_mismatch = false; + bool xapic_id_mismatch; + int r; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) @@ -386,9 +387,14 @@ void kvm_recalculate_apic_map(struct kvm *kvm) "Dirty APIC map without an in-kernel local APIC"); mutex_lock(&kvm->arch.apic_map_lock); + +retry: /* - * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map - * (if clean) or the APIC registers (if dirty). + * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean) + * or the APIC registers (if dirty). Note, on retry the map may have + * not yet been marked dirty by whatever task changed a vCPU's x2APIC + * ID, i.e. the map may still show up as in-progress. In that case + * this task still needs to retry and complete its calculation. */ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty, DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { @@ -397,6 +403,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) return; } + /* + * Reset the mismatch flag between attempts so that KVM does the right + * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e. + * keep max_id strictly increasing. Disallowing max_id from shrinking + * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU + * with the highest x2APIC ID is toggling its APIC on and off. + */ + xapic_id_mismatch = false; + kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); @@ -415,9 +430,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (!kvm_apic_present(vcpu)) continue; - if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) { + r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch); + if (r) { kvfree(new); new = NULL; + if (r == -E2BIG) { + cond_resched(); + goto retry; + } + goto out; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 92d5a1924fc1..253fb2093d5d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -121,6 +121,8 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes); static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ec169f5c7dce..e1d011c67cc6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -25,6 +25,7 @@ #include "kvm_cache_regs.h" #include "smm.h" #include "kvm_emulate.h" +#include "page_track.h" #include "cpuid.h" #include "spte.h" @@ -53,7 +54,7 @@ #include <asm/io.h> #include <asm/set_memory.h> #include <asm/vmx.h> -#include <asm/kvm_page_track.h> + #include "trace.h" extern bool itlb_multihit_kvm_mitigation; @@ -115,11 +116,6 @@ static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; -#ifdef MMU_DEBUG -bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - #define PTE_PREFETCH_NUM 8 #include <trace/events/kvm.h> @@ -278,16 +274,12 @@ static inline bool kvm_available_flush_remote_tlbs_range(void) return kvm_x86_ops.flush_remote_tlbs_range; } -void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, - gfn_t nr_pages) +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { - int ret = -EOPNOTSUPP; + if (!kvm_x86_ops.flush_remote_tlbs_range) + return -EOPNOTSUPP; - if (kvm_x86_ops.flush_remote_tlbs_range) - ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn, - nr_pages); - if (ret) - kvm_flush_remote_tlbs(kvm); + return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); @@ -490,7 +482,7 @@ retry: */ static void mmu_spte_set(u64 *sptep, u64 new_spte) { - WARN_ON(is_shadow_present_pte(*sptep)); + WARN_ON_ONCE(is_shadow_present_pte(*sptep)); __set_spte(sptep, new_spte); } @@ -502,7 +494,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - WARN_ON(!is_shadow_present_pte(new_spte)); + WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { @@ -515,7 +507,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); return old_spte; } @@ -597,7 +589,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) * by a refcounted page, the refcount is elevated. */ page = kvm_pfn_to_refcounted_page(pfn); - WARN_ON(page && !page_count(page)); + WARN_ON_ONCE(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -812,7 +804,7 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->disallow_lpage += count; - WARN_ON(linfo->disallow_lpage < 0); + WARN_ON_ONCE(linfo->disallow_lpage < 0); } } @@ -839,8 +831,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_add_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_add_gfn(kvm, slot, gfn); kvm_mmu_gfn_disallow_lpage(slot, gfn); @@ -886,8 +877,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_remove_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_remove_gfn(kvm, slot, gfn); kvm_mmu_gfn_allow_lpage(slot, gfn); } @@ -941,10 +931,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, int count = 0; if (!rmap_head->val) { - rmap_printk("%p %llx 0->1\n", spte, *spte); rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { - rmap_printk("%p %llx 1->many\n", spte, *spte); desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; @@ -953,7 +941,6 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, rmap_head->val = (unsigned long)desc | 1; ++count; } else { - rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); count = desc->tail_count + desc->spte_count; @@ -973,7 +960,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, return count; } -static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); @@ -984,7 +972,7 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty. */ - BUG_ON(j < 0); + KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); /* * Replace the to-be-freed SPTE with the last valid entry from the head @@ -1009,35 +997,34 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(head_desc); } -static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(struct kvm *kvm, u64 *spte, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i; - if (!rmap_head->val) { - pr_err("%s: %p 0->BUG\n", __func__, spte); - BUG(); - } else if (!(rmap_head->val & 1)) { - rmap_printk("%p 1->0\n", spte); - if ((u64 *)rmap_head->val != spte) { - pr_err("%s: %p 1->BUG\n", __func__, spte); - BUG(); - } + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) + return; + + if (!(rmap_head->val & 1)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) + return; + rmap_head->val = 0; } else { - rmap_printk("%p many->many\n", spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(rmap_head, desc, i); + pte_list_desc_remove_entry(kvm, rmap_head, + desc, i); return; } } desc = desc->more; } - pr_err("%s: %p many->many\n", __func__, spte); - BUG(); + + KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } } @@ -1045,7 +1032,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - pte_list_remove(sptep, rmap_head); + pte_list_remove(kvm, sptep, rmap_head); } /* Return true if at least one SPTE was zapped, false otherwise */ @@ -1120,7 +1107,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - pte_list_remove(spte, rmap_head); + pte_list_remove(kvm, spte, rmap_head); } /* @@ -1212,7 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); - WARN_ON(sp->role.level == PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); @@ -1241,8 +1228,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) !(pt_protect && is_mmu_writable_spte(spte))) return false; - rmap_printk("spte %p %llx\n", sptep, *sptep); - if (pt_protect) spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; @@ -1267,9 +1252,7 @@ static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; - rmap_printk("spte %p %llx\n", sptep, *sptep); - - MMU_WARN_ON(!spte_ad_enabled(spte)); + KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; return mmu_spte_update(sptep, spte); } @@ -1475,14 +1458,11 @@ static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 new_spte; kvm_pfn_t new_pfn; - WARN_ON(pte_huge(pte)); + WARN_ON_ONCE(pte_huge(pte)); new_pfn = pte_pfn(pte); restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - rmap_printk("spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); - need_flush = true; if (pte_write(pte)) { @@ -1588,7 +1568,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level, range->pte); + iterator.level, range->arg.pte); return ret; } @@ -1710,21 +1690,19 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return young; } -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) +static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) { - u64 *pos; - u64 *end; +#ifdef CONFIG_KVM_PROVE_MMU + int i; - for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++) - if (is_shadow_present_pte(*pos)) { - printk(KERN_ERR "%s: %p %llx\n", __func__, - pos, *pos); - return 0; - } - return 1; -} + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { + if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) + pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", + sp->spt[i], &sp->spt[i], + kvm_mmu_page_get_gfn(sp, i)); + } #endif +} /* * This value is the sum of all of the kvm instances's @@ -1752,7 +1730,8 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { - MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); + kvm_mmu_check_sptes_at_free(sp); + hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -1775,16 +1754,16 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, pte_list_add(cache, parent_pte, &sp->parent_ptes); } -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, +static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(kvm, parent_pte, &sp->parent_ptes); } -static void drop_parent_pte(struct kvm_mmu_page *sp, +static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - mmu_page_remove_parent_pte(sp, parent_pte); + mmu_page_remove_parent_pte(kvm, sp, parent_pte); mmu_spte_clear_no_track(parent_pte); } @@ -1840,7 +1819,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); + WARN_ON_ONCE((int)sp->unsync_children < 0); __clear_bit(idx, sp->unsync_child_bitmap); } @@ -1898,7 +1877,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - WARN_ON(!sp->unsync); + WARN_ON_ONCE(!sp->unsync); trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; @@ -2073,11 +2052,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec, if (pvec->nr == 0) return 0; - WARN_ON(pvec->page[0].idx != INVALID_INDEX); + WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); sp = pvec->page[0].sp; level = sp->role.level; - WARN_ON(level == PG_LEVEL_4K); + WARN_ON_ONCE(level == PG_LEVEL_4K); parents->parent[level-2] = sp; @@ -2099,7 +2078,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) if (!sp) return; - WARN_ON(idx == INVALID_INDEX); + WARN_ON_ONCE(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; } while (!sp->unsync_children); @@ -2220,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, if (ret < 0) break; - WARN_ON(!list_empty(&invalid_list)); + WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0) kvm_flush_remote_tlbs(kvm); } @@ -2499,7 +2478,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (child->role.access == direct_access) return; - drop_parent_pte(child, sptep); + drop_parent_pte(vcpu->kvm, child, sptep); kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } @@ -2517,7 +2496,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, drop_spte(kvm, spte); } else { child = spte_to_child_sp(pte); - drop_parent_pte(child, spte); + drop_parent_pte(kvm, child, spte); /* * Recursively zap nested TDP SPs, parentless SPs are @@ -2548,13 +2527,13 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, return zapped; } -static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) - drop_parent_pte(sp, sptep); + drop_parent_pte(kvm, sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -2593,7 +2572,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); - kvm_mmu_unlink_parents(sp); + kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; @@ -2675,7 +2654,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); list_for_each_entry_safe(sp, nsp, invalid_list, link) { - WARN_ON(!sp->role.invalid || sp->root_count); + WARN_ON_ONCE(!sp->role.invalid || sp->root_count); kvm_mmu_free_shadow_page(sp); } } @@ -2775,12 +2754,9 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); int r; - pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; write_lock(&kvm->mmu_lock); for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - pgprintk("%s: gfn %llx role %x\n", __func__, gfn, - sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } @@ -2831,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM; /* @@ -2873,7 +2849,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, continue; } - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(kvm, sp); } if (locked) @@ -2938,9 +2914,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; - pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, - *sptep, write_fault, gfn); - if (unlikely(is_noslot_pfn(pfn))) { vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); @@ -2957,11 +2930,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, u64 pte = *sptep; child = spte_to_child_sp(pte); - drop_parent_pte(child, sptep); + drop_parent_pte(vcpu->kvm, child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %llx new %llx\n", - spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep); flush = true; } else @@ -2986,8 +2957,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); rmap_add(vcpu, slot, sptep, gfn, pte_access); @@ -3033,7 +3002,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *spte, *start = NULL; int i; - WARN_ON(!sp->role.direct); + WARN_ON_ONCE(!sp->role.direct); i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; @@ -3574,12 +3543,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - /* - * The "root" may be a special root, e.g. a PAE entry, treat it as a - * SPTE to ensure any non-PA bits are dropped. - */ - sp = spte_to_child_sp(*root_hpa); - if (WARN_ON(!sp)) + sp = root_to_sp(*root_hpa); + if (WARN_ON_ONCE(!sp)) return; if (is_tdp_mmu_page(sp)) @@ -3624,7 +3589,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, &invalid_list); if (free_active_root) { - if (to_shadow_page(mmu->root.hpa)) { + if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { + /* Nothing to cleanup for dummy roots. */ + } else if (root_to_sp(mmu->root.hpa)) { mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { @@ -3648,6 +3615,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; + struct kvm_mmu_page *sp; hpa_t root_hpa; int i; @@ -3662,8 +3630,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) if (!VALID_PAGE(root_hpa)) continue; - if (!to_shadow_page(root_hpa) || - to_shadow_page(root_hpa)->role.guest_mode) + sp = root_to_sp(root_hpa); + if (!sp || sp->role.guest_mode) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } @@ -3671,19 +3639,6 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); - -static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) -{ - int ret = 0; - - if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - ret = 1; - } - - return ret; -} - static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) { @@ -3821,8 +3776,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = root_pgd >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { + mmu->root.hpa = kvm_mmu_get_dummy_root(); + return 0; + } /* * On SVM, reading PDPTRs might access guest memory, which might fault @@ -3834,8 +3791,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; - if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) + pdptrs[i] = 0; } } @@ -4002,7 +3959,7 @@ static bool is_unsync_root(hpa_t root) { struct kvm_mmu_page *sp; - if (!VALID_PAGE(root)) + if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) return false; /* @@ -4018,7 +3975,7 @@ static bool is_unsync_root(hpa_t root) * requirement isn't satisfied. */ smp_rmb(); - sp = to_shadow_page(root); + sp = root_to_sp(root); /* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the @@ -4048,11 +4005,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; - sp = to_shadow_page(root); if (!is_unsync_root(root)) return; + sp = root_to_sp(root); + write_lock(&vcpu->kvm->mmu_lock); mmu_sync_children(vcpu, sp, true); write_unlock(&vcpu->kvm->mmu_lock); @@ -4194,7 +4152,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) return RET_PF_EMULATE; reserved = get_mmio_spte(vcpu, addr, &spte); - if (WARN_ON(reserved)) + if (WARN_ON_ONCE(reserved)) return -EINVAL; if (is_mmio_spte(spte)) { @@ -4232,7 +4190,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) return true; return false; @@ -4382,7 +4340,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) @@ -4407,6 +4365,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { int r; + /* Dummy roots are used only for shadowing bad guest roots. */ + if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) + return RET_PF_RETRY; + if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; @@ -4443,8 +4405,6 @@ out_unlock: static int nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); - /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault); @@ -4562,9 +4522,19 @@ static void nonpaging_init_context(struct kvm_mmu *context) static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { - return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && - role.word == to_shadow_page(root->hpa)->role.word; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root->hpa)) + return false; + + if (!role.direct && pgd != root->pgd) + return false; + + sp = root_to_sp(root->hpa); + if (WARN_ON_ONCE(!sp)) + return false; + + return role.word == sp->role.word; } /* @@ -4634,11 +4604,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { /* - * For now, limit the caching to 64-bit hosts+VMs in order to avoid - * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs - * later if necessary. + * Limit reuse to 64-bit hosts+VMs without "special" roots in order to + * avoid having to deal with PDPTEs and other complexities. */ - if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa)) + if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); if (VALID_PAGE(mmu->root.hpa)) @@ -4684,9 +4653,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count. */ - if (!new_role.direct) - __clear_sp_write_flooding_count( - to_shadow_page(vcpu->arch.mmu->root.hpa)); + if (!new_role.direct) { + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); + + if (!WARN_ON_ONCE(!sp)) + __clear_sp_write_flooding_count(sp); + } } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); @@ -4808,28 +4780,13 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, } } -static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) -{ - /* - * If TDP is enabled, let the guest use GBPAGES if they're supported in - * hardware. The hardware page walker doesn't let KVM disable GBPAGES, - * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA - * walk for performance and complexity reasons. Not to mention KVM - * _can't_ solve the problem because GVA->GPA walks aren't visible to - * KVM once a TDP translation is installed. Mimic hardware behavior so - * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. - */ - return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); -} - static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), - guest_can_use_gbpages(vcpu), + guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } @@ -4906,7 +4863,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, - guest_can_use_gbpages(vcpu), is_pse, is_amd); + guest_can_use(vcpu, X86_FEATURE_GBPAGES), + is_pse, is_amd); if (!shadow_me_mask) return; @@ -5467,8 +5425,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments). For now that - * problem is swept under the rug; KVM's CPUID API is horrific and + * gfn_write_track (see struct kvm_mmu_page_role comments). For now + * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.root_role.word = 0; @@ -5531,9 +5489,9 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); + WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); + WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); } @@ -5546,16 +5504,21 @@ static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) /* * When freeing obsolete roots, treat roots as obsolete if they don't - * have an associated shadow page. This does mean KVM will get false + * have an associated shadow page, as it's impossible to determine if + * such roots are fresh or stale. This does mean KVM will get false * positives and free roots that don't strictly need to be freed, but * such false positives are relatively rare: * - * (a) only PAE paging and nested NPT has roots without shadow pages + * (a) only PAE paging and nested NPT have roots without shadow pages + * (or any shadow paging flavor with a dummy root, see note below) * (b) remote reloads due to a memslot update obsoletes _all_ roots * (c) KVM doesn't track previous roots for PAE paging, and the guest * is unlikely to zap an in-use PGD. + * + * Note! Dummy roots are unique in that they are obsoleted by memslot + * _creation_! See also FNAME(fetch). */ - sp = to_shadow_page(root_hpa); + sp = root_to_sp(root_hpa); return !sp || is_obsolete_sp(kvm, sp); } @@ -5634,9 +5597,6 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, { unsigned offset, pte_size, misaligned; - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); - offset = offset_in_page(gpa); pte_size = sp->role.has_4_byte_gpte ? 4 : 8; @@ -5684,9 +5644,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - struct kvm_page_track_notifier_node *node) +void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -5702,8 +5661,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5742,7 +5699,18 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + /* + * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP + * checks when emulating instructions that triggers implicit access. + * WARN if hardware generates a fault with an error code that collides + * with the KVM-defined value. Clear the flag and continue on, i.e. + * don't terminate the VM, as KVM can't possibly be relying on a flag + * that KVM doesn't know about. + */ + if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) + error_code &= ~PFERR_IMPLICIT_ACCESS; + + if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; r = RET_PF_INVALID; @@ -6099,7 +6067,7 @@ restart: * pages. Skip the bogus page, otherwise we'll get stuck in an * infinite loop if the page gets put back on the list (again). */ - if (WARN_ON(sp->role.invalid)) + if (WARN_ON_ONCE(sp->role.invalid)) continue; /* @@ -6199,16 +6167,8 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node) -{ - kvm_mmu_zap_all_fast(kvm); -} - int kvm_mmu_init_vm(struct kvm *kvm) { - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; int r; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); @@ -6222,10 +6182,6 @@ int kvm_mmu_init_vm(struct kvm *kvm) return r; } - node->track_write = kvm_mmu_pte_write; - node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; - kvm_page_track_register_notifier(kvm, node); - kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; @@ -6246,10 +6202,6 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm) void kvm_mmu_uninit_vm(struct kvm *kvm) { - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - - kvm_page_track_unregister_notifier(kvm, node); - if (tdp_mmu_enabled) kvm_mmu_uninit_tdp_mmu(kvm); @@ -6670,7 +6622,7 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, */ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_flush_remote_tlbs_memslot(kvm, slot); } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, @@ -6689,20 +6641,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, } } -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - const struct kvm_memory_slot *memslot) -{ - /* - * All current use cases for flushing the TLBs for a specific memslot - * related to dirty logging, and many do the TLB flush out of mmu_lock. - * The interaction between the various operations on memslot must be - * serialized by slots_locks to ensure the TLB flush from one operation - * is observed by any other operation on the same memslot. - */ - lockdep_assert_held(&kvm->slots_lock); - kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); -} - void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { @@ -6732,7 +6670,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, */ } -void kvm_mmu_zap_all(struct kvm *kvm) +static void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -6741,7 +6679,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (WARN_ON(sp->role.invalid)) + if (WARN_ON_ONCE(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; @@ -6757,9 +6695,20 @@ restart: write_unlock(&kvm->mmu_lock); } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_zap_all_fast(kvm); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); gen &= MMIO_SPTE_GEN_MASK; @@ -6862,7 +6811,7 @@ static void mmu_destroy_caches(void) static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) - return sprintf(buffer, "never\n"); + return sysfs_emit(buffer, "never\n"); return param_get_bool(buffer, kp); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index d39af5639ce9..b102014e2c60 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -6,18 +6,10 @@ #include <linux/kvm_host.h> #include <asm/kvm_host.h> -#undef MMU_DEBUG - -#ifdef MMU_DEBUG -extern bool dbg; - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) +#ifdef CONFIG_KVM_PROVE_MMU +#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x) #else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) +#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x) #endif /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ @@ -44,6 +36,16 @@ extern bool dbg; #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) +static inline hpa_t kvm_mmu_get_dummy_root(void) +{ + return my_zero_pfn(0) << PAGE_SHIFT; +} + +static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) +{ + return is_zero_pfn(shadow_page >> PAGE_SHIFT); +} + typedef u64 __rcu *tdp_ptep_t; struct kvm_mmu_page { @@ -170,9 +172,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); -void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, - gfn_t nr_pages); - /* Flush the given page (huge or not) of guest memory. */ static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) { diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 0a2ac438d647..c87da11f3a04 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -12,13 +12,13 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/lockdep.h> #include <linux/kvm_host.h> #include <linux/rculist.h> -#include <asm/kvm_page_track.h> - #include "mmu.h" #include "mmu_internal.h" +#include "page_track.h" bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) { @@ -28,103 +28,64 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { - int i; - - for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - kvfree(slot->arch.gfn_track[i]); - slot->arch.gfn_track[i] = NULL; - } + kvfree(slot->arch.gfn_write_track); + slot->arch.gfn_write_track = NULL; } -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages) +static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot, + unsigned long npages) { - int i; - - for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - if (i == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm)) - continue; - - slot->arch.gfn_track[i] = - __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.gfn_track[i]) - goto track_free; - } + const size_t size = sizeof(*slot->arch.gfn_write_track); - return 0; + if (!slot->arch.gfn_write_track) + slot->arch.gfn_write_track = __vcalloc(npages, size, + GFP_KERNEL_ACCOUNT); -track_free: - kvm_page_track_free_memslot(slot); - return -ENOMEM; + return slot->arch.gfn_write_track ? 0 : -ENOMEM; } -static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages) { - if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX) - return false; + if (!kvm_page_track_write_tracking_enabled(kvm)) + return 0; - return true; + return __kvm_page_track_write_tracking_alloc(slot, npages); } int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot) { - unsigned short *gfn_track; - - if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE]) - return 0; - - gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track), - GFP_KERNEL_ACCOUNT); - if (gfn_track == NULL) - return -ENOMEM; - - slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track; - return 0; + return __kvm_page_track_write_tracking_alloc(slot, slot->npages); } -static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode, short count) +static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn, + short count) { int index, val; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); - val = slot->arch.gfn_track[mode][index]; + val = slot->arch.gfn_write_track[index]; - if (WARN_ON(val + count < 0 || val + count > USHRT_MAX)) + if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX)) return; - slot->arch.gfn_track[mode][index] += count; + slot->arch.gfn_write_track[index] += count; } -/* - * add guest page to the tracking pool so that corresponding access on that - * page will be intercepted. - * - * It should be called under the protection both of mmu-lock and kvm->srcu - * or kvm->slots_lock. - * - * @kvm: the guest instance we are interested in. - * @slot: the @gfn belongs to. - * @gfn: the guest page. - * @mode: tracking mode, currently only write track is supported. - */ -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) +void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn) { + lockdep_assert_held_write(&kvm->mmu_lock); - if (WARN_ON(!page_track_mode_is_valid(mode))) - return; + lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) || + srcu_read_lock_held(&kvm->srcu)); - if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm))) + if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm)) return; - update_gfn_track(slot, gfn, mode, 1); + update_gfn_write_track(slot, gfn, 1); /* * new track stops large page mapping for the @@ -132,37 +93,22 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, */ kvm_mmu_gfn_disallow_lpage(slot, gfn); - if (mode == KVM_PAGE_TRACK_WRITE) - if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) - kvm_flush_remote_tlbs(kvm); + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs(kvm); } -EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); -/* - * remove the guest page from the tracking pool which stops the interception - * of corresponding access on that page. It is the opposed operation of - * kvm_slot_page_track_add_page(). - * - * It should be called under the protection both of mmu-lock and kvm->srcu - * or kvm->slots_lock. - * - * @kvm: the guest instance we are interested in. - * @slot: the @gfn belongs to. - * @gfn: the guest page. - * @mode: tracking mode, currently only write track is supported. - */ -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) +void __kvm_write_track_remove_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) { - if (WARN_ON(!page_track_mode_is_valid(mode))) - return; + lockdep_assert_held_write(&kvm->mmu_lock); - if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm))) + lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) || + srcu_read_lock_held(&kvm->srcu)); + + if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm)) return; - update_gfn_track(slot, gfn, mode, -1); + update_gfn_write_track(slot, gfn, -1); /* * allow large page mapping for the tracked page @@ -170,31 +116,26 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } -EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode) +bool kvm_gfn_is_write_tracked(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn) { int index; - if (WARN_ON(!page_track_mode_is_valid(mode))) - return false; - if (!slot) return false; - if (mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm)) + if (!kvm_page_track_write_tracking_enabled(kvm)) return false; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); - return !!READ_ONCE(slot->arch.gfn_track[mode][index]); + return !!READ_ONCE(slot->arch.gfn_write_track[index]); } +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING void kvm_page_track_cleanup(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; @@ -216,17 +157,22 @@ int kvm_page_track_init(struct kvm *kvm) * register the notifier so that event interception for the tracked guest * pages can be received. */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n) +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; + if (!kvm || kvm->mm != current->mm) + return -ESRCH; + + kvm_get_kvm(kvm); + head = &kvm->arch.track_notifier_head; write_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); write_unlock(&kvm->mmu_lock); + return 0; } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); @@ -234,9 +180,8 @@ EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); * stop receiving the event interception. It is the opposed operation of * kvm_page_track_register_notifier(). */ -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n) +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; @@ -246,6 +191,8 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, hlist_del_rcu(&n->node); write_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); + + kvm_put_kvm(kvm); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); @@ -256,34 +203,30 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); * The node should figure out if the written page is the one that node is * interested in by itself. */ -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes) +void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; int idx; - head = &vcpu->kvm->arch.track_notifier_head; + head = &kvm->arch.track_notifier_head; if (hlist_empty(&head->track_notifier_list)) return; idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, - srcu_read_lock_held(&head->track_srcu)) + srcu_read_lock_held(&head->track_srcu)) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes, n); + n->track_write(gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); } /* - * Notify the node that memory slot is being removed or moved so that it can - * drop write-protection for the pages in the memory slot. - * - * The node should figure out it has any write-protected pages in this slot - * by itself. + * Notify external page track nodes that a memory region is being removed from + * the VM, e.g. so that users can free any associated metadata. */ -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; @@ -296,8 +239,69 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, - srcu_read_lock_held(&head->track_srcu)) - if (n->track_flush_slot) - n->track_flush_slot(kvm, slot, n); + srcu_read_lock_held(&head->track_srcu)) + if (n->track_remove_region) + n->track_remove_region(slot->base_gfn, slot->npages, n); srcu_read_unlock(&head->track_srcu, idx); } + +/* + * add guest page to the tracking pool so that corresponding access on that + * page will be intercepted. + * + * @kvm: the guest instance we are interested in. + * @gfn: the guest page. + */ +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + + slot = gfn_to_memslot(kvm, gfn); + if (!slot) { + srcu_read_unlock(&kvm->srcu, idx); + return -EINVAL; + } + + write_lock(&kvm->mmu_lock); + __kvm_write_track_add_gfn(kvm, slot, gfn); + write_unlock(&kvm->mmu_lock); + + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn); + +/* + * remove the guest page from the tracking pool which stops the interception + * of corresponding access on that page. + * + * @kvm: the guest instance we are interested in. + * @gfn: the guest page. + */ +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + + slot = gfn_to_memslot(kvm, gfn); + if (!slot) { + srcu_read_unlock(&kvm->srcu, idx); + return -EINVAL; + } + + write_lock(&kvm->mmu_lock); + __kvm_write_track_remove_gfn(kvm, slot, gfn); + write_unlock(&kvm->mmu_lock); + + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn); +#endif diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h new file mode 100644 index 000000000000..d4d72ed999b1 --- /dev/null +++ b/arch/x86/kvm/mmu/page_track.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_PAGE_TRACK_H +#define __KVM_X86_PAGE_TRACK_H + +#include <linux/kvm_host.h> + +#include <asm/kvm_page_track.h> + + +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); + +void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages); + +void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn); +void __kvm_write_track_remove_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); + +bool kvm_gfn_is_write_tracked(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn); + +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING +int kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_cleanup(struct kvm *kvm); + +void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot); + +static inline bool kvm_page_track_has_external_user(struct kvm *kvm) +{ + return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list); +} +#else +static inline int kvm_page_track_init(struct kvm *kvm) { return 0; } +static inline void kvm_page_track_cleanup(struct kvm *kvm) { } + +static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, + const u8 *new, int bytes) { } +static inline void kvm_page_track_delete_slot(struct kvm *kvm, + struct kvm_memory_slot *slot) { } + +static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; } + +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ + +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + __kvm_page_track_write(vcpu->kvm, gpa, new, bytes); + + kvm_mmu_track_write(vcpu, gpa, new, bytes); +} + +#endif /* __KVM_X86_PAGE_TRACK_H */ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0662e0278e70..c85255073f67 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -338,7 +338,6 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); /* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging @@ -348,9 +347,21 @@ retry_walk: nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; pte_access = ~0; + + /* + * Queue a page fault for injection if this assertion fails, as callers + * assume that walker.fault contains sane info on a walk failure. I.e. + * avoid making the situation worse by inducing even worse badness + * between when the assertion fails and when KVM kicks the vCPU out to + * userspace (because the VM is bugged). + */ + if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) + goto error; + ++walker->level; do { + struct kvm_memory_slot *slot; unsigned long host_addr; pt_access = pte_access; @@ -381,7 +392,11 @@ retry_walk: if (unlikely(real_gpa == INVALID_GPA)) return 0; - host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), + slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); + if (!kvm_is_visible_memslot(slot)) + goto error; + + host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa), &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -456,9 +471,6 @@ retry_walk: goto retry_walk; } - pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, - walker->pt_access[walker->level - 1]); return 1; error: @@ -529,8 +541,6 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); @@ -638,8 +648,19 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + goto out_gpte_changed; + + /* + * Load a new root and retry the faulting instruction in the extremely + * unlikely scenario that the guest root gfn became visible between + * loading a dummy root and handling the resulting page fault, e.g. if + * userspace create a memslot in the interim. + */ + if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { + kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); goto out_gpte_changed; + } for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; @@ -758,7 +779,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault struct guest_walker walker; int r; - pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); /* @@ -773,7 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { - pgprintk("%s: guest page fault\n", __func__); if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); @@ -837,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) offset = sp->role.quadrant << SPTE_LEVEL_BITS; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index cf2c6426a6fc..4a599130e9c9 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -61,7 +61,7 @@ static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; @@ -221,8 +221,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * shadow pages and unsync'ing pages is not allowed. */ if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); @@ -242,7 +240,7 @@ out: if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ - WARN_ON(level > PG_LEVEL_4K); + WARN_ON_ONCE(level > PG_LEVEL_4K); mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 1279db2eab44..a129951c9a88 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -3,6 +3,7 @@ #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H +#include "mmu.h" #include "mmu_internal.h" /* @@ -236,6 +237,18 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline struct kvm_mmu_page *root_to_sp(hpa_t root) +{ + if (kvm_mmu_is_dummy_root(root)) + return NULL; + + /* + * The "root" may be a special root, e.g. a PAE entry, treat it as a + * SPTE to ensure any non-PA bits are dropped. + */ + return spte_to_child_sp(root); +} + static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && @@ -265,13 +278,13 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); /* * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit @@ -282,13 +295,13 @@ static inline bool spte_ad_need_write_protect(u64 spte) static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index d2eb0d4f8710..bd30ebfb2f2c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -39,13 +39,14 @@ void tdp_iter_restart(struct tdp_iter *iter) void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn) { - int root_level = root->role.level; - - WARN_ON(root_level < 1); - WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + if (WARN_ON_ONCE(!root || (root->role.level < 1) || + (root->role.level > PT64_ROOT_MAX_LEVEL))) { + iter->valid = false; + return; + } iter->next_last_level_gfn = next_last_level_gfn; - iter->root_level = root_level; + iter->root_level = root->role.level; iter->min_level = min_level; iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; iter->as_id = kvm_mmu_page_as_id(root); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 512163d52194..6c63f2d1675f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -475,9 +475,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - WARN_ON(level > PT64_ROOT_MAX_LEVEL); - WARN_ON(level < PG_LEVEL_4K); - WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); + WARN_ON_ONCE(level < PG_LEVEL_4K); + WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); /* * If this warning were to trigger it would indicate that there was a @@ -522,9 +522,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * impact the guest since both the former and current SPTEs * are nonpresent. */ - if (WARN_ON(!is_mmio_spte(old_spte) && - !is_mmio_spte(new_spte) && - !is_removed_spte(new_spte))) + if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && + !is_mmio_spte(new_spte) && + !is_removed_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" @@ -661,7 +661,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ - WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); + WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); @@ -689,7 +689,7 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, else #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) + for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) /* * Yield if the MMU lock is contended or this thread needs to return control @@ -709,7 +709,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { - WARN_ON(iter->yielded); + WARN_ON_ONCE(iter->yielded); /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) @@ -728,7 +728,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, rcu_read_lock(); - WARN_ON(iter->gfn > iter->next_last_level_gfn); + WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); iter->yielded = true; } @@ -1241,7 +1241,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte; /* Huge pages aren't expected to be modified without first being zapped. */ - WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); + WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); if (iter->level != PG_LEVEL_4K || !is_shadow_present_pte(iter->old_spte)) @@ -1255,9 +1255,9 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, */ tdp_mmu_iter_set_spte(kvm, iter, 0); - if (!pte_write(range->pte)) { + if (!pte_write(range->arg.pte)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, - pte_pfn(range->pte)); + pte_pfn(range->arg.pte)); tdp_mmu_iter_set_spte(kvm, iter, new_spte); } @@ -1548,8 +1548,8 @@ retry: if (!is_shadow_present_pte(iter.old_spte)) continue; - MMU_WARN_ON(kvm_ad_enabled() && - spte_ad_need_write_protect(iter.old_spte)); + KVM_MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); if (!(iter.old_spte & dbit)) continue; @@ -1600,6 +1600,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, shadow_dirty_mask; struct tdp_iter iter; + lockdep_assert_held_write(&kvm->mmu_lock); + rcu_read_lock(); tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), @@ -1607,8 +1609,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; - MMU_WARN_ON(kvm_ad_enabled() && - spte_ad_need_write_protect(iter.old_spte)); + KVM_MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); if (iter.level > PG_LEVEL_4K || !(mask & (1UL << (iter.gfn - gfn)))) @@ -1646,7 +1648,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index bf653df86112..edb89b51b383 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -382,9 +382,6 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) - return false; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) return true; @@ -398,6 +395,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) static bool pmc_event_is_allowed(struct kvm_pmc *pmc) { return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + static_call(kvm_x86_pmu_hw_event_available)(pmc) && check_pmu_event_filter(pmc); } diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 56cbdb24400a..b81650678375 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -43,6 +43,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) +#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) /* CPUID level 0x80000007 (EDX). */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index cfc8ab773025..2092db892d7d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -791,6 +791,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) int ret = 0; unsigned long flags; struct amd_svm_iommu_ir *ir; + u64 entry; /** * In some cases, the existing irte is updated and re-set, @@ -824,6 +825,18 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) ir->data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); + + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is running. + * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM + * will update the pCPU info when the vCPU awkened and/or scheduled in. + * See also avic_vcpu_load(). + */ + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, + true, pi->ir_data); + list_add(&ir->node, &svm->ir_list); spin_unlock_irqrestore(&svm->ir_list_lock, flags); out: @@ -986,10 +999,11 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { int ret = 0; - unsigned long flags; struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_held(&svm->ir_list_lock); + if (!kvm_arch_has_assigned_device(vcpu->kvm)) return 0; @@ -997,19 +1011,15 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) * Here, we go through the per-vcpu ir_list to update all existing * interrupt remapping table entry targeting this vcpu. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - if (list_empty(&svm->ir_list)) - goto out; + return 0; list_for_each_entry(ir, &svm->ir_list, node) { ret = amd_iommu_update_ga(cpu, r, ir->data); if (ret) - break; + return ret; } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; + return 0; } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1017,6 +1027,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 entry; int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; lockdep_assert_preemption_disabled(); @@ -1033,6 +1044,15 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_vcpu_is_blocking(vcpu)) return; + /* + * Grab the per-vCPU interrupt remapping lock even if the VM doesn't + * _currently_ have assigned devices, as that can change. Holding + * ir_list_lock ensures that either svm_ir_list_add() will consume + * up-to-date entry information, or that this task will wait until + * svm_ir_list_add() completes to set the new target pCPU. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); @@ -1042,25 +1062,48 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + + spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_put(struct kvm_vcpu *vcpu) { u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + unsigned long flags; lockdep_assert_preemption_disabled(); + /* + * Note, reading the Physical ID entry outside of ir_list_lock is safe + * as only the pCPU that has loaded (or is loading) the vCPU is allowed + * to modify the entry, and preemption is disabled. I.e. the vCPU + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run + * recursively. + */ entry = READ_ONCE(*(svm->avic_physical_id_cache)); /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) return; + /* + * Take and hold the per-vCPU interrupt remapping lock while updating + * the Physical ID entry even though the lock doesn't protect against + * multiple writers (see above). Holding ir_list_lock ensures that + * either svm_ir_list_add() will consume up-to-date entry information, + * or that this task will wait until svm_ir_list_add() completes to + * mark the vCPU as not running. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 96936ddf1b3c..dd496c9e5f91 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!svm->v_vmload_vmsave_enabled) + if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -552,6 +552,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 bool new_vmcb12 = false; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; nested_vmcb02_compute_g_pat(svm); @@ -577,18 +578,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DT); } - kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); + kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, svm->nested.save.efer); + svm_set_efer(vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); - svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); + svm_set_cr0(vcpu, svm->nested.save.cr0); + svm_set_cr4(vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; - kvm_rax_write(&svm->vcpu, vmcb12->save.rax); - kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); - kvm_rip_write(&svm->vcpu, vmcb12->save.rip); + kvm_rax_write(vcpu, vmcb12->save.rax); + kvm_rsp_write(vcpu, vmcb12->save.rsp); + kvm_rip_write(vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ vmcb02->save.rax = vmcb12->save.rax; @@ -602,7 +603,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with * svm_set_msr's definition of reserved bits. @@ -658,7 +660,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); @@ -695,10 +698,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); - } vmcb02->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -717,7 +719,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -727,7 +729,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -735,15 +737,21 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (svm->lbrv_enabled) + if (guest_can_use(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0; - pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0; + if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + pause_count12 = svm->nested.ctl.pause_filter_count; + else + pause_count12 = 0; + if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + pause_thresh12 = svm->nested.ctl.pause_filter_thresh; + else + pause_thresh12 = 0; if (kvm_pause_in_guest(svm->vcpu.kvm)) { /* use guest values since host doesn't intercept PAUSE */ vmcb02->control.pause_filter_count = pause_count12; @@ -1027,7 +1035,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; @@ -1066,7 +1074,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { @@ -1101,10 +1110,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (kvm_caps.has_tsc_control && + vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; - __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu); } svm->nested.ctl.nested_cr3 = 0; @@ -1537,7 +1546,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); - __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu); } /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d3aec1f2cad2..b9a0a939d59f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -23,6 +23,7 @@ #include <asm/pkru.h> #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include <asm/debugreg.h> #include "mmu.h" #include "x86.h" @@ -54,9 +55,14 @@ module_param_named(sev, sev_enabled, bool, 0444); /* enable/disable SEV-ES support */ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); + +/* enable/disable SEV-ES DebugSwap support */ +static bool sev_es_debug_swap_enabled = true; +module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); #else #define sev_enabled false #define sev_es_enabled false +#define sev_es_debug_swap_enabled false #endif /* CONFIG_KVM_AMD_SEV */ static u8 sev_enc_bit; @@ -606,6 +612,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + if (sev_es_debug_swap_enabled) + save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; + pr_debug("Virtual Machine Save Area (VMSA):\n"); print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); @@ -619,6 +628,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, struct vcpu_svm *svm = to_svm(vcpu); int ret; + if (vcpu->guest_debug) { + pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); + return -EINVAL; + } + /* Perform some pre-encryption checks against the VMSA */ ret = sev_es_sync_vmsa(svm); if (ret) @@ -1725,7 +1739,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * Note, the source is not required to have the same number of * vCPUs as the destination when migrating a vanilla SEV VM. */ - src_vcpu = kvm_get_vcpu(dst_kvm, i); + src_vcpu = kvm_get_vcpu(src_kvm, i); src_svm = to_svm(src_vcpu); /* @@ -2171,7 +2185,7 @@ void __init sev_hardware_setup(void) bool sev_es_supported = false; bool sev_supported = false; - if (!sev_enabled || !npt_enabled) + if (!sev_enabled || !npt_enabled || !nrips) goto out; /* @@ -2256,6 +2270,9 @@ out: sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || + !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) + sev_es_debug_swap_enabled = false; #endif } @@ -2881,7 +2898,10 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: - ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); + ++vcpu->stat.nmi_window_exits; + svm->nmi_masked = false; + kvm_make_request(KVM_REQ_EVENT, vcpu); + ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: ret = kvm_emulate_ap_reset_hold(vcpu); @@ -2944,6 +2964,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; @@ -2952,9 +2973,12 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* * An SEV-ES guest requires a VMSA area that is a separate from the * VMCB page. Do not include the encryption mask on the VMSA physical - * address since hardware will access it using the guest key. + * address since hardware will access it using the guest key. Note, + * the VMSA will be NULL if this vCPU is the destination for intrahost + * migration, and will be copied later. */ - svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (svm->sev_es.vmsa) + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -2972,8 +2996,23 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR4_WRITE); svm_set_intercept(svm, TRAP_CR8_WRITE); - /* No support for enable_vmware_backdoor */ - clr_exception_intercept(svm, GP_VECTOR); + vmcb->control.intercepts[INTERCEPT_DR] = 0; + if (!sev_es_debug_swap_enabled) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + recalc_intercepts(svm); + } else { + /* + * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't + * allow debugging SEV-ES guests, and enables DebugSwap iff + * NO_NESTED_DATA_BP is supported, so there's no reason to + * intercept #DB when DebugSwap is enabled. For simplicity + * with respect to guest debug, intercept #DB for other VMs + * even if NO_NESTED_DATA_BP is supported, i.e. even if the + * guest can't DoS the CPU with infinite #DB vectoring. + */ + clr_exception_intercept(svm, DB_VECTOR); + } /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); @@ -3000,6 +3039,12 @@ void sev_init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); + /* + * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as + * KVM can't decrypt guest memory to decode the faulting instruction. + */ + clr_exception_intercept(svm, GP_VECTOR); + if (sev_es_guest(svm->vcpu.kvm)) sev_es_init_vmcb(svm); } @@ -3018,20 +3063,41 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) { /* - * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. KVM performs the - * corresponding VMSAVE in svm_prepare_guest_switch for both - * traditional and SEV-ES guests. + * All host state for SEV-ES guests is categorized into three swap types + * based on how it is handled by hardware during a world switch: + * + * A: VMRUN: Host state saved in host save area + * VMEXIT: Host state loaded from host save area + * + * B: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state loaded from host save area + * + * C: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state initialized to default(reset) values + * + * Manually save type-B state, i.e. state that is loaded by VMEXIT but + * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed + * by common SVM code). */ - - /* XCR0 is restored on VMEXIT, save the current host value */ hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - - /* PKRU is restored on VMEXIT, save the current host value */ hostsa->pkru = read_pkru(); - - /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ hostsa->xss = host_xss; + + /* + * If DebugSwap is enabled, debug registers are loaded but NOT saved by + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both + * saves and loads debug registers (Type-A). + */ + if (sev_es_debug_swap_enabled) { + hostsa->dr0 = native_get_debugreg(0); + hostsa->dr1 = native_get_debugreg(1); + hostsa->dr2 = native_get_debugreg(2); + hostsa->dr3 = native_get_debugreg(3); + hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); + hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); + hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); + hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); + } } void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d4bfdc607fe7..f283eb47f6ac 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -39,10 +39,9 @@ #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> #include <asm/traps.h> +#include <asm/reboot.h> #include <asm/fpu/api.h> -#include <asm/virtext.h> - #include <trace/events/ipi.h> #include "trace.h" @@ -203,7 +202,7 @@ static int nested = true; module_param(nested, int, S_IRUGO); /* enable/disable Next RIP Save */ -static int nrips = true; +int nrips = true; module_param(nrips, int, 0444); /* enable/disable Virtual VMLOAD VMSAVE */ @@ -365,6 +364,8 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; } +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, bool commit_side_effects) @@ -385,6 +386,14 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, } if (!svm->next_rip) { + /* + * FIXME: Drop this when kvm_emulate_instruction() does the + * right thing and treats "can't emulate" as outright failure + * for EMULTYPE_SKIP. + */ + if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0)) + return 0; + if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; @@ -517,14 +526,21 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static bool kvm_is_svm_supported(void) +static bool __kvm_is_svm_supported(void) { - int cpu = raw_smp_processor_id(); - const char *msg; + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + u64 vm_cr; - if (!cpu_has_svm(&msg)) { - pr_err("SVM not supported by CPU %d, %s\n", cpu, msg); + if (c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) { + pr_err("CPU %d isn't AMD or Hygon\n", cpu); + return false; + } + + if (!cpu_has(c, X86_FEATURE_SVM)) { + pr_err("SVM not supported by CPU %d\n", cpu); return false; } @@ -542,25 +558,55 @@ static bool kvm_is_svm_supported(void) return true; } +static bool kvm_is_svm_supported(void) +{ + bool supported; + + migrate_disable(); + supported = __kvm_is_svm_supported(); + migrate_enable(); + + return supported; +} + static int svm_check_processor_compat(void) { - if (!kvm_is_svm_supported()) + if (!__kvm_is_svm_supported()) return -EIO; return 0; } -void __svm_write_tsc_multiplier(u64 multiplier) +static void __svm_write_tsc_multiplier(u64 multiplier) { - preempt_disable(); - if (multiplier == __this_cpu_read(current_tsc_ratio)) - goto out; + return; wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); -out: - preempt_enable(); +} + +static inline void kvm_cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and + * NMI aren't blocked. + */ + stgi(); + wrmsrl(MSR_EFER, efer & ~EFER_SVME); + } +} + +static void svm_emergency_disable(void) +{ + kvm_rebooting = true; + + kvm_cpu_svm_disable(); } static void svm_hardware_disable(void) @@ -569,7 +615,7 @@ static void svm_hardware_disable(void) if (tsc_scaling) __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); - cpu_svm_disable(); + kvm_cpu_svm_disable(); amd_pmu_disable_virt(); } @@ -677,6 +723,39 @@ free_save_area: } +static void set_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + + recalc_intercepts(svm); +} + +static void clr_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.intercepts[INTERCEPT_DR] = 0; + + recalc_intercepts(svm); +} + static int direct_access_msr_slot(u32 msr) { u32 i; @@ -947,50 +1026,24 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); } -static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) +static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) { /* - * If the LBR virtualization is disabled, the LBR msrs are always - * kept in the vmcb01 to avoid copying them on nested guest entries. - * - * If nested, and the LBR virtualization is enabled/disabled, the msrs - * are moved between the vmcb01 and vmcb02 as needed. + * If LBR virtualization is disabled, the LBR MSRs are always kept in + * vmcb01. If LBR virtualization is enabled and L1 is running VMs of + * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. */ - struct vmcb *vmcb = - (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? - svm->vmcb : svm->vmcb01.ptr; - - switch (index) { - case MSR_IA32_DEBUGCTLMSR: - return vmcb->save.dbgctl; - case MSR_IA32_LASTBRANCHFROMIP: - return vmcb->save.br_from; - case MSR_IA32_LASTBRANCHTOIP: - return vmcb->save.br_to; - case MSR_IA32_LASTINTFROMIP: - return vmcb->save.last_excp_from; - case MSR_IA32_LASTINTTOIP: - return vmcb->save.last_excp_to; - default: - KVM_BUG(false, svm->vcpu.kvm, - "%s: Unknown MSR 0x%x", __func__, index); - return 0; - } + return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : + svm->vmcb01.ptr; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - - bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & - DEBUGCTLMSR_LBR; - - bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & - LBR_CTL_ENABLE_MASK); - - if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) - if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) - enable_lbrv = true; + bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; + bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || + (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) return; @@ -1101,21 +1154,23 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return svm->tsc_ratio_msr; } -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; - svm->vmcb->control.tsc_offset = offset; + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) { - __svm_write_tsc_multiplier(multiplier); + preempt_disable(); + if (to_svm(vcpu)->guest_state_loaded) + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + preempt_enable(); } - /* Evaluate instruction intercepts that depend on guest CPUID features. */ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) @@ -1156,8 +1211,6 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - - svm->v_vmload_vmsave_enabled = false; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1201,10 +1254,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. Don't intercept #GP for SEV guests as KVM can't - * decrypt guest memory to decode the faulting instruction. + * as VMware does. */ - if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) + if (enable_vmware_backdoor) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); @@ -1949,7 +2001,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (vcpu->arch.guest_state_protected) + if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) return; get_debugreg(vcpu->arch.db[0], 0); @@ -2510,12 +2562,13 @@ static int iret_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); + ++vcpu->stat.nmi_window_exits; svm->awaiting_iret_completion = true; svm_clr_iret_intercept(svm); - if (!sev_es_guest(vcpu->kvm)) - svm->nmi_iret_rip = kvm_rip_read(vcpu); + svm->nmi_iret_rip = kvm_rip_read(vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; @@ -2680,6 +2733,13 @@ static int dr_interception(struct kvm_vcpu *vcpu) unsigned long val; int err = 0; + /* + * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT + * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. + */ + if (sev_es_guest(vcpu->kvm)) + return 1; + if (vcpu->guest_debug == 0) { /* * No more DR vmexits; force a reload of the debug registers @@ -2764,7 +2824,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: - if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + if (!msr_info->host_initiated && + !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2802,11 +2863,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: + msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; + break; case MSR_IA32_LASTBRANCHFROMIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; + break; case MSR_IA32_LASTBRANCHTOIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; + break; case MSR_IA32_LASTINTFROMIP: + msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; + break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm_get_lbr_msr(svm, msr_info->index); + msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -2906,7 +2975,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!svm->tsc_scaling_enabled) { + if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -2928,7 +2997,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); break; @@ -3037,13 +3107,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) - svm->vmcb->save.dbgctl = data; - else - svm->vmcb01.ptr->save.dbgctl = data; - + svm_get_lbr_vmcb(svm)->save.dbgctl = data; svm_update_lbrv(vcpu); - break; case MSR_VM_HSAVE_PA: /* @@ -3769,6 +3834,19 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ + /* + * SEV-ES guests are responsible for signaling when a vCPU is ready to + * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. + * KVM can't intercept and single-step IRET to detect when NMIs are + * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. + * + * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware + * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not + * supported NAEs in the GHCB protocol. + */ + if (sev_es_guest(vcpu->kvm)) + return; + if (!gif_set(svm)) { if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); @@ -3918,12 +3996,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) svm->soft_int_injected = false; /* - * If we've made progress since setting HF_IRET_MASK, we've + * If we've made progress since setting awaiting_iret_completion, we've * executed an IRET and can allow NMI injection. */ if (svm->awaiting_iret_completion && - (sev_es_guest(vcpu->kvm) || - kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { + kvm_rip_read(vcpu) != svm->nmi_iret_rip) { svm->awaiting_iret_completion = false; svm->nmi_masked = false; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -4209,28 +4286,37 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES); - - /* Update nrips enabled cache */ - svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && - guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); - - svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); - svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); - - svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && - guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); + /* + * SVM doesn't provide a way to disable just XSAVES in the guest, KVM + * can only disable all variants of by disallowing CR4.OSXSAVE from + * being set. As a result, if the host has XSAVE and XSAVES, and the + * guest has XSAVE enabled, the guest can execute XSAVES without + * faulting. Treat XSAVES as enabled in this case regardless of + * whether it's advertised to the guest so that KVM context switches + * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give + * the guest read/write access to the host's XSS. + */ + if (boot_cpu_has(X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) + kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && - guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); - svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); + /* + * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that + * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing + * SVM on Intel is bonkers and extremely unlikely to work). + */ + if (!guest_cpuid_is_intel(vcpu)) + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); svm_recalc_instruction_intercepts(vcpu, svm); @@ -4651,16 +4737,25 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and * decode garbage. * - * Inject #UD if KVM reached this point without an instruction buffer. - * In practice, this path should never be hit by a well-behaved guest, - * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path - * is still theoretically reachable, e.g. via unaccelerated fault-like - * AVIC access, and needs to be handled by KVM to avoid putting the - * guest into an infinite loop. Injecting #UD is somewhat arbitrary, - * but its the least awful option given lack of insight into the guest. + * If KVM is NOT trying to simply skip an instruction, inject #UD if + * KVM reached this point without an instruction buffer. In practice, + * this path should never be hit by a well-behaved guest, e.g. KVM + * doesn't intercept #UD or #GP for SEV guests, but this path is still + * theoretically reachable, e.g. via unaccelerated fault-like AVIC + * access, and needs to be handled by KVM to avoid putting the guest + * into an infinite loop. Injecting #UD is somewhat arbitrary, but + * its the least awful option given lack of insight into the guest. + * + * If KVM is trying to skip an instruction, simply resume the guest. + * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM + * will attempt to re-inject the INT3/INTO and skip the instruction. + * In that scenario, retrying the INT3/INTO and hoping the guest will + * make forward progress is the only option that has a chance of + * success (and in practice it will work the vast majority of the time). */ if (unlikely(!insn)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (!(emul_type & EMULTYPE_SKIP)) + kvm_queue_exception(vcpu, UD_VECTOR); return false; } @@ -5112,9 +5207,11 @@ static __init int svm_hardware_setup(void) svm_adjust_mmio_mask(); + nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which - * may be modified by svm_adjust_mmio_mask()). + * may be modified by svm_adjust_mmio_mask()), as well as nrips. */ sev_hardware_setup(); @@ -5126,11 +5223,6 @@ static __init int svm_hardware_setup(void) goto err; } - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { @@ -5213,6 +5305,13 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .pmu_ops = &amd_pmu_ops, }; +static void __svm_exit(void) +{ + kvm_x86_vendor_exit(); + + cpu_emergency_unregister_virt_callback(svm_emergency_disable); +} + static int __init svm_init(void) { int r; @@ -5226,6 +5325,8 @@ static int __init svm_init(void) if (r) return r; + cpu_emergency_register_virt_callback(svm_emergency_disable); + /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! @@ -5238,14 +5339,14 @@ static int __init svm_init(void) return 0; err_kvm_init: - kvm_x86_vendor_exit(); + __svm_exit(); return r; } static void __exit svm_exit(void) { kvm_exit(); - kvm_x86_vendor_exit(); + __svm_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 8239c8de45ac..f41253958357 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -22,6 +22,7 @@ #include <asm/svm.h> #include <asm/sev-common.h> +#include "cpuid.h" #include "kvm_cache_regs.h" #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) @@ -33,6 +34,7 @@ #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern int nrips; extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; @@ -260,16 +262,6 @@ struct vcpu_svm { unsigned long soft_int_next_rip; bool soft_int_injected; - /* optional nested SVM features that are enabled for this guest */ - bool nrips_enabled : 1; - bool tsc_scaling_enabled : 1; - bool v_vmload_vmsave_enabled : 1; - bool lbrv_enabled : 1; - bool pause_filter_enabled : 1; - bool pause_threshold_enabled : 1; - bool vgif_enabled : 1; - bool vnmi_enabled : 1; - u32 ldr_reg; u32 dfr_reg; struct page *avic_backing_page; @@ -406,48 +398,6 @@ static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u3 return test_bit(bit, (unsigned long *)&control->intercepts); } -static inline void set_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = svm->vmcb01.ptr; - - if (!sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); - } - - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - - recalc_intercepts(svm); -} - -static inline void clr_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = svm->vmcb01.ptr; - - vmcb->control.intercepts[INTERCEPT_DR] = 0; - - /* DR7 access must remain intercepted for an SEV-ES guest */ - if (sev_es_guest(svm->vcpu.kvm)) { - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); - vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - } - - recalc_intercepts(svm); -} - static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -493,7 +443,8 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); + return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) && + (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); } static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm) @@ -544,7 +495,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { - return svm->vnmi_enabled && + return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) && (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); } @@ -660,7 +611,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); -void __svm_write_tsc_multiplier(u64 multiplier); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d0abee35d7ba..41a4533f9989 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -252,7 +252,7 @@ static inline bool cpu_has_vmx_pml(void) static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_ENABLE_XSAVES; } static inline bool cpu_has_vmx_waitpkg(void) diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 79450e1ed7cf..313b8bb5b8a7 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -78,7 +78,7 @@ SECONDARY_EXEC_DESC | \ SECONDARY_EXEC_ENABLE_RDTSCP | \ SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ SECONDARY_EXEC_RDSEED_EXITING | \ SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_TSC_SCALING | \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 516391cc0d64..c5ec0ef51ff7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2307,7 +2307,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | - SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -6331,7 +6331,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, * If if it were, XSS would have to be checked against * the XSS exit bitmap in vmcs12. */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@ -6426,7 +6426,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_allowed(vcpu) && + if (guest_can_use(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; @@ -6567,7 +6567,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) @@ -6601,7 +6601,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && - (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) + (!guest_can_use(vcpu, X86_FEATURE_VMX) || + !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; vmx_leave_nested(vcpu); @@ -6874,7 +6875,7 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps, SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 96952263b029..b4b9d51438c6 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -168,7 +168,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) { - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); } static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 80c769c58a87..f2efa0bf7ae8 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -22,23 +22,51 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) +enum intel_pmu_architectural_events { + /* + * The order of the architectural events matters as support for each + * event is enumerated via CPUID using the index of the event. + */ + INTEL_ARCH_CPU_CYCLES, + INTEL_ARCH_INSTRUCTIONS_RETIRED, + INTEL_ARCH_REFERENCE_CYCLES, + INTEL_ARCH_LLC_REFERENCES, + INTEL_ARCH_LLC_MISSES, + INTEL_ARCH_BRANCHES_RETIRED, + INTEL_ARCH_BRANCHES_MISPREDICTED, + + NR_REAL_INTEL_ARCH_EVENTS, + + /* + * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a. + * TSC reference cycles. The architectural reference cycles event may + * or may not actually use the TSC as the reference, e.g. might use the + * core crystal clock or the bus clock (yeah, "architectural"). + */ + PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS, + NR_INTEL_ARCH_EVENTS, +}; + static struct { u8 eventsel; u8 unit_mask; } const intel_arch_events[] = { - [0] = { 0x3c, 0x00 }, - [1] = { 0xc0, 0x00 }, - [2] = { 0x3c, 0x01 }, - [3] = { 0x2e, 0x4f }, - [4] = { 0x2e, 0x41 }, - [5] = { 0xc4, 0x00 }, - [6] = { 0xc5, 0x00 }, - /* The above index must match CPUID 0x0A.EBX bit vector */ - [7] = { 0x00, 0x03 }, + [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 }, + [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 }, + [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 }, + [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f }, + [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 }, + [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 }, + [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 }, + [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 }, }; /* mapping between fixed pmc index and intel_arch_events array */ -static int fixed_pmc_events[] = {1, 0, 7}; +static int fixed_pmc_events[] = { + [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED, + [1] = INTEL_ARCH_CPU_CYCLES, + [2] = PSEUDO_ARCH_REFERENCE_CYCLES, +}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { @@ -80,16 +108,18 @@ static bool intel_hw_event_available(struct kvm_pmc *pmc) u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS); + + /* + * Disallow events reported as unavailable in guest CPUID. Note, this + * doesn't apply to pseudo-architectural events. + */ + for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) { if (intel_arch_events[i].eventsel != event_select || intel_arch_events[i].unit_mask != unit_mask) continue; - /* disable event that reported as not present by cpuid */ - if ((i < 7) && !(pmu->available_event_types & (1 << i))) - return false; - - break; + return pmu->available_event_types & BIT(i); } return true; @@ -438,16 +468,17 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) { - size_t size = ARRAY_SIZE(fixed_pmc_events); - struct kvm_pmc *pmc; - u32 event; int i; + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED); + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmc = &pmu->fixed_counters[i]; - event = fixed_pmc_events[array_index_nospec(i, size)]; + int index = array_index_nospec(i, KVM_PMC_MAX_FIXED); + struct kvm_pmc *pmc = &pmu->fixed_counters[index]; + u32 event = fixed_pmc_events[index]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | - intel_arch_events[event].eventsel; + intel_arch_events[event].eventsel; } } @@ -508,10 +539,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (pmu->version == 1) { pmu->nr_arch_fixed_counters = 0; } else { - pmu->nr_arch_fixed_counters = - min3(ARRAY_SIZE(fixed_pmc_events), - (size_t) edx.split.num_counters_fixed, - (size_t)kvm_pmu_cap.num_counters_fixed); + pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, + kvm_pmu_cap.num_counters_fixed); edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b483a8baaacf..72e3943f3693 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -41,13 +41,12 @@ #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> -#include <asm/kexec.h> +#include <asm/reboot.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> -#include <asm/virtext.h> #include <asm/vmx.h> #include "capabilities.h" @@ -237,9 +236,6 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; -/* Control for disabling CPU Fill buffer clear */ -static bool __read_mostly vmx_fb_clear_ctrl_available; - static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; @@ -255,14 +251,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; - - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } + if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; } /* If set to auto use the default l1tf mitigation method */ @@ -366,22 +357,9 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) { if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) - return sprintf(s, "???\n"); + return sysfs_emit(s, "???\n"); - return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); -} - -static void vmx_setup_fb_clear_ctrl(void) -{ - u64 msr; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && - !boot_cpu_has_bug(X86_BUG_MDS) && - !boot_cpu_has_bug(X86_BUG_TAA)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_FB_CLEAR_CTRL) - vmx_fb_clear_ctrl_available = true; - } + return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) @@ -409,7 +387,9 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + !boot_cpu_has_bug(X86_BUG_MDS) && + !boot_cpu_has_bug(X86_BUG_TAA); /* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS @@ -754,17 +734,51 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } -#ifdef CONFIG_KEXEC_CORE -static void crash_vmclear_local_loaded_vmcss(void) +/* + * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) + * + * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to + * atomically track post-VMXON state, e.g. this may be called in NMI context. + * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. + * faults are guaranteed to be due to the !post-VMXON check unless the CPU is + * magically in RM, VM86, compat mode, or at CPL>0. + */ +static int kvm_cpu_vmxoff(void) +{ + asm_volatile_goto("1: vmxoff\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "cc", "memory" : fault); + + cr4_clear_bits(X86_CR4_VMXE); + return 0; + +fault: + cr4_clear_bits(X86_CR4_VMXE); + return -EIO; +} + +static void vmx_emergency_disable(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; + kvm_rebooting = true; + + /* + * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be + * set in task context. If this races with VMX is disabled by an NMI, + * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to + * kvm_rebooting set. + */ + if (!(__read_cr4() & X86_CR4_VMXE)) + return; + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); + + kvm_cpu_vmxoff(); } -#endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) { @@ -1899,25 +1913,14 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { - vmcs_write64(TSC_OFFSET, offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } -static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { - vmcs_write64(TSC_MULTIPLIER, multiplier); -} - -/* - * nested_vmx_allowed() checks whether a guest should be allowed to use VMX - * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for - * all guests if the "nested" module option is off, and can also be disabled - * for a single guest by disabling its VMX cpuid bit. - */ -bool nested_vmx_allowed(struct kvm_vcpu *vcpu) -{ - return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } /* @@ -2047,7 +2050,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) @@ -2355,7 +2358,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: @@ -2729,11 +2732,11 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -static bool kvm_is_vmx_supported(void) +static bool __kvm_is_vmx_supported(void) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); - if (!cpu_has_vmx()) { + if (!(cpuid_ecx(1) & feature_bit(VMX))) { pr_err("VMX not supported by CPU %d\n", cpu); return false; } @@ -2747,13 +2750,24 @@ static bool kvm_is_vmx_supported(void) return true; } +static bool kvm_is_vmx_supported(void) +{ + bool supported; + + migrate_disable(); + supported = __kvm_is_vmx_supported(); + migrate_enable(); + + return supported; +} + static int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; - if (!kvm_is_vmx_supported()) + if (!__kvm_is_vmx_supported()) return -EIO; if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { @@ -2833,7 +2847,7 @@ static void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); - if (cpu_vmxoff()) + if (kvm_cpu_vmxoff()) kvm_spurious_fault(); hv_reset_evmcs(); @@ -3071,13 +3085,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 1; - /* - * Very old userspace does not call KVM_SET_TSS_ADDR before entering - * vcpu. Warn the user that an update is overdue. - */ - if (!kvm_vmx->tss_addr) - pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n"); - vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); @@ -3350,7 +3357,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = vmx_emulation_required(vcpu); } -static int vmx_get_max_tdp_level(void) +static int vmx_get_max_ept_level(void) { if (cpu_has_vmx_ept_5levels()) return 5; @@ -4553,16 +4560,19 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * based on a single guest CPUID bit, with a dedicated feature bit. This also * verifies that the control is actually supported by KVM and hardware. */ -#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ -({ \ - bool __enabled; \ - \ - if (cpu_has_vmx_##name()) { \ - __enabled = guest_cpuid_has(&(vmx)->vcpu, \ - X86_FEATURE_##feat_name); \ - vmx_adjust_secondary_exec_control(vmx, exec_control, \ - SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ - } \ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ + __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ + else \ + __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ + __enabled, exiting); \ + } \ }) /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ @@ -4622,19 +4632,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (cpu_has_vmx_xsaves()) { - /* Exposing XSAVES only when XSAVE is exposed */ - bool xsaves_enabled = - boot_cpu_has(X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); - - vcpu->arch.xsaves_enabled = xsaves_enabled; - - vmx_adjust_secondary_exec_control(vmx, &exec_control, - SECONDARY_EXEC_XSAVES, - xsaves_enabled, false); - } + vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); /* * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either @@ -4653,6 +4651,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_RDTSCP, rdpid_or_rdtscp_enabled, false); } + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); @@ -6796,8 +6795,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); read_unlock(&vcpu->kvm->mmu_lock); - vmx_flush_tlb_current(vcpu); - + /* + * No need for a manual TLB flush at this point, KVM has already done a + * flush if there were SPTEs pointing at the previous page. + */ out: /* * Do not pin apic access page in memory, the MMU notifier @@ -7243,13 +7244,20 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, flags); vcpu->arch.cr2 = native_read_cr2(); + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; + + vmx->idt_vectoring_info = 0; vmx_enable_fb_clear(vmx); - if (unlikely(vmx->fail)) + if (unlikely(vmx->fail)) { vmx->exit_reason.full = 0xdead; - else - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + goto out; + } + + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { @@ -7258,6 +7266,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, kvm_after_interrupt(vcpu); } +out: guest_state_exit_irqoff(); } @@ -7379,8 +7388,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; - pt_guest_exit(vmx); kvm_load_host_xsave_state(vcpu); @@ -7397,17 +7404,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->nested.nested_run_pending = 0; } - vmx->idt_vectoring_info = 0; - if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); - if (likely(!vmx->exit_reason.failed_vmentry)) - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx->exit_reason.failed_vmentry)) @@ -7751,8 +7753,16 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ - vcpu->arch.xsaves_enabled = false; + /* + * XSAVES is effectively enabled if and only if XSAVE is also exposed + * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be + * set if and only if XSAVE is supported. + */ + if (boot_cpu_has(X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); + + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); vmx_setup_uret_msrs(vmx); @@ -7760,7 +7770,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); - if (nested_vmx_allowed(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; @@ -7769,7 +7779,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && @@ -8526,7 +8536,7 @@ static __init int hardware_setup(void) */ vmx_setup_me_spte_mask(); - kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), ept_caps_to_lpage_level(vmx_capability.ept)); /* @@ -8622,10 +8632,8 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif + cpu_emergency_unregister_virt_callback(vmx_emergency_disable); + vmx_cleanup_l1d_flush(); } @@ -8666,18 +8674,14 @@ static int __init vmx_init(void) if (r) goto err_l1d_flush; - vmx_setup_fb_clear_ctrl(); - for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); pi_init_cpu(cpu); } -#ifdef CONFIG_KEXEC_CORE - rcu_assign_pointer(crash_vmclear_loaded_vmcss, - crash_vmclear_local_loaded_vmcss); -#endif + cpu_emergency_register_virt_callback(vmx_emergency_disable); + vmx_check_vmcs12_offsets(); /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 32384ba38499..c2130d2c8e24 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -374,7 +374,6 @@ struct kvm_vmx { u64 *pid_table; }; -bool nested_vmx_allowed(struct kvm_vcpu *vcpu); void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy); int allocate_vpid(void); @@ -562,7 +561,7 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_APIC_REGISTER_VIRT | \ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_SHADOW_VMCS | \ - SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ SECONDARY_EXEC_RDSEED_EXITING | \ SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_ENABLE_PML | \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c381770bcbf1..6c9c81e82e65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -25,6 +25,7 @@ #include "tss.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" +#include "mmu/page_track.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" @@ -237,6 +238,9 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); +u64 __read_mostly host_arch_capabilities; +EXPORT_SYMBOL_GPL(host_arch_capabilities); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -1021,7 +1025,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - if (vcpu->arch.xsaves_enabled && + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } @@ -1052,7 +1056,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - if (vcpu->arch.xsaves_enabled && + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, host_xss); } @@ -1620,12 +1624,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) static u64 kvm_get_arch_capabilities(void) { - u64 data = 0; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); - data &= KVM_SUPPORTED_ARCH_CAP; - } + u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -2631,7 +2630,7 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) else vcpu->arch.tsc_offset = l1_offset; - static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); + static_call(kvm_x86_write_tsc_offset)(vcpu); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) @@ -2647,8 +2646,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli vcpu->arch.tsc_scaling_ratio = l1_multiplier; if (kvm_caps.has_tsc_control) - static_call(kvm_x86_write_tsc_multiplier)( - vcpu, vcpu->arch.tsc_scaling_ratio); + static_call(kvm_x86_write_tsc_multiplier)(vcpu); } static inline bool kvm_check_tsc_unstable(void) @@ -4665,7 +4663,6 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) return 0; default: return -ENXIO; - break; } } @@ -6532,7 +6529,7 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, struct kvm_msr_filter_range *user_range) { - unsigned long *bitmap = NULL; + unsigned long *bitmap; size_t bitmap_size; if (!user_range->nmsrs) @@ -8245,11 +8242,6 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } -static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) -{ - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); -} - static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) { return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); @@ -8351,7 +8343,6 @@ static const struct x86_emulate_ops emulate_ops = { .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, - .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, @@ -9172,7 +9163,7 @@ static int kvmclock_cpu_down_prep(unsigned int cpu) static void tsc_khz_changed(void *data) { struct cpufreq_freqs *freq = data; - unsigned long khz = 0; + unsigned long khz; WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); @@ -9512,6 +9503,9 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_init_pmu_capability(ops->pmu_ops); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities); + r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; @@ -11111,12 +11105,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } + /* - * It should be impossible for the hypervisor timer to be in - * use before KVM has ever run the vCPU. + * Don't bother switching APIC timer emulation from the + * hypervisor timer to the software timer, the only way for the + * APIC timer to be active is if userspace stuffed vCPU state, + * i.e. put the vCPU into a nonsensical state. Only an INIT + * will transition the vCPU out of UNINITIALIZED (without more + * state stuffing from userspace), which will reset the local + * APIC and thus cancel the timer or drop the IRQ (if the timer + * already expired). */ - WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); - kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_block(vcpu); kvm_vcpu_srcu_read_lock(vcpu); @@ -11798,15 +11797,22 @@ static int sync_regs(struct kvm_vcpu *vcpu) __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { - if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs)) + struct kvm_sregs sregs = vcpu->run->s.regs.sregs; + + if (__set_sregs(vcpu, &sregs)) return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { - if (kvm_vcpu_ioctl_x86_set_vcpu_events( - vcpu, &vcpu->run->s.regs.events)) + struct kvm_vcpu_events events = vcpu->run->s.regs.events; + + if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events)) return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS; } @@ -12627,6 +12633,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *new, enum kvm_mr_change change) { + /* + * KVM doesn't support moving memslots when there are external page + * trackers attached to the VM, i.e. if KVMGT is in use. + */ + if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm)) + return -EINVAL; + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) { if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) return -EINVAL; @@ -12772,7 +12785,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See is_writable_pte() for more details (the case involving * access-tracked SPTEs is particularly relevant). */ - kvm_arch_flush_remote_tlbs_memslot(kvm, new); + kvm_flush_remote_tlbs_memslot(kvm, new); } } @@ -12781,6 +12794,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + if (change == KVM_MR_DELETE) + kvm_page_track_delete_slot(kvm, old); + if (!kvm->arch.n_requested_mmu_pages && (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) { unsigned long nr_mmu_pages; @@ -12797,17 +12813,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_arch_free_memslot(kvm, old); } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_mmu_zap_all(kvm); -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - kvm_page_track_flush_slot(kvm, slot); -} - static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 82e3dafc5453..1e7be1f6ab29 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -323,6 +323,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 host_xss; +extern u64 host_arch_capabilities; extern struct kvm_caps kvm_caps; |