summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r--arch/x86/kvm/vmx/vmx.c275
1 files changed, 121 insertions, 154 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cbe0cdade38a..4bceb5ca3a89 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -36,6 +36,7 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
+#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
@@ -454,21 +455,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
-/*
- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- * will emulate SYSCALL in legacy mode if the vendor string in guest
- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- * support this emulation, IA32_STAR must always be included in
- * vmx_uret_msrs_list[], even in i386 builds.
- */
-static const u32 vmx_uret_msrs_list[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
- MSR_IA32_TSX_CTRL,
-};
-
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -696,21 +682,11 @@ static bool is_valid_passthrough_msr(u32 msr)
return r;
}
-static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nr_uret_msrs; ++i)
- if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
- return i;
- return -1;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __vmx_find_uret_msr(vmx, msr);
+ i = kvm_find_user_return_msr(msr);
if (i >= 0)
return &vmx->guest_uret_msrs[i];
return NULL;
@@ -719,13 +695,14 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
struct vmx_uret_msr *msr, u64 data)
{
+ unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
u64 old_msr_data = msr->data;
msr->data = data;
- if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+ if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
preempt_enable();
if (ret)
msr->data = old_msr_data;
@@ -1077,7 +1054,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
return false;
}
- i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ i = kvm_find_user_return_msr(MSR_EFER);
if (i < 0)
return false;
@@ -1239,11 +1216,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
if (!vmx->guest_uret_msrs_loaded) {
vmx->guest_uret_msrs_loaded = true;
- for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
- kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ continue;
+
+ kvm_set_user_return_msr(i,
vmx->guest_uret_msrs[i].data,
vmx->guest_uret_msrs[i].mask);
-
+ }
}
if (vmx->nested.need_vmcs12_to_shadow_sync)
@@ -1750,19 +1730,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ bool load_into_hardware)
{
- struct vmx_uret_msr tmp;
- int from, to;
+ struct vmx_uret_msr *uret_msr;
- from = __vmx_find_uret_msr(vmx, msr);
- if (from < 0)
+ uret_msr = vmx_find_uret_msr(vmx, msr);
+ if (!uret_msr)
return;
- to = vmx->nr_active_uret_msrs++;
- tmp = vmx->guest_uret_msrs[to];
- vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
- vmx->guest_uret_msrs[from] = tmp;
+ uret_msr->load_into_hardware = load_into_hardware;
}
/*
@@ -1772,29 +1749,42 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- vmx->guest_uret_msrs_loaded = false;
- vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64
+ bool load_syscall_msrs;
+
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
- if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- vmx_setup_uret_msr(vmx, MSR_STAR);
- vmx_setup_uret_msr(vmx, MSR_LSTAR);
- vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
- }
+ load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ (vmx->vcpu.arch.efer & EFER_SCE);
+
+ vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif
- if (update_transition_efer(vmx))
- vmx_setup_uret_msr(vmx, MSR_EFER);
+ vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
- if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
- vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ /*
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ * kernel and old userspace. If those guests run on a tsx=off host, do
+ * allow guests to use TSX_CTRL, but don't change the value in hardware
+ * so that TSX remains always disabled.
+ */
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
+
+ /*
+ * The set of MSRs to load may have changed, reload MSRs before the
+ * next VM-Enter.
+ */
+ vmx->guest_uret_msrs_loaded = false;
}
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1992,11 +1982,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- goto find_uret_msr;
case MSR_IA32_DEBUGCTLMSR:
msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
break;
@@ -2030,6 +2015,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
if (!intel_pmu_lbr_is_enabled(vcpu))
debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+
return debugctl;
}
@@ -2312,14 +2300,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- /* Check reserved bit, higher 32 bits should be zero */
- if ((data >> 32) != 0)
- return 1;
- goto find_uret_msr;
case MSR_IA32_PERF_CAPABILITIES:
if (data && !vcpu_to_pmu(vcpu)->version)
return 1;
@@ -4368,7 +4348,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
xsaves_enabled, false);
}
- vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
+ /*
+ * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ * feature is exposed to the guest. This creates a virtualization hole
+ * if both are supported in hardware but only one is exposed to the
+ * guest, but letting the guest execute RDTSCP or RDPID when either one
+ * is advertised is preferable to emulating the advertised instruction
+ * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ */
+ if (cpu_has_vmx_rdtscp()) {
+ bool rdpid_or_rdtscp_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_ENABLE_RDTSCP,
+ rdpid_or_rdtscp_enabled, false);
+ }
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -6415,18 +6411,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+ unsigned long entry)
{
- unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- gate_desc *desc = (gate_desc *)host_idt_base + vector;
-
kvm_before_interrupt(vcpu);
- vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+ vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
+ const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6437,18 +6432,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
u32 intr_info = vmx_get_intr_info(vcpu);
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, intr_info);
+ handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
}
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6662,25 +6659,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
- /*
- * VMENTER enables interrupts (host state), but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about
- * it. This is the same logic as for exit_to_user_mode().
- *
- * This ensures that e.g. latency analysis on the host observes
- * guest mode as interrupt enabled.
- *
- * guest_enter_irqoff() informs context tracking about the
- * transition to guest mode and if enabled adjusts RCU state
- * accordingly.
- */
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- instrumentation_end();
-
- guest_enter_irqoff();
- lockdep_hardirqs_on(CALLER_ADDR0);
+ kvm_guest_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
@@ -6696,24 +6675,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
- /*
- * VMEXIT disables interrupts (host state), but tracing and lockdep
- * have them in state 'on' as recorded before entering guest mode.
- * Same as enter_from_user_mode().
- *
- * guest_exit_irqoff() restores host context and reinstates RCU if
- * enabled and required.
- *
- * This needs to be done before the below as native_read_msr()
- * contains a tracepoint and x86_spec_ctrl_restore_host() calls
- * into world and some more.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- guest_exit_irqoff();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- instrumentation_end();
+ kvm_guest_exit_irqoff();
}
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -6888,6 +6850,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
+ struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
int i, cpu, err;
@@ -6910,43 +6873,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
-
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
- u32 index = vmx_uret_msrs_list[i];
- u32 data_low, data_high;
- int j = vmx->nr_uret_msrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
-
- vmx->guest_uret_msrs[j].slot = i;
- vmx->guest_uret_msrs[j].data = 0;
- switch (index) {
- case MSR_IA32_TSX_CTRL:
- /*
- * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
- * interception. Keep the host value unchanged to avoid
- * changing CPUID bits under the host kernel's feet.
- *
- * hle=0, rtm=0, tsx_ctrl=1 can be found with some
- * combinations of new kernel and old userspace. If
- * those guests run on a tsx=off host, do allow guests
- * to use TSX_CTRL, but do not change the value on the
- * host so that TSX remains always disabled.
- */
- if (boot_cpu_has(X86_FEATURE_RTM))
- vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
- else
- vmx->guest_uret_msrs[j].mask = 0;
- break;
- default:
- vmx->guest_uret_msrs[j].mask = -1ull;
- break;
- }
- ++vmx->nr_uret_msrs;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ vmx->guest_uret_msrs[i].data = 0;
+ vmx->guest_uret_msrs[i].mask = -1ull;
+ }
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ * Keep the host value unchanged to avoid changing CPUID bits
+ * under the host kernel's feet.
+ */
+ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (tsx_ctrl)
+ vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
@@ -7377,9 +7316,11 @@ static __init void vmx_set_cpu_caps(void)
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
- /* CPUID 0x80000001 */
- if (!cpu_has_vmx_rdtscp())
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@@ -7435,8 +7376,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/*
* RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here.
+ * Note, RDPID is hidden behind ENABLE_RDTSCP.
*/
- case x86_intercept_rdtscp:
+ case x86_intercept_rdpid:
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
@@ -7802,17 +7744,42 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
+static __init void vmx_setup_user_return_msrs(void)
+{
+
+ /*
+ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ * will emulate SYSCALL in legacy mode if the vendor string in guest
+ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ * support this emulation, MSR_STAR is included in the list for i386,
+ * but is never loaded into hardware. MSR_CSTAR is also never loaded
+ * into hardware and is here purely for emulation purposes.
+ */
+ const u32 vmx_uret_msrs_list[] = {
+ #ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ #endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
+ };
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, i, ept_lpage_level;
+ int r, ept_lpage_level;
store_idt(&dt);
host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
- kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
+ vmx_setup_user_return_msrs();
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;