summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c814
1 files changed, 637 insertions, 177 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8bc64566ab..af823a388c19 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -35,6 +35,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
+#include <asm/cpu.h>
#include <asm/io.h>
#include <asm/desc.h>
#include <asm/vmx.h>
@@ -45,6 +46,7 @@
#include <asm/debugreg.h>
#include <asm/kexec.h>
#include <asm/apic.h>
+#include <asm/irq_remapping.h>
#include "trace.h"
#include "pmu.h"
@@ -105,6 +107,8 @@ static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -424,6 +428,9 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+ u16 vpid02;
+ u16 last_vpid;
+
u32 nested_vmx_procbased_ctls_low;
u32 nested_vmx_procbased_ctls_high;
u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +447,33 @@ struct nested_vmx {
u32 nested_vmx_misc_low;
u32 nested_vmx_misc_high;
u32 nested_vmx_ept_caps;
+ u32 nested_vmx_vpid_caps;
};
#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
- u32 control; /* bit 0 of control is outstanding notification bit */
- u32 rsvd[7];
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
} __aligned(64);
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +493,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ return clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ return set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -532,8 +582,6 @@ struct vcpu_vmx {
s64 vnmi_blocked_time;
u32 exit_reason;
- bool rdtscp_enabled;
-
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -563,6 +611,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -809,7 +862,7 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -831,6 +884,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1006,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
{
- return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1043,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
static inline bool cpu_has_vmx_posted_intr(void)
{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+ return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+ vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1123,9 @@ static inline bool cpu_has_vmx_ple(void)
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
- return flexpriority_enabled && irqchip_in_kernel(kvm);
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_vmx_vpid(void)
@@ -1113,6 +1174,12 @@ static inline bool cpu_has_vmx_pml(void)
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -1157,6 +1224,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
}
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1337,13 +1409,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
__loaded_vmcs_clear, loaded_vmcs, 1);
}
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(int vpid)
{
- if (vmx->vpid == 0)
+ if (vpid == 0)
return;
if (cpu_has_vmx_invvpid_single())
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
}
static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1424,10 @@ static inline void vpid_sync_vcpu_global(void)
__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+static inline void vpid_sync_context(int vpid)
{
if (cpu_has_vmx_invvpid_single())
- vpid_sync_vcpu_single(vmx);
+ vpid_sync_vcpu_single(vpid);
else
vpid_sync_vcpu_global();
}
@@ -1567,7 +1639,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1895,6 +1967,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
preempt_enable();
}
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+ * are two possible cases:
+ * 1. After running 'pre_block', context switch
+ * happened. For this case, 'sn' was set in
+ * vmx_vcpu_put(), so we need to clear it here.
+ * 2. After running 'pre_block', we were blocked,
+ * and woken up by some other guy. For this case,
+ * we don't need to do anything, 'pi_post_block'
+ * will do everything for us. However, we cannot
+ * check whether it is case #1 or case #2 here
+ * (maybe, not needed), so we also clear sn here,
+ * I think it is not a big deal.
+ */
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+ if (vcpu->cpu != cpu) {
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+ }
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ }
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -1943,12 +2061,35 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+ /* Setup TSC multiplier */
+ if (cpu_has_vmx_tsc_scaling())
+ vmcs_write64(TSC_MULTIPLIER,
+ vcpu->arch.tsc_scaling_ratio);
+
vmx->loaded_vmcs->cpu = cpu;
}
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
+ vmx_vcpu_pi_put(vcpu);
+
__vmx_load_host_state(to_vmx(vcpu));
if (!vmm_exclusive) {
__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2348,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && vmx->rdtscp_enabled)
+ if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_STAR is only needed on long mode guests, and only
@@ -2230,15 +2371,16 @@ static void setup_msrs(struct vcpu_vmx *vmx)
/*
* reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset -- 21.3
+ * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
+ * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
*/
-static u64 guest_read_tsc(void)
+static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
{
u64 host_tsc, tsc_offset;
host_tsc = rdtsc();
tsc_offset = vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
+ return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
}
/*
@@ -2255,22 +2397,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return host_tsc + tsc_offset;
}
-/*
- * Engage any workarounds for mis-matched TSC rates. Currently limited to
- * software catchup for faster rates on slower CPUs.
- */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- if (!scale)
- return;
-
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
-}
-
static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
{
return vmcs_read64(TSC_OFFSET);
@@ -2302,7 +2428,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
}
}
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
{
u64 offset = vmcs_read64(TSC_OFFSET);
@@ -2315,11 +2441,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
offset + adjustment);
}
-static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- return target_tsc - rdtsc();
-}
-
static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2377,7 +2498,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (vmx_cpu_uses_apicv(&vmx->vcpu))
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_POSTED_INTR;
@@ -2471,10 +2592,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_PCOMMIT;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
@@ -2493,6 +2616,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ if (enable_vpid)
+ vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+ else
+ vmx->nested.nested_vmx_vpid_caps = 0;
+
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2608,7 +2737,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
/* Currently, no nested vpid support */
- *pdata = vmx->nested.nested_vmx_ept_caps;
+ *pdata = vmx->nested.nested_vmx_ept_caps |
+ ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
break;
default:
return 1;
@@ -2642,7 +2772,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
- msr_info->data = guest_read_tsc();
+ msr_info->data = guest_read_tsc(vcpu);
break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -2673,7 +2803,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!to_vmx(vcpu)->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Otherwise falls through */
default:
@@ -2779,7 +2909,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!vmx->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -2874,6 +3004,8 @@ static int hardware_enable(void)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
/*
* Now we can enable the vmclear operation in kdump
@@ -3015,7 +3147,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_ENABLE_PML;
+ SECONDARY_EXEC_ENABLE_PML |
+ SECONDARY_EXEC_PCOMMIT |
+ SECONDARY_EXEC_TSC_SCALING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -3441,9 +3575,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
{
- vpid_sync_context(to_vmx(vcpu));
+ vpid_sync_context(vpid);
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
@@ -3451,6 +3585,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
}
}
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+}
+
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3644,20 +3783,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
- /*
- * SMEP/SMAP is disabled if CPU is in non-paging mode
- * in hardware. However KVM always uses paging mode to
- * emulate guest non-paging mode with TDP.
- * To emulate this behavior, SMEP/SMAP needs to be
- * manually disabled when guest switches to non-paging
- * mode.
- */
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
}
+ if (!enable_unrestricted_guest && !is_paging(vcpu))
+ /*
+ * SMEP/SMAP is disabled if CPU is in non-paging mode in
+ * hardware. However KVM always uses paging mode without
+ * unrestricted guest.
+ * To emulate this behavior, SMEP/SMAP needs to be manually
+ * disabled when guest switches to non-paging mode.
+ */
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
return 0;
@@ -4146,29 +4286,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
return r;
}
-static void allocate_vpid(struct vcpu_vmx *vmx)
+static int allocate_vpid(void)
{
int vpid;
- vmx->vpid = 0;
if (!enable_vpid)
- return;
+ return 0;
spin_lock(&vmx_vpid_lock);
vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
- if (vpid < VMX_NR_VPIDS) {
- vmx->vpid = vpid;
+ if (vpid < VMX_NR_VPIDS)
__set_bit(vpid, vmx_vpid_bitmap);
- }
+ else
+ vpid = 0;
spin_unlock(&vmx_vpid_lock);
+ return vpid;
}
-static void free_vpid(struct vcpu_vmx *vmx)
+static void free_vpid(int vpid)
{
- if (!enable_vpid)
+ if (!enable_vpid || vpid == 0)
return;
spin_lock(&vmx_vpid_lock);
- if (vmx->vpid != 0)
- __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ __clear_bit(vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
}
@@ -4323,9 +4462,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
msr, MSR_TYPE_W);
}
-static int vmx_vm_has_apicv(struct kvm *kvm)
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
{
- return enable_apicv && irqchip_in_kernel(kvm);
+ return enable_apicv && lapic_in_kernel(vcpu);
}
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4508,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Currently, we don't support urgent interrupt,
+ * all interrupts are recognized as non-urgent
+ * interrupt, so we cannot post interrupts when
+ * 'SN' is set.
+ *
+ * If the vcpu is in guest mode, it means it is
+ * running instead of being scheduled out and
+ * waiting in the run queue, and that's the only
+ * case when 'SN' is set currently, warning if
+ * 'SN' is set.
+ */
+ WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
return true;
@@ -4505,7 +4660,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!vmx_cpu_uses_apicv(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
return pin_based_exec_ctrl;
}
@@ -4517,7 +4672,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64
exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4689,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4703,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!vmx_cpu_uses_apicv(&vmx->vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4558,8 +4713,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
a current VMCS12
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- /* PML is enabled/disabled in creating/destorying vcpu */
- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ if (!enable_pml)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* Currently, we allow L1 guest to directly run pcommit instruction. */
+ exec_control &= ~SECONDARY_EXEC_PCOMMIT;
return exec_control;
}
@@ -4604,12 +4763,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
+ if (cpu_has_secondary_exec_ctrls())
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
vmx_secondary_exec_control(vmx));
- }
- if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+ if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4753,7 +4911,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (cpu_has_vmx_tpr_shadow() && !init_event) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vcpu->kvm))
+ if (cpu_need_tpr_shadow(vcpu))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
__pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4919,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx_vm_has_apicv(vcpu->kvm))
+ if (vmx_cpu_uses_apicv(vcpu))
memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
if (vmx->vpid != 0)
@@ -4771,12 +4929,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx->vcpu.arch.cr0 = cr0;
vmx_set_cr4(vcpu, 0);
- if (!init_event)
- vmx_set_efer(vcpu, 0);
+ vmx_set_efer(vcpu, 0);
vmx_fpu_activate(vcpu);
update_exception_bitmap(vcpu);
- vpid_sync_context(vmx);
+ vpid_sync_context(vmx->vpid);
}
/*
@@ -5104,6 +5261,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
+ case AC_VECTOR:
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
if (!(vcpu->guest_debug &
@@ -5296,7 +5456,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return 1;
if (cr8_prev <= cr8)
return 1;
@@ -5510,17 +5670,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
++vcpu->stat.irq_window_exits;
-
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
return 1;
}
@@ -5753,10 +5902,11 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
skip_emulated_instruction(vcpu);
+ trace_kvm_fast_mmio(gpa);
return 1;
}
- ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+ ret = handle_mmio_page_fault(vcpu, gpa, true);
if (likely(ret == RET_MMIO_PF_EMULATE))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
EMULATE_DONE;
@@ -5910,6 +6060,25 @@ static void update_ple_window_actual_max(void)
ple_window_grow, INT_MIN);
}
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
static __init int hardware_setup(void)
{
int r = -ENOMEM, i, msr;
@@ -6028,6 +6197,12 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (cpu_has_vmx_tsc_scaling()) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
+ }
+
if (enable_apicv)
kvm_x86_ops->update_cr8_intercept = NULL;
else {
@@ -6096,6 +6271,8 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
return alloc_kvm_area();
out8:
@@ -6627,7 +6804,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
- u32 exec_control;
if (vmx->nested.current_vmptr == -1ull)
return;
@@ -6640,9 +6816,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
@@ -6662,6 +6837,7 @@ static void free_nested(struct vcpu_vmx *vmx)
return;
vmx->nested.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7038,7 +7214,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
gpa_t vmptr;
- u32 exec_control;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -7070,9 +7245,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
if (enable_shadow_vmcs) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
__pa(vmx->nested.current_shadow_vmcs));
vmx->nested.sync_shadow_vmcs = true;
@@ -7178,7 +7352,58 @@ static int handle_invept(struct kvm_vcpu *vcpu)
static int handle_invvpid(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception(vcpu, UD_VECTOR);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info;
+ unsigned long type, types;
+ gva_t gva;
+ struct x86_exception e;
+ int vpid;
+
+ if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_VPID) ||
+ !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+
+ if (!(types & (1UL << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return 1;
+ }
+
+ /* according to the intel vmx instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
+ sizeof(u32), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ switch (type) {
+ case VMX_VPID_EXTENT_ALL_CONTEXT:
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+ nested_vmx_succeed(vcpu);
+ break;
+ default:
+ /* Trap single context invalidation invvpid calls */
+ BUG_ON(1);
+ break;
+ }
+
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7207,6 +7432,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+ /* we never catch pcommit instruct for L1 guest. */
+ WARN_ON(1);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7489,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_PCOMMIT] = handle_pcommit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7791,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PCOMMIT:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
default:
return true;
}
@@ -7569,10 +7804,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}
-static int vmx_enable_pml(struct vcpu_vmx *vmx)
+static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
{
struct page *pml_pg;
- u32 exec_control;
pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!pml_pg)
@@ -7583,24 +7817,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-
return 0;
}
-static void vmx_disable_pml(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
- u32 exec_control;
-
- ASSERT(vmx->pml_pg);
- __free_page(vmx->pml_pg);
- vmx->pml_pg = NULL;
-
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ if (vmx->pml_pg) {
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+ }
}
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7782,6 +8007,9 @@ static void dump_vmcs(void)
vmcs_read32(IDT_VECTORING_INFO_FIELD),
vmcs_read32(IDT_VECTORING_ERROR_CODE));
pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016lx\n",
+ vmcs_readl(TSC_MULTIPLIER));
if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
@@ -7924,10 +8152,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
* apicv
*/
if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !vmx_vm_has_apicv(vcpu->kvm))
+ !vmx_cpu_uses_apicv(vcpu))
return;
- if (!vm_need_tpr_shadow(vcpu->kvm))
+ if (!cpu_need_tpr_shadow(vcpu))
return;
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8029,9 +8257,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
}
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
- if (!vmx_vm_has_apicv(vcpu->kvm))
+ u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
+ if (!vmx_cpu_uses_apicv(vcpu))
return;
vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8477,8 +8706,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (enable_pml)
- vmx_disable_pml(vmx);
- free_vpid(vmx);
+ vmx_destroy_pml_buffer(vmx);
+ free_vpid(vmx->vpid);
leave_guest_mode(vcpu);
vmx_load_vmcs01(vcpu);
free_nested(vmx);
@@ -8497,7 +8726,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx)
return ERR_PTR(-ENOMEM);
- allocate_vpid(vmx);
+ vmx->vpid = allocate_vpid();
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
@@ -8530,7 +8759,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
put_cpu();
if (err)
goto free_vmcs;
- if (vm_need_virtualize_apic_accesses(kvm)) {
+ if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
goto free_vmcs;
@@ -8545,8 +8774,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
- if (nested)
+ if (nested) {
nested_vmx_setup_ctls_msrs(vmx);
+ vmx->nested.vpid02 = allocate_vpid();
+ }
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
@@ -8559,7 +8790,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
* for the guest, etc.
*/
if (enable_pml) {
- err = vmx_enable_pml(vmx);
+ err = vmx_create_pml_buffer(vmx);
if (err)
goto free_vmcs;
}
@@ -8567,13 +8798,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
return &vmx->vcpu;
free_vmcs:
+ free_vpid(vmx->nested.vpid02);
free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
- free_vpid(vmx);
+ free_vpid(vmx->vpid);
kmem_cache_free(kvm_vcpu_cache, vmx);
return ERR_PTR(err);
}
@@ -8648,49 +8880,67 @@ static int vmx_get_lpage_level(void)
return PT_PDPE_LEVEL;
}
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
+ u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
- vmx->rdtscp_enabled = false;
if (vmx_rdtscp_supported()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- if (exec_control & SECONDARY_EXEC_RDTSCP) {
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
- vmx->rdtscp_enabled = true;
- else {
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+ if (!rdtscp_enabled)
+ secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
+
+ if (nested) {
+ if (rdtscp_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_RDTSCP;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
}
- if (nested && !vmx->rdtscp_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
}
/* Exposing INVPCID only when PCID is exposed */
best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
if (vmx_invpcid_supported() &&
- best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
- guest_cpuid_has_pcid(vcpu)) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- } else {
- if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+ !guest_cpuid_has_pcid(vcpu))) {
+ secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+
if (best)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
+
+ vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+ if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+ if (guest_cpuid_has_pcommit(vcpu))
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_PCOMMIT;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_PCOMMIT;
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9298,13 +9548,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmx_secondary_exec_control(vmx);
- if (!vmx->rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_PCOMMIT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9573,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+ cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9433,12 +9683,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (enable_vpid) {
/*
- * Trivially support vpid by letting L2s share their parent
- * L1's vpid. TODO: move to a more elaborate solution, giving
- * each L2 its own vpid and exposing the vpid feature to L1.
+ * There is no direct mapping between vpid02 and vpid12, the
+ * vpid02 is per-vCPU for L0 and reused while the value of
+ * vpid12 is changed w/ one invvpid during nested vmentry.
+ * The vpid12 is allocated by L1 for L2, so it will not
+ * influence global bitmap(for vpid01 and vpid02 allocation)
+ * even if spawn a lot of nested vCPUs.
*/
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx_flush_tlb(vcpu);
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+ }
+ } else {
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ vmx_flush_tlb(vcpu);
+ }
+
}
if (nested_cpu_has_ept(vmcs12)) {
@@ -10278,6 +10540,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * We should not block the vCPU if
+ * an interrupt is posted for it.
+ */
+ if (pi_test_on(pi_desc) == 1) {
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+
+ return 1;
+ }
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+ unsigned long flags;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if(vcpu->pre_pcpu != -1) {
+ spin_lock_irqsave(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ */
+
+ kvm_set_msi_irq(e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ continue;
+
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else {
+ /* suppress notification event before unposting */
+ pi_set_sn(vcpu_to_pi_desc(vcpu));
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ pi_clear_sn(vcpu_to_pi_desc(vcpu));
+ }
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -10297,7 +10754,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_db_bp_intercept = update_exception_bitmap,
+ .update_bp_intercept = update_exception_bitmap,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
@@ -10347,7 +10804,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .vm_has_apicv = vmx_vm_has_apicv,
+ .cpu_uses_apicv = vmx_cpu_uses_apicv,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10371,11 +10828,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .set_tsc_khz = vmx_set_tsc_khz,
.read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
- .adjust_tsc_offset = vmx_adjust_tsc_offset,
- .compute_tsc_offset = vmx_compute_tsc_offset,
+ .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
.read_l1_tsc = vmx_read_l1_tsc,
.set_tdp_cr3 = vmx_set_cr3,
@@ -10394,7 +10849,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
.pmu_ops = &intel_pmu_ops,
+
+ .update_pi_irte = vmx_update_pi_irte,
};
static int __init vmx_init(void)