diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-31 09:30:41 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-31 09:30:41 -0800 |
commit | e813e65038389b66d2f8dd87588694caf8dc2923 (patch) | |
tree | 4595d8ebaf672b79b412bd663a13907fd785478d /arch/x86/kvm | |
parent | ccaaaf6fe5a5e1fffca5cca0f3fc4ec84d7ae752 (diff) | |
parent | 4cbc418a44d5067133271bb6eeac2382f2bf94f7 (diff) |
Merge tag 'kvm-5.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini:
"This is the first batch of KVM changes.
ARM:
- cleanups and corner case fixes.
PPC:
- Bugfixes
x86:
- Support for mapping DAX areas with large nested page table entries.
- Cleanups and bugfixes here too. A particularly important one is a
fix for FPU load when the thread has TIF_NEED_FPU_LOAD. There is
also a race condition which could be used in guest userspace to
exploit the guest kernel, for which the embargo expired today.
- Fast path for IPI delivery vmexits, shaving about 200 clock cycles
from IPI latency.
- Protect against "Spectre-v1/L1TF" (bring data in the cache via
speculative out of bound accesses, use L1TF on the sibling
hyperthread to read it), which unfortunately is an even bigger
whack-a-mole game than SpectreV1.
Sean continues his mission to rewrite KVM. In addition to a sizable
number of x86 patches, this time he contributed a pretty large
refactoring of vCPU creation that affects all architectures but should
not have any visible effect.
s390 will come next week together with some more x86 patches"
* tag 'kvm-5.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits)
x86/KVM: Clean up host's steal time structure
x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed
x86/kvm: Cache gfn to pfn translation
x86/kvm: Introduce kvm_(un)map_gfn()
x86/kvm: Be careful not to clear KVM_VCPU_FLUSH_TLB bit
KVM: PPC: Book3S PR: Fix -Werror=return-type build failure
KVM: PPC: Book3S HV: Release lock on page-out failure path
KVM: arm64: Treat emulated TVAL TimerValue as a signed 32-bit integer
KVM: arm64: pmu: Only handle supported event counters
KVM: arm64: pmu: Fix chained SW_INCR counters
KVM: arm64: pmu: Don't mark a counter as chained if the odd one is disabled
KVM: arm64: pmu: Don't increment SW_INCR if PMCR.E is unset
KVM: x86: Use a typedef for fastop functions
KVM: X86: Add 'else' to unify fastop and execute call path
KVM: x86: inline memslot_valid_for_gpte
KVM: x86/mmu: Use huge pages for DAX-backed files
KVM: x86/mmu: Remove lpage_is_disallowed() check from set_spte()
KVM: x86/mmu: Fold max_mapping_level() into kvm_mmu_hugepage_adjust()
KVM: x86/mmu: Zap any compound page when collapsing sptes
KVM: x86/mmu: Remove obsolete gfn restoration in FNAME(fetch)
...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 45 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 133 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 41 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 37 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 605 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 88 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mtrr.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.h | 18 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 134 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 189 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 24 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs_shadow_fields.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 294 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 569 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 23 |
25 files changed, 1208 insertions, 1094 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cf55629ff0ff..b1c469446b07 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -62,7 +62,7 @@ u64 kvm_supported_xcr0(void) return xcr0; } -#define F(x) bit(X86_FEATURE_##x) +#define F feature_bit int kvm_update_cpuid(struct kvm_vcpu *vcpu) { @@ -281,8 +281,9 @@ out: return r; } -static void cpuid_mask(u32 *word, int wordnum) +static __always_inline void cpuid_mask(u32 *word, int wordnum) { + reverse_cpuid_check(wordnum); *word &= boot_cpu_data.x86_capability[wordnum]; } @@ -352,6 +353,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; unsigned f_la57; + unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0; /* cpuid 7.0.ebx */ const u32 kvm_cpuid_7_0_ebx_x86_features = @@ -363,7 +365,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | + F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; @@ -392,6 +394,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* Set LA57 based on hardware capability. */ entry->ecx |= f_la57; entry->ecx |= f_umip; + entry->ecx |= f_pku; /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d78a61408243..7366c618aa04 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -53,15 +53,46 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, }; -static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) +/* + * Reverse CPUID and its derivatives can only be used for hardware-defined + * feature words, i.e. words whose bits directly correspond to a CPUID leaf. + * Retrieving a feature bit or masking guest CPUID from a Linux-defined word + * is nonsensical as the bit number/mask is an arbitrary software-defined value + * and can't be used by KVM to query/control guest capabilities. And obviously + * the leaf being queried must have an entry in the lookup table. + */ +static __always_inline void reverse_cpuid_check(unsigned x86_leaf) { - unsigned x86_leaf = x86_feature / 32; - + BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); +} + +/* + * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain + * the hardware defined bit number (stored in bits 4:0) and a software defined + * "word" (stored in bits 31:5). The word is used to index into arrays of + * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). + */ +static __always_inline u32 __feature_bit(int x86_feature) +{ + reverse_cpuid_check(x86_feature / 32); + return 1 << (x86_feature & 31); +} +#define feature_bit(name) __feature_bit(X86_FEATURE_##name) + +static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) +{ + unsigned x86_leaf = x86_feature / 32; + + reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; } @@ -93,15 +124,11 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_ { int *reg; - if (x86_feature == X86_FEATURE_XSAVE && - !static_cpu_has(X86_FEATURE_XSAVE)) - return false; - reg = guest_cpuid_get_register(vcpu, x86_feature); if (!reg) return false; - return *reg & bit(x86_feature); + return *reg & __feature_bit(x86_feature); } static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature) @@ -110,7 +137,7 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x8 reg = guest_cpuid_get_register(vcpu, x86_feature); if (reg) - *reg &= ~bit(x86_feature); + *reg &= ~__feature_bit(x86_feature); } static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 952d1a4f4d7e..ddbc61984227 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -22,6 +22,7 @@ #include "kvm_cache_regs.h" #include <asm/kvm_emulate.h> #include <linux/stringify.h> +#include <asm/fpu/api.h> #include <asm/debugreg.h> #include <asm/nospec-branch.h> @@ -310,7 +311,9 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) #define ON64(x) #endif -static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); +typedef void (*fastop_t)(struct fastop *); + +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); #define __FOP_FUNC(name) \ ".align " __stringify(FASTOP_SIZE) " \n\t" \ @@ -1075,8 +1078,23 @@ static void fetch_register_operand(struct operand *op) } } -static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) +static void emulator_get_fpu(void) +{ + fpregs_lock(); + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); +} + +static void emulator_put_fpu(void) { + fpregs_unlock(); +} + +static void read_sse_reg(sse128_t *data, int reg) +{ + emulator_get_fpu(); switch (reg) { case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; @@ -1098,11 +1116,12 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) #endif default: BUG(); } + emulator_put_fpu(); } -static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, - int reg) +static void write_sse_reg(sse128_t *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; @@ -1124,10 +1143,12 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, #endif default: BUG(); } + emulator_put_fpu(); } -static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +static void read_mmx_reg(u64 *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; @@ -1139,10 +1160,12 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; default: BUG(); } + emulator_put_fpu(); } -static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +static void write_mmx_reg(u64 *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; @@ -1154,6 +1177,7 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; default: BUG(); } + emulator_put_fpu(); } static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1161,7 +1185,9 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fninit"); + emulator_put_fpu(); return X86EMUL_CONTINUE; } @@ -1172,7 +1198,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fnstcw %0": "+m"(fcw)); + emulator_put_fpu(); ctxt->dst.val = fcw; @@ -1186,7 +1214,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fnstsw %0": "+m"(fsw)); + emulator_put_fpu(); ctxt->dst.val = fsw; @@ -1205,7 +1235,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; - read_sse_reg(ctxt, &op->vec_val, reg); + read_sse_reg(&op->vec_val, reg); return; } if (ctxt->d & Mmx) { @@ -1256,7 +1286,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = ctxt->modrm_rm; - read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); + read_sse_reg(&op->vec_val, ctxt->modrm_rm); return rc; } if (ctxt->d & Mmx) { @@ -1833,10 +1863,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->bytes * op->count); break; case OP_XMM: - write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); + write_sse_reg(&op->vec_val, op->addr.xmm); break; case OP_MM: - write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); + write_mmx_reg(&op->mm_val, op->addr.mm); break; case OP_NONE: /* no writeback */ @@ -2348,12 +2378,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) { #ifdef CONFIG_X86_64 - u32 eax, ebx, ecx, edx; - - eax = 0x80000001; - ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - return edx & bit(X86_FEATURE_LM); + return ctxt->ops->guest_has_long_mode(ctxt); #else return false; #endif @@ -3618,18 +3643,11 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -#define FFL(x) bit(X86_FEATURE_##x) - static int em_movbe(struct x86_emulate_ctxt *ctxt) { - u32 ebx, ecx, edx, eax = 1; u16 tmp; - /* - * Check MOVBE is set in the guest-visible CPUID leaf. - */ - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - if (!(ecx & FFL(MOVBE))) + if (!ctxt->ops->guest_has_movbe(ctxt)) return emulate_ud(ctxt); switch (ctxt->op_bytes) { @@ -4027,10 +4045,7 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) static int check_fxsr(struct x86_emulate_ctxt *ctxt) { - u32 eax = 1, ebx, ecx = 0, edx; - - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - if (!(edx & FFL(FXSR))) + if (!ctxt->ops->guest_has_fxsr(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) @@ -4092,8 +4107,12 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + emulator_get_fpu(); + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + emulator_put_fpu(); + if (rc != X86EMUL_CONTINUE) return rc; @@ -4136,6 +4155,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + emulator_get_fpu(); + if (size < __fxstate_size(16)) { rc = fxregs_fixup(&fx_state, size); if (rc != X86EMUL_CONTINUE) @@ -4151,6 +4172,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: + emulator_put_fpu(); + return rc; } @@ -5210,16 +5233,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->ad_bytes = def_ad_bytes ^ 6; break; case 0x26: /* ES override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_ES; + break; case 0x2e: /* CS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_CS; + break; case 0x36: /* SS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_SS; + break; case 0x3e: /* DS override */ has_seg_override = true; - ctxt->seg_override = (ctxt->b >> 3) & 3; + ctxt->seg_override = VCPU_SREG_DS; break; case 0x64: /* FS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_FS; + break; case 0x65: /* GS override */ has_seg_override = true; - ctxt->seg_override = ctxt->b & 7; + ctxt->seg_override = VCPU_SREG_GS; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -5303,10 +5338,15 @@ done_prefixes: } break; case Escape: - if (ctxt->modrm > 0xbf) - opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; - else + if (ctxt->modrm > 0xbf) { + size_t size = ARRAY_SIZE(opcode.u.esc->high); + u32 index = array_index_nospec( + ctxt->modrm - 0xc0, size); + + opcode = opcode.u.esc->high[index]; + } else { opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; + } break; case InstrDual: if ((ctxt->modrm >> 6) == 3) @@ -5448,7 +5488,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; + emulator_get_fpu(); rc = asm_safe("fwait"); + emulator_put_fpu(); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); @@ -5456,14 +5498,13 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op) +static void fetch_possible_mmx_operand(struct operand *op) { if (op->type == OP_MM) - read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); + read_mmx_reg(&op->mm_val, op->addr.mm); } -static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; @@ -5539,10 +5580,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) * Now that we know the fpu is exception safe, we can fetch * operands from it. */ - fetch_possible_mmx_operand(ctxt, &ctxt->src); - fetch_possible_mmx_operand(ctxt, &ctxt->src2); + fetch_possible_mmx_operand(&ctxt->src); + fetch_possible_mmx_operand(&ctxt->src2); if (!(ctxt->d & Mov)) - fetch_possible_mmx_operand(ctxt, &ctxt->dst); + fetch_possible_mmx_operand(&ctxt->dst); } if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { @@ -5641,14 +5682,10 @@ special_insn: ctxt->eflags &= ~X86_EFLAGS_RF; if (ctxt->execute) { - if (ctxt->d & Fastop) { - void (*fop)(struct fastop *) = (void *)ctxt->execute; - rc = fastop(ctxt, fop); - if (rc != X86EMUL_CONTINUE) - goto done; - goto writeback; - } - rc = ctxt->execute(ctxt); + if (ctxt->d & Fastop) + rc = fastop(ctxt, (fastop_t)ctxt->execute); + else + rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; goto writeback; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 23ff65504d7e..4df1c965bf1a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -33,6 +33,7 @@ #include <trace/events/kvm.h> #include "trace.h" +#include "irq.h" #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) @@ -809,11 +810,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 *pdata) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - *pdata = hv->hv_crash_param[index]; + *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } @@ -852,11 +854,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 data) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - hv->hv_crash_param[index] = data; + hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } @@ -1058,7 +1061,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } @@ -1121,7 +1124,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; /* - * Clear apic_assist portion of f(struct hv_vp_assist_page + * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ @@ -1178,7 +1181,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8b38bb4868a6..629a09ca9860 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *s, switch (addr) { case 0x20: case 0x21: + pic_lock(s); + pic_ioport_write(&s->pics[0], addr, data); + pic_unlock(s); + break; case 0xa0: case 0xa1: pic_lock(s); - pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_ioport_write(&s->pics[1], addr, data); pic_unlock(s); break; case 0x4d0: diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 9fd2dd89a1c5..26aa22cb9b29 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -36,6 +36,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/nospec.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/current.h> @@ -68,13 +69,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, default: { u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; - u64 redir_content; + u64 redir_content = ~0ULL; - if (redir_index < IOAPIC_NUM_PINS) - redir_content = - ioapic->redirtbl[redir_index].bits; - else - redir_content = ~0ULL; + if (redir_index < IOAPIC_NUM_PINS) { + u32 index = array_index_nospec( + redir_index, IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[index].bits; + } result = (ioapic->ioregsel & 0x1) ? (redir_content >> 32) & 0xffffffff : @@ -108,8 +110,9 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; - if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, - e->fields.dest_mode)) + if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + e->fields.dest_id, + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode))) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); @@ -188,7 +191,7 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, /* * Return 0 for coalesced interrupts; for edge-triggered interrupts, * this only happens if a previous edge has not been delivered due - * do masking. For level interrupts, the remote_irr field tells + * to masking. For level interrupts, the remote_irr field tells * us if the interrupt is waiting for an EOI. * * RTC is special: it is edge-triggered, but userspace likes to know @@ -250,8 +253,10 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { - if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode) || + u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); + + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + e->fields.dest_id, dm) || kvm_apic_pending_eoi(vcpu, e->fields.vector)) __set_bit(e->fields.vector, ioapic_handled_vectors); @@ -292,6 +297,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (index >= IOAPIC_NUM_PINS) return; + index = array_index_nospec(index, IOAPIC_NUM_PINS); e = &ioapic->redirtbl[index]; mask_before = e->fields.mask; /* Preserve read-only fields */ @@ -327,11 +333,12 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.delivery_mode == APIC_DM_FIXED) { struct kvm_lapic_irq irq; - irq.shorthand = 0; + irq.shorthand = APIC_DEST_NOSHORT; irq.vector = e->fields.vector; irq.delivery_mode = e->fields.delivery_mode << 8; irq.dest_id = e->fields.dest_id; - irq.dest_mode = e->fields.dest_mode; + irq.dest_mode = + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); bitmap_zero(&vcpu_bitmap, 16); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, &vcpu_bitmap); @@ -343,7 +350,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) * keep ioapic_handled_vectors synchronized. */ irq.dest_id = old_dest_id; - irq.dest_mode = old_dest_mode; + irq.dest_mode = + kvm_lapic_irq_dest_mode( + !!e->fields.dest_mode); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, &vcpu_bitmap); } @@ -369,11 +378,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.dest_id = entry->fields.dest_id; irqe.vector = entry->fields.vector; - irqe.dest_mode = entry->fields.dest_mode; + irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode); irqe.trig_mode = entry->fields.trig_mode; irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; - irqe.shorthand = 0; + irqe.shorthand = APIC_DEST_NOSHORT; irqe.msi_redir_hint = false; if (irqe.trig_mode == IOAPIC_EDGE_TRIG) diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ea1a4e0297da..2fb2e3c80724 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -116,9 +116,6 @@ static inline int ioapic_in_kernel(struct kvm *kvm) } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); -bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode); -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); @@ -126,9 +123,6 @@ void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, int level, bool line_status); void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, - struct dest_map *dest_map); void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7c6233d37c64..f173ab6b407e 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -113,5 +113,8 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu); int kvm_setup_default_irq_routing(struct kvm *kvm); int kvm_setup_empty_irq_routing(struct kvm *kvm); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8ecd48d31800..79afa0bb5f41 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -52,15 +52,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; - if (irq->dest_mode == 0 && irq->dest_id == 0xff && - kvm_lowest_prio_delivery(irq)) { + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -114,13 +114,14 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; - irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq->dest_mode = kvm_lapic_irq_dest_mode( + !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo)); irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; irq->delivery_mode = e->msi.data & 0x700; irq->msi_redir_hint = ((e->msi.address_lo & MSI_ADDR_REDIRECTION_LOWPRI) > 0); irq->level = 1; - irq->shorthand = 0; + irq->shorthand = APIC_DEST_NOSHORT; } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); @@ -416,7 +417,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + if (irq.level && + kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, irq.dest_id, irq.dest_mode)) __set_bit(irq.vector, ioapic_handled_vectors); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cf9177b4a07f..cce1e6b204c8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -56,9 +56,6 @@ #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ -#define APIC_SHORT_MASK 0xc0000 -#define APIC_DEST_NOSHORT 0x0 -#define APIC_DEST_MASK 0x800 #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 @@ -792,13 +789,13 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode) + int shorthand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); ASSERT(target); - switch (short_hand) { + switch (shorthand) { case APIC_DEST_NOSHORT: if (dest_mode == APIC_DEST_PHYSICAL) return kvm_apic_match_physical_addr(target, mda); @@ -967,12 +964,12 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, } /* - * This routine tries to handler interrupts in posted mode, here is how + * This routine tries to handle interrupts in posted mode, here is how * it deals with different cases: * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority * interrupt, handle it in posted mode and use the following mechanism - * to find the destinaiton vCPU. + * to find the destination vCPU. * 1. For lowest-priority interrupts, store all the possible * destination vCPUs in an array. * 2. Use "guest vector % max number of destination vCPUs" to find @@ -1151,7 +1148,7 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, NULL, - irq->delivery_mode, + irq->shorthand, irq->dest_id, irq->dest_mode)) continue; @@ -1574,9 +1571,9 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) struct kvm_timer *ktimer = &apic->lapic_timer; kvm_apic_local_deliver(apic, APIC_LVTT); - if (apic_lvtt_tscdeadline(apic)) + if (apic_lvtt_tscdeadline(apic)) { ktimer->tscdeadline = 0; - if (apic_lvtt_oneshot(apic)) { + } else if (apic_lvtt_oneshot(apic)) { ktimer->tscdeadline = 0; ktimer->target_expiration = 0; } @@ -1963,15 +1960,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: + case APIC_LVTERR: { /* TODO: Check vector */ + size_t size; + u32 index; + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + size = ARRAY_SIZE(apic_lvt_mask); + index = array_index_nospec( + (reg - APIC_LVTT) >> 4, size); + val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); - break; + } case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) @@ -2373,14 +2375,13 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); - int r = 0; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) - r = 1; + return 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - return r; + return 1; + return 0; } void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 39925afdfcdc..ec730ce7a344 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,8 +10,9 @@ #define KVM_APIC_SIPI 1 #define KVM_APIC_LVT_NUM 6 -#define KVM_APIC_SHORT_MASK 0xc0000 -#define KVM_APIC_DEST_MASK 0x800 +#define APIC_SHORT_MASK 0xc0000 +#define APIC_DEST_NOSHORT 0x0 +#define APIC_DEST_MASK 0x800 #define APIC_BUS_CYCLE_NS 1 #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) @@ -82,8 +83,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val); int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode); - + int shorthand, unsigned int dest, int dest_mode); +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a32b847a8089..adc84f0f16ba 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -418,22 +418,24 @@ static inline bool is_access_track_spte(u64 spte) * requires a full MMU zap). The flag is instead explicitly queried when * checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 11 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) -#define MMIO_SPTE_GEN_HIGH_START 52 -#define MMIO_SPTE_GEN_HIGH_END 61 +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) + static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; @@ -444,8 +446,6 @@ static u64 get_mmio_spte_generation(u64 spte) { u64 gen; - spte &= ~shadow_mmio_mask; - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; return gen; @@ -538,16 +538,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static u8 kvm_get_shadow_phys_bits(void) { /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected - * in CPU detection code, but MKTME treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore for MKTME - * we should still return physical address bits reported by CPUID. + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. */ - if (!boot_cpu_has(X86_FEATURE_TME) || - WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) - return boot_cpu_data.x86_phys_bits; + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; - return cpuid_eax(0x80000008) & 0xff; + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; } static void kvm_mmu_reset_all_pte_masks(void) @@ -1260,56 +1264,6 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->lpage_disallowed_link); } -static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, - struct kvm_memory_slot *slot) -{ - struct kvm_lpage_info *linfo; - - if (slot) { - linfo = lpage_info_slot(gfn, slot, level); - return !!linfo->disallow_lpage; - } - - return true; -} - -static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, - int level) -{ - struct kvm_memory_slot *slot; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); -} - -static int host_mapping_level(struct kvm *kvm, gfn_t gfn) -{ - unsigned long page_size; - int i, ret = 0; - - page_size = kvm_host_page_size(kvm, gfn); - - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - if (page_size >= KVM_HPAGE_SIZE(i)) - ret = i; - else - break; - } - - return ret; -} - -static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, - bool no_dirty_log) -{ - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return false; - if (no_dirty_log && slot->dirty_bitmap) - return false; - - return true; -} - static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -1317,40 +1271,14 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!memslot_valid_for_gpte(slot, no_dirty_log)) - slot = NULL; + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return NULL; + if (no_dirty_log && slot->dirty_bitmap) + return NULL; return slot; } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, - bool *force_pt_level) -{ - int host_level, level, max_level; - struct kvm_memory_slot *slot; - - if (unlikely(*force_pt_level)) - return PT_PAGE_TABLE_LEVEL; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); - *force_pt_level = !memslot_valid_for_gpte(slot, true); - if (unlikely(*force_pt_level)) - return PT_PAGE_TABLE_LEVEL; - - host_level = host_mapping_level(vcpu->kvm, large_gfn); - - if (host_level == PT_PAGE_TABLE_LEVEL) - return host_level; - - max_level = min(kvm_x86_ops->get_lpage_level(), host_level); - - for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) - break; - - return level - 1; -} - /* * About rmap_head encoding: * @@ -1410,7 +1338,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, if (j != 0) return; if (!prev_desc && !desc->more) - rmap_head->val = (unsigned long)desc->sptes[0]; + rmap_head->val = 0; else if (prev_desc) prev_desc->more = desc->more; @@ -1525,7 +1453,7 @@ struct rmap_iterator { /* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the - * information in the itererator may not be valid. + * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise. */ @@ -2899,6 +2827,26 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } +static int make_mmu_pages_available(struct kvm_vcpu *vcpu) +{ + LIST_HEAD(invalid_list); + + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + return 0; + + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { + if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) + break; + + ++vcpu->kvm->stat.mmu_recycled; + } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + if (!kvm_mmu_available_pages(vcpu->kvm)) + return -ENOSPC; + return 0; +} + /* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock @@ -3099,17 +3047,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - - /* - * Other vcpu creates new sp in the window between - * mapping_level() and acquiring mmu-lock. We can - * allow guest to retry the access, the mapping can - * be fixed if guest refault. - */ - if (level > PT_PAGE_TABLE_LEVEL && - mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) - goto done; - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; /* @@ -3141,7 +3078,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; -done: return ret; } @@ -3294,6 +3230,83 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, + kvm_pfn_t pfn, struct kvm_memory_slot *slot) +{ + unsigned long hva; + pte_t *pte; + int level; + + BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K || + PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || + PT_PDPE_LEVEL != (int)PG_LEVEL_1G); + + if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) + return PT_PAGE_TABLE_LEVEL; + + /* + * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() + * is not solely for performance, it's also necessary to avoid the + * "writable" check in __gfn_to_hva_many(), which will always fail on + * read-only memslots due to gfn_to_hva() assuming writes. Earlier + * page fault steps have already verified the guest isn't writing a + * read-only memslot. + */ + hva = __gfn_to_hva_memslot(slot, gfn); + + pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); + if (unlikely(!pte)) + return PT_PAGE_TABLE_LEVEL; + + return level; +} + +static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp) +{ + struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; + kvm_pfn_t pfn = *pfnp; + kvm_pfn_t mask; + int level; + + if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) + return PT_PAGE_TABLE_LEVEL; + + if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) + return PT_PAGE_TABLE_LEVEL; + + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); + if (!slot) + return PT_PAGE_TABLE_LEVEL; + + max_level = min(max_level, kvm_x86_ops->get_lpage_level()); + for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + linfo = lpage_info_slot(gfn, slot, max_level); + if (!linfo->disallow_lpage) + break; + } + + if (max_level == PT_PAGE_TABLE_LEVEL) + return PT_PAGE_TABLE_LEVEL; + + level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); + if (level == PT_PAGE_TABLE_LEVEL) + return level; + + level = min(level, max_level); + + /* + * mmu_notifier_retry() was successful and mmu_lock is held, so + * the pmd can't be split from under us. + */ + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + *pfnp = pfn & ~mask; + + return level; +} + static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { @@ -3318,18 +3331,20 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, - int map_writable, int level, kvm_pfn_t pfn, - bool prefault, bool lpage_disallowed) + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault, bool account_disallowed_nx_lpage) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int ret; + int level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); + trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { /* @@ -3348,7 +3363,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (account_disallowed_nx_lpage) account_huge_nx_page(vcpu->kvm, sp); } } @@ -3384,45 +3399,6 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t gfn, kvm_pfn_t *pfnp, - int *levelp) -{ - kvm_pfn_t pfn = *pfnp; - int level = *levelp; - - /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. - */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn)) && - !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { - unsigned long mask; - /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - } -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, kvm_pfn_t pfn, unsigned access, int *ret_val) { @@ -3528,7 +3504,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) * - true: let the vcpu to access on the same address again. * - false: let the real page fault path to fix it. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, +static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code) { struct kvm_shadow_walk_iterator iterator; @@ -3537,9 +3513,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, u64 spte = 0ull; uint retry_count = 0; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return false; - if (!page_fault_can_be_fast(error_code)) return false; @@ -3548,9 +3521,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) - if (!is_shadow_present_pte(spte) || - iterator.level < level) + for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) + if (!is_shadow_present_pte(spte)) break; sp = page_header(__pa(iterator.sptep)); @@ -3626,71 +3598,13 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, } while (true); - trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, spte, fault_handled); walk_shadow_page_lockless_end(vcpu); return fault_handled; } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); -static int make_mmu_pages_available(struct kvm_vcpu *vcpu); - -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, - gfn_t gfn, bool prefault) -{ - int r; - int level; - bool force_pt_level; - kvm_pfn_t pfn; - unsigned long mmu_seq; - bool map_writable, write = error_code & PFERR_WRITE_MASK; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); - - force_pt_level = lpage_disallowed; - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - - if (fast_page_fault(vcpu, v, level, error_code)) - return RET_PF_RETRY; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return RET_PF_RETRY; - - if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) - return r; - - r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) - goto out_unlock; - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, - prefault, false); -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return r; -} - static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { @@ -3981,7 +3895,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { if (exception) @@ -3989,7 +3903,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, return vaddr; } -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, +static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -4001,20 +3915,14 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, static bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { - int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; + int bit7 = (pte >> 7) & 1; - return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | - ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); + return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) { - return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); -} - -static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) -{ - return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); + return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -4038,11 +3946,11 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; + struct rsvd_bits_validate *rsvd_check; int root, leaf; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - goto exit; + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; walk_shadow_page_lockless_begin(vcpu); @@ -4058,8 +3966,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (!is_shadow_present_pte(spte)) break; - reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte, - iterator.level); + /* + * Use a bitwise-OR instead of a logical-OR to aggregate the + * reserved bit and EPT's invalid memtype/XWR checks to avoid + * adding a Jcc in the loop. + */ + reserved |= __is_bad_mt_xwr(rsvd_check, spte) | + __is_rsvd_bits_set(rsvd_check, spte, iterator.level); } walk_shadow_page_lockless_end(vcpu); @@ -4073,7 +3986,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) root--; } } -exit: + *sptep = spte; return reserved; } @@ -4137,9 +4050,6 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) struct kvm_shadow_walk_iterator iterator; u64 spte; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { clear_sp_write_flooding_count(iterator.sptep); @@ -4149,29 +4059,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code, bool prefault) -{ - gfn_t gfn = gva >> PAGE_SHIFT; - int r; - - pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - - if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return RET_PF_EMULATE; - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - - - return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code, gfn, prefault); -} - -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; @@ -4180,11 +4069,13 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu->direct_map; arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, cr2_or_gpa, + kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, + bool *writable) { struct kvm_memory_slot *slot; bool async; @@ -4204,12 +4095,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; /* *pfn has correct page already */ if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(gva, gfn); + trace_kvm_try_async_get_page(cr2_or_gpa, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(gva, gfn); + trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return true; - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) + } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) return true; } @@ -4217,11 +4108,77 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } +static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault, int max_level, bool is_tdp) +{ + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool lpage_disallowed = exec && is_nx_huge_page_enabled(); + bool map_writable; + + gfn_t gfn = gpa >> PAGE_SHIFT; + unsigned long mmu_seq; + kvm_pfn_t pfn; + int r; + + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return RET_PF_EMULATE; + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + if (lpage_disallowed) + max_level = PT_PAGE_TABLE_LEVEL; + + if (fast_page_fault(vcpu, gpa, error_code)) + return RET_PF_RETRY; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + return RET_PF_RETRY; + + if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) + return r; + + r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, + prefault, is_tdp && lpage_disallowed); + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return r; +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, + u32 error_code, bool prefault) +{ + pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + + /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ + return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, + PT_DIRECTORY_LEVEL, false); +} + int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; +#ifndef CONFIG_X86_64 + /* A 64-bit CR2 should be impossible on 32-bit KVM. */ + if (WARN_ON_ONCE(fault_address >> 32)) + return -EFAULT; +#endif + vcpu->arch.l1tf_flush_l1d = true; switch (vcpu->arch.apf.host_apf_reason) { default: @@ -4249,76 +4206,23 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -static bool -check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) -{ - int page_num = KVM_PAGES_PER_HPAGE(level); - - gfn &= ~(page_num - 1); - - return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); -} - -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, +static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { - kvm_pfn_t pfn; - int r; - int level; - bool force_pt_level; - gfn_t gfn = gpa >> PAGE_SHIFT; - unsigned long mmu_seq; - int write = error_code & PFERR_WRITE_MASK; - bool map_writable; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); - - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); + int max_level; - if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return RET_PF_EMULATE; + for (max_level = PT_MAX_HUGEPAGE_LEVEL; + max_level > PT_PAGE_TABLE_LEVEL; + max_level--) { + int page_num = KVM_PAGES_PER_HPAGE(max_level); + gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - force_pt_level = - lpage_disallowed || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - if (level > PT_DIRECTORY_LEVEL && - !check_hugepage_cache_consistency(vcpu, gfn, level)) - level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; } - if (fast_page_fault(vcpu, gpa, level, error_code)) - return RET_PF_RETRY; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return RET_PF_RETRY; - - if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) - return r; - - r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) - goto out_unlock; - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, - prefault, lpage_disallowed); -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return r; + return direct_page_fault(vcpu, gpa, error_code, prefault, + max_level, true); } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -5496,47 +5400,30 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); -static int make_mmu_pages_available(struct kvm_vcpu *vcpu) -{ - LIST_HEAD(invalid_list); - - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) - return 0; - - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) - break; - - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - - if (!kvm_mmu_available_pages(vcpu->kvm)) - return -ENOSPC; - return 0; -} - -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = 0; bool direct = vcpu->arch.mmu->direct_map; + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + /* With shadow page tables, fault_address contains a GVA or nGPA. */ if (vcpu->arch.mmu->direct_map) { vcpu->arch.gpa_available = true; - vcpu->arch.gpa_val = cr2; + vcpu->arch.gpa_val = cr2_or_gpa; } r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, cr2, direct); + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { - r = vcpu->arch.mmu->page_fault(vcpu, cr2, + r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); WARN_ON(r == RET_PF_INVALID); @@ -5556,7 +5443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, */ if (vcpu->arch.mmu->direct_map && (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; } @@ -5571,7 +5458,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, * explicitly shadowing L1's page tables, i.e. unprotecting something * for L1 isn't going to magically fix whatever issue cause L2 to fail. */ - if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* @@ -5586,7 +5473,7 @@ emulate: return 1; } - return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, + return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -6015,8 +5902,8 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + (kvm_is_zone_device_pfn(pfn) || + PageCompound(pfn_to_page(pfn)))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -6249,7 +6136,7 @@ static void kvm_set_mmio_spte_mask(void) * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ - if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) + if (shadow_phys_bits == 52) mask &= ~1ull; kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 97b21e7fd013..4e1ef0473663 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -128,6 +128,21 @@ static inline int FNAME(is_present_gpte)(unsigned long pte) #endif } +static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) +{ +#if PTTYPE != PTTYPE_EPT + return false; +#else + return __is_bad_mt_xwr(rsvd_check, gpte); +#endif +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || + FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -175,9 +190,6 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - if (!FNAME(is_present_gpte)(gpte)) goto no_present; @@ -186,6 +198,9 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; + if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + return false; no_present: @@ -291,11 +306,11 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) } /* - * Fetch a guest pte for a guest virtual address + * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t addr, u32 access) + gpa_t addr, u32 access) { int ret; pt_element_t pte; @@ -400,7 +415,7 @@ retry_walk: if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } @@ -496,7 +511,7 @@ error: } static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, u32 access) + struct kvm_vcpu *vcpu, gpa_t addr, u32 access) { return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); @@ -611,17 +626,17 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, - int write_fault, int hlevel, + int write_fault, int max_level, kvm_pfn_t pfn, bool map_writable, bool prefault, bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, ret; - gfn_t gfn, base_gfn; + int top_level, hlevel, ret; + gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -637,7 +652,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) goto out_gpte_changed; for (shadow_walk_init(&it, vcpu, addr); @@ -666,12 +681,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - /* - * FNAME(page_fault) might have clobbered the bottom bits of - * gw->gfn, restore them from the virtual address. - */ - gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); - base_gfn = gfn; + hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -682,9 +692,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; @@ -765,7 +775,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { int write_fault = error_code & PFERR_WRITE_MASK; @@ -773,12 +783,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; - int level = PT_PAGE_TABLE_LEVEL; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && is_nx_huge_page_enabled(); - bool force_pt_level = lpage_disallowed; + int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -818,14 +827,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - level = mapping_level(vcpu, walker.gfn, &force_pt_level); - if (likely(!force_pt_level)) { - level = min(walker.level, level); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - } else - force_pt_level = true; + if (lpage_disallowed || is_self_change_mapping) + max_level = PT_PAGE_TABLE_LEVEL; + else + max_level = walker.level; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -865,10 +870,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (!force_pt_level) - transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, + map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: @@ -945,18 +948,19 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, +/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, struct x86_exception *exception) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr)(&walker, vcpu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; + gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; @@ -964,7 +968,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, } #if PTTYPE != PTTYPE_EPT -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, +/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ +static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -972,6 +977,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, gpa_t gpa = UNMAPPED_GVA; int r; +#ifndef CONFIG_X86_64 + /* A 64-bit GVA should be impossible on 32-bit KVM. */ + WARN_ON_ONCE(vaddr >> 32); +#endif + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); if (r) { diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 7ca8831c7d1a..3c6522b84ff1 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -249,13 +249,13 @@ TRACE_EVENT( TRACE_EVENT( fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), TP_STRUCT__entry( __field(int, vcpu_id) - __field(gva_t, gva) + __field(gpa_t, cr2_or_gpa) __field(u32, error_code) __field(u64 *, sptep) __field(u64, old_spte) @@ -265,7 +265,7 @@ TRACE_EVENT( TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->gva = gva; + __entry->cr2_or_gpa = cr2_or_gpa; __entry->error_code = error_code; __entry->sptep = sptep; __entry->old_spte = old_spte; @@ -273,9 +273,9 @@ TRACE_EVENT( __entry->retry = retry; ), - TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" " new %llx spurious %d fixed %d", __entry->vcpu_id, - __entry->gva, __print_flags(__entry->error_code, "|", + __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, __spte_satisfied(old_spte), __spte_satisfied(new_spte) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 25ce3edd1872..7f0059aa30e1 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -192,11 +192,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) break; case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: *seg = 1; - *unit = msr - MSR_MTRRfix16K_80000; + *unit = array_index_nospec( + msr - MSR_MTRRfix16K_80000, + MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); break; case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: *seg = 2; - *unit = msr - MSR_MTRRfix4K_C0000; + *unit = array_index_nospec( + msr - MSR_MTRRfix4K_C0000, + MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); break; default: return false; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7ebb62326c14..13332984b6d5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -2,6 +2,8 @@ #ifndef __KVM_X86_PMU_H #define __KVM_X86_PMU_H +#include <linux/nospec.h> + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -102,8 +104,12 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, u32 base) { - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) - return &pmu->gp_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_gp_counters); + + return &pmu->gp_counters[index]; + } return NULL; } @@ -113,8 +119,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) { int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) - return &pmu->fixed_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_fixed_counters); + + return &pmu->fixed_counters[index]; + } return NULL; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 122d4ce3b1ab..9dbb990c319a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1307,6 +1307,47 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1361,6 +1402,8 @@ static __init int svm_hardware_setup(void) } } + svm_adjust_mmio_mask(); + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) @@ -2144,7 +2187,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) +static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *page; @@ -2153,39 +2196,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; - BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); - - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!svm) { - err = -ENOMEM; - goto out; - } - - svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!svm->vcpu.arch.user_fpu) { - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); - err = -ENOMEM; - goto free_partial_svm; - } - - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!svm->vcpu.arch.guest_fpu) { - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); - err = -ENOMEM; - goto free_user_fpu; - } - - err = kvm_vcpu_init(&svm->vcpu, kvm, id); - if (err) - goto free_svm; + BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); + svm = to_svm(vcpu); err = -ENOMEM; page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) - goto uninit; + goto out; msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!msrpm_pages) @@ -2222,9 +2239,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - svm_init_osvw(&svm->vcpu); + svm_init_osvw(vcpu); - return &svm->vcpu; + return 0; free_page4: __free_page(hsave_page); @@ -2234,16 +2251,8 @@ free_page2: __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); free_page1: __free_page(page); -uninit: - kvm_vcpu_uninit(&svm->vcpu); -free_svm: - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); -free_partial_svm: - kmem_cache_free(kvm_vcpu_cache, svm); out: - return ERR_PTR(err); + return err; } static void svm_clear_current_vmcb(struct vmcb *vmcb) @@ -2269,10 +2278,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, svm); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -4281,12 +4286,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; svm->spec_ctrl = data; - if (!data) break; @@ -4310,13 +4313,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + return 1; if (!data) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - if (is_guest_mode(vcpu)) - break; set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: @@ -4519,9 +4521,9 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) */ kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, apic, - icrl & KVM_APIC_SHORT_MASK, + icrl & APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & KVM_APIC_DEST_MASK); + icrl & APIC_DEST_MASK); if (m && !avic_vcpu_is_running(vcpu)) kvm_vcpu_wake_up(vcpu); @@ -4935,7 +4937,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = control->exit_info_2; } -static int handle_exit(struct kvm_vcpu *vcpu) +static int handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -4993,7 +4996,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) __func__, svm->vmcb->control.exit_int_info, exit_code); - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { + kvm_skip_emulated_instruction(vcpu); + return 1; + } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); dump_vmcb(vcpu); @@ -5913,6 +5919,7 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVES); /* Update nrips enabled cache */ @@ -5924,14 +5931,14 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); } -#define F(x) bit(X86_FEATURE_##x) +#define F feature_bit static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { case 0x1: if (avic) - entry->ecx &= ~bit(X86_FEATURE_X2APIC); + entry->ecx &= ~F(X2APIC); break; case 0x80000001: if (nested) @@ -6001,6 +6008,11 @@ static bool svm_has_wbinvd_exit(void) return true; } +static bool svm_pku_supported(void) +{ + return false; +} + #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -6186,9 +6198,12 @@ out: return ret; } -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath) { - + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE) + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) @@ -7341,6 +7356,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, .pt_supported = svm_pt_supported, + .pku_supported = svm_pku_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 7aa69716d516..283bdb7071af 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -145,6 +145,11 @@ static inline bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool vmx_pku_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PKU); +} + static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 72359709cdc1..89c3e0caf39f 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -350,17 +350,12 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { struct vcpu_vmx *vmx = to_vmx(vcpu); - bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled; vmx->nested.enlightened_vmcs_enabled = true; if (vmcs_version) *vmcs_version = nested_get_evmcs_version(vcpu); - /* We don't support disabling the feature for simplicity. */ - if (evmcs_already_enabled) - return 0; - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6879966b7648..2db21d59eaf5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -28,16 +28,6 @@ module_param(nested_early_check, bool, S_IRUGO); failed; \ }) -#define SET_MSR_OR_WARN(vcpu, idx, data) \ -({ \ - bool failed = kvm_set_msr(vcpu, idx, data); \ - if (failed) \ - pr_warn_ratelimited( \ - "%s cannot write MSR (0x%x, 0x%llx)\n", \ - __func__, idx, data); \ - failed; \ -}) - /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -2172,8 +2162,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * EXEC CONTROLS */ exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; + exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; @@ -2550,8 +2540,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl)) + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; kvm_rsp_write(vcpu, vmcs12->guest_rsp); @@ -2566,7 +2556,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) return -EINVAL; if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) + nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) return -EINVAL; return 0; @@ -2823,7 +2813,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; -#ifdef CONFIG_X86_64 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || @@ -2831,7 +2820,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) return -EINVAL; -#endif /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the @@ -2899,6 +2887,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && + CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) return -EINVAL; @@ -3048,9 +3040,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); - static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -3183,7 +3172,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, u32 exit_qual; evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); @@ -3230,7 +3219,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, } enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset += vmcs12->tsc_offset; if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) @@ -3294,7 +3283,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * 26.7 "VM-entry failures during or after loading guest state". */ vmentry_fail_vmexit_guest_mode: - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); @@ -3407,8 +3396,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && - !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && - !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && + !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && + !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; return kvm_vcpu_halt(vcpu); @@ -3427,7 +3416,7 @@ vmentry_failed: /* * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date - * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). + * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). * This function returns the new value we should put in vmcs12.guest_cr0. * It's not enough to just return the vmcs02 GUEST_CR0. Rather, * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now @@ -3999,8 +3988,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -4209,7 +4198,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; if (likely(!vmx->fail)) { @@ -4751,36 +4740,32 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) static int handle_vmread(struct kvm_vcpu *vcpu) { - unsigned long field; - u64 field_value; + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - int len; - gva_t gva = 0; - struct vmcs12 *vmcs12; + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; + unsigned long field; + u64 value; + gva_t gva = 0; short offset; + int len; if (!nested_vmx_check_permission(vcpu)) return 1; - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) + /* + * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * any VMREAD sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == -1ull || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); - if (!is_guest_mode(vcpu)) - vmcs12 = get_vmcs12(vcpu); - else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMREAD - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - /* Decode instruction info and find the field to read */ - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); offset = vmcs_field_to_offset(field); if (offset < 0) @@ -4790,25 +4775,26 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - /* Read the field, zero-extended to a u64 field_value */ - field_value = vmcs12_read_any(vmcs12, field, offset); + /* Read the field, zero-extended to a u64 value */ + value = vmcs12_read_any(vmcs12, field, offset); /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending * on the guest's mode (32 or 64 bit), not on the given field's length. */ - if (vmx_instruction_info & (1u << 10)) { - kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), - field_value); + if (instr_info & BIT(10)) { + kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, len, &gva)) + instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) + if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { kvm_inject_page_fault(vcpu, &e); + return 1; + } } return nested_vmx_succeed(vcpu); @@ -4840,46 +4826,58 @@ static bool is_shadow_field_ro(unsigned long field) static int handle_vmwrite(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct x86_exception e; unsigned long field; - int len; + short offset; gva_t gva; - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + int len; - /* The value to write might be 32 or 64 bits, depending on L1's long + /* + * The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the appropriate number of + * bit (value), and then copies only the appropriate number of * bits into the vmcs12 field. */ - u64 field_value = 0; - struct x86_exception e; - struct vmcs12 *vmcs12; - short offset; + u64 value = 0; if (!nested_vmx_check_permission(vcpu)) return 1; - if (vmx->nested.current_vmptr == -1ull) + /* + * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * any VMWRITE sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == -1ull || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); - if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_readl(vcpu, - (((vmx_instruction_info) >> 3) & 0xf)); + if (instr_info & BIT(10)) + value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, false, len, &gva)) + instr_info, false, len, &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { + if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } } + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); + + offset = vmcs_field_to_offset(field); + if (offset < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* * If the vCPU supports "VMWRITE to any supported field in the * VMCS," then the "read-only" fields are actually read/write. @@ -4889,29 +4887,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - if (!is_guest_mode(vcpu)) { - vmcs12 = get_vmcs12(vcpu); - - /* - * Ensure vmcs12 is up-to-date before any VMWRITE that dirties - * vmcs12, else we may crush a field or consume a stale value. - */ - if (!is_shadow_field_rw(field)) - copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - } else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - - offset = vmcs_field_to_offset(field); - if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + /* + * Ensure vmcs12 is up-to-date before any VMWRITE that dirties + * vmcs12, else we may crush a field or consume a stale value. + */ + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); /* * Some Intel CPUs intentionally drop the reserved bits of the AR byte @@ -4922,9 +4903,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * the stripped down value, L2 sees the full value as stored by KVM). */ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) - field_value &= 0x1f0ff; + value &= 0x1f0ff; - vmcs12_write_any(vmcs12, field, offset, field_value); + vmcs12_write_any(vmcs12, field, offset, value); /* * Do not track vmcs12 dirty-state if in guest-mode as we actually @@ -4941,7 +4922,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) preempt_disable(); vmcs_load(vmx->vmcs01.shadow_vmcs); - __vmcs_writel(field, field_value); + __vmcs_writel(field, value); vmcs_clear(vmx->vmcs01.shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); @@ -5524,10 +5505,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return false; case EXIT_REASON_TRIPLE_FAULT: return true; - case EXIT_REASON_PENDING_INTERRUPT: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); + case EXIT_REASON_INTERRUPT_WINDOW: + return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); + return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: @@ -6015,8 +5996,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | - CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7023138b1cb0..34a3a17bb6d7 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -86,10 +86,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu, static unsigned intel_find_fixed_event(int idx) { - if (idx >= ARRAY_SIZE(fixed_pmc_events)) + u32 event; + size_t size = ARRAY_SIZE(fixed_pmc_events); + + if (idx >= size) return PERF_COUNT_HW_MAX; - return intel_arch_events[fixed_pmc_events[idx]].event_type; + event = fixed_pmc_events[array_index_nospec(idx, size)]; + return intel_arch_events[event].event_type; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -130,16 +134,20 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); struct kvm_pmc *counters; + unsigned int num_counters; idx &= ~(3u << 30); - if (!fixed && idx >= pmu->nr_arch_gp_counters) - return NULL; - if (fixed && idx >= pmu->nr_arch_fixed_counters) + if (fixed) { + counters = pmu->fixed_counters; + num_counters = pmu->nr_arch_fixed_counters; + } else { + counters = pmu->gp_counters; + num_counters = pmu->nr_arch_gp_counters; + } + if (idx >= num_counters) return NULL; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; - - return &counters[idx]; + return &counters[array_index_nospec(idx, num_counters)]; } static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index eb1ecd16fd22..cad128d1657b 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -23,12 +23,12 @@ BUILD_BUG_ON(1) * * When adding or removing fields here, note that shadowed * fields must always be synced by prepare_vmcs02, not just - * prepare_vmcs02_full. + * prepare_vmcs02_rare. */ /* * Keeping the fields ordered by size is an attempt at improving - * branch prediction in vmcs_read_any and vmcs_write_any. + * branch prediction in vmcs12_read_any and vmcs12_write_any. */ /* 16-bits */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cdb4bf50ee14..c475fa2aaae0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1057,6 +1057,12 @@ static unsigned long segment_base(u16 selector) } #endif +static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) +{ + return (pt_mode == PT_MODE_HOST_GUEST) && + !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); +} + static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; @@ -1716,7 +1722,7 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) return vcpu->arch.tsc_offset - vmcs12->tsc_offset; return vcpu->arch.tsc_offset; @@ -1734,7 +1740,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * to the newly set TSC to get L2's TSC. */ if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) g_tsc_offset = vmcs12->tsc_offset; trace_kvm_write_tsc_offset(vcpu->vcpu_id, @@ -1773,8 +1779,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) default: return 1; } - - return 0; } /* @@ -1916,7 +1920,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } /* - * Writes msr value into into the appropriate "register". + * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ @@ -1994,12 +1998,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; vmx->spec_ctrl = data; - if (!data) break; @@ -2010,7 +2012,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the + * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. @@ -2033,7 +2035,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + return 1; if (!data) break; @@ -2046,7 +2049,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the + * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ @@ -2104,47 +2107,50 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pt_update_intercept_for_msr(vmx); break; case MSR_IA32_RTIT_STATUS: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (data & MSR_IA32_RTIT_STATUS_MASK)) + if (!pt_can_write_msr(vmx)) + return 1; + if (data & MSR_IA32_RTIT_STATUS_MASK) return 1; vmx->pt_desc.guest.status = data; break; case MSR_IA32_RTIT_CR3_MATCH: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_cr3_filtering)) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cr3_filtering)) return 1; vmx->pt_desc.guest.cr3_match = data; break; case MSR_IA32_RTIT_OUTPUT_BASE: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (!intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_topa_output) && - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_single_range_output)) || - (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) return 1; vmx->pt_desc.guest.output_base = data; break; case MSR_IA32_RTIT_OUTPUT_MASK: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (!intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_topa_output) && - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_single_range_output))) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) return 1; vmx->pt_desc.guest.output_mask = data; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!pt_can_write_msr(vmx)) + return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges))) + if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges)) + return 1; + if (is_noncanonical_address(data, vcpu)) return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; @@ -2322,7 +2328,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | @@ -2657,8 +2663,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_segment_cache_clear(vmx); - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); @@ -3444,7 +3448,7 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); - int i, idx, r = 0; + int i, r = 0; kvm_pfn_t identity_map_pfn; u32 tmp; @@ -3452,7 +3456,7 @@ static int init_rmode_identity_map(struct kvm *kvm) mutex_lock(&kvm->slots_lock); if (likely(kvm_vmx->ept_identity_pagetable_done)) - goto out2; + goto out; if (!kvm_vmx->ept_identity_map_addr) kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; @@ -3461,9 +3465,8 @@ static int init_rmode_identity_map(struct kvm *kvm) r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, kvm_vmx->ept_identity_map_addr, PAGE_SIZE); if (r < 0) - goto out2; + goto out; - idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -3479,9 +3482,6 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm_vmx->ept_identity_pagetable_done = true; out: - srcu_read_unlock(&kvm->srcu, idx); - -out2: mutex_unlock(&kvm->slots_lock); return r; } @@ -4009,6 +4009,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (vmx_xsaves_supported()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = + boot_cpu_has(X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); @@ -4319,7 +4320,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static void enable_irq_window(struct kvm_vcpu *vcpu) { - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -4330,7 +4331,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; } - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -4455,8 +4456,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (enable_unrestricted_guest) return 0; - ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, - PAGE_SIZE * 3); + mutex_lock(&kvm->slots_lock); + ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); + mutex_unlock(&kvm->slots_lock); + if (ret) return ret; to_kvm_vmx(kvm)->tss_addr = addr; @@ -4938,7 +4942,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5151,7 +5155,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!enable_vnmi); - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5172,7 +5176,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); intr_window_requested = exec_controls_get(vmx) & - CPU_BASED_VIRTUAL_INTR_PENDING; + CPU_BASED_INTR_WINDOW_EXITING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) @@ -5496,7 +5500,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_CPUID] = kvm_emulate_cpuid, [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, @@ -5783,7 +5787,8 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; @@ -5869,34 +5874,44 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } } - if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) { + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { + kvm_skip_emulated_instruction(vcpu); + return 1; + } + + if (exit_reason >= kvm_vmx_max_exit_handlers) + goto unexpected_vmexit; #ifdef CONFIG_RETPOLINE - if (exit_reason == EXIT_REASON_MSR_WRITE) - return kvm_emulate_wrmsr(vcpu); - else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) - return handle_preemption_timer(vcpu); - else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT) - return handle_interrupt_window(vcpu); - else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) - return handle_external_interrupt(vcpu); - else if (exit_reason == EXIT_REASON_HLT) - return kvm_emulate_halt(vcpu); - else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) - return handle_ept_misconfig(vcpu); + if (exit_reason == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) + return handle_interrupt_window(vcpu); + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); #endif - return kvm_vmx_exit_handlers[exit_reason](vcpu); - } else { - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason); - dump_vmcs(); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = + + exit_reason = array_index_nospec(exit_reason, + kvm_vmx_max_exit_handlers); + if (!kvm_vmx_exit_handlers[exit_reason]) + goto unexpected_vmexit; + + return kvm_vmx_exit_handlers[exit_reason](vcpu); + +unexpected_vmexit: + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); + dump_vmcs(); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; - vcpu->run->internal.data[0] = exit_reason; - return 0; - } + vcpu->run->internal.ndata = 1; + vcpu->run->internal.data[0] = exit_reason; + return 0; } /* @@ -6217,7 +6232,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6225,6 +6241,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); + else if (!is_guest_mode(vcpu) && + vmx->exit_reason == EXIT_REASON_MSR_WRITE) + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static bool vmx_has_emulated_msr(int index) @@ -6633,60 +6652,31 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, vmx); } -static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) +static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { - int err; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; - int i, cpu; - - BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); - - vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!vmx) - return ERR_PTR(-ENOMEM); + int i, cpu, err; - vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vmx->vcpu.arch.user_fpu) { - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); - err = -ENOMEM; - goto free_partial_vcpu; - } + BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); + vmx = to_vmx(vcpu); - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vmx->vcpu.arch.guest_fpu) { - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); - err = -ENOMEM; - goto free_user_fpu; - } + err = -ENOMEM; vmx->vpid = allocate_vpid(); - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); - if (err) - goto free_vcpu; - - err = -ENOMEM; - /* * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by * avoiding dealing with cases, such as enabling PML partially on vcpus - * for the guest, etc. + * for the guest), etc. */ if (enable_pml) { vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) - goto uninit_vcpu; + goto free_vpid; } BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); @@ -6731,7 +6721,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - if (kvm_cstate_in_guest(kvm)) { + if (kvm_cstate_in_guest(vcpu->kvm)) { vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); @@ -6741,19 +6731,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; init_vmcs(vmx); - vmx_vcpu_put(&vmx->vcpu); + vmx_vcpu_put(vcpu); put_cpu(); - if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { - err = alloc_apic_access_page(kvm); + if (cpu_need_virtualize_apic_accesses(vcpu)) { + err = alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } if (enable_ept && !enable_unrestricted_guest) { - err = init_rmode_identity_map(kvm); + err = init_rmode_identity_map(vcpu->kvm); if (err) goto free_vmcs; } @@ -6761,7 +6751,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, vmx_capability.ept, - kvm_vcpu_apicv_active(&vmx->vcpu)); + kvm_vcpu_apicv_active(vcpu)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); @@ -6779,22 +6769,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->ept_pointer = INVALID_PAGE; - return &vmx->vcpu; + return 0; free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_pml: vmx_destroy_pml_buffer(vmx); -uninit_vcpu: - kvm_vcpu_uninit(&vmx->vcpu); -free_vcpu: +free_vpid: free_vpid(vmx->vpid); - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); -free_partial_vcpu: - kmem_cache_free(kvm_vcpu_cache, vmx); - return ERR_PTR(err); + return err; } #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" @@ -6946,28 +6929,28 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) } while (0) entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); - cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); - cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); - cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); - cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); - cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); - cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); - cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); - cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); - cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); - cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); - cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); - cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); - cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); - cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); - cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); - cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); - cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); - cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57)); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); + cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); + cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); #undef cr4_fixed1_update } @@ -7101,7 +7084,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { if (func == 1 && nested) - entry->ecx |= bit(X86_FEATURE_VMX); + entry->ecx |= feature_bit(VMX); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -7843,6 +7826,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, .pt_supported = vmx_pt_supported, + .pku_supported = vmx_pku_supported, .request_immediate_exit = vmx_request_immediate_exit, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 740d3ee42455..2d3be7f3ad67 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -93,6 +93,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif +static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ @@ -879,30 +881,44 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - if (cr4 & CR4_RESERVED_BITS) - return -EINVAL; +#define __cr4_reserved_bits(__cpu_has, __c) \ +({ \ + u64 __reserved_bits = CR4_RESERVED_BITS; \ + \ + if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ + __reserved_bits |= X86_CR4_OSXSAVE; \ + if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ + __reserved_bits |= X86_CR4_SMEP; \ + if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ + __reserved_bits |= X86_CR4_SMAP; \ + if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ + __reserved_bits |= X86_CR4_FSGSBASE; \ + if (!__cpu_has(__c, X86_FEATURE_PKU)) \ + __reserved_bits |= X86_CR4_PKE; \ + if (!__cpu_has(__c, X86_FEATURE_LA57)) \ + __reserved_bits |= X86_CR4_LA57; \ + __reserved_bits; \ +}) - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return -EINVAL; +static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) +{ + u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return -EINVAL; + if (cpuid_ecx(0x7) & feature_bit(LA57)) + reserved_bits &= ~X86_CR4_LA57; - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return -EINVAL; + if (kvm_x86_ops->umip_emulated()) + reserved_bits &= ~X86_CR4_UMIP; - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return -EINVAL; + return reserved_bits; +} - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & cr4_reserved_bits) return -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) return -EINVAL; return 0; @@ -1047,9 +1063,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - vcpu->arch.db[dr] = val; + vcpu->arch.db[array_index_nospec(dr, size)] = val; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; @@ -1064,7 +1082,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 5: /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) + if (!kvm_dr7_valid(val)) return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -1086,9 +1104,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr); int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[dr]; + *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: /* fall through */ @@ -1212,6 +1232,7 @@ static const u32 emulated_msrs_all[] = { MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, /* * The following list leaves out MSRs whose values are determined @@ -1526,6 +1547,49 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); /* + * The fast path for frequent and performance sensitive wrmsr emulation, + * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces + * the latency of virtual IPI by avoiding the expensive bits of transitioning + * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the + * other cases which must be called after interrupts are enabled on the host. + */ +static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) +{ + if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && + ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && + ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); + } + + return 1; +} + +enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +{ + u32 msr = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); + int ret = 0; + + switch (msr) { + case APIC_BASE_MSR + (APIC_ICR >> 4): + ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + break; + default: + return EXIT_FASTPATH_NONE; + } + + if (!ret) { + trace_kvm_msr_write(msr, data); + return EXIT_FASTPATH_SKIP_EMUL_INS; + } + + return EXIT_FASTPATH_NONE; +} +EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); + +/* * Adapt set_msr() to msr_io()'s calling convention */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) @@ -2485,7 +2549,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore @@ -2581,45 +2648,47 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) static void record_steal_time(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + /* -EAGAIN is returned in atomic context so we can just return. */ + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, + &map, &vcpu->arch.st.cache, false)) return; + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); - if (vcpu->arch.st.steal.version & 1) - vcpu->arch.st.steal.version += 1; /* first time write, random junk */ + vcpu->arch.st.preempted = 0; - vcpu->arch.st.steal.version += 1; + if (st->version & 1) + st->version += 1; /* first time write, random junk */ - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + st->version += 1; smp_wmb(); - vcpu->arch.st.steal.steal += current->sched_info.run_delay - + st->steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - smp_wmb(); - vcpu->arch.st.steal.version += 1; + st->version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -2786,11 +2855,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS, - sizeof(struct kvm_steal_time))) - return 1; - vcpu->arch.st.msr_val = data; if (!(data & KVM_MSR_ENABLED)) @@ -2926,7 +2990,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; break; } @@ -3458,10 +3525,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); - /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@ -3501,15 +3564,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + if (vcpu->arch.st.preempted) + return; + + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, + &vcpu->arch.st.cache, true)) + return; + + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); - kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal.preempted, - offsetof(struct kvm_steal_time, preempted), - sizeof(vcpu->arch.st.steal.preempted)); + st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -4660,9 +4733,6 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, { struct kvm_pit *pit = kvm->arch.vpit; - if (!pit) - return -ENXIO; - /* pit->pit_state.lock was overloaded to prevent userspace from getting * an inconsistent state after running multiple KVM_REINJECT_CONTROL * ioctls in parallel. Use a separate lock if that ioctl isn't rare. @@ -5029,6 +5099,9 @@ set_identity_unlock: r = -EFAULT; if (copy_from_user(&control, argp, sizeof(control))) goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; r = kvm_vm_ioctl_reinject(kvm, &control); break; } @@ -6186,6 +6259,21 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); } +static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); +} + +static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); +} + +static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { return kvm_register_read(emul_to_vcpu(ctxt), reg); @@ -6263,6 +6351,9 @@ static const struct x86_emulate_ops emulate_ops = { .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, + .guest_has_long_mode = emulator_guest_has_long_mode, + .guest_has_movbe = emulator_guest_has_movbe, + .guest_has_fxsr = emulator_guest_has_fxsr, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, @@ -6379,11 +6470,11 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool write_fault_to_shadow_pgtable, int emulation_type) { - gpa_t gpa = cr2; + gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) @@ -6397,7 +6488,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, * Write permission should be allowed since only * write access need to be emulated. */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); /* * If the mapping is invalid in guest, let cpu retry @@ -6454,10 +6545,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - unsigned long cr2, int emulation_type) + gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; @@ -6486,14 +6577,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (x86_page_table_writing_insn(ctxt)) return false; - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) + if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) return false; vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2; + vcpu->arch.last_retry_addr = cr2_or_gpa; if (!vcpu->arch.mmu->direct_map) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6639,11 +6730,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } -int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - int emulation_type, - void *insn, - int insn_len) +int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + int emulation_type, void *insn, int insn_len) { int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -6689,8 +6777,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, - emulation_type)) + if (reexecute_instruction(vcpu, cr2_or_gpa, + write_fault_to_spt, + emulation_type)) return 1; if (ctxt->have_exception) { /* @@ -6724,7 +6813,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return 1; } - if (retry_instruction(ctxt, cr2, emulation_type)) + if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) return 1; /* this is needed for vmware backdoor interface to work since it @@ -6736,7 +6825,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, restart: /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2; + ctxt->exception.address = cr2_or_gpa; r = x86_emulate_insn(ctxt); @@ -6744,7 +6833,7 @@ restart: return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, emulation_type)) return 1; @@ -7357,8 +7446,8 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) { struct kvm_lapic_irq lapic_irq; - lapic_irq.shorthand = 0; - lapic_irq.dest_mode = 0; + lapic_irq.shorthand = APIC_DEST_NOSHORT; + lapic_irq.dest_mode = APIC_DEST_PHYSICAL; lapic_irq.level = 0; lapic_irq.dest_id = apicid; lapic_irq.msi_redir_hint = false; @@ -7997,6 +8086,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); + enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; bool req_immediate_exit = false; @@ -8198,8 +8288,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); guest_enter_irqoff(); - /* The preempt notifier should have taken care of the FPU already. */ - WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -8243,7 +8334,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_exit_irqoff(vcpu); + kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); /* * Consume any pending interrupts, including the possible source of @@ -8287,7 +8378,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_from_vapic(vcpu); vcpu->arch.gpa_available = false; - r = kvm_x86_ops->handle_exit(vcpu); + r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); return r; cancel_injection: @@ -8471,12 +8562,26 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +static void kvm_save_current_fpu(struct fpu *fpu) +{ + /* + * If the target FPU state is not resident in the CPU registers, just + * memcpy() from current, else save CPU state directly to the target. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&fpu->state, ¤t->thread.fpu.state, + fpu_kernel_xstate_size); + else + copy_fpregs_to_fpstate(fpu); +} + /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(vcpu->arch.user_fpu); + kvm_save_current_fpu(vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8492,7 +8597,8 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); + kvm_save_current_fpu(vcpu->arch.guest_fpu); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); @@ -8714,6 +8820,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { vcpu_load(vcpu); + if (kvm_mpx_supported()) + kvm_load_guest_fpu(vcpu); kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && @@ -8722,6 +8830,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; + if (kvm_mpx_supported()) + kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); return 0; } @@ -9082,33 +9192,90 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; - - kvmclock_reset(vcpu); + if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); - kvm_x86_ops->vcpu_free(vcpu); - free_cpumask_var(wbinvd_dirty_mask); + return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; + struct page *page; + int r; - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - printk_once(KERN_WARNING - "kvm: SMP vm created on host with unstable TSC; " - "guest TSC will not be reliable\n"); + vcpu->arch.emulate_ctxt.ops = &emulate_ops; + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - vcpu = kvm_x86_ops->vcpu_create(kvm, id); + kvm_set_tsc_khz(vcpu, max_tsc_khz); - return vcpu; -} + r = kvm_mmu_create(vcpu); + if (r < 0) + return r; + + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + if (r < 0) + goto fail_mmu_destroy; + } else + static_key_slow_inc(&kvm_no_apic_vcpu); + + r = -ENOMEM; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto fail_free_lapic; + vcpu->arch.pio_data = page_address(page); + + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks) + goto fail_free_pio_data; + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) + goto fail_free_mce_banks; + + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { + pr_err("kvm: failed to allocate userspace's fpu\n"); + goto free_wbinvd_dirty_mask; + } + + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); + goto free_user_fpu; + } + fx_init(vcpu); + + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + + kvm_async_pf_hash_reset(vcpu); + kvm_pmu_init(vcpu); + + vcpu->arch.pending_external_vector = -1; + vcpu->arch.preempted_in_kernel = false; + + kvm_hv_vcpu_init(vcpu); + + r = kvm_x86_ops->vcpu_create(vcpu); + if (r) + goto free_guest_fpu; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); @@ -9117,6 +9284,22 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_init_mmu(vcpu, false); vcpu_put(vcpu); return 0; + +free_guest_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); +fail_free_lapic: + kvm_free_lapic(vcpu); +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); + return r; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -9149,13 +9332,29 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - vcpu->arch.apf.msr_val = 0; + struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; + int idx; - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); + kvm_release_pfn(cache->pfn, cache->dirty, cache); + + kvmclock_reset(vcpu); kvm_x86_ops->vcpu_free(vcpu); + + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + + kvm_hv_vcpu_uninit(vcpu); + kvm_pmu_destroy(vcpu); + kfree(vcpu->arch.mce_banks); + kvm_free_lapic(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_mmu_destroy(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + free_page((unsigned long)vcpu->arch.pio_data); + if (!lapic_in_kernel(vcpu)) + static_key_slow_dec(&kvm_no_apic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9171,7 +9370,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.nmi_injected = false; kvm_clear_interrupt_queue(vcpu); kvm_clear_exception_queue(vcpu); - vcpu->arch.exception.pending = false; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); @@ -9347,6 +9545,8 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); + if (kvm_has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that @@ -9375,6 +9575,13 @@ void kvm_arch_hardware_unsetup(void) int kvm_arch_check_processor_compat(void) { + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + WARN_ON(!irqs_disabled()); + + if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) + return -EIO; + return kvm_x86_ops->check_processor_compatibility(); } @@ -9392,98 +9599,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct page *page; - int r; - - vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); - - kvm_set_tsc_khz(vcpu, max_tsc_khz); - - r = kvm_mmu_create(vcpu); - if (r < 0) - goto fail_free_pio_data; - - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); - if (r < 0) - goto fail_mmu_destroy; - } else - static_key_slow_inc(&kvm_no_apic_vcpu); - - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) { - r = -ENOMEM; - goto fail_free_lapic; - } - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, - GFP_KERNEL_ACCOUNT)) { - r = -ENOMEM; - goto fail_free_mce_banks; - } - - fx_init(vcpu); - - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - - kvm_async_pf_hash_reset(vcpu); - kvm_pmu_init(vcpu); - - vcpu->arch.pending_external_vector = -1; - vcpu->arch.preempted_in_kernel = false; - - kvm_hv_vcpu_init(vcpu); - - return 0; - -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); -fail_free_lapic: - kvm_free_lapic(vcpu); -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail: - return r; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - int idx; - - kvm_hv_vcpu_uninit(vcpu); - kvm_pmu_destroy(vcpu); - kfree(vcpu->arch.mce_banks); - kvm_free_lapic(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_mmu_destroy(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - free_page((unsigned long)vcpu->arch.pio_data); - if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); -} - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -9558,7 +9673,7 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) @@ -9627,18 +9742,6 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(__x86_set_memory_region); -int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) -{ - int r; - - mutex_lock(&kvm->slots_lock); - r = __x86_set_memory_region(kvm, id, gpa, size); - mutex_unlock(&kvm->slots_lock); - - return r; -} -EXPORT_SYMBOL_GPL(x86_set_memory_region); - void kvm_arch_pre_destroy_vm(struct kvm *kvm) { kvm_mmu_pre_destroy_vm(kvm); @@ -9652,9 +9755,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) * unless the the memory map has changed due to process exit * or fd copying. */ - x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_lock(&kvm->slots_lock); + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_unlock(&kvm->slots_lock); } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); @@ -9758,11 +9865,18 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { + struct kvm_vcpu *vcpu; + int i; + /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ kvm_mmu_invalidate_mmio_sptes(kvm, gen); + + /* Force re-initialization of steal_time cache */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -9792,7 +9906,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * * The reason is, in case of PML, we need to set D-bit for any slots * with dirty logging disabled in order to eliminate unnecessary GPA - * logging in PML buffer (and potential PML buffer full VMEXT). This + * logging in PML buffer (and potential PML buffer full VMEXIT). This * guarantees leaving PML enabled during guest's lifetime won't have * any additional overhead from PML when guest is running with dirty * logging disabled for memory slots. @@ -10014,7 +10128,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) return; - vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true); + vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) @@ -10127,7 +10241,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_not_present(work->arch.token, work->gva); + trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (kvm_can_deliver_async_pf(vcpu) && @@ -10162,7 +10276,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - trace_kvm_async_pf_ready(work->arch.token, work->gva); + trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && !apf_get_user(vcpu, &val)) { @@ -10284,7 +10398,6 @@ bool kvm_vector_hashing_enabled(void) { return vector_hashing; } -EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { @@ -10292,6 +10405,28 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) +{ + uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && + !boot_cpu_has(X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + + return bits; +} +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 29391af8871d..2d2ff855773b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -144,11 +144,6 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu) return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu); } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; @@ -166,21 +161,13 @@ static inline u64 get_canonical(u64 la, u8 vaddr_bits) static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) { -#ifdef CONFIG_X86_64 return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; -#else - return false; -#endif } static inline bool emul_is_noncanonical_address(u64 la, struct x86_emulate_ctxt *ctxt) { -#ifdef CONFIG_X86_64 return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; -#else - return false; -#endif } static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, @@ -289,8 +276,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, +int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); +enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -369,7 +357,14 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } +static inline bool kvm_dr7_valid(unsigned long data) +{ + /* Bits [63:32] are reserved */ + return !(data >> 32); +} + void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); #endif |