From b6416e61012429e0277bd15a229222fd17afc1c1 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 16 Dec 2016 14:30:35 -0800
Subject: jump_labels: API for flushing deferred jump label updates

Modules that use static_key_deferred need a way to synchronize with
any delayed work that is still pending when the module is unloaded.
Introduce static_key_deferred_flush() which flushes any pending
jump label updates.

Signed-off-by: David Matlack <dmatlack@google.com>
Cc: stable@vger.kernel.org
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/jump_label_ratelimit.h | 5 +++++
 kernel/jump_label.c                  | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index 089f70f83e97..23da3af459fe 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -14,6 +14,7 @@ struct static_key_deferred {
 
 #ifdef HAVE_JUMP_LABEL
 extern void static_key_slow_dec_deferred(struct static_key_deferred *key);
+extern void static_key_deferred_flush(struct static_key_deferred *key);
 extern void
 jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl);
 
@@ -26,6 +27,10 @@ static inline void static_key_slow_dec_deferred(struct static_key_deferred *key)
 	STATIC_KEY_CHECK_USE();
 	static_key_slow_dec(&key->key);
 }
+static inline void static_key_deferred_flush(struct static_key_deferred *key)
+{
+	STATIC_KEY_CHECK_USE();
+}
 static inline void
 jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..a9b8cf500591 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
 
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+	STATIC_KEY_CHECK_USE();
+	flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
 void jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
-- 
cgit v1.2.3


From cef84c302fe051744b983a92764d3fcca933415d Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 16 Dec 2016 14:30:36 -0800
Subject: KVM: x86: flush pending lapic jump label updates on module unload

KVM's lapic emulation uses static_key_deferred (apic_{hw,sw}_disabled).
These are implemented with delayed_work structs which can still be
pending when the KVM module is unloaded. We've seen this cause kernel
panics when the kvm_intel module is quickly reloaded.

Use the new static_key_deferred_flush() API to flush pending updates on
module unload.

Signed-off-by: David Matlack <dmatlack@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 6 ++++++
 arch/x86/kvm/lapic.h | 1 +
 arch/x86/kvm/x86.c   | 1 +
 3 files changed, 8 insertions(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5fe290c1b7d8..2f6ef5121a4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2426,3 +2426,9 @@ void kvm_lapic_init(void)
 	jump_label_rate_limit(&apic_hw_disabled, HZ);
 	jump_label_rate_limit(&apic_sw_disabled, HZ);
 }
+
+void kvm_lapic_exit(void)
+{
+	static_key_deferred_flush(&apic_hw_disabled);
+	static_key_deferred_flush(&apic_sw_disabled);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e0c80233b3e1..ff8039d61672 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -110,6 +110,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
 void kvm_lapic_init(void);
+void kvm_lapic_exit(void);
 
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f22810a7e0c..a0ac6e0060fb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6045,6 +6045,7 @@ out:
 
 void kvm_arch_exit(void)
 {
+	kvm_lapic_exit();
 	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
 
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
-- 
cgit v1.2.3


From 129a72a0d3c8e139a04512325384fe5ac119e74d Mon Sep 17 00:00:00 2001
From: Steve Rutherford <srutherford@google.com>
Date: Wed, 11 Jan 2017 18:28:29 -0800
Subject: KVM: x86: Introduce segmented_write_std

Introduces segemented_write_std.

Switches from emulated reads/writes to standard read/writes in fxsave,
fxrstor, sgdt, and sidt.  This fixes CVE-2017-2584, a longstanding
kernel memory leak.

Since commit 283c95d0e389 ("KVM: x86: emulate FXSAVE and FXRSTOR",
2016-11-09), which is luckily not yet in any final release, this would
also be an exploitable kernel memory *write*!

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: stable@vger.kernel.org
Fixes: 96051572c819194c37a8367624b285be10297eca
Fixes: 283c95d0e3891b64087706b344a4b545d04a6e62
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 56628a44668b..f36d0fa6b885 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -818,6 +818,20 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
 	return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
 }
 
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+			       struct segmented_address addr,
+			       void *data,
+			       unsigned int size)
+{
+	int rc;
+	ulong linear;
+
+	rc = linearize(ctxt, addr, size, true, &linear);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+	return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception);
+}
+
 /*
  * Prefetch the remaining bytes of the instruction without crossing page
  * boundary if they are not in fetch_cache yet.
@@ -3685,8 +3699,8 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
 	}
 	/* Disable writeback. */
 	ctxt->dst.type = OP_NONE;
-	return segmented_write(ctxt, ctxt->dst.addr.mem,
-			       &desc_ptr, 2 + ctxt->op_bytes);
+	return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+				   &desc_ptr, 2 + ctxt->op_bytes);
 }
 
 static int em_sgdt(struct x86_emulate_ctxt *ctxt)
@@ -3932,7 +3946,7 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
 	else
 		size = offsetof(struct fxregs_state, xmm_space[0]);
 
-	return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+	return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
 }
 
 static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
@@ -3974,7 +3988,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
+	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-- 
cgit v1.2.3


From 4f3dbdf47e150016aacd734e663347fcaa768303 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 5 Jan 2017 17:39:42 -0800
Subject: KVM: eventfd: fix NULL deref irqbypass consumer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported syzkaller:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
    IP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass]
    PGD 0

    Oops: 0002 [#1] SMP
    CPU: 1 PID: 125 Comm: kworker/1:1 Not tainted 4.9.0+ #1
    Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm]
    task: ffff9bbe0dfbb900 task.stack: ffffb61802014000
    RIP: 0010:irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass]
    Call Trace:
     irqfd_shutdown+0x66/0xa0 [kvm]
     process_one_work+0x16b/0x480
     worker_thread+0x4b/0x500
     kthread+0x101/0x140
     ? process_one_work+0x480/0x480
     ? kthread_create_on_node+0x60/0x60
     ret_from_fork+0x25/0x30
    RIP: irq_bypass_unregister_consumer+0x9d/0xb70 [irqbypass] RSP: ffffb61802017e20
    CR2: 0000000000000008

The syzkaller folks reported a NULL pointer dereference that due to
unregister an consumer which fails registration before. The syzkaller
creates two VMs w/ an equal eventfd occasionally. So the second VM
fails to register an irqbypass consumer. It will make irqfd as inactive
and queue an workqueue work to shutdown irqfd and unregister the irqbypass
consumer when eventfd is closed. However, the second consumer has been
initialized though it fails registration. So the token(same as the first
VM's) is taken to unregister the consumer through the workqueue, the
consumer of the first VM is found and unregistered, then NULL deref incurred
in the path of deleting consumer from the consumers list.

This patch fixes it by making irq_bypass_register/unregister_consumer()
looks for the consumer entry based on consumer pointer itself instead of
token matching.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Cc: stable@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/lib/irqbypass.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index 52abac4bb6a2..6d2fcd6fcb25 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -195,7 +195,7 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
 	mutex_lock(&lock);
 
 	list_for_each_entry(tmp, &consumers, node) {
-		if (tmp->token == consumer->token) {
+		if (tmp->token == consumer->token || tmp == consumer) {
 			mutex_unlock(&lock);
 			module_put(THIS_MODULE);
 			return -EBUSY;
@@ -245,7 +245,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 	mutex_lock(&lock);
 
 	list_for_each_entry(tmp, &consumers, node) {
-		if (tmp->token != consumer->token)
+		if (tmp != consumer)
 			continue;
 
 		list_for_each_entry(producer, &producers, node) {
-- 
cgit v1.2.3


From 546d87e5c903a7f3ee7b9f998949a94729fbc65b Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 3 Jan 2017 18:56:19 -0800
Subject: KVM: x86: fix NULL deref in vcpu_scan_ioapic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported by syzkaller:

    BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0
    IP: _raw_spin_lock+0xc/0x30
    PGD 3e28eb067
    PUD 3f0ac6067
    PMD 0
    Oops: 0002 [#1] SMP
    CPU: 0 PID: 2431 Comm: test Tainted: G           OE   4.10.0-rc1+ #3
    Call Trace:
     ? kvm_ioapic_scan_entry+0x3e/0x110 [kvm]
     kvm_arch_vcpu_ioctl_run+0x10a8/0x15f0 [kvm]
     ? pick_next_task_fair+0xe1/0x4e0
     ? kvm_arch_vcpu_load+0xea/0x260 [kvm]
     kvm_vcpu_ioctl+0x33a/0x600 [kvm]
     ? hrtimer_try_to_cancel+0x29/0x130
     ? do_nanosleep+0x97/0xf0
     do_vfs_ioctl+0xa1/0x5d0
     ? __hrtimer_init+0x90/0x90
     ? do_nanosleep+0x5b/0xf0
     SyS_ioctl+0x79/0x90
     do_syscall_64+0x6e/0x180
     entry_SYSCALL64_slow_path+0x25/0x25
    RIP: _raw_spin_lock+0xc/0x30 RSP: ffffa43688973cc0

The syzkaller folks reported a NULL pointer dereference due to
ENABLE_CAP succeeding even without an irqchip.  The Hyper-V
synthetic interrupt controller is activated, resulting in a
wrong request to rescan the ioapic and a NULL pointer dereference.

    #include <sys/ioctl.h>
    #include <sys/mman.h>
    #include <sys/types.h>
    #include <linux/kvm.h>
    #include <pthread.h>
    #include <stddef.h>
    #include <stdint.h>
    #include <stdlib.h>
    #include <string.h>
    #include <unistd.h>

    #ifndef KVM_CAP_HYPERV_SYNIC
    #define KVM_CAP_HYPERV_SYNIC 123
    #endif

    void* thr(void* arg)
    {
	struct kvm_enable_cap cap;
	cap.flags = 0;
	cap.cap = KVM_CAP_HYPERV_SYNIC;
	ioctl((long)arg, KVM_ENABLE_CAP, &cap);
	return 0;
    }

    int main()
    {
	void *host_mem = mmap(0, 0x1000, PROT_READ|PROT_WRITE,
			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
	int kvmfd = open("/dev/kvm", 0);
	int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
	struct kvm_userspace_memory_region memreg;
	memreg.slot = 0;
	memreg.flags = 0;
	memreg.guest_phys_addr = 0;
	memreg.memory_size = 0x1000;
	memreg.userspace_addr = (unsigned long)host_mem;
	host_mem[0] = 0xf4;
	ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
	int cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
	struct kvm_sregs sregs;
	ioctl(cpufd, KVM_GET_SREGS, &sregs);
	sregs.cr0 = 0;
	sregs.cr4 = 0;
	sregs.efer = 0;
	sregs.cs.selector = 0;
	sregs.cs.base = 0;
	ioctl(cpufd, KVM_SET_SREGS, &sregs);
	struct kvm_regs regs = { .rflags = 2 };
	ioctl(cpufd, KVM_SET_REGS, &regs);
	ioctl(vmfd, KVM_CREATE_IRQCHIP, 0);
	pthread_t th;
	pthread_create(&th, 0, thr, (void*)(long)cpufd);
	usleep(rand() % 10000);
	ioctl(cpufd, KVM_RUN, 0);
	pthread_join(th, 0);
	return 0;
    }

This patch fixes it by failing ENABLE_CAP if without an irqchip.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Fixes: 5c919412fe61 (kvm/x86: Hyper-V synthetic interrupt controller)
Cc: stable@vger.kernel.org # 4.5+
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a0ac6e0060fb..57d8a856cdc5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3342,6 +3342,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
 	switch (cap->cap) {
 	case KVM_CAP_HYPERV_SYNIC:
+		if (!irqchip_in_kernel(vcpu->kvm))
+			return -EINVAL;
 		return kvm_hv_activate_synic(vcpu);
 	default:
 		return -EINVAL;
-- 
cgit v1.2.3


From 33ab91103b3415e12457e3104f0e4517ce12d0f3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Jan 2017 15:02:32 +0100
Subject: KVM: x86: fix emulation of "MOV SS, null selector"

This is CVE-2017-2583.  On Intel this causes a failed vmentry because
SS's type is neither 3 nor 7 (even though the manual says this check is
only done for usable SS, and the dmesg splat says that SS is unusable!).
On AMD it's worse: svm.c is confused and sets CPL to 0 in the vmcb.

The fix fabricates a data segment descriptor when SS is set to a null
selector, so that CPL and SS.DPL are set correctly in the VMCS/vmcb.
Furthermore, only allow setting SS to a NULL selector if SS.RPL < 3;
this in turn ensures CPL < 3 because RPL must be equal to CPL.

Thanks to Andy Lutomirski and Willy Tarreau for help in analyzing
the bug and deciphering the manuals.

Reported-by: Xiaohan Zhang <zhangxiaohan1@huawei.com>
Fixes: 79d5b4c3cd809c770d4bf9812635647016c56011
Cc: stable@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f36d0fa6b885..cedbba0f3402 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1585,7 +1585,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				    &ctxt->exception);
 }
 
-/* Does not support long mode */
 static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				     u16 selector, int seg, u8 cpl,
 				     enum x86_transfer_type transfer,
@@ -1622,20 +1621,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	rpl = selector & 3;
 
-	/* NULL selector is not valid for TR, CS and SS (except for long mode) */
-	if ((seg == VCPU_SREG_CS
-	     || (seg == VCPU_SREG_SS
-		 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
-	     || seg == VCPU_SREG_TR)
-	    && null_selector)
-		goto exception;
-
 	/* TR should be in GDT only */
 	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
 		goto exception;
 
-	if (null_selector) /* for NULL selector skip all following checks */
+	/* NULL selector is not valid for TR, CS and (except for long mode) SS */
+	if (null_selector) {
+		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+			goto exception;
+
+		if (seg == VCPU_SREG_SS) {
+			if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+				goto exception;
+
+			/*
+			 * ctxt->ops->set_segment expects the CPL to be in
+			 * SS.DPL, so fake an expand-up 32-bit data segment.
+			 */
+			seg_desc.type = 3;
+			seg_desc.p = 1;
+			seg_desc.s = 1;
+			seg_desc.dpl = cpl;
+			seg_desc.d = 1;
+			seg_desc.g = 1;
+		}
+
+		/* Skip all following checks */
 		goto load;
+	}
 
 	ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
 	if (ret != X86EMUL_CONTINUE)
@@ -1751,6 +1764,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				   u16 selector, int seg)
 {
 	u8 cpl = ctxt->ops->cpl(ctxt);
+
+	/*
+	 * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+	 * they can load it at CPL<3 (Intel's manual says only LSS can,
+	 * but it's wrong).
+	 *
+	 * However, the Intel manual says that putting IST=1/DPL=3 in
+	 * an interrupt gate will result in SS=3 (the AMD manual instead
+	 * says it doesn't), so allow SS=3 in __load_segment_descriptor
+	 * and only forbid it here.
+	 */
+	if (seg == VCPU_SREG_SS && selector == 3 &&
+	    ctxt->mode == X86EMUL_MODE_PROT64)
+		return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
 	return __load_segment_descriptor(ctxt, selector, seg, cpl,
 					 X86_TRANSFER_NONE, NULL);
 }
-- 
cgit v1.2.3