From c0bb03924f1a80e7f65900e36c8e6b3dc167c5f8 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 7 Dec 2016 01:16:24 -0800
Subject: Drivers: hv: vmbus: Raise retry/wait limits in vmbus_post_msg()

DoS protection conditions were altered in WS2016 and now it's easy to get
-EAGAIN returned from vmbus_post_msg() (e.g. when we try changing MTU on a
netvsc device in a loop). All vmbus_post_msg() callers don't retry the
operation and we usually end up with a non-functional device or crash.

While host's DoS protection conditions are unknown to me my tests show that
it can take up to 10 seconds before the message is sent so doing udelay()
is not an option, we really need to sleep. Almost all vmbus_post_msg()
callers are ready to sleep but there is one special case:
vmbus_initiate_unload() which can be called from interrupt/NMI context and
we can't sleep there. I'm also not sure about the lonely
vmbus_send_tl_connect_request() which has no in-tree users but its external
users are most likely waiting for the host to reply so sleeping there is
also appropriate.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/connection.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'drivers/hv/connection.c')

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 6ce8b874e833..9b72ebcd37bc 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -111,7 +111,8 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 
 	ret = vmbus_post_msg(msg,
-			       sizeof(struct vmbus_channel_initiate_contact));
+			     sizeof(struct vmbus_channel_initiate_contact),
+			     true);
 	if (ret != 0) {
 		spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
 		list_del(&msginfo->msglistentry);
@@ -435,7 +436,7 @@ void vmbus_on_event(unsigned long data)
 /*
  * vmbus_post_msg - Send a msg on the vmbus's message connection
  */
-int vmbus_post_msg(void *buffer, size_t buflen)
+int vmbus_post_msg(void *buffer, size_t buflen, bool can_sleep)
 {
 	union hv_connection_id conn_id;
 	int ret = 0;
@@ -450,7 +451,7 @@ int vmbus_post_msg(void *buffer, size_t buflen)
 	 * insufficient resources. Retry the operation a couple of
 	 * times before giving up.
 	 */
-	while (retries < 20) {
+	while (retries < 100) {
 		ret = hv_post_message(conn_id, 1, buffer, buflen);
 
 		switch (ret) {
@@ -473,8 +474,14 @@ int vmbus_post_msg(void *buffer, size_t buflen)
 		}
 
 		retries++;
-		udelay(usec);
-		if (usec < 2048)
+		if (can_sleep && usec > 1000)
+			msleep(usec / 1000);
+		else if (usec < MAX_UDELAY_MS * 1000)
+			udelay(usec);
+		else
+			mdelay(usec / 1000);
+
+		if (usec < 256000)
 			usec *= 2;
 	}
 	return ret;
-- 
cgit v1.2.3


From 8de8af7e0873c4fdac2205327dff922819e16657 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Thu, 19 Jan 2017 11:51:47 -0700
Subject: Drivers: hv: vmbus: Move the extracting of Hypervisor version
 information

As part of the effort to separate out architecture specific code,
extract hypervisor version information in an architecture specific
file.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mshyperv.h | 19 ++++++++++++++++
 arch/x86/kernel/cpu/mshyperv.c  | 20 +++++++++++++++++
 drivers/hv/connection.c         |  7 ++----
 drivers/hv/hv.c                 | 49 -----------------------------------------
 drivers/hv/hyperv_vmbus.h       | 27 -----------------------
 5 files changed, 41 insertions(+), 81 deletions(-)

(limited to 'drivers/hv/connection.c')

diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index adfe8cc9f7e3..54729e3cba47 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -5,6 +5,25 @@
 #include <linux/interrupt.h>
 #include <asm/hyperv.h>
 
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HVCPUID_VERSION_FEATURES).
+ */
+enum hv_cpuid_function {
+	HVCPUID_VERSION_FEATURES		= 0x00000001,
+	HVCPUID_VENDOR_MAXFUNCTION		= 0x40000000,
+	HVCPUID_INTERFACE			= 0x40000001,
+
+	/*
+	 * The remaining functions depend on the value of
+	 * HVCPUID_INTERFACE
+	 */
+	HVCPUID_VERSION				= 0x40000002,
+	HVCPUID_FEATURES			= 0x40000003,
+	HVCPUID_ENLIGHTENMENT_INFO		= 0x40000004,
+	HVCPUID_IMPLEMENTATION_LIMITS		= 0x40000005,
+};
+
 struct ms_hyperv_info {
 	u32 features;
 	u32 misc_features;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index d3705a44971c..b5375b9497b3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -160,6 +160,11 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
 
 static void __init ms_hyperv_init_platform(void)
 {
+	int hv_host_info_eax;
+	int hv_host_info_ebx;
+	int hv_host_info_ecx;
+	int hv_host_info_edx;
+
 	/*
 	 * Extract the features and hints
 	 */
@@ -170,6 +175,21 @@ static void __init ms_hyperv_init_platform(void)
 	pr_info("HyperV: features 0x%x, hints 0x%x\n",
 		ms_hyperv.features, ms_hyperv.hints);
 
+	/*
+	 * Extract host information.
+	 */
+	if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) {
+		hv_host_info_eax = cpuid_eax(HVCPUID_VERSION);
+		hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION);
+		hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
+		hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
+
+		pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
+			hv_host_info_eax, hv_host_info_ebx >> 16,
+			hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
+			hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
+	}
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
 		/*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 9b72ebcd37bc..307a5a8937f6 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -221,11 +221,8 @@ int vmbus_connect(void)
 		goto cleanup;
 
 	vmbus_proto_version = version;
-	pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d; Vmbus version:%d.%d\n",
-		    host_info_eax, host_info_ebx >> 16,
-		    host_info_ebx & 0xFFFF, host_info_ecx,
-		    host_info_edx >> 24, host_info_edx & 0xFFFFFF,
-		    version >> 16, version & 0xFFFF);
+	pr_info("Vmbus version:%d.%d\n",
+		version >> 16, version & 0xFFFF);
 
 	kfree(msginfo);
 	return 0;
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 1a33b59776d3..9985a347ed03 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -42,51 +42,6 @@ struct hv_context hv_context = {
 #define HV_MAX_MAX_DELTA_TICKS 0xffffffff
 #define HV_MIN_DELTA_TICKS 1
 
-/*
- * query_hypervisor_info - Get version info of the windows hypervisor
- */
-unsigned int host_info_eax;
-unsigned int host_info_ebx;
-unsigned int host_info_ecx;
-unsigned int host_info_edx;
-
-static int query_hypervisor_info(void)
-{
-	unsigned int eax;
-	unsigned int ebx;
-	unsigned int ecx;
-	unsigned int edx;
-	unsigned int max_leaf;
-	unsigned int op;
-
-	/*
-	* Its assumed that this is called after confirming that Viridian
-	* is present. Query id and revision.
-	*/
-	eax = 0;
-	ebx = 0;
-	ecx = 0;
-	edx = 0;
-	op = HVCPUID_VENDOR_MAXFUNCTION;
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-
-	max_leaf = eax;
-
-	if (max_leaf >= HVCPUID_VERSION) {
-		eax = 0;
-		ebx = 0;
-		ecx = 0;
-		edx = 0;
-		op = HVCPUID_VERSION;
-		cpuid(op, &eax, &ebx, &ecx, &edx);
-		host_info_eax = eax;
-		host_info_ebx = ebx;
-		host_info_ecx = ecx;
-		host_info_edx = edx;
-	}
-	return max_leaf;
-}
-
 /*
  * hv_init - Main initialization routine.
  *
@@ -94,7 +49,6 @@ static int query_hypervisor_info(void)
  */
 int hv_init(void)
 {
-	int max_leaf;
 	union hv_x64_msr_hypercall_contents hypercall_msr;
 
 	memset(hv_context.synic_event_page, 0, sizeof(void *) * NR_CPUS);
@@ -111,9 +65,6 @@ int hv_init(void)
 	memset(hv_context.clk_evt, 0,
 	       sizeof(void *) * NR_CPUS);
 
-	max_leaf = query_hypervisor_info();
-
-
 	/* See if the hypercall page is already set */
 	hypercall_msr.as_uint64 = 0;
 	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 947455d30707..a7e35c842fed 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -40,25 +40,6 @@
  */
 #define HV_UTIL_NEGO_TIMEOUT 55
 
-/*
- * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
- * is set by CPUID(HVCPUID_VERSION_FEATURES).
- */
-enum hv_cpuid_function {
-	HVCPUID_VERSION_FEATURES		= 0x00000001,
-	HVCPUID_VENDOR_MAXFUNCTION		= 0x40000000,
-	HVCPUID_INTERFACE			= 0x40000001,
-
-	/*
-	 * The remaining functions depend on the value of
-	 * HVCPUID_INTERFACE
-	 */
-	HVCPUID_VERSION			= 0x40000002,
-	HVCPUID_FEATURES			= 0x40000003,
-	HVCPUID_ENLIGHTENMENT_INFO	= 0x40000004,
-	HVCPUID_IMPLEMENTATION_LIMITS		= 0x40000005,
-};
-
 #define  HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE   0x400
 
 #define HV_X64_MSR_CRASH_P0   0x40000100
@@ -444,14 +425,6 @@ extern int hv_synic_cleanup(unsigned int cpu);
 
 extern void hv_synic_clockevents_cleanup(void);
 
-/*
- * Host version information.
- */
-extern unsigned int host_info_eax;
-extern unsigned int host_info_ebx;
-extern unsigned int host_info_ecx;
-extern unsigned int host_info_edx;
-
 /* Interface */
 
 
-- 
cgit v1.2.3


From 5c1bec61fdfcd056df909a712e2a86bbaeb0f942 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 5 Feb 2017 17:20:31 -0700
Subject: vmbus: use kernel bitops for traversing interrupt mask

Use standard kernel operations for find first set bit to traverse
the channel bit array. This has added benefit of speeding up
lookup on 64 bit and because it uses find first set instruction.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      |  8 ++-----
 drivers/hv/connection.c   | 55 +++++++++++++++--------------------------------
 drivers/hv/hyperv_vmbus.h | 16 ++++++++------
 drivers/hv/vmbus_drv.c    |  4 +---
 4 files changed, 29 insertions(+), 54 deletions(-)

(limited to 'drivers/hv/connection.c')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index be34547cdb68..a016c5c0e472 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -47,12 +47,8 @@ void vmbus_setevent(struct vmbus_channel *channel)
 	 * For channels marked as in "low latency" mode
 	 * bypass the monitor page mechanism.
 	 */
-	if ((channel->offermsg.monitor_allocated) &&
-	    (!channel->low_latency)) {
-		/* Each u32 represents 32 channels */
-		sync_set_bit(channel->offermsg.child_relid & 31,
-			(unsigned long *) vmbus_connection.send_int_page +
-			(channel->offermsg.child_relid >> 5));
+	if (channel->offermsg.monitor_allocated && !channel->low_latency) {
+		vmbus_send_interrupt(channel->offermsg.child_relid);
 
 		/* Get the child to parent monitor page */
 		monitorpage = vmbus_connection.monitor_pages[1];
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 307a5a8937f6..1766ef03e78d 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -379,17 +379,11 @@ static void process_chn_event(u32 relid)
  */
 void vmbus_on_event(unsigned long data)
 {
-	u32 dword;
-	u32 maxdword;
-	int bit;
-	u32 relid;
-	u32 *recv_int_page = NULL;
-	void *page_addr;
-	int cpu = smp_processor_id();
-	union hv_synic_event_flags *event;
+	unsigned long *recv_int_page;
+	u32 maxbits, relid;
 
 	if (vmbus_proto_version < VERSION_WIN8) {
-		maxdword = MAX_NUM_CHANNELS_SUPPORTED >> 5;
+		maxbits = MAX_NUM_CHANNELS_SUPPORTED;
 		recv_int_page = vmbus_connection.recv_int_page;
 	} else {
 		/*
@@ -397,35 +391,24 @@ void vmbus_on_event(unsigned long data)
 		 * can be directly checked to get the id of the channel
 		 * that has the interrupt pending.
 		 */
-		maxdword = HV_EVENT_FLAGS_DWORD_COUNT;
-		page_addr = hv_context.synic_event_page[cpu];
-		event = (union hv_synic_event_flags *)page_addr +
+		int cpu = smp_processor_id();
+		void *page_addr = hv_context.synic_event_page[cpu];
+		union hv_synic_event_flags *event
+			= (union hv_synic_event_flags *)page_addr +
 						 VMBUS_MESSAGE_SINT;
-		recv_int_page = event->flags32;
-	}
-
 
+		maxbits = HV_EVENT_FLAGS_COUNT;
+		recv_int_page = event->flags;
+	}
 
-	/* Check events */
-	if (!recv_int_page)
+	if (unlikely(!recv_int_page))
 		return;
-	for (dword = 0; dword < maxdword; dword++) {
-		if (!recv_int_page[dword])
-			continue;
-		for (bit = 0; bit < 32; bit++) {
-			if (sync_test_and_clear_bit(bit,
-				(unsigned long *)&recv_int_page[dword])) {
-				relid = (dword << 5) + bit;
-
-				if (relid == 0)
-					/*
-					 * Special case - vmbus
-					 * channel protocol msg
-					 */
-					continue;
 
+	for_each_set_bit(relid, recv_int_page, maxbits) {
+		if (sync_test_and_clear_bit(relid, recv_int_page)) {
+			/* Special case - vmbus channel protocol msg */
+			if (relid != 0)
 				process_chn_event(relid);
-			}
 		}
 	}
 }
@@ -491,12 +474,8 @@ void vmbus_set_event(struct vmbus_channel *channel)
 {
 	u32 child_relid = channel->offermsg.child_relid;
 
-	if (!channel->is_dedicated_interrupt) {
-		/* Each u32 represents 32 channels */
-		sync_set_bit(child_relid & 31,
-			(unsigned long *)vmbus_connection.send_int_page +
-			(child_relid >> 5));
-	}
+	if (!channel->is_dedicated_interrupt)
+		vmbus_send_interrupt(child_relid);
 
 	hv_do_hypercall(HVCALL_SIGNAL_EVENT, channel->sig_event, NULL);
 }
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 86b56b677dc3..2749a4142889 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -40,11 +40,9 @@
  */
 #define HV_UTIL_NEGO_TIMEOUT 55
 
-
-
-
-#define HV_EVENT_FLAGS_BYTE_COUNT	(256)
-#define HV_EVENT_FLAGS_DWORD_COUNT	(256 / sizeof(u32))
+/* Define synthetic interrupt controller flag constants. */
+#define HV_EVENT_FLAGS_COUNT		(256 * 8)
+#define HV_EVENT_FLAGS_LONG_COUNT	(256 / sizeof(unsigned long))
 
 /*
  * Timer configuration register.
@@ -65,8 +63,7 @@ union hv_timer_config {
 
 /* Define the synthetic interrupt controller event flags format. */
 union hv_synic_event_flags {
-	u8 flags8[HV_EVENT_FLAGS_BYTE_COUNT];
-	u32 flags32[HV_EVENT_FLAGS_DWORD_COUNT];
+	unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT];
 };
 
 /* Define SynIC control register. */
@@ -358,6 +355,11 @@ struct vmbus_msginfo {
 
 extern struct vmbus_connection vmbus_connection;
 
+static inline void vmbus_send_interrupt(u32 relid)
+{
+	sync_set_bit(relid, vmbus_connection.send_int_page);
+}
+
 enum vmbus_message_handler_type {
 	/* The related handler can sleep. */
 	VMHT_BLOCKING = 0,
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index f8ebe13cf251..c1b27026f744 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -908,10 +908,8 @@ static void vmbus_isr(void)
 		(vmbus_proto_version == VERSION_WIN7)) {
 
 		/* Since we are a child, we only need to check bit 0 */
-		if (sync_test_and_clear_bit(0,
-			(unsigned long *) &event->flags32[0])) {
+		if (sync_test_and_clear_bit(0, event->flags))
 			handled = true;
-		}
 	} else {
 		/*
 		 * Our host is win8 or above. The signaling mechanism
-- 
cgit v1.2.3


From 37cdd991fac810a727cd285629d1640fcf53cd19 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:19 -0700
Subject: vmbus: put related per-cpu variable together

The hv_context structure had several arrays which were per-cpu
and was allocating small structures (tasklet_struct). Instead use
a single per-cpu array.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c |  35 ++++++++-----
 drivers/hv/connection.c   |  20 ++++---
 drivers/hv/hv.c           | 130 ++++++++++++++++++++--------------------------
 drivers/hv/hyperv_vmbus.h |  53 +++++++++++--------
 drivers/hv/vmbus_drv.c    |  39 ++++++++------
 5 files changed, 143 insertions(+), 134 deletions(-)

(limited to 'drivers/hv/connection.c')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index de90a9900fee..579ad2560a39 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -353,9 +353,10 @@ static void free_channel(struct vmbus_channel *channel)
 static void percpu_channel_enq(void *arg)
 {
 	struct vmbus_channel *channel = arg;
-	int cpu = smp_processor_id();
+	struct hv_per_cpu_context *hv_cpu
+		= this_cpu_ptr(hv_context.cpu_context);
 
-	list_add_tail(&channel->percpu_list, &hv_context.percpu_list[cpu]);
+	list_add_tail(&channel->percpu_list, &hv_cpu->chan_list);
 }
 
 static void percpu_channel_deq(void *arg)
@@ -379,19 +380,21 @@ static void vmbus_release_relid(u32 relid)
 
 void hv_event_tasklet_disable(struct vmbus_channel *channel)
 {
-	struct tasklet_struct *tasklet;
-	tasklet = hv_context.event_dpc[channel->target_cpu];
-	tasklet_disable(tasklet);
+	struct hv_per_cpu_context *hv_cpu;
+
+	hv_cpu = per_cpu_ptr(hv_context.cpu_context, channel->target_cpu);
+	tasklet_disable(&hv_cpu->event_dpc);
 }
 
 void hv_event_tasklet_enable(struct vmbus_channel *channel)
 {
-	struct tasklet_struct *tasklet;
-	tasklet = hv_context.event_dpc[channel->target_cpu];
-	tasklet_enable(tasklet);
+	struct hv_per_cpu_context *hv_cpu;
+
+	hv_cpu = per_cpu_ptr(hv_context.cpu_context, channel->target_cpu);
+	tasklet_enable(&hv_cpu->event_dpc);
 
 	/* In case there is any pending event */
-	tasklet_schedule(tasklet);
+	tasklet_schedule(&hv_cpu->event_dpc);
 }
 
 void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
@@ -726,9 +729,12 @@ static void vmbus_wait_for_unload(void)
 			break;
 
 		for_each_online_cpu(cpu) {
-			page_addr = hv_context.synic_message_page[cpu];
-			msg = (struct hv_message *)page_addr +
-				VMBUS_MESSAGE_SINT;
+			struct hv_per_cpu_context *hv_cpu
+				= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+			page_addr = hv_cpu->synic_message_page;
+			msg = (struct hv_message *)page_addr
+				+ VMBUS_MESSAGE_SINT;
 
 			message_type = READ_ONCE(msg->header.message_type);
 			if (message_type == HVMSG_NONE)
@@ -752,7 +758,10 @@ static void vmbus_wait_for_unload(void)
 	 * messages after we reconnect.
 	 */
 	for_each_online_cpu(cpu) {
-		page_addr = hv_context.synic_message_page[cpu];
+		struct hv_per_cpu_context *hv_cpu
+			= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+		page_addr = hv_cpu->synic_message_page;
 		msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
 		msg->header.message_type = HVMSG_NONE;
 	}
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 1766ef03e78d..158f12823baf 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -93,12 +93,10 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
 	 * all the CPUs. This is needed for kexec to work correctly where
 	 * the CPU attempting to connect may not be CPU 0.
 	 */
-	if (version >= VERSION_WIN8_1) {
-		msg->target_vcpu = hv_context.vp_index[get_cpu()];
-		put_cpu();
-	} else {
+	if (version >= VERSION_WIN8_1)
+		msg->target_vcpu = hv_context.vp_index[smp_processor_id()];
+	else
 		msg->target_vcpu = 0;
-	}
 
 	/*
 	 * Add to list before we send the request since we may
@@ -269,12 +267,12 @@ void vmbus_disconnect(void)
  */
 static struct vmbus_channel *pcpu_relid2channel(u32 relid)
 {
+	struct hv_per_cpu_context *hv_cpu
+		= this_cpu_ptr(hv_context.cpu_context);
+	struct vmbus_channel *found_channel = NULL;
 	struct vmbus_channel *channel;
-	struct vmbus_channel *found_channel  = NULL;
-	int cpu = smp_processor_id();
-	struct list_head *pcpu_head = &hv_context.percpu_list[cpu];
 
-	list_for_each_entry(channel, pcpu_head, percpu_list) {
+	list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) {
 		if (channel->offermsg.child_relid == relid) {
 			found_channel = channel;
 			break;
@@ -379,6 +377,7 @@ static void process_chn_event(u32 relid)
  */
 void vmbus_on_event(unsigned long data)
 {
+	struct hv_per_cpu_context *hv_cpu = (void *)data;
 	unsigned long *recv_int_page;
 	u32 maxbits, relid;
 
@@ -391,8 +390,7 @@ void vmbus_on_event(unsigned long data)
 		 * can be directly checked to get the id of the channel
 		 * that has the interrupt pending.
 		 */
-		int cpu = smp_processor_id();
-		void *page_addr = hv_context.synic_event_page[cpu];
+		void *page_addr = hv_cpu->synic_event_page;
 		union hv_synic_event_flags *event
 			= (union hv_synic_event_flags *)page_addr +
 						 VMBUS_MESSAGE_SINT;
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 0f73237bed0a..9fc2355a60ea 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -49,24 +49,13 @@ struct hv_context hv_context = {
  */
 int hv_init(void)
 {
-
-	memset(hv_context.synic_event_page, 0, sizeof(void *) * NR_CPUS);
-	memset(hv_context.synic_message_page, 0,
-	       sizeof(void *) * NR_CPUS);
-	memset(hv_context.post_msg_page, 0,
-	       sizeof(void *) * NR_CPUS);
-	memset(hv_context.vp_index, 0,
-	       sizeof(int) * NR_CPUS);
-	memset(hv_context.event_dpc, 0,
-	       sizeof(void *) * NR_CPUS);
-	memset(hv_context.msg_dpc, 0,
-	       sizeof(void *) * NR_CPUS);
-	memset(hv_context.clk_evt, 0,
-	       sizeof(void *) * NR_CPUS);
-
 	if (!hv_is_hypercall_page_setup())
 		return -ENOTSUPP;
 
+	hv_context.cpu_context = alloc_percpu(struct hv_per_cpu_context);
+	if (!hv_context.cpu_context)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -79,25 +68,24 @@ int hv_post_message(union hv_connection_id connection_id,
 		  enum hv_message_type message_type,
 		  void *payload, size_t payload_size)
 {
-
 	struct hv_input_post_message *aligned_msg;
+	struct hv_per_cpu_context *hv_cpu;
 	u64 status;
 
 	if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
 		return -EMSGSIZE;
 
-	aligned_msg = (struct hv_input_post_message *)
-			hv_context.post_msg_page[get_cpu()];
-
+	hv_cpu = get_cpu_ptr(hv_context.cpu_context);
+	aligned_msg = hv_cpu->post_msg_page;
 	aligned_msg->connectionid = connection_id;
 	aligned_msg->reserved = 0;
 	aligned_msg->message_type = message_type;
 	aligned_msg->payload_size = payload_size;
 	memcpy((void *)aligned_msg->payload, payload, payload_size);
+	put_cpu_ptr(hv_cpu);
 
 	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
 
-	put_cpu();
 	return status & 0xFFFF;
 }
 
@@ -154,8 +142,6 @@ static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu)
 
 int hv_synic_alloc(void)
 {
-	size_t size = sizeof(struct tasklet_struct);
-	size_t ced_size = sizeof(struct clock_event_device);
 	int cpu;
 
 	hv_context.hv_numa_map = kzalloc(sizeof(struct cpumask) * nr_node_ids,
@@ -166,53 +152,43 @@ int hv_synic_alloc(void)
 	}
 
 	for_each_present_cpu(cpu) {
-		hv_context.event_dpc[cpu] = kmalloc(size, GFP_ATOMIC);
-		if (hv_context.event_dpc[cpu] == NULL) {
-			pr_err("Unable to allocate event dpc\n");
-			goto err;
-		}
-		tasklet_init(hv_context.event_dpc[cpu], vmbus_on_event, cpu);
-
-		hv_context.msg_dpc[cpu] = kmalloc(size, GFP_ATOMIC);
-		if (hv_context.msg_dpc[cpu] == NULL) {
-			pr_err("Unable to allocate event dpc\n");
-			goto err;
-		}
-		tasklet_init(hv_context.msg_dpc[cpu], vmbus_on_msg_dpc, cpu);
-
-		hv_context.clk_evt[cpu] = kzalloc(ced_size, GFP_ATOMIC);
-		if (hv_context.clk_evt[cpu] == NULL) {
+		struct hv_per_cpu_context *hv_cpu
+			= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+		memset(hv_cpu, 0, sizeof(*hv_cpu));
+		tasklet_init(&hv_cpu->event_dpc,
+			     vmbus_on_event, (unsigned long) hv_cpu);
+		tasklet_init(&hv_cpu->msg_dpc,
+			     vmbus_on_msg_dpc, (unsigned long) hv_cpu);
+
+		hv_cpu->clk_evt = kzalloc(sizeof(struct clock_event_device),
+					  GFP_KERNEL);
+		if (hv_cpu->clk_evt == NULL) {
 			pr_err("Unable to allocate clock event device\n");
 			goto err;
 		}
+		hv_init_clockevent_device(hv_cpu->clk_evt, cpu);
 
-		hv_init_clockevent_device(hv_context.clk_evt[cpu], cpu);
-
-		hv_context.synic_message_page[cpu] =
+		hv_cpu->synic_message_page =
 			(void *)get_zeroed_page(GFP_ATOMIC);
-
-		if (hv_context.synic_message_page[cpu] == NULL) {
+		if (hv_cpu->synic_message_page == NULL) {
 			pr_err("Unable to allocate SYNIC message page\n");
 			goto err;
 		}
 
-		hv_context.synic_event_page[cpu] =
-			(void *)get_zeroed_page(GFP_ATOMIC);
-
-		if (hv_context.synic_event_page[cpu] == NULL) {
+		hv_cpu->synic_event_page = (void *)get_zeroed_page(GFP_ATOMIC);
+		if (hv_cpu->synic_event_page == NULL) {
 			pr_err("Unable to allocate SYNIC event page\n");
 			goto err;
 		}
 
-		hv_context.post_msg_page[cpu] =
-			(void *)get_zeroed_page(GFP_ATOMIC);
-
-		if (hv_context.post_msg_page[cpu] == NULL) {
+		hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
+		if (hv_cpu->post_msg_page == NULL) {
 			pr_err("Unable to allocate post msg page\n");
 			goto err;
 		}
 
-		INIT_LIST_HEAD(&hv_context.percpu_list[cpu]);
+		INIT_LIST_HEAD(&hv_cpu->chan_list);
 	}
 
 	return 0;
@@ -220,26 +196,24 @@ err:
 	return -ENOMEM;
 }
 
-static void hv_synic_free_cpu(int cpu)
-{
-	kfree(hv_context.event_dpc[cpu]);
-	kfree(hv_context.msg_dpc[cpu]);
-	kfree(hv_context.clk_evt[cpu]);
-	if (hv_context.synic_event_page[cpu])
-		free_page((unsigned long)hv_context.synic_event_page[cpu]);
-	if (hv_context.synic_message_page[cpu])
-		free_page((unsigned long)hv_context.synic_message_page[cpu]);
-	if (hv_context.post_msg_page[cpu])
-		free_page((unsigned long)hv_context.post_msg_page[cpu]);
-}
 
 void hv_synic_free(void)
 {
 	int cpu;
 
+	for_each_present_cpu(cpu) {
+		struct hv_per_cpu_context *hv_cpu
+			= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+		if (hv_cpu->synic_event_page)
+			free_page((unsigned long)hv_cpu->synic_event_page);
+		if (hv_cpu->synic_message_page)
+			free_page((unsigned long)hv_cpu->synic_message_page);
+		if (hv_cpu->post_msg_page)
+			free_page((unsigned long)hv_cpu->post_msg_page);
+	}
+
 	kfree(hv_context.hv_numa_map);
-	for_each_present_cpu(cpu)
-		hv_synic_free_cpu(cpu);
 }
 
 /*
@@ -251,6 +225,8 @@ void hv_synic_free(void)
  */
 int hv_synic_init(unsigned int cpu)
 {
+	struct hv_per_cpu_context *hv_cpu
+		= per_cpu_ptr(hv_context.cpu_context, cpu);
 	union hv_synic_simp simp;
 	union hv_synic_siefp siefp;
 	union hv_synic_sint shared_sint;
@@ -260,7 +236,7 @@ int hv_synic_init(unsigned int cpu)
 	/* Setup the Synic's message page */
 	hv_get_simp(simp.as_uint64);
 	simp.simp_enabled = 1;
-	simp.base_simp_gpa = virt_to_phys(hv_context.synic_message_page[cpu])
+	simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
 		>> PAGE_SHIFT;
 
 	hv_set_simp(simp.as_uint64);
@@ -268,7 +244,7 @@ int hv_synic_init(unsigned int cpu)
 	/* Setup the Synic's event page */
 	hv_get_siefp(siefp.as_uint64);
 	siefp.siefp_enabled = 1;
-	siefp.base_siefp_gpa = virt_to_phys(hv_context.synic_event_page[cpu])
+	siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
 		>> PAGE_SHIFT;
 
 	hv_set_siefp(siefp.as_uint64);
@@ -305,7 +281,7 @@ int hv_synic_init(unsigned int cpu)
 	 * Register the per-cpu clockevent source.
 	 */
 	if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE)
-		clockevents_config_and_register(hv_context.clk_evt[cpu],
+		clockevents_config_and_register(hv_cpu->clk_evt,
 						HV_TIMER_FREQUENCY,
 						HV_MIN_DELTA_TICKS,
 						HV_MAX_MAX_DELTA_TICKS);
@@ -322,8 +298,12 @@ void hv_synic_clockevents_cleanup(void)
 	if (!(ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE))
 		return;
 
-	for_each_present_cpu(cpu)
-		clockevents_unbind_device(hv_context.clk_evt[cpu], cpu);
+	for_each_present_cpu(cpu) {
+		struct hv_per_cpu_context *hv_cpu
+			= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+		clockevents_unbind_device(hv_cpu->clk_evt, cpu);
+	}
 }
 
 /*
@@ -372,8 +352,12 @@ int hv_synic_cleanup(unsigned int cpu)
 
 	/* Turn off clockevent device */
 	if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE) {
-		clockevents_unbind_device(hv_context.clk_evt[cpu], cpu);
-		hv_ce_shutdown(hv_context.clk_evt[cpu]);
+		struct hv_per_cpu_context *hv_cpu
+			= this_cpu_ptr(hv_context.cpu_context);
+
+		clockevents_unbind_device(hv_cpu->clk_evt, cpu);
+		hv_ce_shutdown(hv_cpu->clk_evt);
+		put_cpu_ptr(hv_cpu);
 	}
 
 	hv_get_synint_state(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT,
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index c375ec89db6f..c8ce9ab2e16a 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -29,6 +29,7 @@
 #include <asm/sync_bitops.h>
 #include <linux/atomic.h>
 #include <linux/hyperv.h>
+#include <linux/interrupt.h>
 
 /*
  * Timeout for services such as KVP and fcopy.
@@ -189,6 +190,33 @@ enum {
 	VMBUS_MESSAGE_SINT		= 2,
 };
 
+/*
+ * Per cpu state for channel handling
+ */
+struct hv_per_cpu_context {
+	void *synic_message_page;
+	void *synic_event_page;
+	/*
+	 * buffer to post messages to the host.
+	 */
+	void *post_msg_page;
+
+	/*
+	 * Starting with win8, we can take channel interrupts on any CPU;
+	 * we will manage the tasklet that handles events messages on a per CPU
+	 * basis.
+	 */
+	struct tasklet_struct event_dpc;
+	struct tasklet_struct msg_dpc;
+
+	/*
+	 * To optimize the mapping of relid to channel, maintain
+	 * per-cpu list of the channels based on their CPU affinity.
+	 */
+	struct list_head chan_list;
+	struct clock_event_device *clk_evt;
+};
+
 struct hv_context {
 	/* We only support running on top of Hyper-V
 	* So at this point this really can only contain the Hyper-V ID
@@ -199,8 +227,8 @@ struct hv_context {
 
 	bool synic_initialized;
 
-	void *synic_message_page[NR_CPUS];
-	void *synic_event_page[NR_CPUS];
+	struct hv_per_cpu_context __percpu *cpu_context;
+
 	/*
 	 * Hypervisor's notion of virtual processor ID is different from
 	 * Linux' notion of CPU ID. This information can only be retrieved
@@ -211,26 +239,7 @@ struct hv_context {
 	 * Linux cpuid 'a'.
 	 */
 	u32 vp_index[NR_CPUS];
-	/*
-	 * Starting with win8, we can take channel interrupts on any CPU;
-	 * we will manage the tasklet that handles events messages on a per CPU
-	 * basis.
-	 */
-	struct tasklet_struct *event_dpc[NR_CPUS];
-	struct tasklet_struct *msg_dpc[NR_CPUS];
-	/*
-	 * To optimize the mapping of relid to channel, maintain
-	 * per-cpu list of the channels based on their CPU affinity.
-	 */
-	struct list_head percpu_list[NR_CPUS];
-	/*
-	 * buffer to post messages to the host.
-	 */
-	void *post_msg_page[NR_CPUS];
-	/*
-	 * Support PV clockevent device.
-	 */
-	struct clock_event_device *clk_evt[NR_CPUS];
+
 	/*
 	 * To manage allocations in a NUMA node.
 	 * Array indexed by numa node ID.
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index c1b27026f744..cf8540c1df4a 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -835,9 +835,10 @@ static void vmbus_onmessage_work(struct work_struct *work)
 	kfree(ctx);
 }
 
-static void hv_process_timer_expiration(struct hv_message *msg, int cpu)
+static void hv_process_timer_expiration(struct hv_message *msg,
+					struct hv_per_cpu_context *hv_cpu)
 {
-	struct clock_event_device *dev = hv_context.clk_evt[cpu];
+	struct clock_event_device *dev = hv_cpu->clk_evt;
 
 	if (dev->event_handler)
 		dev->event_handler(dev);
@@ -847,8 +848,8 @@ static void hv_process_timer_expiration(struct hv_message *msg, int cpu)
 
 void vmbus_on_msg_dpc(unsigned long data)
 {
-	int cpu = smp_processor_id();
-	void *page_addr = hv_context.synic_message_page[cpu];
+	struct hv_per_cpu_context *hv_cpu = (void *)data;
+	void *page_addr = hv_cpu->synic_message_page;
 	struct hv_message *msg = (struct hv_message *)page_addr +
 				  VMBUS_MESSAGE_SINT;
 	struct vmbus_channel_message_header *hdr;
@@ -886,14 +887,14 @@ msg_handled:
 
 static void vmbus_isr(void)
 {
-	int cpu = smp_processor_id();
-	void *page_addr;
+	struct hv_per_cpu_context *hv_cpu
+		= this_cpu_ptr(hv_context.cpu_context);
+	void *page_addr = hv_cpu->synic_event_page;
 	struct hv_message *msg;
 	union hv_synic_event_flags *event;
 	bool handled = false;
 
-	page_addr = hv_context.synic_event_page[cpu];
-	if (page_addr == NULL)
+	if (unlikely(page_addr == NULL))
 		return;
 
 	event = (union hv_synic_event_flags *)page_addr +
@@ -921,18 +922,18 @@ static void vmbus_isr(void)
 	}
 
 	if (handled)
-		tasklet_schedule(hv_context.event_dpc[cpu]);
+		tasklet_schedule(&hv_cpu->event_dpc);
 
 
-	page_addr = hv_context.synic_message_page[cpu];
+	page_addr = hv_cpu->synic_message_page;
 	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
 
 	/* Check if there are actual msgs to be processed */
 	if (msg->header.message_type != HVMSG_NONE) {
 		if (msg->header.message_type == HVMSG_TIMER_EXPIRED)
-			hv_process_timer_expiration(msg, cpu);
+			hv_process_timer_expiration(msg, hv_cpu);
 		else
-			tasklet_schedule(hv_context.msg_dpc[cpu]);
+			tasklet_schedule(&hv_cpu->msg_dpc);
 	}
 
 	add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
@@ -1521,9 +1522,14 @@ static void __exit vmbus_exit(void)
 	hv_synic_clockevents_cleanup();
 	vmbus_disconnect();
 	hv_remove_vmbus_irq();
-	for_each_online_cpu(cpu)
-		tasklet_kill(hv_context.msg_dpc[cpu]);
+	for_each_online_cpu(cpu) {
+		struct hv_per_cpu_context *hv_cpu
+			= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+		tasklet_kill(&hv_cpu->msg_dpc);
+	}
 	vmbus_free_channels();
+
 	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
 		unregister_die_notifier(&hyperv_die_block);
 		atomic_notifier_chain_unregister(&panic_notifier_list,
@@ -1531,7 +1537,10 @@ static void __exit vmbus_exit(void)
 	}
 	bus_unregister(&hv_bus);
 	for_each_online_cpu(cpu) {
-		tasklet_kill(hv_context.event_dpc[cpu]);
+		struct hv_per_cpu_context *hv_cpu
+			= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+		tasklet_kill(&hv_cpu->event_dpc);
 	}
 	cpuhp_remove_state(hyperv_cpuhp_online);
 	hv_synic_free();
-- 
cgit v1.2.3


From 631e63a9f346cb657761ae22138f294718696501 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:20 -0700
Subject: vmbus: change to per channel tasklet

Make the event handling tasklet per channel rather than per-cpu.
This allows for better fairness when getting lots of data on the same
cpu.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      |  2 +-
 drivers/hv/channel_mgmt.c | 16 +++++-----
 drivers/hv/connection.c   | 78 ++---------------------------------------------
 drivers/hv/hv.c           |  2 --
 drivers/hv/hyperv_vmbus.h |  1 -
 drivers/hv/vmbus_drv.c    | 58 ++++++++++++++++++++++++++++++-----
 include/linux/hyperv.h    |  3 +-
 7 files changed, 64 insertions(+), 96 deletions(-)

(limited to 'drivers/hv/connection.c')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 789c75f6df26..18cc1c78260d 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -530,7 +530,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
 	int ret;
 
 	/*
-	 * process_chn_event(), running in the tasklet, can race
+	 * vmbus_on_event(), running in the tasklet, can race
 	 * with vmbus_close_internal() in the case of SMP guest, e.g., when
 	 * the former is accessing channel->inbound.ring_buffer, the latter
 	 * could be freeing the ring_buffer pages.
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 579ad2560a39..2f6270d76b79 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -339,6 +339,9 @@ static struct vmbus_channel *alloc_channel(void)
 	INIT_LIST_HEAD(&channel->sc_list);
 	INIT_LIST_HEAD(&channel->percpu_list);
 
+	tasklet_init(&channel->callback_event,
+		     vmbus_on_event, (unsigned long)channel);
+
 	return channel;
 }
 
@@ -347,6 +350,7 @@ static struct vmbus_channel *alloc_channel(void)
  */
 static void free_channel(struct vmbus_channel *channel)
 {
+	tasklet_kill(&channel->callback_event);
 	kfree(channel);
 }
 
@@ -380,21 +384,15 @@ static void vmbus_release_relid(u32 relid)
 
 void hv_event_tasklet_disable(struct vmbus_channel *channel)
 {
-	struct hv_per_cpu_context *hv_cpu;
-
-	hv_cpu = per_cpu_ptr(hv_context.cpu_context, channel->target_cpu);
-	tasklet_disable(&hv_cpu->event_dpc);
+	tasklet_disable(&channel->callback_event);
 }
 
 void hv_event_tasklet_enable(struct vmbus_channel *channel)
 {
-	struct hv_per_cpu_context *hv_cpu;
-
-	hv_cpu = per_cpu_ptr(hv_context.cpu_context, channel->target_cpu);
-	tasklet_enable(&hv_cpu->event_dpc);
+	tasklet_enable(&channel->callback_event);
 
 	/* In case there is any pending event */
-	tasklet_schedule(&hv_cpu->event_dpc);
+	tasklet_schedule(&channel->callback_event);
 }
 
 void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 158f12823baf..27e72dc07e12 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -259,29 +259,6 @@ void vmbus_disconnect(void)
 	vmbus_connection.monitor_pages[1] = NULL;
 }
 
-/*
- * Map the given relid to the corresponding channel based on the
- * per-cpu list of channels that have been affinitized to this CPU.
- * This will be used in the channel callback path as we can do this
- * mapping in a lock-free fashion.
- */
-static struct vmbus_channel *pcpu_relid2channel(u32 relid)
-{
-	struct hv_per_cpu_context *hv_cpu
-		= this_cpu_ptr(hv_context.cpu_context);
-	struct vmbus_channel *found_channel = NULL;
-	struct vmbus_channel *channel;
-
-	list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) {
-		if (channel->offermsg.child_relid == relid) {
-			found_channel = channel;
-			break;
-		}
-	}
-
-	return found_channel;
-}
-
 /*
  * relid2channel - Get the channel object given its
  * child relative id (ie channel id)
@@ -318,24 +295,15 @@ struct vmbus_channel *relid2channel(u32 relid)
 }
 
 /*
- * process_chn_event - Process a channel event notification
+ * vmbus_on_event - Process a channel event notification
  */
-static void process_chn_event(u32 relid)
+void vmbus_on_event(unsigned long data)
 {
-	struct vmbus_channel *channel;
+	struct vmbus_channel *channel = (void *) data;
 	void *arg;
 	bool read_state;
 	u32 bytes_to_read;
 
-	/*
-	 * Find the channel based on this relid and invokes the
-	 * channel callback to process the event
-	 */
-	channel = pcpu_relid2channel(relid);
-
-	if (!channel)
-		return;
-
 	/*
 	 * A channel once created is persistent even when there
 	 * is no driver handling the device. An unloading driver
@@ -344,7 +312,6 @@ static void process_chn_event(u32 relid)
 	 * Thus, checking and invoking the driver specific callback takes
 	 * care of orderly unloading of the driver.
 	 */
-
 	if (channel->onchannel_callback != NULL) {
 		arg = channel->channel_callback_context;
 		read_state = channel->batched_reading;
@@ -372,45 +339,6 @@ static void process_chn_event(u32 relid)
 	}
 }
 
-/*
- * vmbus_on_event - Handler for events
- */
-void vmbus_on_event(unsigned long data)
-{
-	struct hv_per_cpu_context *hv_cpu = (void *)data;
-	unsigned long *recv_int_page;
-	u32 maxbits, relid;
-
-	if (vmbus_proto_version < VERSION_WIN8) {
-		maxbits = MAX_NUM_CHANNELS_SUPPORTED;
-		recv_int_page = vmbus_connection.recv_int_page;
-	} else {
-		/*
-		 * When the host is win8 and beyond, the event page
-		 * can be directly checked to get the id of the channel
-		 * that has the interrupt pending.
-		 */
-		void *page_addr = hv_cpu->synic_event_page;
-		union hv_synic_event_flags *event
-			= (union hv_synic_event_flags *)page_addr +
-						 VMBUS_MESSAGE_SINT;
-
-		maxbits = HV_EVENT_FLAGS_COUNT;
-		recv_int_page = event->flags;
-	}
-
-	if (unlikely(!recv_int_page))
-		return;
-
-	for_each_set_bit(relid, recv_int_page, maxbits) {
-		if (sync_test_and_clear_bit(relid, recv_int_page)) {
-			/* Special case - vmbus channel protocol msg */
-			if (relid != 0)
-				process_chn_event(relid);
-		}
-	}
-}
-
 /*
  * vmbus_post_msg - Send a msg on the vmbus's message connection
  */
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 9fc2355a60ea..665a64f1611e 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -156,8 +156,6 @@ int hv_synic_alloc(void)
 			= per_cpu_ptr(hv_context.cpu_context, cpu);
 
 		memset(hv_cpu, 0, sizeof(*hv_cpu));
-		tasklet_init(&hv_cpu->event_dpc,
-			     vmbus_on_event, (unsigned long) hv_cpu);
 		tasklet_init(&hv_cpu->msg_dpc,
 			     vmbus_on_msg_dpc, (unsigned long) hv_cpu);
 
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index c8ce9ab2e16a..558a798c407c 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -206,7 +206,6 @@ struct hv_per_cpu_context {
 	 * we will manage the tasklet that handles events messages on a per CPU
 	 * basis.
 	 */
-	struct tasklet_struct event_dpc;
 	struct tasklet_struct msg_dpc;
 
 	/*
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index cf8540c1df4a..eaf1a10b0245 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -885,6 +885,56 @@ msg_handled:
 	vmbus_signal_eom(msg, message_type);
 }
 
+
+/*
+ * Schedule all channels with events pending
+ */
+static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
+{
+	unsigned long *recv_int_page;
+	u32 maxbits, relid;
+
+	if (vmbus_proto_version < VERSION_WIN8) {
+		maxbits = MAX_NUM_CHANNELS_SUPPORTED;
+		recv_int_page = vmbus_connection.recv_int_page;
+	} else {
+		/*
+		 * When the host is win8 and beyond, the event page
+		 * can be directly checked to get the id of the channel
+		 * that has the interrupt pending.
+		 */
+		void *page_addr = hv_cpu->synic_event_page;
+		union hv_synic_event_flags *event
+			= (union hv_synic_event_flags *)page_addr +
+						 VMBUS_MESSAGE_SINT;
+
+		maxbits = HV_EVENT_FLAGS_COUNT;
+		recv_int_page = event->flags;
+	}
+
+	if (unlikely(!recv_int_page))
+		return;
+
+	for_each_set_bit(relid, recv_int_page, maxbits) {
+		struct vmbus_channel *channel;
+
+		if (!sync_test_and_clear_bit(relid, recv_int_page))
+			continue;
+
+		/* Special case - vmbus channel protocol msg */
+		if (relid == 0)
+			continue;
+
+		/* Find channel based on relid */
+		list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) {
+			if (channel->offermsg.child_relid == relid) {
+				tasklet_schedule(&channel->callback_event);
+				break;
+			}
+		}
+	}
+}
+
 static void vmbus_isr(void)
 {
 	struct hv_per_cpu_context *hv_cpu
@@ -922,8 +972,7 @@ static void vmbus_isr(void)
 	}
 
 	if (handled)
-		tasklet_schedule(&hv_cpu->event_dpc);
-
+		vmbus_chan_sched(hv_cpu);
 
 	page_addr = hv_cpu->synic_message_page;
 	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
@@ -1536,12 +1585,7 @@ static void __exit vmbus_exit(void)
 						 &hyperv_panic_block);
 	}
 	bus_unregister(&hv_bus);
-	for_each_online_cpu(cpu) {
-		struct hv_per_cpu_context *hv_cpu
-			= per_cpu_ptr(hv_context.cpu_context, cpu);
 
-		tasklet_kill(&hv_cpu->event_dpc);
-	}
 	cpuhp_remove_state(hyperv_cpuhp_online);
 	hv_synic_free();
 	acpi_bus_unregister_driver(&vmbus_acpi_driver);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index c9b6d533958f..69afc9337c0d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -35,7 +35,7 @@
 #include <linux/completion.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
-
+#include <linux/interrupt.h>
 
 #define MAX_PAGE_BUFFER_COUNT				32
 #define MAX_MULTIPAGE_BUFFER_COUNT			32 /* 128K */
@@ -743,6 +743,7 @@ struct vmbus_channel {
 	struct vmbus_close_msg close_msg;
 
 	/* Channel callback's invoked in softirq context */
+	struct tasklet_struct callback_event;
 	void (*onchannel_callback)(void *context);
 	void *channel_callback_context;
 
-- 
cgit v1.2.3


From b71e328297a3a578c482fb4814e737a0ec185839 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 11 Feb 2017 23:02:21 -0700
Subject: vmbus: add direct isr callback mode

Change the simple boolean batched_reading into a tri-value.
For future NAPI support in netvsc driver, the callback needs to
occur directly in interrupt handler.

Batched mode is also changed to disable host interrupts immediately
in interrupt routine (to avoid unnecessary host signals), and the
tasklet is rescheduled if more data is detected.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c    |  7 -------
 drivers/hv/connection.c      | 27 ++++++++++++---------------
 drivers/hv/hv_util.c         |  3 +--
 drivers/hv/vmbus_drv.c       | 26 ++++++++++++++++++++++++--
 drivers/uio/uio_hv_generic.c |  2 +-
 include/linux/hyperv.h       | 31 +++++++++++++++++--------------
 6 files changed, 55 insertions(+), 41 deletions(-)

(limited to 'drivers/hv/connection.c')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 2f6270d76b79..b2bb5aafaa2f 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -819,13 +819,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 		return;
 	}
 
-	/*
-	 * By default we setup state to enable batched
-	 * reading. A specific service can choose to
-	 * disable this prior to opening the channel.
-	 */
-	newchannel->batched_reading = true;
-
 	/*
 	 * Setup state for signalling the host.
 	 */
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 27e72dc07e12..a8366fec1458 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -300,9 +300,7 @@ struct vmbus_channel *relid2channel(u32 relid)
 void vmbus_on_event(unsigned long data)
 {
 	struct vmbus_channel *channel = (void *) data;
-	void *arg;
-	bool read_state;
-	u32 bytes_to_read;
+	void (*callback_fn)(void *);
 
 	/*
 	 * A channel once created is persistent even when there
@@ -312,9 +310,13 @@ void vmbus_on_event(unsigned long data)
 	 * Thus, checking and invoking the driver specific callback takes
 	 * care of orderly unloading of the driver.
 	 */
-	if (channel->onchannel_callback != NULL) {
-		arg = channel->channel_callback_context;
-		read_state = channel->batched_reading;
+	callback_fn = READ_ONCE(channel->onchannel_callback);
+	if (unlikely(callback_fn == NULL))
+		return;
+
+	(*callback_fn)(channel->channel_callback_context);
+
+	if (channel->callback_mode == HV_CALL_BATCHED) {
 		/*
 		 * This callback reads the messages sent by the host.
 		 * We can optimize host to guest signaling by ensuring:
@@ -326,16 +328,11 @@ void vmbus_on_event(unsigned long data)
 		 *    state is set we check to see if additional packets are
 		 *    available to read. In this case we repeat the process.
 		 */
+		if (hv_end_read(&channel->inbound) != 0) {
+			hv_begin_read(&channel->inbound);
 
-		do {
-			if (read_state)
-				hv_begin_read(&channel->inbound);
-			channel->onchannel_callback(arg);
-			if (read_state)
-				bytes_to_read = hv_end_read(&channel->inbound);
-			else
-				bytes_to_read = 0;
-		} while (read_state && (bytes_to_read != 0));
+			tasklet_schedule(&channel->callback_event);
+		}
 	}
 }
 
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 098cd3dc7db2..3042eaa13062 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -435,8 +435,7 @@ static int util_probe(struct hv_device *dev,
 	 * Turn off batched reading for all util drivers before we open the
 	 * channel.
 	 */
-
-	set_channel_read_state(dev->channel, false);
+	set_channel_read_mode(dev->channel, HV_CALL_DIRECT);
 
 	hv_set_drvdata(dev, srv);
 
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index eaf1a10b0245..f7f6b9144b07 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -886,6 +886,18 @@ msg_handled:
 }
 
 
+/*
+ * Direct callback for channels using other deferred processing
+ */
+static void vmbus_channel_isr(struct vmbus_channel *channel)
+{
+	void (*callback_fn)(void *);
+
+	callback_fn = READ_ONCE(channel->onchannel_callback);
+	if (likely(callback_fn != NULL))
+		(*callback_fn)(channel->channel_callback_context);
+}
+
 /*
  * Schedule all channels with events pending
  */
@@ -927,9 +939,19 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
 
 		/* Find channel based on relid */
 		list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) {
-			if (channel->offermsg.child_relid == relid) {
-				tasklet_schedule(&channel->callback_event);
+			if (channel->offermsg.child_relid != relid)
+				continue;
+
+			switch (channel->callback_mode) {
+			case HV_CALL_ISR:
+				vmbus_channel_isr(channel);
 				break;
+
+			case HV_CALL_BATCHED:
+				hv_begin_read(&channel->inbound);
+				/* fallthrough */
+			case HV_CALL_DIRECT:
+				tasklet_schedule(&channel->callback_event);
 			}
 		}
 	}
diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c
index 50958f167305..48d5327d38d4 100644
--- a/drivers/uio/uio_hv_generic.c
+++ b/drivers/uio/uio_hv_generic.c
@@ -125,7 +125,7 @@ hv_uio_probe(struct hv_device *dev,
 		goto fail;
 
 	dev->channel->inbound.ring_buffer->interrupt_mask = 1;
-	dev->channel->batched_reading = false;
+	set_channel_read_mode(dev->channel, HV_CALL_DIRECT);
 
 	/* Fill general uio info */
 	pdata->info.name = "uio_hv_generic";
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 69afc9337c0d..e5aac5c051f7 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -748,19 +748,21 @@ struct vmbus_channel {
 	void *channel_callback_context;
 
 	/*
-	 * A channel can be marked for efficient (batched)
-	 * reading:
-	 * If batched_reading is set to "true", we read until the
-	 * channel is empty and hold off interrupts from the host
-	 * during the entire read process.
-	 * If batched_reading is set to "false", the client is not
-	 * going to perform batched reading.
-	 *
-	 * By default we will enable batched reading; specific
-	 * drivers that don't want this behavior can turn it off.
+	 * A channel can be marked for one of three modes of reading:
+	 *   BATCHED - callback called from taslket and should read
+	 *            channel until empty. Interrupts from the host
+	 *            are masked while read is in process (default).
+	 *   DIRECT - callback called from tasklet (softirq).
+	 *   ISR - callback called in interrupt context and must
+	 *         invoke its own deferred processing.
+	 *         Host interrupts are disabled and must be re-enabled
+	 *         when ring is empty.
 	 */
-
-	bool batched_reading;
+	enum hv_callback_mode {
+		HV_CALL_BATCHED,
+		HV_CALL_DIRECT,
+		HV_CALL_ISR
+	} callback_mode;
 
 	bool is_dedicated_interrupt;
 	struct hv_input_signal_event_buffer sig_buf;
@@ -910,9 +912,10 @@ static inline void set_channel_affinity_state(struct vmbus_channel *c,
 	c->affinity_policy = policy;
 }
 
-static inline void set_channel_read_state(struct vmbus_channel *c, bool state)
+static inline void set_channel_read_mode(struct vmbus_channel *c,
+					enum hv_callback_mode mode)
 {
-	c->batched_reading = state;
+	c->callback_mode = mode;
 }
 
 static inline void set_per_channel_state(struct vmbus_channel *c, void *s)
-- 
cgit v1.2.3