156 files changed, 12430 insertions, 2800 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-pyra b/Documentation/ABI/testing/sysfs-driver-hid-roccat-pyra
new file mode 100644
index 000000000000..ad1125b02ff4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-pyra
@@ -0,0 +1,98 @@
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_cpi
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	It is possible to switch the cpi setting of the mouse with the
+		press of a button.
+		When read, this file returns the raw number of the actual cpi
+		setting reported by the mouse. This number has to be further
+		processed to receive the real dpi value.
+
+		VALUE DPI
+		1     400
+		2     800
+		4     1600
+
+		This file is readonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_profile
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	When read, this file returns the number of the actual profile in
+		range 0-4.
+		This file is readonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/firmware_version
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	When read, this file returns the raw integer version number of the
+		firmware reported by the mouse. Using the integer value eases
+		further usage in other programs. To receive the real version
+		number the decimal point has to be shifted 2 positions to the
+		left. E.g. a returned value of 138 means 1.38
+		This file is readonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/profile_settings
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	The mouse can store 5 profiles which can be switched by the
+		press of a button. A profile is split in settings and buttons.
+		profile_settings holds informations like resolution, sensitivity
+		and light effects.
+		When written, this file lets one write the respective profile
+		settings back to the mouse. The data has to be 13 bytes long.
+		The mouse will reject invalid data.
+		Which profile to write is determined by the profile number
+		contained in the data.
+		This file is writeonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/profile[1-5]_settings
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	The mouse can store 5 profiles which can be switched by the
+		press of a button. A profile is split in settings and buttons.
+		profile_settings holds informations like resolution, sensitivity
+		and light effects.
+		When read, these files return the respective profile settings.
+		The returned data is 13 bytes in size.
+		This file is readonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/profile_buttons
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	The mouse can store 5 profiles which can be switched by the
+		press of a button. A profile is split in settings and buttons.
+		profile_buttons holds informations about button layout.
+		When written, this file lets one write the respective profile
+		buttons back to the mouse. The data has to be 19 bytes long.
+		The mouse will reject invalid data.
+		Which profile to write is determined by the profile number
+		contained in the data.
+		This file is writeonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/profile[1-5]_buttons
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	The mouse can store 5 profiles which can be switched by the
+		press of a button. A profile is split in settings and buttons.
+		profile_buttons holds informations about button layout.
+		When read, these files return the respective profile buttons.
+		The returned data is 19 bytes in size.
+		This file is readonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/startup_profile
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	The integer value of this attribute ranges from 0-4.
+                When read, this attribute returns the number of the profile
+                that's active when the mouse is powered on.
+		This file is readonly.
+
+What:		/sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/settings
+Date:		August 2010
+Contact:	Stefan Achatz <erazor_de@users.sourceforge.net>
+Description:	When read, this file returns the settings stored in the mouse.
+		The size of the data is 3 bytes and holds information on the
+		startup_profile.
+		When written, this file lets write settings back to the mouse.
+		The data has to be 3 bytes long. The mouse will reject invalid
+		data.
diff --git a/Documentation/input/ntrig.txt b/Documentation/input/ntrig.txt
new file mode 100644
index 000000000000..be1fd981f73f
--- /dev/null
+++ b/Documentation/input/ntrig.txt
@@ -0,0 +1,126 @@
+N-Trig touchscreen Driver
+-------------------------
+	Copyright (c) 2008-2010 Rafi Rubin <rafi@seas.upenn.edu>
+	Copyright (c) 2009-2010 Stephane Chatty
+
+This driver provides support for N-Trig pen and multi-touch sensors.  Single
+and multi-touch events are translated to the appropriate protocols for
+the hid and input systems.  Pen events are sufficiently hid compliant and
+are left to the hid core.  The driver also provides additional filtering
+and utility functions accessible with sysfs and module parameters.
+
+This driver has been reported to work properly with multiple N-Trig devices
+attached.
+
+
+Parameters
+----------
+
+Note: values set at load time are global and will apply to all applicable
+devices.  Adjusting parameters with sysfs will override the load time values,
+but only for that one device.
+
+The following parameters are used to configure filters to reduce noise:
+
+activate_slack		number of fingers to ignore before processing events
+
+activation_height	size threshold to activate immediately
+activation_width
+
+min_height		size threshold bellow which fingers are ignored
+min_width		both to decide activation and during activity
+
+deactivate_slack	the number of "no contact" frames to ignore before
+			propagating the end of activity events
+
+When the last finger is removed from the device, it sends a number of empty
+frames.  By holding off on deactivation for a few frames we can tolerate false
+erroneous disconnects, where the sensor may mistakenly not detect a finger that
+is still present.  Thus deactivate_slack addresses problems where a users might
+see breaks in lines during drawing, or drop an object during a long drag.
+
+
+Additional sysfs items
+----------------------
+
+These nodes just provide easy access to the ranges reported by the device.
+sensor_logical_height	the range for positions reported during activity
+sensor_logical_width
+
+sensor_physical_height	internal ranges not used for normal events but
+sensor_physical_width	useful for tuning
+
+All N-Trig devices with product id of 1 report events in the ranges of
+X: 0-9600
+Y: 0-7200
+However not all of these devices have the same physical dimensions.  Most
+seem to be 12" sensors (Dell Latitude XT and XT2 and the HP TX2), and
+at least one model (Dell Studio 17) has a 17" sensor.  The ratio of physical
+to logical sizes is used to adjust the size based filter parameters.
+
+
+Filtering
+---------
+
+With the release of the early multi-touch firmwares it became increasingly
+obvious that these sensors were prone to erroneous events.  Users reported
+seeing both inappropriately dropped contact and ghosts, contacts reported
+where no finger was actually touching the screen.
+
+Deactivation slack helps prevent dropped contact for single touch use, but does
+not address the problem of dropping one of more contacts while other contacts
+are still active.  Drops in the multi-touch context require additional
+processing and should be handled in tandem with tacking.
+
+As observed ghost contacts are similar to actual use of the sensor, but they
+seem to have different profiles.  Ghost activity typically shows up as small
+short lived touches.  As such, I assume that the longer the continuous stream
+of events the more likely those events are from a real contact, and that the
+larger the size of each contact the more likely it is real.  Balancing the
+goals of preventing ghosts and accepting real events quickly (to minimize
+user observable latency), the filter accumulates confidence for incoming
+events until it hits thresholds and begins propagating.  In the interest in
+minimizing stored state as well as the cost of operations to make a decision,
+I've kept that decision simple.
+
+Time is measured in terms of the number of fingers reported, not frames since
+the probability of multiple simultaneous ghosts is expected to drop off
+dramatically with increasing numbers.  Rather than accumulate weight as a
+function of size, I just use it as a binary threshold.  A sufficiently large
+contact immediately overrides the waiting period and leads to activation.
+
+Setting the activation size thresholds to large values will result in deciding
+primarily on activation slack.  If you see longer lived ghosts, turning up the
+activation slack while reducing the size thresholds may suffice to eliminate
+the ghosts while keeping the screen quite responsive to firm taps.
+
+Contacts continue to be filtered with min_height and min_width even after
+the initial activation filter is satisfied.  The intent is to provide
+a mechanism for filtering out ghosts in the form of an extra finger while
+you actually are using the screen.  In practice this sort of ghost has
+been far less problematic or relatively rare and I've left the defaults
+set to 0 for both parameters, effectively turning off that filter.
+
+I don't know what the optimal values are for these filters.  If the defaults
+don't work for you, please play with the parameters.  If you do find other
+values more comfortable, I would appreciate feedback.
+
+The calibration of these devices does drift over time.  If ghosts or contact
+dropping worsen and interfere with the normal usage of your device, try
+recalibrating it.
+
+
+Calibration
+-----------
+
+The N-Trig windows tools provide calibration and testing routines.  Also an
+unofficial unsupported set of user space tools including a calibrator is
+available at:
+http://code.launchpad.net/~rafi-seas/+junk/ntrig_calib
+
+
+Tracking
+--------
+
+As of yet, all tested N-Trig firmwares do not track fingers.  When multiple
+contacts are active they seem to be sorted primarily by Y position.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4cd8b86e00ea..9533af74a127 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1131,9 +1131,13 @@ and is between 256 and 4096 characters. It is defined in the file
 	kvm.oos_shadow=	[KVM] Disable out-of-sync shadow paging.
 			Default is 1 (enabled)
 
-	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.
+	kvm.mmu_audit=	[KVM] This is a R/W parameter which allows audit
+			KVM MMU at runtime.
 			Default is 0 (off)
 
+	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.
+			Default is 1 (enabled)
+
 	kvm-amd.npt=	[KVM,AMD] Disable nested paging (virtualized MMU)
 			for all guests.
 			Default is 1 (enabled) if in 64bit or 32bit-PAE mode
@@ -1698,6 +1702,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nojitter	[IA64] Disables jitter checking for ITC timers.
 
+	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 5f5b64982b1a..b336266bea5e 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -320,13 +320,13 @@ struct kvm_translation {
 4.15 KVM_INTERRUPT
 
 Capability: basic
-Architectures: x86
+Architectures: x86, ppc
 Type: vcpu ioctl
 Parameters: struct kvm_interrupt (in)
 Returns: 0 on success, -1 on error
 
 Queues a hardware interrupt vector to be injected.  This is only
-useful if in-kernel local APIC is not used.
+useful if in-kernel local APIC or equivalent is not used.
 
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
@@ -334,8 +334,37 @@ struct kvm_interrupt {
 	__u32 irq;
 };
 
+X86:
+
 Note 'irq' is an interrupt vector, not an interrupt pin or line.
 
+PPC:
+
+Queues an external interrupt to be injected. This ioctl is overleaded
+with 3 different irq values:
+
+a) KVM_INTERRUPT_SET
+
+  This injects an edge type external interrupt into the guest once it's ready
+  to receive interrupts. When injected, the interrupt is done.
+
+b) KVM_INTERRUPT_UNSET
+
+  This unsets any pending interrupt.
+
+  Only available with KVM_CAP_PPC_UNSET_IRQ.
+
+c) KVM_INTERRUPT_SET_LEVEL
+
+  This injects a level type external interrupt into the guest context. The
+  interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET
+  is triggered.
+
+  Only available with KVM_CAP_PPC_IRQ_LEVEL.
+
+Note that any value for 'irq' other than the ones stated above is invalid
+and incurs unexpected behavior.
+
 4.16 KVM_DEBUG_GUEST
 
 Capability: basic
@@ -1013,8 +1042,9 @@ number is just right, the 'nent' field is adjusted to the number of valid
 entries in the 'entries' array, which is then filled.
 
 The entries returned are the host cpuid as returned by the cpuid instruction,
-with unknown or unsupported features masked out.  The fields in each entry
-are defined as follows:
+with unknown or unsupported features masked out.  Some features (for example,
+x2apic), may not be present in the host cpu, but are exposed by kvm if it can
+emulate them efficiently. The fields in each entry are defined as follows:
 
   function: the eax value used to obtain the entry
   index: the ecx value used to obtain the entry (for entries that are
@@ -1032,6 +1062,29 @@ are defined as follows:
    eax, ebx, ecx, edx: the values returned by the cpuid instruction for
          this function/index combination
 
+4.46 KVM_PPC_GET_PVINFO
+
+Capability: KVM_CAP_PPC_GET_PVINFO
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_pvinfo (out)
+Returns: 0 on success, !0 on error
+
+struct kvm_ppc_pvinfo {
+	__u32 flags;
+	__u32 hcall[4];
+	__u8  pad[108];
+};
+
+This ioctl fetches PV specific information that need to be passed to the guest
+using the device tree or other means from vm context.
+
+For now the only implemented piece of information distributed here is an array
+of 4 instructions that make up a hypercall.
+
+If any additional field gets added to this structure later on, a bit for that
+additional piece of information will be set in the flags bitmap.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
new file mode 100644
index 000000000000..a7f2244b3be9
--- /dev/null
+++ b/Documentation/kvm/ppc-pv.txt
@@ -0,0 +1,196 @@
+The PPC KVM paravirtual interface
+=================================
+
+The basic execution principle by which KVM on PowerPC works is to run all kernel
+space code in PR=1 which is user space. This way we trap all privileged
+instructions and can emulate them accordingly.
+
+Unfortunately that is also the downfall. There are quite some privileged
+instructions that needlessly return us to the hypervisor even though they
+could be handled differently.
+
+This is what the PPC PV interface helps with. It takes privileged instructions
+and transforms them into unprivileged ones with some help from the hypervisor.
+This cuts down virtualization costs by about 50% on some of my benchmarks.
+
+The code for that interface can be found in arch/powerpc/kernel/kvm*
+
+Querying for existence
+======================
+
+To find out if we're running on KVM or not, we leverage the device tree. When
+Linux is running on KVM, a node /hypervisor exists. That node contains a
+compatible property with the value "linux,kvm".
+
+Once you determined you're running under a PV capable KVM, you can now use
+hypercalls as described below.
+
+KVM hypercalls
+==============
+
+Inside the device tree's /hypervisor node there's a property called
+'hypercall-instructions'. This property contains at most 4 opcodes that make
+up the hypercall. To call a hypercall, just call these instructions.
+
+The parameters are as follows:
+
+	Register	IN			OUT
+
+	r0		-			volatile
+	r3		1st parameter		Return code
+	r4		2nd parameter		1st output value
+	r5		3rd parameter		2nd output value
+	r6		4th parameter		3rd output value
+	r7		5th parameter		4th output value
+	r8		6th parameter		5th output value
+	r9		7th parameter		6th output value
+	r10		8th parameter		7th output value
+	r11		hypercall number	8th output value
+	r12		-			volatile
+
+Hypercall definitions are shared in generic code, so the same hypercall numbers
+apply for x86 and powerpc alike with the exception that each KVM hypercall
+also needs to be ORed with the KVM vendor code which is (42 << 16).
+
+Return codes can be as follows:
+
+	Code		Meaning
+
+	0		Success
+	12		Hypercall not implemented
+	<0		Error
+
+The magic page
+==============
+
+To enable communication between the hypervisor and guest there is a new shared
+page that contains parts of supervisor visible register state. The guest can
+map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
+
+With this hypercall issued the guest always gets the magic page mapped at the
+desired location in effective and physical address space. For now, we always
+map the page to -4096. This way we can access it using absolute load and store
+functions. The following instruction reads the first field of the magic page:
+
+	ld	rX, -4096(0)
+
+The interface is designed to be extensible should there be need later to add
+additional registers to the magic page. If you add fields to the magic page,
+also define a new hypercall feature to indicate that the host can give you more
+registers. Only if the host supports the additional features, make use of them.
+
+The magic page has the following layout as described in
+arch/powerpc/include/asm/kvm_para.h:
+
+struct kvm_vcpu_arch_shared {
+	__u64 scratch1;
+	__u64 scratch2;
+	__u64 scratch3;
+	__u64 critical;		/* Guest may not get interrupts if == r1 */
+	__u64 sprg0;
+	__u64 sprg1;
+	__u64 sprg2;
+	__u64 sprg3;
+	__u64 srr0;
+	__u64 srr1;
+	__u64 dar;
+	__u64 msr;
+	__u32 dsisr;
+	__u32 int_pending;	/* Tells the guest if we have an interrupt */
+};
+
+Additions to the page must only occur at the end. Struct fields are always 32
+or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
+
+Magic page features
+===================
+
+When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE,
+a second return value is passed to the guest. This second return value contains
+a bitmap of available features inside the magic page.
+
+The following enhancements to the magic page are currently available:
+
+  KVM_MAGIC_FEAT_SR		Maps SR registers r/w in the magic page
+
+For enhanced features in the magic page, please check for the existence of the
+feature before using them!
+
+MSR bits
+========
+
+The MSR contains bits that require hypervisor intervention and bits that do
+not require direct hypervisor intervention because they only get interpreted
+when entering the guest or don't have any impact on the hypervisor's behavior.
+
+The following bits are safe to be set inside the guest:
+
+  MSR_EE
+  MSR_RI
+  MSR_CR
+  MSR_ME
+
+If any other bit changes in the MSR, please still use mtmsr(d).
+
+Patched instructions
+====================
+
+The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions
+respectively on 32 bit systems with an added offset of 4 to accomodate for big
+endianness.
+
+The following is a list of mapping the Linux kernel performs when running as
+guest. Implementing any of those mappings is optional, as the instruction traps
+also act on the shared page. So calling privileged instructions still works as
+before.
+
+From			To
+====			==
+
+mfmsr	rX		ld	rX, magic_page->msr
+mfsprg	rX, 0		ld	rX, magic_page->sprg0
+mfsprg	rX, 1		ld	rX, magic_page->sprg1
+mfsprg	rX, 2		ld	rX, magic_page->sprg2
+mfsprg	rX, 3		ld	rX, magic_page->sprg3
+mfsrr0	rX		ld	rX, magic_page->srr0
+mfsrr1	rX		ld	rX, magic_page->srr1
+mfdar	rX		ld	rX, magic_page->dar
+mfdsisr	rX		lwz	rX, magic_page->dsisr
+
+mtmsr	rX		std	rX, magic_page->msr
+mtsprg	0, rX		std	rX, magic_page->sprg0
+mtsprg	1, rX		std	rX, magic_page->sprg1
+mtsprg	2, rX		std	rX, magic_page->sprg2
+mtsprg	3, rX		std	rX, magic_page->sprg3
+mtsrr0	rX		std	rX, magic_page->srr0
+mtsrr1	rX		std	rX, magic_page->srr1
+mtdar	rX		std	rX, magic_page->dar
+mtdsisr	rX		stw	rX, magic_page->dsisr
+
+tlbsync			nop
+
+mtmsrd	rX, 0		b	<special mtmsr section>
+mtmsr	rX		b	<special mtmsr section>
+
+mtmsrd	rX, 1		b	<special mtmsrd section>
+
+[Book3S only]
+mtsrin	rX, rY		b	<special mtsrin section>
+
+[BookE only]
+wrteei	[0|1]		b	<special wrteei section>
+
+
+Some instructions require more logic to determine what's going on than a load
+or store instruction can deliver. To enable patching of those, we keep some
+RAM around where we can live translate instructions to. What happens is the
+following:
+
+	1) copy emulation code to memory
+	2) patch that code to fit the emulated instruction
+	3) patch that code to return to the original pc + 4
+	4) patch the original instruction to branch to the new code
+
+That way we can inject an arbitrary amount of code as replacement for a single
+instruction. This allows us to check for pending interrupts when setting EE=1
+for example.
diff --git a/Documentation/kvm/timekeeping.txt b/Documentation/kvm/timekeeping.txt
new file mode 100644
index 000000000000..0c5033a58c9e
--- /dev/null
+++ b/Documentation/kvm/timekeeping.txt
@@ -0,0 +1,612 @@
+
+	Timekeeping Virtualization for X86-Based Architectures
+
+	Zachary Amsden <zamsden@redhat.com>
+	Copyright (c) 2010, Red Hat.  All rights reserved.
+
+1) Overview
+2) Timing Devices
+3) TSC Hardware
+4) Virtualization Problems
+
+=========================================================================
+
+1) Overview
+
+One of the most complicated parts of the X86 platform, and specifically,
+the virtualization of this platform is the plethora of timing devices available
+and the complexity of emulating those devices.  In addition, virtualization of
+time introduces a new set of challenges because it introduces a multiplexed
+division of time beyond the control of the guest CPU.
+
+First, we will describe the various timekeeping hardware available, then
+present some of the problems which arise and solutions available, giving
+specific recommendations for certain classes of KVM guests.
+
+The purpose of this document is to collect data and information relevant to
+timekeeping which may be difficult to find elsewhere, specifically,
+information relevant to KVM and hardware-based virtualization.
+
+=========================================================================
+
+2) Timing Devices
+
+First we discuss the basic hardware devices available.  TSC and the related
+KVM clock are special enough to warrant a full exposition and are described in
+the following section.
+
+2.1) i8254 - PIT
+
+One of the first timer devices available is the programmable interrupt timer,
+or PIT.  The PIT has a fixed frequency 1.193182 MHz base clock and three
+channels which can be programmed to deliver periodic or one-shot interrupts.
+These three channels can be configured in different modes and have individual
+counters.  Channel 1 and 2 were not available for general use in the original
+IBM PC, and historically were connected to control RAM refresh and the PC
+speaker.  Now the PIT is typically integrated as part of an emulated chipset
+and a separate physical PIT is not used.
+
+The PIT uses I/O ports 0x40 - 0x43.  Access to the 16-bit counters is done
+using single or multiple byte access to the I/O ports.  There are 6 modes
+available, but not all modes are available to all timers, as only timer 2
+has a connected gate input, required for modes 1 and 5.  The gate line is
+controlled by port 61h, bit 0, as illustrated in the following diagram.
+
+ --------------             ----------------
+|              |           |                |
+|  1.1932 MHz  |---------->| CLOCK      OUT | ---------> IRQ 0
+|    Clock     |   |       |                |
+ --------------    |    +->| GATE  TIMER 0  |
+                   |        ----------------
+                   |
+                   |        ----------------
+                   |       |                |
+                   |------>| CLOCK      OUT | ---------> 66.3 KHZ DRAM
+                   |       |                |            (aka /dev/null)
+                   |    +->| GATE  TIMER 1  |
+                   |        ----------------
+                   |
+                   |        ----------------
+                   |       |                |
+                   |------>| CLOCK      OUT | ---------> Port 61h, bit 5
+                           |                |      |
+Port 61h, bit 0 ---------->| GATE  TIMER 2  |       \_.----   ____
+                            ----------------         _|    )--|LPF|---Speaker
+                                                    / *----   \___/
+Port 61h, bit 1 -----------------------------------/
+
+The timer modes are now described.
+
+Mode 0: Single Timeout.   This is a one-shot software timeout that counts down
+ when the gate is high (always true for timers 0 and 1).  When the count
+ reaches zero, the output goes high.
+
+Mode 1: Triggered One-shot.  The output is intially set high.  When the gate
+ line is set high, a countdown is initiated (which does not stop if the gate is
+ lowered), during which the output is set low.  When the count reaches zero,
+ the output goes high.
+
+Mode 2: Rate Generator.  The output is initially set high.  When the countdown
+ reaches 1, the output goes low for one count and then returns high.  The value
+ is reloaded and the countdown automatically resumes.  If the gate line goes
+ low, the count is halted.  If the output is low when the gate is lowered, the
+ output automatically goes high (this only affects timer 2).
+
+Mode 3: Square Wave.   This generates a high / low square wave.  The count
+ determines the length of the pulse, which alternates between high and low
+ when zero is reached.  The count only proceeds when gate is high and is
+ automatically reloaded on reaching zero.  The count is decremented twice at
+ each clock to generate a full high / low cycle at the full periodic rate.
+ If the count is even, the clock remains high for N/2 counts and low for N/2
+ counts; if the clock is odd, the clock is high for (N+1)/2 counts and low
+ for (N-1)/2 counts.  Only even values are latched by the counter, so odd
+ values are not observed when reading.  This is the intended mode for timer 2,
+ which generates sine-like tones by low-pass filtering the square wave output.
+
+Mode 4: Software Strobe.  After programming this mode and loading the counter,
+ the output remains high until the counter reaches zero.  Then the output
+ goes low for 1 clock cycle and returns high.  The counter is not reloaded.
+ Counting only occurs when gate is high.
+
+Mode 5: Hardware Strobe.  After programming and loading the counter, the
+ output remains high.  When the gate is raised, a countdown is initiated
+ (which does not stop if the gate is lowered).  When the counter reaches zero,
+ the output goes low for 1 clock cycle and then returns high.  The counter is
+ not reloaded.
+
+In addition to normal binary counting, the PIT supports BCD counting.  The
+command port, 0x43 is used to set the counter and mode for each of the three
+timers.
+
+PIT commands, issued to port 0x43, using the following bit encoding:
+
+Bit 7-4: Command (See table below)
+Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined)
+Bit 0  : Binary (0) / BCD (1)
+
+Command table:
+
+0000 - Latch Timer 0 count for port 0x40
+	sample and hold the count to be read in port 0x40;
+	additional commands ignored until counter is read;
+	mode bits ignored.
+
+0001 - Set Timer 0 LSB mode for port 0x40
+	set timer to read LSB only and force MSB to zero;
+	mode bits set timer mode
+
+0010 - Set Timer 0 MSB mode for port 0x40
+	set timer to read MSB only and force LSB to zero;
+	mode bits set timer mode
+
+0011 - Set Timer 0 16-bit mode for port 0x40
+	set timer to read / write LSB first, then MSB;
+	mode bits set timer mode
+
+0100 - Latch Timer 1 count for port 0x41 - as described above
+0101 - Set Timer 1 LSB mode for port 0x41 - as described above
+0110 - Set Timer 1 MSB mode for port 0x41 - as described above
+0111 - Set Timer 1 16-bit mode for port 0x41 - as described above
+
+1000 - Latch Timer 2 count for port 0x42 - as described above
+1001 - Set Timer 2 LSB mode for port 0x42 - as described above
+1010 - Set Timer 2 MSB mode for port 0x42 - as described above
+1011 - Set Timer 2 16-bit mode for port 0x42 as described above
+
+1101 - General counter latch
+	Latch combination of counters into corresponding ports
+	Bit 3 = Counter 2
+	Bit 2 = Counter 1
+	Bit 1 = Counter 0
+	Bit 0 = Unused
+
+1110 - Latch timer status
+	Latch combination of counter mode into corresponding ports
+	Bit 3 = Counter 2
+	Bit 2 = Counter 1
+	Bit 1 = Counter 0
+
+	The output of ports 0x40-0x42 following this command will be:
+
+	Bit 7 = Output pin
+	Bit 6 = Count loaded (0 if timer has expired)
+	Bit 5-4 = Read / Write mode
+	    01 = MSB only
+	    10 = LSB only
+	    11 = LSB / MSB (16-bit)
+	Bit 3-1 = Mode
+	Bit 0 = Binary (0) / BCD mode (1)
+
+2.2) RTC
+
+The second device which was available in the original PC was the MC146818 real
+time clock.  The original device is now obsolete, and usually emulated by the
+system chipset, sometimes by an HPET and some frankenstein IRQ routing.
+
+The RTC is accessed through CMOS variables, which uses an index register to
+control which bytes are read.  Since there is only one index register, read
+of the CMOS and read of the RTC require lock protection (in addition, it is
+dangerous to allow userspace utilities such as hwclock to have direct RTC
+access, as they could corrupt kernel reads and writes of CMOS memory).
+
+The RTC generates an interrupt which is usually routed to IRQ 8.  The interrupt
+can function as a periodic timer, an additional once a day alarm, and can issue
+interrupts after an update of the CMOS registers by the MC146818 is complete.
+The type of interrupt is signalled in the RTC status registers.
+
+The RTC will update the current time fields by battery power even while the
+system is off.  The current time fields should not be read while an update is
+in progress, as indicated in the status register.
+
+The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be
+programmed to a 32kHz divider if the RTC is to count seconds.
+
+This is the RAM map originally used for the RTC/CMOS:
+
+Location    Size    Description
+------------------------------------------
+00h         byte    Current second (BCD)
+01h         byte    Seconds alarm (BCD)
+02h         byte    Current minute (BCD)
+03h         byte    Minutes alarm (BCD)
+04h         byte    Current hour (BCD)
+05h         byte    Hours alarm (BCD)
+06h         byte    Current day of week (BCD)
+07h         byte    Current day of month (BCD)
+08h         byte    Current month (BCD)
+09h         byte    Current year (BCD)
+0Ah         byte    Register A
+                       bit 7   = Update in progress
+                       bit 6-4 = Divider for clock
+                                  000 = 4.194 MHz
+                                  001 = 1.049 MHz
+                                  010 = 32 kHz
+                                  10X = test modes
+                                  110 = reset / disable
+                                  111 = reset / disable
+                       bit 3-0 = Rate selection for periodic interrupt
+                                  000 = periodic timer disabled
+                                  001 = 3.90625 uS
+                                  010 = 7.8125 uS
+                                  011 = .122070 mS
+                                  100 = .244141 mS
+                                     ...
+                                 1101 = 125 mS
+                                 1110 = 250 mS
+                                 1111 = 500 mS
+0Bh         byte    Register B
+                       bit 7   = Run (0) / Halt (1)
+                       bit 6   = Periodic interrupt enable
+                       bit 5   = Alarm interrupt enable
+                       bit 4   = Update-ended interrupt enable
+                       bit 3   = Square wave interrupt enable
+                       bit 2   = BCD calendar (0) / Binary (1)
+                       bit 1   = 12-hour mode (0) / 24-hour mode (1)
+                       bit 0   = 0 (DST off) / 1 (DST enabled)
+OCh         byte    Register C (read only)
+                       bit 7   = interrupt request flag (IRQF)
+                       bit 6   = periodic interrupt flag (PF)
+                       bit 5   = alarm interrupt flag (AF)
+                       bit 4   = update interrupt flag (UF)
+                       bit 3-0 = reserved
+ODh         byte    Register D (read only)
+                       bit 7   = RTC has power
+                       bit 6-0 = reserved
+32h         byte    Current century BCD (*)
+  (*) location vendor specific and now determined from ACPI global tables
+
+2.3) APIC
+
+On Pentium and later processors, an on-board timer is available to each CPU
+as part of the Advanced Programmable Interrupt Controller.  The APIC is
+accessed through memory-mapped registers and provides interrupt service to each
+CPU, used for IPIs and local timer interrupts.
+
+Although in theory the APIC is a safe and stable source for local interrupts,
+in practice, many bugs and glitches have occurred due to the special nature of
+the APIC CPU-local memory-mapped hardware.  Beware that CPU errata may affect
+the use of the APIC and that workarounds may be required.  In addition, some of
+these workarounds pose unique constraints for virtualization - requiring either
+extra overhead incurred from extra reads of memory-mapped I/O or additional
+functionality that may be more computationally expensive to implement.
+
+Since the APIC is documented quite well in the Intel and AMD manuals, we will
+avoid repetition of the detail here.  It should be pointed out that the APIC
+timer is programmed through the LVT (local vector timer) register, is capable
+of one-shot or periodic operation, and is based on the bus clock divided down
+by the programmable divider register.
+
+2.4) HPET
+
+HPET is quite complex, and was originally intended to replace the PIT / RTC
+support of the X86 PC.  It remains to be seen whether that will be the case, as
+the de facto standard of PC hardware is to emulate these older devices.  Some
+systems designated as legacy free may support only the HPET as a hardware timer
+device.
+
+The HPET spec is rather loose and vague, requiring at least 3 hardware timers,
+but allowing implementation freedom to support many more.  It also imposes no
+fixed rate on the timer frequency, but does impose some extremal values on
+frequency, error and slew.
+
+In general, the HPET is recommended as a high precision (compared to PIT /RTC)
+time source which is independent of local variation (as there is only one HPET
+in any given system).  The HPET is also memory-mapped, and its presence is
+indicated through ACPI tables by the BIOS.
+
+Detailed specification of the HPET is beyond the current scope of this
+document, as it is also very well documented elsewhere.
+
+2.5) Offboard Timers
+
+Several cards, both proprietary (watchdog boards) and commonplace (e1000) have
+timing chips built into the cards which may have registers which are accessible
+to kernel or user drivers.  To the author's knowledge, using these to generate
+a clocksource for a Linux or other kernel has not yet been attempted and is in
+general frowned upon as not playing by the agreed rules of the game.  Such a
+timer device would require additional support to be virtualized properly and is
+not considered important at this time as no known operating system does this.
+
+=========================================================================
+
+3) TSC Hardware
+
+The TSC or time stamp counter is relatively simple in theory; it counts
+instruction cycles issued by the processor, which can be used as a measure of
+time.  In practice, due to a number of problems, it is the most complicated
+timekeeping device to use.
+
+The TSC is represented internally as a 64-bit MSR which can be read with the
+RDMSR, RDTSC, or RDTSCP (when available) instructions.  In the past, hardware
+limitations made it possible to write the TSC, but generally on old hardware it
+was only possible to write the low 32-bits of the 64-bit counter, and the upper
+32-bits of the counter were cleared.  Now, however, on Intel processors family
+0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction
+has been lifted and all 64-bits are writable.  On AMD systems, the ability to
+write the TSC MSR is not an architectural guarantee.
+
+The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by
+means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access.
+
+Some vendors have implemented an additional instruction, RDTSCP, which returns
+atomically not just the TSC, but an indicator which corresponds to the
+processor number.  This can be used to index into an array of TSC variables to
+determine offset information in SMP systems where TSCs are not synchronized.
+The presence of this instruction must be determined by consulting CPUID feature
+bits.
+
+Both VMX and SVM provide extension fields in the virtualization hardware which
+allows the guest visible TSC to be offset by a constant.  Newer implementations
+promise to allow the TSC to additionally be scaled, but this hardware is not
+yet widely available.
+
+3.1) TSC synchronization
+
+The TSC is a CPU-local clock in most implementations.  This means, on SMP
+platforms, the TSCs of different CPUs may start at different times depending
+on when the CPUs are powered on.  Generally, CPUs on the same die will share
+the same clock, however, this is not always the case.
+
+The BIOS may attempt to resynchronize the TSCs during the poweron process and
+the operating system or other system software may attempt to do this as well.
+Several hardware limitations make the problem worse - if it is not possible to
+write the full 64-bits of the TSC, it may be impossible to match the TSC in
+newly arriving CPUs to that of the rest of the system, resulting in
+unsynchronized TSCs.  This may be done by BIOS or system software, but in
+practice, getting a perfectly synchronized TSC will not be possible unless all
+values are read from the same clock, which generally only is possible on single
+socket systems or those with special hardware support.
+
+3.2) TSC and CPU hotplug
+
+As touched on already, CPUs which arrive later than the boot time of the system
+may not have a TSC value that is synchronized with the rest of the system.
+Either system software, BIOS, or SMM code may actually try to establish the TSC
+to a value matching the rest of the system, but a perfect match is usually not
+a guarantee.  This can have the effect of bringing a system from a state where
+TSC is synchronized back to a state where TSC synchronization flaws, however
+small, may be exposed to the OS and any virtualization environment.
+
+3.3) TSC and multi-socket / NUMA
+
+Multi-socket systems, especially large multi-socket systems are likely to have
+individual clocksources rather than a single, universally distributed clock.
+Since these clocks are driven by different crystals, they will not have
+perfectly matched frequency, and temperature and electrical variations will
+cause the CPU clocks, and thus the TSCs to drift over time.  Depending on the
+exact clock and bus design, the drift may or may not be fixed in absolute
+error, and may accumulate over time.
+
+In addition, very large systems may deliberately slew the clocks of individual
+cores.  This technique, known as spread-spectrum clocking, reduces EMI at the
+clock frequency and harmonics of it, which may be required to pass FCC
+standards for telecommunications and computer equipment.
+
+It is recommended not to trust the TSCs to remain synchronized on NUMA or
+multiple socket systems for these reasons.
+
+3.4) TSC and C-states
+
+C-states, or idling states of the processor, especially C1E and deeper sleep
+states may be problematic for TSC as well.  The TSC may stop advancing in such
+a state, resulting in a TSC which is behind that of other CPUs when execution
+is resumed.  Such CPUs must be detected and flagged by the operating system
+based on CPU and chipset identifications.
+
+The TSC in such a case may be corrected by catching it up to a known external
+clocksource.
+
+3.5) TSC frequency change / P-states
+
+To make things slightly more interesting, some CPUs may change frequency.  They
+may or may not run the TSC at the same rate, and because the frequency change
+may be staggered or slewed, at some points in time, the TSC rate may not be
+known other than falling within a range of values.  In this case, the TSC will
+not be a stable time source, and must be calibrated against a known, stable,
+external clock to be a usable source of time.
+
+Whether the TSC runs at a constant rate or scales with the P-state is model
+dependent and must be determined by inspecting CPUID, chipset or vendor
+specific MSR fields.
+
+In addition, some vendors have known bugs where the P-state is actually
+compensated for properly during normal operation, but when the processor is
+inactive, the P-state may be raised temporarily to service cache misses from
+other processors.  In such cases, the TSC on halted CPUs could advance faster
+than that of non-halted processors.  AMD Turion processors are known to have
+this problem.
+
+3.6) TSC and STPCLK / T-states
+
+External signals given to the processor may also have the effect of stopping
+the TSC.  This is typically done for thermal emergency power control to prevent
+an overheating condition, and typically, there is no way to detect that this
+condition has happened.
+
+3.7) TSC virtualization - VMX
+
+VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP
+instructions, which is enough for full virtualization of TSC in any manner.  In
+addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET
+field specified in the VMCS.  Special instructions must be used to read and
+write the VMCS field.
+
+3.8) TSC virtualization - SVM
+
+SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP
+instructions, which is enough for full virtualization of TSC in any manner.  In
+addition, SVM allows passing through the host TSC plus an additional offset
+field specified in the SVM control block.
+
+3.9) TSC feature bits in Linux
+
+In summary, there is no way to guarantee the TSC remains in perfect
+synchronization unless it is explicitly guaranteed by the architecture.  Even
+if so, the TSCs in multi-sockets or NUMA systems may still run independently
+despite being locally consistent.
+
+The following feature bits are used by Linux to signal various TSC attributes,
+but they can only be taken to be meaningful for UP or single node systems.
+
+X86_FEATURE_TSC 		: The TSC is available in hardware
+X86_FEATURE_RDTSCP		: The RDTSCP instruction is available
+X86_FEATURE_CONSTANT_TSC 	: The TSC rate is unchanged with P-states
+X86_FEATURE_NONSTOP_TSC		: The TSC does not stop in C-states
+X86_FEATURE_TSC_RELIABLE	: TSC sync checks are skipped (VMware)
+
+4) Virtualization Problems
+
+Timekeeping is especially problematic for virtualization because a number of
+challenges arise.  The most obvious problem is that time is now shared between
+the host and, potentially, a number of virtual machines.  Thus the virtual
+operating system does not run with 100% usage of the CPU, despite the fact that
+it may very well make that assumption.  It may expect it to remain true to very
+exacting bounds when interrupt sources are disabled, but in reality only its
+virtual interrupt sources are disabled, and the machine may still be preempted
+at any time.  This causes problems as the passage of real time, the injection
+of machine interrupts and the associated clock sources are no longer completely
+synchronized with real time.
+
+This same problem can occur on native harware to a degree, as SMM mode may
+steal cycles from the naturally on X86 systems when SMM mode is used by the
+BIOS, but not in such an extreme fashion.  However, the fact that SMM mode may
+cause similar problems to virtualization makes it a good justification for
+solving many of these problems on bare metal.
+
+4.1) Interrupt clocking
+
+One of the most immediate problems that occurs with legacy operating systems
+is that the system timekeeping routines are often designed to keep track of
+time by counting periodic interrupts.  These interrupts may come from the PIT
+or the RTC, but the problem is the same: the host virtualization engine may not
+be able to deliver the proper number of interrupts per second, and so guest
+time may fall behind.  This is especially problematic if a high interrupt rate
+is selected, such as 1000 HZ, which is unfortunately the default for many Linux
+guests.
+
+There are three approaches to solving this problem; first, it may be possible
+to simply ignore it.  Guests which have a separate time source for tracking
+'wall clock' or 'real time' may not need any adjustment of their interrupts to
+maintain proper time.  If this is not sufficient, it may be necessary to inject
+additional interrupts into the guest in order to increase the effective
+interrupt rate.  This approach leads to complications in extreme conditions,
+where host load or guest lag is too much to compensate for, and thus another
+solution to the problem has risen: the guest may need to become aware of lost
+ticks and compensate for them internally.  Although promising in theory, the
+implementation of this policy in Linux has been extremely error prone, and a
+number of buggy variants of lost tick compensation are distributed across
+commonly used Linux systems.
+
+Windows uses periodic RTC clocking as a means of keeping time internally, and
+thus requires interrupt slewing to keep proper time.  It does use a low enough
+rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in
+practice.
+
+4.2) TSC sampling and serialization
+
+As the highest precision time source available, the cycle counter of the CPU
+has aroused much interest from developers.  As explained above, this timer has
+many problems unique to its nature as a local, potentially unstable and
+potentially unsynchronized source.  One issue which is not unique to the TSC,
+but is highlighted because of its very precise nature is sampling delay.  By
+definition, the counter, once read is already old.  However, it is also
+possible for the counter to be read ahead of the actual use of the result.
+This is a consequence of the superscalar execution of the instruction stream,
+which may execute instructions out of order.  Such execution is called
+non-serialized.  Forcing serialized execution is necessary for precise
+measurement with the TSC, and requires a serializing instruction, such as CPUID
+or an MSR read.
+
+Since CPUID may actually be virtualized by a trap and emulate mechanism, this
+serialization can pose a performance issue for hardware virtualization.  An
+accurate time stamp counter reading may therefore not always be available, and
+it may be necessary for an implementation to guard against "backwards" reads of
+the TSC as seen from other CPUs, even in an otherwise perfectly synchronized
+system.
+
+4.3) Timespec aliasing
+
+Additionally, this lack of serialization from the TSC poses another challenge
+when using results of the TSC when measured against another time source.  As
+the TSC is much higher precision, many possible values of the TSC may be read
+while another clock is still expressing the same value.
+
+That is, you may read (T,T+10) while external clock C maintains the same value.
+Due to non-serialized reads, you may actually end up with a range which
+fluctuates - from (T-1.. T+10).  Thus, any time calculated from a TSC, but
+calibrated against an external value may have a range of valid values.
+Re-calibrating this computation may actually cause time, as computed after the
+calibration, to go backwards, compared with time computed before the
+calibration.
+
+This problem is particularly pronounced with an internal time source in Linux,
+the kernel time, which is expressed in the theoretically high resolution
+timespec - but which advances in much larger granularity intervals, sometimes
+at the rate of jiffies, and possibly in catchup modes, at a much larger step.
+
+This aliasing requires care in the computation and recalibration of kvmclock
+and any other values derived from TSC computation (such as TSC virtualization
+itself).
+
+4.4) Migration
+
+Migration of a virtual machine raises problems for timekeeping in two ways.
+First, the migration itself may take time, during which interrupts cannot be
+delivered, and after which, the guest time may need to be caught up.  NTP may
+be able to help to some degree here, as the clock correction required is
+typically small enough to fall in the NTP-correctable window.
+
+An additional concern is that timers based off the TSC (or HPET, if the raw bus
+clock is exposed) may now be running at different rates, requiring compensation
+in some way in the hypervisor by virtualizing these timers.  In addition,
+migrating to a faster machine may preclude the use of a passthrough TSC, as a
+faster clock cannot be made visible to a guest without the potential of time
+advancing faster than usual.  A slower clock is less of a problem, as it can
+always be caught up to the original rate.  KVM clock avoids these problems by
+simply storing multipliers and offsets against the TSC for the guest to convert
+back into nanosecond resolution values.
+
+4.5) Scheduling
+
+Since scheduling may be based on precise timing and firing of interrupts, the
+scheduling algorithms of an operating system may be adversely affected by
+virtualization.  In theory, the effect is random and should be universally
+distributed, but in contrived as well as real scenarios (guest device access,
+causes of virtualization exits, possible context switch), this may not always
+be the case.  The effect of this has not been well studied.
+
+In an attempt to work around this, several implementations have provided a
+paravirtualized scheduler clock, which reveals the true amount of CPU time for
+which a virtual machine has been running.
+
+4.6) Watchdogs
+
+Watchdog timers, such as the lock detector in Linux may fire accidentally when
+running under hardware virtualization due to timer interrupts being delayed or
+misinterpretation of the passage of real time.  Usually, these warnings are
+spurious and can be ignored, but in some circumstances it may be necessary to
+disable such detection.
+
+4.7) Delays and precision timing
+
+Precise timing and delays may not be possible in a virtualized system.  This
+can happen if the system is controlling physical hardware, or issues delays to
+compensate for slower I/O to and from devices.  The first issue is not solvable
+in general for a virtualized system; hardware control software can't be
+adequately virtualized without a full real-time operating system, which would
+require an RT aware virtualization platform.
+
+The second issue may cause performance problems, but this is unlikely to be a
+significant issue.  In many cases these delays may be eliminated through
+configuration or paravirtualization.
+
+4.8) Covert channels and leaks
+
+In addition to the above problems, time information will inevitably leak to the
+guest about the host in anything but a perfect implementation of virtualized
+time.  This may allow the guest to infer the presence of a hypervisor (as in a
+red-pill type detection), and it may allow information to leak between guests
+by using CPU utilization itself as a signalling channel.  Preventing such
+problems would require completely isolated virtual time which may not track
+real time any longer.  This may be useful in certain security or QA contexts,
+but in general isn't recommended for real-world deployment scenarios.
diff --git a/MAINTAINERS b/MAINTAINERS
index 494e1a07366a..0e265fcc103b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4453,6 +4453,15 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/busses/i2c-pasemi.c
 
+PADATA PARALLEL EXECUTION MECHANISM
+M:	Steffen Klassert <steffen.klassert@secunet.com>
+L:	linux-kernel@vger.kernel.org
+L:	linux-crypto@vger.kernel.org
+S:	Maintained
+F:	kernel/padata.c
+F:	include/linux/padata.h
+F:	Documentation/padata.txt
+
 PANASONIC LAPTOP ACPI EXTRAS DRIVER
 M:	Harald Welte <laforge@gnumonks.org>
 L:	platform-driver-x86@vger.kernel.org
@@ -4532,6 +4541,12 @@ S:	Maintained
 F:	drivers/leds/leds-pca9532.c
 F:	include/linux/leds-pca9532.h
 
+PCA9541 I2C BUS MASTER SELECTOR DRIVER
+M:	Guenter Roeck <guenter.roeck@ericsson.com>
+L:	linux-i2c@vger.kernel.org
+S:	Maintained
+F:	drivers/i2c/muxes/pca9541.c
+
 PCA9564/PCA9665 I2C BUS DRIVER
 M:	Wolfram Sang <w.sang@pengutronix.de>
 L:	linux-i2c@vger.kernel.org
@@ -4580,6 +4595,13 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/pcnet32.c
 
+PCRYPT PARALLEL CRYPTO ENGINE
+M:	Steffen Klassert <steffen.klassert@secunet.com>
+L:	linux-crypto@vger.kernel.org
+S:	Maintained
+F:	crypto/pcrypt.c
+F:	include/crypto/pcrypt.h
+
 PER-TASK DELAY ACCOUNTING
 M:	Balbir Singh <balbir@linux.vnet.ibm.com>
 S:	Maintained
diff --git a/arch/arm/mach-omap2/clock2420_data.c b/arch/arm/mach-omap2/clock2420_data.c
index 37d65d62ed8f..5f2066a6ba74 100644
--- a/arch/arm/mach-omap2/clock2420_data.c
+++ b/arch/arm/mach-omap2/clock2420_data.c
@@ -1838,7 +1838,7 @@ static struct omap_clk omap2420_clks[] = {
 	CLK(NULL,	"des_ick",	&des_ick,	CK_242X),
 	CLK("omap-sham",	"ick",	&sha_ick,	CK_242X),
 	CLK("omap_rng",	"ick",		&rng_ick,	CK_242X),
-	CLK(NULL,	"aes_ick",	&aes_ick,	CK_242X),
+	CLK("omap-aes",	"ick",	&aes_ick,	CK_242X),
 	CLK(NULL,	"pka_ick",	&pka_ick,	CK_242X),
 	CLK(NULL,	"usb_fck",	&usb_fck,	CK_242X),
 	CLK("musb_hdrc",	"fck",	&osc_ck,	CK_242X),
diff --git a/arch/arm/mach-omap2/clock2430_data.c b/arch/arm/mach-omap2/clock2430_data.c
index b33118fb6a87..701a1716019e 100644
--- a/arch/arm/mach-omap2/clock2430_data.c
+++ b/arch/arm/mach-omap2/clock2430_data.c
@@ -1926,7 +1926,7 @@ static struct omap_clk omap2430_clks[] = {
 	CLK(NULL,	"des_ick",	&des_ick,	CK_243X),
 	CLK("omap-sham",	"ick",	&sha_ick,	CK_243X),
 	CLK("omap_rng",	"ick",		&rng_ick,	CK_243X),
-	CLK(NULL,	"aes_ick",	&aes_ick,	CK_243X),
+	CLK("omap-aes",	"ick",	&aes_ick,	CK_243X),
 	CLK(NULL,	"pka_ick",	&pka_ick,	CK_243X),
 	CLK(NULL,	"usb_fck",	&usb_fck,	CK_243X),
 	CLK("musb_hdrc",	"ick",	&usbhs_ick,	CK_243X),
diff --git a/arch/arm/mach-omap2/clock3xxx_data.c b/arch/arm/mach-omap2/clock3xxx_data.c
index dfdce2d82779..c73906d17458 100644
--- a/arch/arm/mach-omap2/clock3xxx_data.c
+++ b/arch/arm/mach-omap2/clock3xxx_data.c
@@ -3288,7 +3288,7 @@ static struct omap_clk omap3xxx_clks[] = {
 	CLK(NULL,	"usbtll_ick",	&usbtll_ick,	CK_3430ES2 | CK_AM35XX),
 	CLK("mmci-omap-hs.2",	"ick",	&mmchs3_ick,	CK_3430ES2 | CK_AM35XX),
 	CLK(NULL,	"icr_ick",	&icr_ick,	CK_343X),
-	CLK(NULL,	"aes2_ick",	&aes2_ick,	CK_343X),
+	CLK("omap-aes",	"ick",	&aes2_ick,	CK_343X),
 	CLK("omap-sham",	"ick",	&sha12_ick,	CK_343X),
 	CLK(NULL,	"des2_ick",	&des2_ick,	CK_343X),
 	CLK("mmci-omap-hs.1",	"ick",	&mmchs2_ick,	CK_3XXX),
diff --git a/arch/arm/mach-omap2/devices.c b/arch/arm/mach-omap2/devices.c
index 2dbb265bedd4..b27e7cbb3f29 100644
--- a/arch/arm/mach-omap2/devices.c
+++ b/arch/arm/mach-omap2/devices.c
@@ -498,6 +498,76 @@ static void omap_init_sham(void)
 static inline void omap_init_sham(void) { }
 #endif
 
+#if defined(CONFIG_CRYPTO_DEV_OMAP_AES) || defined(CONFIG_CRYPTO_DEV_OMAP_AES_MODULE)
+
+#ifdef CONFIG_ARCH_OMAP24XX
+static struct resource omap2_aes_resources[] = {
+	{
+		.start	= OMAP24XX_SEC_AES_BASE,
+		.end	= OMAP24XX_SEC_AES_BASE + 0x4C,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= OMAP24XX_DMA_AES_TX,
+		.flags	= IORESOURCE_DMA,
+	},
+	{
+		.start	= OMAP24XX_DMA_AES_RX,
+		.flags	= IORESOURCE_DMA,
+	}
+};
+static int omap2_aes_resources_sz = ARRAY_SIZE(omap2_aes_resources);
+#else
+#define omap2_aes_resources		NULL
+#define omap2_aes_resources_sz		0
+#endif
+
+#ifdef CONFIG_ARCH_OMAP34XX
+static struct resource omap3_aes_resources[] = {
+	{
+		.start	= OMAP34XX_SEC_AES_BASE,
+		.end	= OMAP34XX_SEC_AES_BASE + 0x4C,
+		.flags	= IORESOURCE_MEM,
+	},
+	{
+		.start	= OMAP34XX_DMA_AES2_TX,
+		.flags	= IORESOURCE_DMA,
+	},
+	{
+		.start	= OMAP34XX_DMA_AES2_RX,
+		.flags	= IORESOURCE_DMA,
+	}
+};
+static int omap3_aes_resources_sz = ARRAY_SIZE(omap3_aes_resources);
+#else
+#define omap3_aes_resources		NULL
+#define omap3_aes_resources_sz		0
+#endif
+
+static struct platform_device aes_device = {
+	.name		= "omap-aes",
+	.id		= -1,
+};
+
+static void omap_init_aes(void)
+{
+	if (cpu_is_omap24xx()) {
+		aes_device.resource = omap2_aes_resources;
+		aes_device.num_resources = omap2_aes_resources_sz;
+	} else if (cpu_is_omap34xx()) {
+		aes_device.resource = omap3_aes_resources;
+		aes_device.num_resources = omap3_aes_resources_sz;
+	} else {
+		pr_err("%s: platform not supported\n", __func__);
+		return;
+	}
+	platform_device_register(&aes_device);
+}
+
+#else
+static inline void omap_init_aes(void) { }
+#endif
+
 /*-------------------------------------------------------------------------*/
 
 #if defined(CONFIG_ARCH_OMAP3) || defined(CONFIG_ARCH_OMAP4)
@@ -854,6 +924,7 @@ static int __init omap2_init_devices(void)
 	omap_hdq_init();
 	omap_init_sti();
 	omap_init_sham();
+	omap_init_aes();
 	omap_init_vout();
 
 	return 0;
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index ee541cebcd78..c5f92a926a9a 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -25,5 +25,6 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
+#define kvm_lapic_enabled(x) (true)
 
 #endif
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 6c5547d82bbe..18ea6963ad77 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -86,5 +86,6 @@ struct kvm_guest_debug_arch {
 
 #define KVM_INTERRUPT_SET	-1U
 #define KVM_INTERRUPT_UNSET	-2U
+#define KVM_INTERRUPT_SET_LEVEL	-3U
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index c5ea4cda34b3..5b7504674397 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -58,6 +58,7 @@
 #define BOOK3S_INTERRUPT_INST_STORAGE	0x400
 #define BOOK3S_INTERRUPT_INST_SEGMENT	0x480
 #define BOOK3S_INTERRUPT_EXTERNAL	0x500
+#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL	0x501
 #define BOOK3S_INTERRUPT_ALIGNMENT	0x600
 #define BOOK3S_INTERRUPT_PROGRAM	0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
@@ -84,7 +85,8 @@
 #define BOOK3S_IRQPRIO_EXTERNAL			13
 #define BOOK3S_IRQPRIO_DECREMENTER		14
 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR	15
-#define BOOK3S_IRQPRIO_MAX			16
+#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL		16
+#define BOOK3S_IRQPRIO_MAX			17
 
 #define BOOK3S_HFLAG_DCBZ32			0x1
 #define BOOK3S_HFLAG_SLB			0x2
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 8274a2d43925..d62e703f1214 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -38,15 +38,6 @@ struct kvmppc_slb {
 	bool class	: 1;
 };
 
-struct kvmppc_sr {
-	u32 raw;
-	u32 vsid;
-	bool Ks		: 1;
-	bool Kp		: 1;
-	bool nx		: 1;
-	bool valid	: 1;
-};
-
 struct kvmppc_bat {
 	u64 raw;
 	u32 bepi;
@@ -69,6 +60,13 @@ struct kvmppc_sid_map {
 #define SID_MAP_NUM     (1 << SID_MAP_BITS)
 #define SID_MAP_MASK    (SID_MAP_NUM - 1)
 
+#ifdef CONFIG_PPC_BOOK3S_64
+#define SID_CONTEXTS	1
+#else
+#define SID_CONTEXTS	128
+#define VSID_POOL_SIZE	(SID_CONTEXTS * 16)
+#endif
+
 struct kvmppc_vcpu_book3s {
 	struct kvm_vcpu vcpu;
 	struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
@@ -79,20 +77,22 @@ struct kvmppc_vcpu_book3s {
 		u64 vsid;
 	} slb_shadow[64];
 	u8 slb_shadow_max;
-	struct kvmppc_sr sr[16];
 	struct kvmppc_bat ibat[8];
 	struct kvmppc_bat dbat[8];
 	u64 hid[6];
 	u64 gqr[8];
 	int slb_nr;
-	u32 dsisr;
 	u64 sdr1;
 	u64 hior;
 	u64 msr_mask;
-	u64 vsid_first;
 	u64 vsid_next;
+#ifdef CONFIG_PPC_BOOK3S_32
+	u32 vsid_pool[VSID_POOL_SIZE];
+#else
+	u64 vsid_first;
 	u64 vsid_max;
-	int context_id;
+#endif
+	int context_id[SID_CONTEXTS];
 	ulong prog_flags; /* flags to inject when giving a 700 trap */
 };
 
@@ -131,9 +131,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-extern u32 kvmppc_trampoline_lowmem;
-extern u32 kvmppc_trampoline_enter;
+extern ulong kvmppc_trampoline_lowmem;
+extern ulong kvmppc_trampoline_enter;
 extern void kvmppc_rmcall(ulong srr0, ulong srr1);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index b0b23c007d6e..bba3b9b72a39 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/kvm_para.h>
 #include <asm/kvm_asm.h>
 
 #define KVM_MAX_VCPUS 1
@@ -41,12 +42,17 @@
 
 #define HPTEG_CACHE_NUM			(1 << 15)
 #define HPTEG_HASH_BITS_PTE		13
+#define HPTEG_HASH_BITS_PTE_LONG	12
 #define HPTEG_HASH_BITS_VPTE		13
 #define HPTEG_HASH_BITS_VPTE_LONG	5
 #define HPTEG_HASH_NUM_PTE		(1 << HPTEG_HASH_BITS_PTE)
+#define HPTEG_HASH_NUM_PTE_LONG		(1 << HPTEG_HASH_BITS_PTE_LONG)
 #define HPTEG_HASH_NUM_VPTE		(1 << HPTEG_HASH_BITS_VPTE)
 #define HPTEG_HASH_NUM_VPTE_LONG	(1 << HPTEG_HASH_BITS_VPTE_LONG)
 
+/* Physical Address Mask - allowed range of real mode RAM access */
+#define KVM_PAM			0x0fffffffffffffffULL
+
 struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
@@ -159,8 +165,10 @@ struct kvmppc_mmu {
 
 struct hpte_cache {
 	struct hlist_node list_pte;
+	struct hlist_node list_pte_long;
 	struct hlist_node list_vpte;
 	struct hlist_node list_vpte_long;
+	struct rcu_head rcu_head;
 	u64 host_va;
 	u64 pfn;
 	ulong slot;
@@ -210,28 +218,20 @@ struct kvm_vcpu_arch {
 	u32 cr;
 #endif
 
-	ulong msr;
 #ifdef CONFIG_PPC_BOOK3S
 	ulong shadow_msr;
 	ulong hflags;
 	ulong guest_owned_ext;
 #endif
 	u32 mmucr;
-	ulong sprg0;
-	ulong sprg1;
-	ulong sprg2;
-	ulong sprg3;
 	ulong sprg4;
 	ulong sprg5;
 	ulong sprg6;
 	ulong sprg7;
-	ulong srr0;
-	ulong srr1;
 	ulong csrr0;
 	ulong csrr1;
 	ulong dsrr0;
 	ulong dsrr1;
-	ulong dear;
 	ulong esr;
 	u32 dec;
 	u32 decar;
@@ -290,12 +290,17 @@ struct kvm_vcpu_arch {
 	struct tasklet_struct tasklet;
 	u64 dec_jiffies;
 	unsigned long pending_exceptions;
+	struct kvm_vcpu_arch_shared *shared;
+	unsigned long magic_page_pa; /* phys addr to map the magic page to */
+	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
 #ifdef CONFIG_PPC_BOOK3S
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
 	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
 	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
 	int hpte_cache_count;
+	spinlock_t mmu_lock;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 2d48f6a63d0b..50533f9adf40 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -20,16 +20,153 @@
 #ifndef __POWERPC_KVM_PARA_H__
 #define __POWERPC_KVM_PARA_H__
 
+#include <linux/types.h>
+
+struct kvm_vcpu_arch_shared {
+	__u64 scratch1;
+	__u64 scratch2;
+	__u64 scratch3;
+	__u64 critical;		/* Guest may not get interrupts if == r1 */
+	__u64 sprg0;
+	__u64 sprg1;
+	__u64 sprg2;
+	__u64 sprg3;
+	__u64 srr0;
+	__u64 srr1;
+	__u64 dar;
+	__u64 msr;
+	__u32 dsisr;
+	__u32 int_pending;	/* Tells the guest if we have an interrupt */
+	__u32 sr[16];
+};
+
+#define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
+#define HC_VENDOR_KVM		(42 << 16)
+#define HC_EV_SUCCESS		0
+#define HC_EV_UNIMPLEMENTED	12
+
+#define KVM_FEATURE_MAGIC_PAGE	1
+
+#define KVM_MAGIC_FEAT_SR	(1 << 0)
+
 #ifdef __KERNEL__
 
+#ifdef CONFIG_KVM_GUEST
+
+#include <linux/of.h>
+
+static inline int kvm_para_available(void)
+{
+	struct device_node *hyper_node;
+
+	hyper_node = of_find_node_by_path("/hypervisor");
+	if (!hyper_node)
+		return 0;
+
+	if (!of_device_is_compatible(hyper_node, "linux,kvm"))
+		return 0;
+
+	return 1;
+}
+
+extern unsigned long kvm_hypercall(unsigned long *in,
+				   unsigned long *out,
+				   unsigned long nr);
+
+#else
+
 static inline int kvm_para_available(void)
 {
 	return 0;
 }
 
+static unsigned long kvm_hypercall(unsigned long *in,
+				   unsigned long *out,
+				   unsigned long nr)
+{
+	return HC_EV_UNIMPLEMENTED;
+}
+
+#endif
+
+static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+	unsigned long r;
+
+	r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	*r2 = out[0];
+
+	return r;
+}
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+				  unsigned long p2)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+				  unsigned long p2, unsigned long p3)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	in[2] = p3;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+				  unsigned long p2, unsigned long p3,
+				  unsigned long p4)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	in[2] = p3;
+	in[3] = p4;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+
 static inline unsigned int kvm_arch_para_features(void)
 {
-	return 0;
+	unsigned long r;
+
+	if (!kvm_para_available())
+		return 0;
+
+	if(kvm_hypercall0_1(KVM_HC_FEATURES, &r))
+		return 0;
+
+	return r;
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 18d139ec2d22..ecb3bc74c344 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -107,6 +107,7 @@ extern int kvmppc_booke_init(void);
 extern void kvmppc_booke_exit(void);
 
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
+extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 
 /*
  * Cuts out inst bits with ordering according to spec.
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 4ed076a4db24..36c30f31ec93 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -129,6 +129,8 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
 obj-y				+= ppc_save_regs.o
 endif
 
+obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
+
 # Disable GCOV in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
 GCOV_PROFILE_ftrace.o := n
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c3e01945ad4f..bd0df2e6aa8f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -48,11 +48,11 @@
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
 #endif
-#ifdef CONFIG_KVM
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST)
 #include <linux/kvm_host.h>
-#ifndef CONFIG_BOOKE
-#include <asm/kvm_book3s.h>
 #endif
+#if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
+#include <asm/kvm_book3s.h>
 #endif
 
 #ifdef CONFIG_PPC32
@@ -396,12 +396,13 @@ int main(void)
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
-	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
 	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
+	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
+	DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 
 	/* book3s */
 #ifdef CONFIG_PPC_BOOK3S
@@ -466,6 +467,22 @@ int main(void)
 	DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
 #endif /* CONFIG_PPC_BOOK3S */
 #endif
+
+#ifdef CONFIG_KVM_GUEST
+	DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
+					    scratch1));
+	DEFINE(KVM_MAGIC_SCRATCH2, offsetof(struct kvm_vcpu_arch_shared,
+					    scratch2));
+	DEFINE(KVM_MAGIC_SCRATCH3, offsetof(struct kvm_vcpu_arch_shared,
+					    scratch3));
+	DEFINE(KVM_MAGIC_INT, offsetof(struct kvm_vcpu_arch_shared,
+				       int_pending));
+	DEFINE(KVM_MAGIC_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
+	DEFINE(KVM_MAGIC_CRITICAL, offsetof(struct kvm_vcpu_arch_shared,
+					    critical));
+	DEFINE(KVM_MAGIC_SR, offsetof(struct kvm_vcpu_arch_shared, sr));
+#endif
+
 #ifdef CONFIG_44x
 	DEFINE(PGD_T_LOG2, PGD_T_LOG2);
 	DEFINE(PTE_T_LOG2, PTE_T_LOG2);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 39b0c48872d2..9f8b01d6466f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -299,6 +299,12 @@ slb_miss_user_pseries:
 	b	.				/* prevent spec. execution */
 #endif /* __DISABLED__ */
 
+/* KVM's trampoline code needs to be close to the interrupt handlers */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include "../kvm/book3s_rmhandlers.S"
+#endif
+
 	.align	7
 	.globl	__end_interrupts
 __end_interrupts:
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index c571cd3c1453..f0dd577e4a5b 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -166,12 +166,6 @@ exception_marker:
 #include "exceptions-64s.S"
 #endif
 
-/* KVM trampoline code needs to be close to the interrupt handlers */
-
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#include "../kvm/book3s_rmhandlers.S"
-#endif
-
 _GLOBAL(generic_secondary_thread_init)
 	mr	r24,r3
 
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
new file mode 100644
index 000000000000..428d0e538aec
--- /dev/null
+++ b/arch/powerpc/kernel/kvm.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/init.h>
+#include <linux/kvm_para.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include <asm/reg.h>
+#include <asm/sections.h>
+#include <asm/cacheflush.h>
+#include <asm/disassemble.h>
+
+#define KVM_MAGIC_PAGE		(-4096L)
+#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
+
+#define KVM_INST_LWZ		0x80000000
+#define KVM_INST_STW		0x90000000
+#define KVM_INST_LD		0xe8000000
+#define KVM_INST_STD		0xf8000000
+#define KVM_INST_NOP		0x60000000
+#define KVM_INST_B		0x48000000
+#define KVM_INST_B_MASK		0x03ffffff
+#define KVM_INST_B_MAX		0x01ffffff
+
+#define KVM_MASK_RT		0x03e00000
+#define KVM_RT_30		0x03c00000
+#define KVM_MASK_RB		0x0000f800
+#define KVM_INST_MFMSR		0x7c0000a6
+#define KVM_INST_MFSPR_SPRG0	0x7c1042a6
+#define KVM_INST_MFSPR_SPRG1	0x7c1142a6
+#define KVM_INST_MFSPR_SPRG2	0x7c1242a6
+#define KVM_INST_MFSPR_SPRG3	0x7c1342a6
+#define KVM_INST_MFSPR_SRR0	0x7c1a02a6
+#define KVM_INST_MFSPR_SRR1	0x7c1b02a6
+#define KVM_INST_MFSPR_DAR	0x7c1302a6
+#define KVM_INST_MFSPR_DSISR	0x7c1202a6
+
+#define KVM_INST_MTSPR_SPRG0	0x7c1043a6
+#define KVM_INST_MTSPR_SPRG1	0x7c1143a6
+#define KVM_INST_MTSPR_SPRG2	0x7c1243a6
+#define KVM_INST_MTSPR_SPRG3	0x7c1343a6
+#define KVM_INST_MTSPR_SRR0	0x7c1a03a6
+#define KVM_INST_MTSPR_SRR1	0x7c1b03a6
+#define KVM_INST_MTSPR_DAR	0x7c1303a6
+#define KVM_INST_MTSPR_DSISR	0x7c1203a6
+
+#define KVM_INST_TLBSYNC	0x7c00046c
+#define KVM_INST_MTMSRD_L0	0x7c000164
+#define KVM_INST_MTMSRD_L1	0x7c010164
+#define KVM_INST_MTMSR		0x7c000124
+
+#define KVM_INST_WRTEEI_0	0x7c000146
+#define KVM_INST_WRTEEI_1	0x7c008146
+
+#define KVM_INST_MTSRIN		0x7c0001e4
+
+static bool kvm_patching_worked = true;
+static char kvm_tmp[1024 * 1024];
+static int kvm_tmp_index;
+
+static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
+{
+	*inst = new_inst;
+	flush_icache_range((ulong)inst, (ulong)inst + 4);
+}
+
+static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
+}
+
+static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_STW | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
+}
+
+static void kvm_patch_ins_nop(u32 *inst)
+{
+	kvm_patch_ins(inst, KVM_INST_NOP);
+}
+
+static void kvm_patch_ins_b(u32 *inst, int addr)
+{
+#ifdef CONFIG_RELOCATABLE
+	/* On relocatable kernels interrupts handlers and our code
+	   can be in different regions, so we don't patch them */
+
+	extern u32 __end_interrupts;
+	if ((ulong)inst < (ulong)&__end_interrupts)
+		return;
+#endif
+
+	kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
+}
+
+static u32 *kvm_alloc(int len)
+{
+	u32 *p;
+
+	if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) {
+		printk(KERN_ERR "KVM: No more space (%d + %d)\n",
+				kvm_tmp_index, len);
+		kvm_patching_worked = false;
+		return NULL;
+	}
+
+	p = (void*)&kvm_tmp[kvm_tmp_index];
+	kvm_tmp_index += len;
+
+	return p;
+}
+
+extern u32 kvm_emulate_mtmsrd_branch_offs;
+extern u32 kvm_emulate_mtmsrd_reg_offs;
+extern u32 kvm_emulate_mtmsrd_orig_ins_offs;
+extern u32 kvm_emulate_mtmsrd_len;
+extern u32 kvm_emulate_mtmsrd[];
+
+static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsrd_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsrd_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4);
+	p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	switch (get_rt(rt)) {
+	case 30:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		break;
+	case 31:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		break;
+	default:
+		p[kvm_emulate_mtmsrd_reg_offs] |= rt;
+		break;
+	}
+
+	p[kvm_emulate_mtmsrd_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+extern u32 kvm_emulate_mtmsr_branch_offs;
+extern u32 kvm_emulate_mtmsr_reg1_offs;
+extern u32 kvm_emulate_mtmsr_reg2_offs;
+extern u32 kvm_emulate_mtmsr_orig_ins_offs;
+extern u32 kvm_emulate_mtmsr_len;
+extern u32 kvm_emulate_mtmsr[];
+
+static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsr_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsr_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4);
+	p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK;
+
+	/* Make clobbered registers work too */
+	switch (get_rt(rt)) {
+	case 30:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		break;
+	case 31:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		break;
+	default:
+		p[kvm_emulate_mtmsr_reg1_offs] |= rt;
+		p[kvm_emulate_mtmsr_reg2_offs] |= rt;
+		break;
+	}
+
+	p[kvm_emulate_mtmsr_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#ifdef CONFIG_BOOKE
+
+extern u32 kvm_emulate_wrteei_branch_offs;
+extern u32 kvm_emulate_wrteei_ee_offs;
+extern u32 kvm_emulate_wrteei_len;
+extern u32 kvm_emulate_wrteei[];
+
+static void kvm_patch_ins_wrteei(u32 *inst)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_wrteei_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4);
+	p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE);
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_32
+
+extern u32 kvm_emulate_mtsrin_branch_offs;
+extern u32 kvm_emulate_mtsrin_reg1_offs;
+extern u32 kvm_emulate_mtsrin_reg2_offs;
+extern u32 kvm_emulate_mtsrin_orig_ins_offs;
+extern u32 kvm_emulate_mtsrin_len;
+extern u32 kvm_emulate_mtsrin[];
+
+static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtsrin_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtsrin_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtsrin, kvm_emulate_mtsrin_len * 4);
+	p[kvm_emulate_mtsrin_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtsrin_reg1_offs] |= (rb << 10);
+	p[kvm_emulate_mtsrin_reg2_offs] |= rt;
+	p[kvm_emulate_mtsrin_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtsrin_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
+static void kvm_map_magic_page(void *data)
+{
+	u32 *features = data;
+
+	ulong in[8];
+	ulong out[8];
+
+	in[0] = KVM_MAGIC_PAGE;
+	in[1] = KVM_MAGIC_PAGE;
+
+	kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
+
+	*features = out[0];
+}
+
+static void kvm_check_ins(u32 *inst, u32 features)
+{
+	u32 _inst = *inst;
+	u32 inst_no_rt = _inst & ~KVM_MASK_RT;
+	u32 inst_rt = _inst & KVM_MASK_RT;
+
+	switch (inst_no_rt) {
+	/* Loads */
+	case KVM_INST_MFMSR:
+		kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG0:
+		kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG1:
+		kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG2:
+		kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG3:
+		kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SRR0:
+		kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SRR1:
+		kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
+		break;
+	case KVM_INST_MFSPR_DAR:
+		kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MFSPR_DSISR:
+		kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
+		break;
+
+	/* Stores */
+	case KVM_INST_MTSPR_SPRG0:
+		kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG1:
+		kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG2:
+		kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG3:
+		kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SRR0:
+		kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SRR1:
+		kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
+		break;
+	case KVM_INST_MTSPR_DAR:
+		kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MTSPR_DSISR:
+		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
+		break;
+
+	/* Nops */
+	case KVM_INST_TLBSYNC:
+		kvm_patch_ins_nop(inst);
+		break;
+
+	/* Rewrites */
+	case KVM_INST_MTMSRD_L1:
+		kvm_patch_ins_mtmsrd(inst, inst_rt);
+		break;
+	case KVM_INST_MTMSR:
+	case KVM_INST_MTMSRD_L0:
+		kvm_patch_ins_mtmsr(inst, inst_rt);
+		break;
+	}
+
+	switch (inst_no_rt & ~KVM_MASK_RB) {
+#ifdef CONFIG_PPC_BOOK3S_32
+	case KVM_INST_MTSRIN:
+		if (features & KVM_MAGIC_FEAT_SR) {
+			u32 inst_rb = _inst & KVM_MASK_RB;
+			kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb);
+		}
+		break;
+		break;
+#endif
+	}
+
+	switch (_inst) {
+#ifdef CONFIG_BOOKE
+	case KVM_INST_WRTEEI_0:
+	case KVM_INST_WRTEEI_1:
+		kvm_patch_ins_wrteei(inst);
+		break;
+#endif
+	}
+}
+
+static void kvm_use_magic_page(void)
+{
+	u32 *p;
+	u32 *start, *end;
+	u32 tmp;
+	u32 features;
+
+	/* Tell the host to map the magic page to -4096 on all CPUs */
+	on_each_cpu(kvm_map_magic_page, &features, 1);
+
+	/* Quick self-test to see if the mapping works */
+	if (__get_user(tmp, (u32*)KVM_MAGIC_PAGE)) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Now loop through all code and find instructions */
+	start = (void*)_stext;
+	end = (void*)_etext;
+
+	for (p = start; p < end; p++)
+		kvm_check_ins(p, features);
+
+	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
+			 kvm_patching_worked ? "worked" : "failed");
+}
+
+unsigned long kvm_hypercall(unsigned long *in,
+			    unsigned long *out,
+			    unsigned long nr)
+{
+	unsigned long register r0 asm("r0");
+	unsigned long register r3 asm("r3") = in[0];
+	unsigned long register r4 asm("r4") = in[1];
+	unsigned long register r5 asm("r5") = in[2];
+	unsigned long register r6 asm("r6") = in[3];
+	unsigned long register r7 asm("r7") = in[4];
+	unsigned long register r8 asm("r8") = in[5];
+	unsigned long register r9 asm("r9") = in[6];
+	unsigned long register r10 asm("r10") = in[7];
+	unsigned long register r11 asm("r11") = nr;
+	unsigned long register r12 asm("r12");
+
+	asm volatile("bl	kvm_hypercall_start"
+		     : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
+		       "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
+		       "=r"(r12)
+		     : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
+		       "r"(r9), "r"(r10), "r"(r11)
+		     : "memory", "cc", "xer", "ctr", "lr");
+
+	out[0] = r4;
+	out[1] = r5;
+	out[2] = r6;
+	out[3] = r7;
+	out[4] = r8;
+	out[5] = r9;
+	out[6] = r10;
+	out[7] = r11;
+
+	return r3;
+}
+EXPORT_SYMBOL_GPL(kvm_hypercall);
+
+static int kvm_para_setup(void)
+{
+	extern u32 kvm_hypercall_start;
+	struct device_node *hyper_node;
+	u32 *insts;
+	int len, i;
+
+	hyper_node = of_find_node_by_path("/hypervisor");
+	if (!hyper_node)
+		return -1;
+
+	insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
+	if (len % 4)
+		return -1;
+	if (len > (4 * 4))
+		return -1;
+
+	for (i = 0; i < (len / 4); i++)
+		kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
+
+	return 0;
+}
+
+static __init void kvm_free_tmp(void)
+{
+	unsigned long start, end;
+
+	start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK;
+	end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
+
+	/* Free the tmp space we don't need */
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		totalram_pages++;
+	}
+}
+
+static int __init kvm_guest_init(void)
+{
+	if (!kvm_para_available())
+		goto free_tmp;
+
+	if (kvm_para_setup())
+		goto free_tmp;
+
+	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
+		kvm_use_magic_page();
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Enable napping */
+	powersave_nap = 1;
+#endif
+
+free_tmp:
+	kvm_free_tmp();
+
+	return 0;
+}
+
+postcore_initcall(kvm_guest_init);
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
new file mode 100644
index 000000000000..f2b1b2523e61
--- /dev/null
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -0,0 +1,302 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+
+/* Hypercall entry point. Will be patched with device tree instructions. */
+
+.global kvm_hypercall_start
+kvm_hypercall_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+	blr
+
+#define KVM_MAGIC_PAGE		(-4096)
+
+#ifdef CONFIG_64BIT
+#define LL64(reg, offs, reg2)	ld	reg, (offs)(reg2)
+#define STL64(reg, offs, reg2)	std	reg, (offs)(reg2)
+#else
+#define LL64(reg, offs, reg2)	lwz	reg, (offs + 4)(reg2)
+#define STL64(reg, offs, reg2)	stw	reg, (offs + 4)(reg2)
+#endif
+
+#define SCRATCH_SAVE							\
+	/* Enable critical section. We are critical if			\
+	   shared->critical == r1 */					\
+	STL64(r1, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);		\
+									\
+	/* Save state */						\
+	PPC_STL	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0);		\
+	PPC_STL	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0);		\
+	mfcr	r31;							\
+	stw	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0);
+
+#define SCRATCH_RESTORE							\
+	/* Restore state */						\
+	PPC_LL	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0);		\
+	lwz	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0);		\
+	mtcr	r30;							\
+	PPC_LL	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0);		\
+									\
+	/* Disable critical section. We are critical if			\
+	   shared->critical == r1 and r2 is always != r1 */		\
+	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
+
+.global kvm_emulate_mtmsrd
+kvm_emulate_mtmsrd:
+
+	SCRATCH_SAVE
+
+	/* Put MSR & ~(MSR_EE|MSR_RI) in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	lis	r30, (~(MSR_EE | MSR_RI))@h
+	ori	r30, r30, (~(MSR_EE | MSR_RI))@l
+	and	r31, r31, r30
+
+	/* OR the register's (MSR_EE|MSR_RI) on MSR */
+kvm_emulate_mtmsrd_reg:
+	ori	r30, r0, 0
+	andi.	r30, r30, (MSR_EE|MSR_RI)
+	or	r31, r31, r30
+
+	/* Put MSR back into magic page */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_check
+
+	/* Check if we may trigger an interrupt */
+	andi.	r30, r30, MSR_EE
+	beq	no_check
+
+	SCRATCH_RESTORE
+
+	/* Nag hypervisor */
+kvm_emulate_mtmsrd_orig_ins:
+	tlbsync
+
+	b	kvm_emulate_mtmsrd_branch
+
+no_check:
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsrd_branch:
+	b	.
+kvm_emulate_mtmsrd_end:
+
+.global kvm_emulate_mtmsrd_branch_offs
+kvm_emulate_mtmsrd_branch_offs:
+	.long (kvm_emulate_mtmsrd_branch - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_reg_offs
+kvm_emulate_mtmsrd_reg_offs:
+	.long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_orig_ins_offs
+kvm_emulate_mtmsrd_orig_ins_offs:
+	.long (kvm_emulate_mtmsrd_orig_ins - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_len
+kvm_emulate_mtmsrd_len:
+	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
+
+
+#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
+#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
+
+.global kvm_emulate_mtmsr
+kvm_emulate_mtmsr:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Find the changed bits between old and new MSR */
+kvm_emulate_mtmsr_reg1:
+	ori	r30, r0, 0
+	xor	r31, r30, r31
+
+	/* Check if we need to really do mtmsr */
+	LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS)
+	and.	r31, r31, r30
+
+	/* No critical bits changed? Maybe we can stay in the guest. */
+	beq	maybe_stay_in_guest
+
+do_mtmsr:
+
+	SCRATCH_RESTORE
+
+	/* Just fire off the mtmsr if it's critical */
+kvm_emulate_mtmsr_orig_ins:
+	mtmsr	r0
+
+	b	kvm_emulate_mtmsr_branch
+
+maybe_stay_in_guest:
+
+	/* Get the target register in r30 */
+kvm_emulate_mtmsr_reg2:
+	ori	r30, r0, 0
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_mtmsr
+
+	/* Check if we may trigger an interrupt */
+	andi.	r31, r30, MSR_EE
+	beq	no_mtmsr
+
+	b	do_mtmsr
+
+no_mtmsr:
+
+	/* Put MSR into magic page because we don't call mtmsr */
+	STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsr_branch:
+	b	.
+kvm_emulate_mtmsr_end:
+
+.global kvm_emulate_mtmsr_branch_offs
+kvm_emulate_mtmsr_branch_offs:
+	.long (kvm_emulate_mtmsr_branch - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg1_offs
+kvm_emulate_mtmsr_reg1_offs:
+	.long (kvm_emulate_mtmsr_reg1 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg2_offs
+kvm_emulate_mtmsr_reg2_offs:
+	.long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_orig_ins_offs
+kvm_emulate_mtmsr_orig_ins_offs:
+	.long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_len
+kvm_emulate_mtmsr_len:
+	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
+
+
+
+.global kvm_emulate_wrteei
+kvm_emulate_wrteei:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Remove MSR_EE from old MSR */
+	li	r30, 0
+	ori	r30, r30, MSR_EE
+	andc	r31, r31, r30
+
+	/* OR new MSR_EE onto the old MSR */
+kvm_emulate_wrteei_ee:
+	ori	r31, r31, 0
+
+	/* Write new MSR value back */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_wrteei_branch:
+	b	.
+kvm_emulate_wrteei_end:
+
+.global kvm_emulate_wrteei_branch_offs
+kvm_emulate_wrteei_branch_offs:
+	.long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
+
+.global kvm_emulate_wrteei_ee_offs
+kvm_emulate_wrteei_ee_offs:
+	.long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4
+
+.global kvm_emulate_wrteei_len
+kvm_emulate_wrteei_len:
+	.long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
+
+
+.global kvm_emulate_mtsrin
+kvm_emulate_mtsrin:
+
+	SCRATCH_SAVE
+
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	andi.	r31, r31, MSR_DR | MSR_IR
+	beq	kvm_emulate_mtsrin_reg1
+
+	SCRATCH_RESTORE
+
+kvm_emulate_mtsrin_orig_ins:
+	nop
+	b	kvm_emulate_mtsrin_branch
+
+kvm_emulate_mtsrin_reg1:
+	/* rX >> 26 */
+	rlwinm  r30,r0,6,26,29
+
+kvm_emulate_mtsrin_reg2:
+	stw	r0, (KVM_MAGIC_PAGE + KVM_MAGIC_SR)(r30)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtsrin_branch:
+	b	.
+kvm_emulate_mtsrin_end:
+
+.global kvm_emulate_mtsrin_branch_offs
+kvm_emulate_mtsrin_branch_offs:
+	.long (kvm_emulate_mtsrin_branch - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg1_offs
+kvm_emulate_mtsrin_reg1_offs:
+	.long (kvm_emulate_mtsrin_reg1 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg2_offs
+kvm_emulate_mtsrin_reg2_offs:
+	.long (kvm_emulate_mtsrin_reg2 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_orig_ins_offs
+kvm_emulate_mtsrin_orig_ins_offs:
+	.long (kvm_emulate_mtsrin_orig_ins - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_len
+kvm_emulate_mtsrin_len:
+	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 73c0a3f64ed1..74d0e7421143 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -43,7 +43,7 @@ int kvmppc_core_check_processor_compat(void)
 {
 	int r;
 
-	if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
+	if (strncmp(cur_cpu_spec->platform, "ppc440", 6) == 0)
 		r = 0;
 	else
 		r = -ENOTSUPP;
@@ -72,6 +72,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* Since the guest can directly access the timebase, it must know the
 	 * real timebase frequency. Accordingly, it must see the state of
 	 * CCR1[TCS]. */
+	/* XXX CCR1 doesn't exist on all 440 SoCs. */
 	vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
 
 	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
@@ -123,8 +124,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_vcpu;
 
+	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_vcpu;
+
 	return vcpu;
 
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
 free_vcpu:
 	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 out:
@@ -135,6 +142,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 
+	free_page((unsigned long)vcpu->arch.shared);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 9b9b5cdea840..5f3cff83e089 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -47,6 +47,7 @@
 #ifdef DEBUG
 void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	struct kvmppc_44x_tlbe *tlbe;
 	int i;
 
@@ -221,14 +222,14 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
 
 int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
 
 	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
 int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
 
 	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
@@ -354,7 +355,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
 
 	stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
 	stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags,
-	                                            vcpu->arch.msr & MSR_PR);
+	                                            vcpu->arch.shared->msr & MSR_PR);
 	stlbe.tid = !(asid & 0xff);
 
 	/* Keep track of the reference so we can properly release it later. */
@@ -423,7 +424,7 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 
 	/* Does it match current guest AS? */
 	/* XXX what about IS != DS? */
-	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
 		return 0;
 
 	gpa = get_tlb_raddr(tlbe);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index a3cef30d1d42..e316847c08c0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -17,6 +17,7 @@
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include "trace.h"
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -35,7 +36,6 @@
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
-/* #define EXIT_DEBUG_SIMPLE */
 /* #define DEBUG_EXT */
 
 static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
@@ -105,65 +105,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 	kvmppc_giveup_ext(vcpu, MSR_VSX);
 }
 
-#if defined(EXIT_DEBUG)
-static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
-{
-	u64 jd = mftb() - vcpu->arch.dec_jiffies;
-	return vcpu->arch.dec - jd;
-}
-#endif
-
 static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.shadow_msr = vcpu->arch.msr;
+	ulong smsr = vcpu->arch.shared->msr;
+
 	/* Guest MSR values */
-	vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE |
-				 MSR_BE | MSR_DE;
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
 	/* Process MSR values */
-	vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR |
-				 MSR_EE;
+	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
 	/* External providers the guest reserved */
-	vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext);
+	smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
 	/* 64-bit Process MSR values */
 #ifdef CONFIG_PPC_BOOK3S_64
-	vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV;
+	smsr |= MSR_ISF | MSR_HV;
 #endif
+	vcpu->arch.shadow_msr = smsr;
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 {
-	ulong old_msr = vcpu->arch.msr;
+	ulong old_msr = vcpu->arch.shared->msr;
 
 #ifdef EXIT_DEBUG
 	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
 #endif
 
 	msr &= to_book3s(vcpu)->msr_mask;
-	vcpu->arch.msr = msr;
+	vcpu->arch.shared->msr = msr;
 	kvmppc_recalc_shadow_msr(vcpu);
 
-	if (msr & (MSR_WE|MSR_POW)) {
+	if (msr & MSR_POW) {
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
 			vcpu->stat.halt_wakeup++;
+
+			/* Unset POW bit after we woke up */
+			msr &= ~MSR_POW;
+			vcpu->arch.shared->msr = msr;
 		}
 	}
 
-	if ((vcpu->arch.msr & (MSR_PR|MSR_IR|MSR_DR)) !=
+	if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
 		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
 		kvmppc_mmu_flush_segments(vcpu);
 		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+
+		/* Preload magic page segment when in kernel mode */
+		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
+			struct kvm_vcpu_arch *a = &vcpu->arch;
+
+			if (msr & MSR_DR)
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
+			else
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
+		}
 	}
 
 	/* Preload FPU if it's enabled */
-	if (vcpu->arch.msr & MSR_FP)
+	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 }
 
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
-	vcpu->arch.srr0 = kvmppc_get_pc(vcpu);
-	vcpu->arch.srr1 = vcpu->arch.msr | flags;
+	vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
+	vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
 	kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
 	vcpu->arch.mmu.reset_msr(vcpu);
 }
@@ -180,6 +186,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 	case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;		break;
 	case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;		break;
 	case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;		break;
+	case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL;	break;
 	case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;		break;
 	case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;		break;
 	case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;		break;
@@ -199,6 +206,9 @@ static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 {
 	clear_bit(kvmppc_book3s_vec2irqprio(vec),
 		  &vcpu->arch.pending_exceptions);
+
+	if (!vcpu->arch.pending_exceptions)
+		vcpu->arch.shared->int_pending = 0;
 }
 
 void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
@@ -237,13 +247,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
 {
-	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+	unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
+
+	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
+		vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
+
+	kvmppc_book3s_queue_irqprio(vcpu, vec);
 }
 
 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                   struct kvm_interrupt *irq)
 {
 	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
 }
 
 int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
@@ -251,14 +267,29 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 	int deliver = 1;
 	int vec = 0;
 	ulong flags = 0ULL;
+	ulong crit_raw = vcpu->arch.shared->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
 
 	switch (priority) {
 	case BOOK3S_IRQPRIO_DECREMENTER:
-		deliver = vcpu->arch.msr & MSR_EE;
+		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
 		vec = BOOK3S_INTERRUPT_DECREMENTER;
 		break;
 	case BOOK3S_IRQPRIO_EXTERNAL:
-		deliver = vcpu->arch.msr & MSR_EE;
+	case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
+		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
 		vec = BOOK3S_INTERRUPT_EXTERNAL;
 		break;
 	case BOOK3S_IRQPRIO_SYSTEM_RESET:
@@ -320,9 +351,27 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 	return deliver;
 }
 
+/*
+ * This function determines if an irqprio should be cleared once issued.
+ */
+static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	switch (priority) {
+		case BOOK3S_IRQPRIO_DECREMENTER:
+			/* DEC interrupts get cleared by mtdec */
+			return false;
+		case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
+			/* External interrupts get cleared by userspace */
+			return false;
+	}
+
+	return true;
+}
+
 void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
 	unsigned int priority;
 
 #ifdef EXIT_DEBUG
@@ -332,8 +381,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 	priority = __ffs(*pending);
 	while (priority < BOOK3S_IRQPRIO_MAX) {
 		if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
-		    (priority != BOOK3S_IRQPRIO_DECREMENTER)) {
-			/* DEC interrupts get cleared by mtdec */
+		    clear_irqprio(vcpu, priority)) {
 			clear_bit(priority, &vcpu->arch.pending_exceptions);
 			break;
 		}
@@ -342,6 +390,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 					 BITS_PER_BYTE * sizeof(*pending),
 					 priority + 1);
 	}
+
+	/* Tell the guest about our interrupt status */
+	if (*pending)
+		vcpu->arch.shared->int_pending = 1;
+	else if (old_pending)
+		vcpu->arch.shared->int_pending = 0;
 }
 
 void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
@@ -398,6 +452,25 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 	}
 }
 
+pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	/* Magic page override */
+	if (unlikely(mp_pa) &&
+	    unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) ==
+		     ((mp_pa & PAGE_MASK) & KVM_PAM))) {
+		ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+		pfn_t pfn;
+
+		pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
+		get_page(pfn_to_page(pfn));
+		return pfn;
+	}
+
+	return gfn_to_pfn(vcpu->kvm, gfn);
+}
+
 /* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
  * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
  * emulate 32 bytes dcbz length.
@@ -415,8 +488,10 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 	int i;
 
 	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-	if (is_error_page(hpage))
+	if (is_error_page(hpage)) {
+		kvm_release_page_clean(hpage);
 		return;
+	}
 
 	hpage_offset = pte->raddr & ~PAGE_MASK;
 	hpage_offset &= ~0xFFFULL;
@@ -437,14 +512,14 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
 			 struct kvmppc_pte *pte)
 {
-	int relocated = (vcpu->arch.msr & (data ? MSR_DR : MSR_IR));
+	int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR));
 	int r;
 
 	if (relocated) {
 		r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data);
 	} else {
 		pte->eaddr = eaddr;
-		pte->raddr = eaddr & 0xffffffff;
+		pte->raddr = eaddr & KVM_PAM;
 		pte->vpage = VSID_REAL | eaddr >> 12;
 		pte->may_read = true;
 		pte->may_write = true;
@@ -533,6 +608,13 @@ mmio:
 
 static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	if (unlikely(mp_pa) &&
+	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
+		return 1;
+	}
+
 	return kvm_is_visible_gfn(vcpu->kvm, gfn);
 }
 
@@ -545,8 +627,8 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int page_found = 0;
 	struct kvmppc_pte pte;
 	bool is_mmio = false;
-	bool dr = (vcpu->arch.msr & MSR_DR) ? true : false;
-	bool ir = (vcpu->arch.msr & MSR_IR) ? true : false;
+	bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
+	bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
 	u64 vsid;
 
 	relocated = data ? dr : ir;
@@ -558,12 +640,12 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		pte.may_execute = true;
 		pte.may_read = true;
 		pte.may_write = true;
-		pte.raddr = eaddr & 0xffffffff;
+		pte.raddr = eaddr & KVM_PAM;
 		pte.eaddr = eaddr;
 		pte.vpage = eaddr >> 12;
 	}
 
-	switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 	case 0:
 		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
 		break;
@@ -571,7 +653,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case MSR_IR:
 		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
 
-		if ((vcpu->arch.msr & (MSR_DR|MSR_IR)) == MSR_DR)
+		if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
 			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
 		else
 			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
@@ -594,20 +676,23 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	if (page_found == -ENOENT) {
 		/* Page not found in guest PTE entries */
-		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
-		to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr;
-		vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EPERM) {
 		/* Storage protection */
-		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
-		to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
-		to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT;
-		vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dsisr =
+			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
+		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EINVAL) {
 		/* Page not found in guest SLB */
-		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
 	} else if (!is_mmio &&
 		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
@@ -695,9 +780,11 @@ static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
 
 	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
 	if (ret == -ENOENT) {
-		vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 33, 1);
-		vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 34, 36, 0);
-		vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0);
+		ulong msr = vcpu->arch.shared->msr;
+
+		msr = kvmppc_set_field(msr, 33, 33, 1);
+		msr = kvmppc_set_field(msr, 34, 36, 0);
+		vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
 		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
 		return EMULATE_AGAIN;
 	}
@@ -736,7 +823,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
 		return RESUME_GUEST;
 
-	if (!(vcpu->arch.msr & msr)) {
+	if (!(vcpu->arch.shared->msr & msr)) {
 		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 		return RESUME_GUEST;
 	}
@@ -796,16 +883,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	run->exit_reason = KVM_EXIT_UNKNOWN;
 	run->ready_for_interrupt_injection = 1;
-#ifdef EXIT_DEBUG
-	printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | dec=0x%x | msr=0x%lx\n",
-		exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
-		kvmppc_get_dec(vcpu), to_svcpu(vcpu)->shadow_srr1);
-#elif defined (EXIT_DEBUG_SIMPLE)
-	if ((exit_nr != 0x900) && (exit_nr != 0x500))
-		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n",
-			exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
-			vcpu->arch.msr);
-#endif
+
+	trace_kvm_book3s_exit(exit_nr, vcpu);
 	kvm_resched(vcpu);
 	switch (exit_nr) {
 	case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -836,9 +915,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
 			r = RESUME_GUEST;
 		} else {
-			vcpu->arch.msr |= to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
+			vcpu->arch.shared->msr |=
+				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
 			r = RESUME_GUEST;
 		}
 		break;
@@ -861,17 +940,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
 			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
 		} else {
-			vcpu->arch.dear = dar;
-			to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr;
+			vcpu->arch.shared->dar = dar;
+			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFUL);
 			r = RESUME_GUEST;
 		}
 		break;
 	}
 	case BOOK3S_INTERRUPT_DATA_SEGMENT:
 		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
-			vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+			vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 			kvmppc_book3s_queue_irqprio(vcpu,
 				BOOK3S_INTERRUPT_DATA_SEGMENT);
 		}
@@ -904,7 +982,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 program_interrupt:
 		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
 
-		if (vcpu->arch.msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & MSR_PR) {
 #ifdef EXIT_DEBUG
 			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
 #endif
@@ -941,10 +1019,10 @@ program_interrupt:
 		break;
 	}
 	case BOOK3S_INTERRUPT_SYSCALL:
-		// XXX make user settable
 		if (vcpu->arch.osi_enabled &&
 		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
 		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+			/* MOL hypercalls */
 			u64 *gprs = run->osi.gprs;
 			int i;
 
@@ -953,8 +1031,13 @@ program_interrupt:
 				gprs[i] = kvmppc_get_gpr(vcpu, i);
 			vcpu->arch.osi_needed = 1;
 			r = RESUME_HOST_NV;
-
+		} else if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
 		} else {
+			/* Guest syscalls */
 			vcpu->stat.syscall_exits++;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			r = RESUME_GUEST;
@@ -989,9 +1072,9 @@ program_interrupt:
 	}
 	case BOOK3S_INTERRUPT_ALIGNMENT:
 		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-			to_book3s(vcpu)->dsisr = kvmppc_alignment_dsisr(vcpu,
+			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
 				kvmppc_get_last_inst(vcpu));
-			vcpu->arch.dear = kvmppc_alignment_dar(vcpu,
+			vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
 				kvmppc_get_last_inst(vcpu));
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 		}
@@ -1031,9 +1114,7 @@ program_interrupt:
 		}
 	}
 
-#ifdef EXIT_DEBUG
-	printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, kvmppc_get_pc(vcpu), r);
-#endif
+	trace_kvm_book3s_reenter(r, vcpu);
 
 	return r;
 }
@@ -1052,14 +1133,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->ctr = kvmppc_get_ctr(vcpu);
 	regs->lr = kvmppc_get_lr(vcpu);
 	regs->xer = kvmppc_get_xer(vcpu);
-	regs->msr = vcpu->arch.msr;
-	regs->srr0 = vcpu->arch.srr0;
-	regs->srr1 = vcpu->arch.srr1;
+	regs->msr = vcpu->arch.shared->msr;
+	regs->srr0 = vcpu->arch.shared->srr0;
+	regs->srr1 = vcpu->arch.shared->srr1;
 	regs->pid = vcpu->arch.pid;
-	regs->sprg0 = vcpu->arch.sprg0;
-	regs->sprg1 = vcpu->arch.sprg1;
-	regs->sprg2 = vcpu->arch.sprg2;
-	regs->sprg3 = vcpu->arch.sprg3;
+	regs->sprg0 = vcpu->arch.shared->sprg0;
+	regs->sprg1 = vcpu->arch.shared->sprg1;
+	regs->sprg2 = vcpu->arch.shared->sprg2;
+	regs->sprg3 = vcpu->arch.shared->sprg3;
 	regs->sprg5 = vcpu->arch.sprg4;
 	regs->sprg6 = vcpu->arch.sprg5;
 	regs->sprg7 = vcpu->arch.sprg6;
@@ -1080,12 +1161,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	kvmppc_set_lr(vcpu, regs->lr);
 	kvmppc_set_xer(vcpu, regs->xer);
 	kvmppc_set_msr(vcpu, regs->msr);
-	vcpu->arch.srr0 = regs->srr0;
-	vcpu->arch.srr1 = regs->srr1;
-	vcpu->arch.sprg0 = regs->sprg0;
-	vcpu->arch.sprg1 = regs->sprg1;
-	vcpu->arch.sprg2 = regs->sprg2;
-	vcpu->arch.sprg3 = regs->sprg3;
+	vcpu->arch.shared->srr0 = regs->srr0;
+	vcpu->arch.shared->srr1 = regs->srr1;
+	vcpu->arch.shared->sprg0 = regs->sprg0;
+	vcpu->arch.shared->sprg1 = regs->sprg1;
+	vcpu->arch.shared->sprg2 = regs->sprg2;
+	vcpu->arch.shared->sprg3 = regs->sprg3;
 	vcpu->arch.sprg5 = regs->sprg4;
 	vcpu->arch.sprg6 = regs->sprg5;
 	vcpu->arch.sprg7 = regs->sprg6;
@@ -1111,10 +1192,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 			sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
 		}
 	} else {
-		for (i = 0; i < 16; i++) {
-			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw;
-			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw;
-		}
+		for (i = 0; i < 16; i++)
+			sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
+
 		for (i = 0; i < 8; i++) {
 			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
 			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
@@ -1225,6 +1305,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	struct kvmppc_vcpu_book3s *vcpu_book3s;
 	struct kvm_vcpu *vcpu;
 	int err = -ENOMEM;
+	unsigned long p;
 
 	vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
 	if (!vcpu_book3s)
@@ -1242,6 +1323,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_shadow_vcpu;
 
+	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+	/* the real shared page fills the last 4k of our page */
+	vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
+	if (!p)
+		goto uninit_vcpu;
+
 	vcpu->arch.host_retip = kvm_return_point;
 	vcpu->arch.host_msr = mfmsr();
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -1268,10 +1355,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	err = kvmppc_mmu_init(vcpu);
 	if (err < 0)
-		goto free_shadow_vcpu;
+		goto uninit_vcpu;
 
 	return vcpu;
 
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
 free_shadow_vcpu:
 	kfree(vcpu_book3s->shadow_vcpu);
 free_vcpu:
@@ -1284,6 +1373,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 
+	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu_book3s->shadow_vcpu);
 	vfree(vcpu_book3s);
@@ -1346,7 +1436,7 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	local_irq_enable();
 
 	/* Preload FPU if it's enabled */
-	if (vcpu->arch.msr & MSR_FP)
+	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 
 	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 3292d76101d2..c8cefdd15fd8 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -58,14 +58,39 @@ static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static inline u32 sr_vsid(u32 sr_raw)
+{
+	return sr_raw & 0x0fffffff;
+}
+
+static inline bool sr_valid(u32 sr_raw)
+{
+	return (sr_raw & 0x80000000) ? false : true;
+}
+
+static inline bool sr_ks(u32 sr_raw)
+{
+	return (sr_raw & 0x40000000) ? true: false;
+}
+
+static inline bool sr_kp(u32 sr_raw)
+{
+	return (sr_raw & 0x20000000) ? true: false;
+}
+
+static inline bool sr_nx(u32 sr_raw)
+{
+	return (sr_raw & 0x10000000) ? true: false;
+}
+
 static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
 					  struct kvmppc_pte *pte, bool data);
 static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 					     u64 *vsid);
 
-static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr)
+static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	return &vcpu_book3s->sr[(eaddr >> 28) & 0xf];
+	return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf];
 }
 
 static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -87,7 +112,7 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
 }
 
 static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s,
-				      struct kvmppc_sr *sre, gva_t eaddr,
+				      u32 sre, gva_t eaddr,
 				      bool primary)
 {
 	u32 page, hash, pteg, htabmask;
@@ -96,7 +121,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
 	page = (eaddr & 0x0FFFFFFF) >> 12;
 	htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0;
 
-	hash = ((sre->vsid ^ page) << 6);
+	hash = ((sr_vsid(sre) ^ page) << 6);
 	if (!primary)
 		hash = ~hash;
 	hash &= htabmask;
@@ -104,8 +129,8 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
 	pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;
 
 	dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
-		vcpu_book3s->vcpu.arch.pc, eaddr, vcpu_book3s->sdr1, pteg,
-		sre->vsid);
+		kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg,
+		sr_vsid(sre));
 
 	r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
 	if (kvm_is_error_hva(r))
@@ -113,10 +138,9 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
 	return r | (pteg & ~PAGE_MASK);
 }
 
-static u32 kvmppc_mmu_book3s_32_get_ptem(struct kvmppc_sr *sre, gva_t eaddr,
-				    bool primary)
+static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary)
 {
-	return ((eaddr & 0x0fffffff) >> 22) | (sre->vsid << 7) |
+	return ((eaddr & 0x0fffffff) >> 22) | (sr_vsid(sre) << 7) |
 	       (primary ? 0 : 0x40) | 0x80000000;
 }
 
@@ -133,7 +157,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
 		else
 			bat = &vcpu_book3s->ibat[i];
 
-		if (vcpu->arch.msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & MSR_PR) {
 			if (!bat->vp)
 				continue;
 		} else {
@@ -180,17 +204,17 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 				     bool primary)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-	struct kvmppc_sr *sre;
+	u32 sre;
 	hva_t ptegp;
 	u32 pteg[16];
 	u32 ptem = 0;
 	int i;
 	int found = 0;
 
-	sre = find_sr(vcpu_book3s, eaddr);
+	sre = find_sr(vcpu, eaddr);
 
 	dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28,
-		    sre->vsid, sre->raw);
+		    sr_vsid(sre), sre);
 
 	pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
 
@@ -214,8 +238,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 			pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF);
 			pp = pteg[i+1] & 3;
 
-			if ((sre->Kp &&  (vcpu->arch.msr & MSR_PR)) ||
-			    (sre->Ks && !(vcpu->arch.msr & MSR_PR)))
+			if ((sr_kp(sre) &&  (vcpu->arch.shared->msr & MSR_PR)) ||
+			    (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR)))
 				pp |= 4;
 
 			pte->may_write = false;
@@ -269,7 +293,7 @@ no_page_found:
 		dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n",
 			    to_book3s(vcpu)->sdr1, ptegp);
 		for (i=0; i<16; i+=2) {
-			dprintk_pte("   %02d: 0x%x - 0x%x (0x%llx)\n",
+			dprintk_pte("   %02d: 0x%x - 0x%x (0x%x)\n",
 				    i, pteg[i], pteg[i+1], ptem);
 		}
 	}
@@ -281,8 +305,24 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 				      struct kvmppc_pte *pte, bool data)
 {
 	int r;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
 
 	pte->eaddr = eaddr;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(vcpu->arch.shared->msr & MSR_PR)) {
+		pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
+		pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff);
+		pte->raddr &= KVM_PAM;
+		pte->may_execute = true;
+		pte->may_read = true;
+		pte->may_write = true;
+
+		return 0;
+	}
+
 	r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data);
 	if (r < 0)
 	       r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true);
@@ -295,30 +335,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
 {
-	return to_book3s(vcpu)->sr[srnum].raw;
+	return vcpu->arch.shared->sr[srnum];
 }
 
 static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
 					ulong value)
 {
-	struct kvmppc_sr *sre;
-
-	sre = &to_book3s(vcpu)->sr[srnum];
-
-	/* Flush any left-over shadows from the previous SR */
-
-	/* XXX Not necessary? */
-	/* kvmppc_mmu_pte_flush(vcpu, ((u64)sre->vsid) << 28, 0xf0000000ULL); */
-
-	/* And then put in the new SR */
-	sre->raw = value;
-	sre->vsid = (value & 0x0fffffff);
-	sre->valid = (value & 0x80000000) ? false : true;
-	sre->Ks = (value & 0x40000000) ? true : false;
-	sre->Kp = (value & 0x20000000) ? true : false;
-	sre->nx = (value & 0x10000000) ? true : false;
-
-	/* Map the new segment */
+	vcpu->arch.shared->sr[srnum] = value;
 	kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
 }
 
@@ -331,19 +354,19 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 					     u64 *vsid)
 {
 	ulong ea = esid << SID_SHIFT;
-	struct kvmppc_sr *sr;
+	u32 sr;
 	u64 gvsid = esid;
 
-	if (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
-		sr = find_sr(to_book3s(vcpu), ea);
-		if (sr->valid)
-			gvsid = sr->vsid;
+	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+		sr = find_sr(vcpu, ea);
+		if (sr_valid(sr))
+			gvsid = sr_vsid(sr);
 	}
 
 	/* In case we only have one of MSR_IR or MSR_DR set, let's put
 	   that in the real-mode context (and hope RM doesn't access
 	   high memory) */
-	switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 	case 0:
 		*vsid = VSID_REAL | esid;
 		break;
@@ -354,8 +377,8 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		*vsid = VSID_REAL_DR | gvsid;
 		break;
 	case MSR_DR|MSR_IR:
-		if (sr->valid)
-			*vsid = sr->vsid;
+		if (sr_valid(sr))
+			*vsid = sr_vsid(sr);
 		else
 			*vsid = VSID_BAT | gvsid;
 		break;
@@ -363,7 +386,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		BUG();
 	}
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		*vsid |= VSID_PR;
 
 	return 0;
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 0b51ef872c1e..9fecbfbce773 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -19,7 +19,6 @@
  */
 
 #include <linux/kvm_host.h>
-#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -77,7 +76,14 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-	return hash_64(gvsid, SID_MAP_BITS);
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
 }
 
 
@@ -86,7 +92,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 	struct kvmppc_sid_map *map;
 	u16 sid_map_mask;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
@@ -147,8 +153,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	struct hpte_cache *pte;
 
 	/* Get host physical address for gpa */
-	hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
-	if (kvm_is_error_hva(hpaddr)) {
+	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
+	if (is_error_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
 				 orig_pte->eaddr);
 		return -EINVAL;
@@ -253,7 +259,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	u16 sid_map_mask;
 	static int backwards_map = 0;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	/* We might get collisions that trap in preceding order, so let's
@@ -269,18 +275,15 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	backwards_map = !backwards_map;
 
 	/* Uh-oh ... out of mappings. Let's flush! */
-	if (vcpu_book3s->vsid_next >= vcpu_book3s->vsid_max) {
-		vcpu_book3s->vsid_next = vcpu_book3s->vsid_first;
+	if (vcpu_book3s->vsid_next >= VSID_POOL_SIZE) {
+		vcpu_book3s->vsid_next = 0;
 		memset(vcpu_book3s->sid_map, 0,
 		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
 		kvmppc_mmu_pte_flush(vcpu, 0, 0);
 		kvmppc_mmu_flush_segments(vcpu);
 	}
-	map->host_vsid = vcpu_book3s->vsid_next;
-
-	/* Would have to be 111 to be completely aligned with the rest of
-	   Linux, but that is just way too little space! */
-	vcpu_book3s->vsid_next+=1;
+	map->host_vsid = vcpu_book3s->vsid_pool[vcpu_book3s->vsid_next];
+	vcpu_book3s->vsid_next++;
 
 	map->guest_vsid = gvsid;
 	map->valid = true;
@@ -327,40 +330,38 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
+	int i;
+
 	kvmppc_mmu_hpte_destroy(vcpu);
 	preempt_disable();
-	__destroy_context(to_book3s(vcpu)->context_id);
+	for (i = 0; i < SID_CONTEXTS; i++)
+		__destroy_context(to_book3s(vcpu)->context_id[i]);
 	preempt_enable();
 }
 
 /* From mm/mmu_context_hash32.c */
-#define CTX_TO_VSID(ctx) (((ctx) * (897 * 16)) & 0xffffff)
+#define CTX_TO_VSID(c, id)	((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff)
 
 int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int err;
 	ulong sdr1;
+	int i;
+	int j;
 
-	err = __init_new_context();
-	if (err < 0)
-		return -1;
-	vcpu3s->context_id = err;
-
-	vcpu3s->vsid_max = CTX_TO_VSID(vcpu3s->context_id + 1) - 1;
-	vcpu3s->vsid_first = CTX_TO_VSID(vcpu3s->context_id);
-
-#if 0 /* XXX still doesn't guarantee uniqueness */
-	/* We could collide with the Linux vsid space because the vsid
-	 * wraps around at 24 bits. We're safe if we do our own space
-	 * though, so let's always set the highest bit. */
+	for (i = 0; i < SID_CONTEXTS; i++) {
+		err = __init_new_context();
+		if (err < 0)
+			goto init_fail;
+		vcpu3s->context_id[i] = err;
 
-	vcpu3s->vsid_max |= 0x00800000;
-	vcpu3s->vsid_first |= 0x00800000;
-#endif
-	BUG_ON(vcpu3s->vsid_max < vcpu3s->vsid_first);
+		/* Remember context id for this combination */
+		for (j = 0; j < 16; j++)
+			vcpu3s->vsid_pool[(i * 16) + j] = CTX_TO_VSID(err, j);
+	}
 
-	vcpu3s->vsid_next = vcpu3s->vsid_first;
+	vcpu3s->vsid_next = 0;
 
 	/* Remember where the HTAB is */
 	asm ( "mfsdr1 %0" : "=r"(sdr1) );
@@ -370,4 +371,14 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	kvmppc_mmu_hpte_init(vcpu);
 
 	return 0;
+
+init_fail:
+	for (j = 0; j < i; j++) {
+		if (!vcpu3s->context_id[j])
+			continue;
+
+		__destroy_context(to_book3s(vcpu)->context_id[j]);
+	}
+
+	return -1;
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 4025ea26b3c1..d7889ef3211e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -163,6 +163,22 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	bool found = false;
 	bool perm_err = false;
 	int second = 0;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(vcpu->arch.shared->msr & MSR_PR)) {
+		gpte->eaddr = eaddr;
+		gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+		gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff);
+		gpte->raddr &= KVM_PAM;
+		gpte->may_execute = true;
+		gpte->may_read = true;
+		gpte->may_write = true;
+
+		return 0;
+	}
 
 	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
 	if (!slbe)
@@ -180,9 +196,9 @@ do_second:
 		goto no_page_found;
 	}
 
-	if ((vcpu->arch.msr & MSR_PR) && slbe->Kp)
+	if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp)
 		key = 4;
-	else if (!(vcpu->arch.msr & MSR_PR) && slbe->Ks)
+	else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks)
 		key = 4;
 
 	for (i=0; i<16; i+=2) {
@@ -381,7 +397,7 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
 	for (i = 1; i < vcpu_book3s->slb_nr; i++)
 		vcpu_book3s->slb[i].valid = false;
 
-	if (vcpu->arch.msr & MSR_IR) {
+	if (vcpu->arch.shared->msr & MSR_IR) {
 		kvmppc_mmu_flush_segments(vcpu);
 		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
 	}
@@ -445,14 +461,15 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	ulong ea = esid << SID_SHIFT;
 	struct kvmppc_slb *slb;
 	u64 gvsid = esid;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
 
-	if (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 		slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
 		if (slb)
 			gvsid = slb->vsid;
 	}
 
-	switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 	case 0:
 		*vsid = VSID_REAL | esid;
 		break;
@@ -464,7 +481,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		break;
 	case MSR_DR|MSR_IR:
 		if (!slb)
-			return -ENOENT;
+			goto no_slb;
 
 		*vsid = gvsid;
 		break;
@@ -473,10 +490,21 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		break;
 	}
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		*vsid |= VSID_PR;
 
 	return 0;
+
+no_slb:
+	/* Catch magic page case */
+	if (unlikely(mp_ea) &&
+	    unlikely(esid == (mp_ea >> SID_SHIFT)) &&
+	    !(vcpu->arch.shared->msr & MSR_PR)) {
+		*vsid = VSID_REAL | esid;
+		return 0;
+	}
+
+	return -EINVAL;
 }
 
 static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 384179a5002b..fa2f08434ba5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -20,7 +20,6 @@
  */
 
 #include <linux/kvm_host.h>
-#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -28,24 +27,9 @@
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
+#include "trace.h"
 
 #define PTE_SIZE 12
-#define VSID_ALL 0
-
-/* #define DEBUG_MMU */
-/* #define DEBUG_SLB */
-
-#ifdef DEBUG_MMU
-#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
-#else
-#define dprintk_mmu(a, ...) do { } while(0)
-#endif
-
-#ifdef DEBUG_SLB
-#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__)
-#else
-#define dprintk_slb(a, ...) do { } while(0)
-#endif
 
 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
@@ -58,34 +42,39 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-	return hash_64(gvsid, SID_MAP_BITS);
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
 }
 
+
 static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 {
 	struct kvmppc_sid_map *map;
 	u16 sid_map_mask;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
 	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
-	if (map->guest_vsid == gvsid) {
-		dprintk_slb("SLB: Searching: 0x%llx -> 0x%llx\n",
-			    gvsid, map->host_vsid);
+	if (map->valid && (map->guest_vsid == gvsid)) {
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
 		return map;
 	}
 
 	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
-	if (map->guest_vsid == gvsid) {
-		dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
-			    gvsid, map->host_vsid);
+	if (map->valid && (map->guest_vsid == gvsid)) {
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
 		return map;
 	}
 
-	dprintk_slb("SLB: Searching %d/%d: 0x%llx -> not found\n",
-		    sid_map_mask, SID_MAP_MASK - sid_map_mask, gvsid);
+	trace_kvm_book3s_slb_fail(sid_map_mask, gvsid);
 	return NULL;
 }
 
@@ -101,18 +90,13 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	struct kvmppc_sid_map *map;
 
 	/* Get host physical address for gpa */
-	hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
-	if (kvm_is_error_hva(hpaddr)) {
+	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
+	if (is_error_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
 		return -EINVAL;
 	}
 	hpaddr <<= PAGE_SHIFT;
-#if PAGE_SHIFT == 12
-#elif PAGE_SHIFT == 16
-	hpaddr |= orig_pte->raddr & 0xf000;
-#else
-#error Unknown page size
-#endif
+	hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
 
 	/* and write the mapping ea -> hpa into the pt */
 	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
@@ -161,10 +145,7 @@ map_again:
 	} else {
 		struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
-		dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
-			    ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
-			    (rflags & HPTE_R_N) ? '-' : 'x',
-			    orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr);
+		trace_kvm_book3s_64_mmu_map(rflags, hpteg, va, hpaddr, orig_pte);
 
 		/* The ppc_md code may give us a secondary entry even though we
 		   asked for a primary. Fix up. */
@@ -191,7 +172,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	u16 sid_map_mask;
 	static int backwards_map = 0;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	/* We might get collisions that trap in preceding order, so let's
@@ -219,8 +200,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	map->guest_vsid = gvsid;
 	map->valid = true;
 
-	dprintk_slb("SLB: New mapping at %d: 0x%llx -> 0x%llx\n",
-		    sid_map_mask, gvsid, map->host_vsid);
+	trace_kvm_book3s_slb_map(sid_map_mask, gvsid, map->host_vsid);
 
 	return map;
 }
@@ -292,7 +272,7 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
 	to_svcpu(vcpu)->slb[slb_index].esid = slb_esid;
 	to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid;
 
-	dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid);
+	trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
 
 	return 0;
 }
@@ -306,7 +286,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_hpte_destroy(vcpu);
-	__destroy_context(to_book3s(vcpu)->context_id);
+	__destroy_context(to_book3s(vcpu)->context_id[0]);
 }
 
 int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
@@ -317,10 +297,10 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	err = __init_new_context();
 	if (err < 0)
 		return -1;
-	vcpu3s->context_id = err;
+	vcpu3s->context_id[0] = err;
 
-	vcpu3s->vsid_max = ((vcpu3s->context_id + 1) << USER_ESID_BITS) - 1;
-	vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
+	vcpu3s->vsid_max = ((vcpu3s->context_id[0] + 1) << USER_ESID_BITS) - 1;
+	vcpu3s->vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS;
 	vcpu3s->vsid_next = vcpu3s->vsid_first;
 
 	kvmppc_mmu_hpte_init(vcpu);
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index c85f906038ce..466846557089 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -73,8 +73,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		switch (get_xop(inst)) {
 		case OP_19_XOP_RFID:
 		case OP_19_XOP_RFI:
-			kvmppc_set_pc(vcpu, vcpu->arch.srr0);
-			kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+			kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0);
+			kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
 			*advance = 0;
 			break;
 
@@ -86,14 +86,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case 31:
 		switch (get_xop(inst)) {
 		case OP_31_XOP_MFMSR:
-			kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr);
+			kvmppc_set_gpr(vcpu, get_rt(inst),
+				       vcpu->arch.shared->msr);
 			break;
 		case OP_31_XOP_MTMSRD:
 		{
 			ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst));
 			if (inst & 0x10000) {
-				vcpu->arch.msr &= ~(MSR_RI | MSR_EE);
-				vcpu->arch.msr |= rs & (MSR_RI | MSR_EE);
+				vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE);
+				vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE);
 			} else
 				kvmppc_set_msr(vcpu, rs);
 			break;
@@ -204,14 +205,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				ra = kvmppc_get_gpr(vcpu, get_ra(inst));
 
 			addr = (ra + rb) & ~31ULL;
-			if (!(vcpu->arch.msr & MSR_SF))
+			if (!(vcpu->arch.shared->msr & MSR_SF))
 				addr &= 0xffffffff;
 			vaddr = addr;
 
 			r = kvmppc_st(vcpu, &addr, 32, zeros, true);
 			if ((r == -ENOENT) || (r == -EPERM)) {
 				*advance = 0;
-				vcpu->arch.dear = vaddr;
+				vcpu->arch.shared->dar = vaddr;
 				to_svcpu(vcpu)->fault_dar = vaddr;
 
 				dsisr = DSISR_ISSTORE;
@@ -220,7 +221,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				else if (r == -EPERM)
 					dsisr |= DSISR_PROTFAULT;
 
-				to_book3s(vcpu)->dsisr = dsisr;
+				vcpu->arch.shared->dsisr = dsisr;
 				to_svcpu(vcpu)->fault_dsisr = dsisr;
 
 				kvmppc_book3s_queue_irqprio(vcpu,
@@ -263,7 +264,7 @@ void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper,
 	}
 }
 
-static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn)
+static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_bat *bat;
@@ -285,35 +286,7 @@ static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn)
 		BUG();
 	}
 
-	if (sprn % 2)
-		return bat->raw >> 32;
-	else
-		return bat->raw;
-}
-
-static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
-{
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-	struct kvmppc_bat *bat;
-
-	switch (sprn) {
-	case SPRN_IBAT0U ... SPRN_IBAT3L:
-		bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
-		break;
-	case SPRN_IBAT4U ... SPRN_IBAT7L:
-		bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)];
-		break;
-	case SPRN_DBAT0U ... SPRN_DBAT3L:
-		bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
-		break;
-	case SPRN_DBAT4U ... SPRN_DBAT7L:
-		bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)];
-		break;
-	default:
-		BUG();
-	}
-
-	kvmppc_set_bat(vcpu, bat, !(sprn % 2), val);
+	return bat;
 }
 
 int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
@@ -326,10 +299,10 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		to_book3s(vcpu)->sdr1 = spr_val;
 		break;
 	case SPRN_DSISR:
-		to_book3s(vcpu)->dsisr = spr_val;
+		vcpu->arch.shared->dsisr = spr_val;
 		break;
 	case SPRN_DAR:
-		vcpu->arch.dear = spr_val;
+		vcpu->arch.shared->dar = spr_val;
 		break;
 	case SPRN_HIOR:
 		to_book3s(vcpu)->hior = spr_val;
@@ -338,12 +311,16 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_IBAT4U ... SPRN_IBAT7L:
 	case SPRN_DBAT0U ... SPRN_DBAT3L:
 	case SPRN_DBAT4U ... SPRN_DBAT7L:
-		kvmppc_write_bat(vcpu, sprn, (u32)spr_val);
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		kvmppc_set_bat(vcpu, bat, !(sprn % 2), (u32)spr_val);
 		/* BAT writes happen so rarely that we're ok to flush
 		 * everything here */
 		kvmppc_mmu_pte_flush(vcpu, 0, 0);
 		kvmppc_mmu_flush_segments(vcpu);
 		break;
+	}
 	case SPRN_HID0:
 		to_book3s(vcpu)->hid[0] = spr_val;
 		break;
@@ -433,16 +410,24 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_IBAT4U ... SPRN_IBAT7L:
 	case SPRN_DBAT0U ... SPRN_DBAT3L:
 	case SPRN_DBAT4U ... SPRN_DBAT7L:
-		kvmppc_set_gpr(vcpu, rt, kvmppc_read_bat(vcpu, sprn));
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		if (sprn % 2)
+			kvmppc_set_gpr(vcpu, rt, bat->raw >> 32);
+		else
+			kvmppc_set_gpr(vcpu, rt, bat->raw);
+
 		break;
+	}
 	case SPRN_SDR1:
 		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
 		break;
 	case SPRN_DSISR:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr);
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr);
 		break;
 	case SPRN_DAR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear);
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar);
 		break;
 	case SPRN_HIOR:
 		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior);
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 4868d4a7ebc5..79751d8dd131 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,6 +21,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
+#include "trace.h"
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -30,14 +31,6 @@
 
 #define PTE_SIZE	12
 
-/* #define DEBUG_MMU */
-
-#ifdef DEBUG_MMU
-#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
-#else
-#define dprintk_mmu(a, ...) do { } while(0)
-#endif
-
 static struct kmem_cache *hpte_cache;
 
 static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
@@ -45,6 +38,12 @@ static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
 	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
 }
 
+static inline u64 kvmppc_mmu_hash_pte_long(u64 eaddr)
+{
+	return hash_64((eaddr & 0x0ffff000) >> PTE_SIZE,
+		       HPTEG_HASH_BITS_PTE_LONG);
+}
+
 static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
 {
 	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
@@ -60,77 +59,128 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	u64 index;
 
+	trace_kvm_book3s_mmu_map(pte);
+
+	spin_lock(&vcpu->arch.mmu_lock);
+
 	/* Add to ePTE list */
 	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
-	hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+	hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+
+	/* Add to ePTE_long list */
+	index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
+	hlist_add_head_rcu(&pte->list_pte_long,
+			   &vcpu->arch.hpte_hash_pte_long[index]);
 
 	/* Add to vPTE list */
 	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
-	hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+	hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
 
 	/* Add to vPTE_long list */
 	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
-	hlist_add_head(&pte->list_vpte_long,
-		       &vcpu->arch.hpte_hash_vpte_long[index]);
+	hlist_add_head_rcu(&pte->list_vpte_long,
+			   &vcpu->arch.hpte_hash_vpte_long[index]);
+
+	spin_unlock(&vcpu->arch.mmu_lock);
+}
+
+static void free_pte_rcu(struct rcu_head *head)
+{
+	struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head);
+	kmem_cache_free(hpte_cache, pte);
 }
 
 static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
-	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
-		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+	trace_kvm_book3s_mmu_invalidate(pte);
 
 	/* Different for 32 and 64 bit */
 	kvmppc_mmu_invalidate_pte(vcpu, pte);
 
+	spin_lock(&vcpu->arch.mmu_lock);
+
+	/* pte already invalidated in between? */
+	if (hlist_unhashed(&pte->list_pte)) {
+		spin_unlock(&vcpu->arch.mmu_lock);
+		return;
+	}
+
+	hlist_del_init_rcu(&pte->list_pte);
+	hlist_del_init_rcu(&pte->list_pte_long);
+	hlist_del_init_rcu(&pte->list_vpte);
+	hlist_del_init_rcu(&pte->list_vpte_long);
+
 	if (pte->pte.may_write)
 		kvm_release_pfn_dirty(pte->pfn);
 	else
 		kvm_release_pfn_clean(pte->pfn);
 
-	hlist_del(&pte->list_pte);
-	hlist_del(&pte->list_vpte);
-	hlist_del(&pte->list_vpte_long);
+	spin_unlock(&vcpu->arch.mmu_lock);
 
 	vcpu->arch.hpte_cache_count--;
-	kmem_cache_free(hpte_cache, pte);
+	call_rcu(&pte->rcu_head, free_pte_rcu);
 }
 
 static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 {
 	struct hpte_cache *pte;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	int i;
 
+	rcu_read_lock();
+
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
 		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
 
-		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			invalidate_pte(vcpu, pte);
 	}
+
+	rcu_read_unlock();
 }
 
 static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
 	struct hlist_head *list;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 
 	/* Find the list of entries in the map */
 	list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
 
+	rcu_read_lock();
+
 	/* Check the list for matching entries and invalidate */
-	hlist_for_each_entry_safe(pte, node, tmp, list, list_pte)
+	hlist_for_each_entry_rcu(pte, node, list, list_pte)
 		if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
 			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
 }
 
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
-	u64 i;
+	struct hlist_head *list;
+	struct hlist_node *node;
+	struct hpte_cache *pte;
 
-	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
-		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
+	/* Find the list of entries in the map */
+	list = &vcpu->arch.hpte_hash_pte_long[
+			kvmppc_mmu_hash_pte_long(guest_ea)];
 
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, node, list, list_pte_long)
+		if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+	trace_kvm_book3s_mmu_flush("", vcpu, guest_ea, ea_mask);
 	guest_ea &= ea_mask;
 
 	switch (ea_mask) {
@@ -138,9 +188,7 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 		kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
 		break;
 	case 0x0ffff000:
-		/* 32-bit flush w/o segment, go through all possible segments */
-		for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
-			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
+		kvmppc_mmu_pte_flush_long(vcpu, guest_ea);
 		break;
 	case 0:
 		/* Doing a complete flush -> start from scratch */
@@ -156,39 +204,46 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
 	struct hlist_head *list;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xfffffffffULL;
 
 	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
 
+	rcu_read_lock();
+
 	/* Check the list for matching entries and invalidate */
-	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte)
+	hlist_for_each_entry_rcu(pte, node, list, list_vpte)
 		if ((pte->pte.vpage & vp_mask) == guest_vp)
 			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
 }
 
 /* Flush with mask 0xffffff000 */
 static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
 	struct hlist_head *list;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xffffff000ULL;
 
 	list = &vcpu->arch.hpte_hash_vpte_long[
 		kvmppc_mmu_hash_vpte_long(guest_vp)];
 
+	rcu_read_lock();
+
 	/* Check the list for matching entries and invalidate */
-	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+	hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 		if ((pte->pte.vpage & vp_mask) == guest_vp)
 			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
 }
 
 void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
 {
-	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
+	trace_kvm_book3s_mmu_flush("v", vcpu, guest_vp, vp_mask);
 	guest_vp &= vp_mask;
 
 	switch(vp_mask) {
@@ -206,21 +261,24 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
 
 void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 {
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 	int i;
 
-	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
-		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
+	trace_kvm_book3s_mmu_flush("p", vcpu, pa_start, pa_end);
+
+	rcu_read_lock();
 
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
 		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
 
-		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			if ((pte->pte.raddr >= pa_start) &&
 			    (pte->pte.raddr < pa_end))
 				invalidate_pte(vcpu, pte);
 	}
+
+	rcu_read_unlock();
 }
 
 struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
@@ -254,11 +312,15 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
 	/* init hpte lookup hashes */
 	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
 				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long));
 	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
 				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
 	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
 				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
 
+	spin_lock_init(&vcpu->arch.mmu_lock);
+
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index 35a701f3ece4..7b0ee96c1bed 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -165,14 +165,15 @@ static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
 {
 	u64 dsisr;
+	struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared;
 
-	vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 36, 0);
-	vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0);
-	vcpu->arch.dear = eaddr;
+	shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0);
+	shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0);
+	shared->dar = eaddr;
 	/* Page Fault */
 	dsisr = kvmppc_set_field(0, 33, 33, 1);
 	if (is_store)
-		to_book3s(vcpu)->dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
+		shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
 }
 
@@ -658,7 +659,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	if (!kvmppc_inst_is_paired_single(vcpu, inst))
 		return EMULATE_FAIL;
 
-	if (!(vcpu->arch.msr & MSR_FP)) {
+	if (!(vcpu->arch.shared->msr & MSR_FP)) {
 		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);
 		return EMULATE_AGAIN;
 	}
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 506d5c316c96..2b9c9088d00e 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -202,8 +202,25 @@ _GLOBAL(kvmppc_rmcall)
 
 #if defined(CONFIG_PPC_BOOK3S_32)
 #define STACK_LR	INT_FRAME_SIZE+4
+
+/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */
+#define MSR_EXT_START						\
+	PPC_STL	r20, _NIP(r1);					\
+	mfmsr	r20;						\
+	LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE);			\
+	andc	r3,r20,r3;		/* Disable DR,EE */	\
+	mtmsr	r3;						\
+	sync
+
+#define MSR_EXT_END						\
+	mtmsr	r20;			/* Enable DR,EE */	\
+	sync;							\
+	PPC_LL	r20, _NIP(r1)
+
 #elif defined(CONFIG_PPC_BOOK3S_64)
 #define STACK_LR	_LINK
+#define MSR_EXT_START
+#define MSR_EXT_END
 #endif
 
 /*
@@ -215,19 +232,12 @@ _GLOBAL(kvmppc_load_up_ ## what);				\
 	PPC_STLU r1, -INT_FRAME_SIZE(r1);			\
 	mflr	r3;						\
 	PPC_STL	r3, STACK_LR(r1);				\
-	PPC_STL	r20, _NIP(r1);					\
-	mfmsr	r20;						\
-	LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE);			\
-	andc	r3,r20,r3;		/* Disable DR,EE */	\
-	mtmsr	r3;						\
-	sync;							\
+	MSR_EXT_START;						\
 								\
 	bl	FUNC(load_up_ ## what);				\
 								\
-	mtmsr	r20;			/* Enable DR,EE */	\
-	sync;							\
+	MSR_EXT_END;						\
 	PPC_LL	r3, STACK_LR(r1);				\
-	PPC_LL	r20, _NIP(r1);					\
 	mtlr	r3;						\
 	addi	r1, r1, INT_FRAME_SIZE;				\
 	blr
@@ -242,10 +252,10 @@ define_load_up(vsx)
 
 .global kvmppc_trampoline_lowmem
 kvmppc_trampoline_lowmem:
-	.long kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
+	PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
 
 .global kvmppc_trampoline_enter
 kvmppc_trampoline_enter:
-	.long kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
+	PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
 
 #include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 8d4e35f5372c..77575d08c818 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -62,9 +62,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
 	int i;
 
-	printk("pc:   %08lx msr:  %08lx\n", vcpu->arch.pc, vcpu->arch.msr);
+	printk("pc:   %08lx msr:  %08llx\n", vcpu->arch.pc, vcpu->arch.shared->msr);
 	printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
-	printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1);
+	printk("srr0: %08llx srr1: %08llx\n", vcpu->arch.shared->srr0,
+					    vcpu->arch.shared->srr1);
 
 	printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
 
@@ -130,13 +131,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
 {
-	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL);
+	unsigned int prio = BOOKE_IRQPRIO_EXTERNAL;
+
+	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
+		prio = BOOKE_IRQPRIO_EXTERNAL_LEVEL;
+
+	kvmppc_booke_queue_irqprio(vcpu, prio);
 }
 
 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                   struct kvm_interrupt *irq)
 {
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
+	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
 /* Deliver the interrupt of the corresponding priority, if possible. */
@@ -146,6 +153,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	int allowed = 0;
 	ulong uninitialized_var(msr_mask);
 	bool update_esr = false, update_dear = false;
+	ulong crit_raw = vcpu->arch.shared->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+	bool keep_irq = false;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+
+	if (priority == BOOKE_IRQPRIO_EXTERNAL_LEVEL) {
+		priority = BOOKE_IRQPRIO_EXTERNAL;
+		keep_irq = true;
+	}
 
 	switch (priority) {
 	case BOOKE_IRQPRIO_DTLB_MISS:
@@ -169,36 +196,38 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		break;
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_WATCHDOG:
-		allowed = vcpu->arch.msr & MSR_CE;
+		allowed = vcpu->arch.shared->msr & MSR_CE;
 		msr_mask = MSR_ME;
 		break;
 	case BOOKE_IRQPRIO_MACHINE_CHECK:
-		allowed = vcpu->arch.msr & MSR_ME;
+		allowed = vcpu->arch.shared->msr & MSR_ME;
 		msr_mask = 0;
 		break;
 	case BOOKE_IRQPRIO_EXTERNAL:
 	case BOOKE_IRQPRIO_DECREMENTER:
 	case BOOKE_IRQPRIO_FIT:
-		allowed = vcpu->arch.msr & MSR_EE;
+		allowed = vcpu->arch.shared->msr & MSR_EE;
+		allowed = allowed && !crit;
 		msr_mask = MSR_CE|MSR_ME|MSR_DE;
 		break;
 	case BOOKE_IRQPRIO_DEBUG:
-		allowed = vcpu->arch.msr & MSR_DE;
+		allowed = vcpu->arch.shared->msr & MSR_DE;
 		msr_mask = MSR_ME;
 		break;
 	}
 
 	if (allowed) {
-		vcpu->arch.srr0 = vcpu->arch.pc;
-		vcpu->arch.srr1 = vcpu->arch.msr;
+		vcpu->arch.shared->srr0 = vcpu->arch.pc;
+		vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
 		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
 		if (update_esr == true)
 			vcpu->arch.esr = vcpu->arch.queued_esr;
 		if (update_dear == true)
-			vcpu->arch.dear = vcpu->arch.queued_dear;
-		kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask);
+			vcpu->arch.shared->dar = vcpu->arch.queued_dear;
+		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
 
-		clear_bit(priority, &vcpu->arch.pending_exceptions);
+		if (!keep_irq)
+			clear_bit(priority, &vcpu->arch.pending_exceptions);
 	}
 
 	return allowed;
@@ -208,6 +237,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
 	unsigned int priority;
 
 	priority = __ffs(*pending);
@@ -219,6 +249,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 		                         BITS_PER_BYTE * sizeof(*pending),
 		                         priority + 1);
 	}
+
+	/* Tell the guest about our interrupt status */
+	if (*pending)
+		vcpu->arch.shared->int_pending = 1;
+	else if (old_pending)
+		vcpu->arch.shared->int_pending = 0;
 }
 
 /**
@@ -265,7 +301,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_PROGRAM:
-		if (vcpu->arch.msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & MSR_PR) {
 			/* Program traps generated by user-level software must be handled
 			 * by the guest kernel. */
 			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
@@ -337,7 +373,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_SYSCALL:
-		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
+		if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
+		}
 		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
 		r = RESUME_GUEST;
 		break;
@@ -466,15 +510,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
+	int i;
+
 	vcpu->arch.pc = 0;
-	vcpu->arch.msr = 0;
+	vcpu->arch.shared->msr = 0;
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
 
 	vcpu->arch.shadow_pid = 1;
 
-	/* Eye-catching number so we know if the guest takes an interrupt
-	 * before it's programmed its own IVPR. */
+	/* Eye-catching numbers so we know if the guest takes an interrupt
+	 * before it's programmed its own IVPR/IVORs. */
 	vcpu->arch.ivpr = 0x55550000;
+	for (i = 0; i < BOOKE_IRQPRIO_MAX; i++)
+		vcpu->arch.ivor[i] = 0x7700 | i * 4;
 
 	kvmppc_init_timing_stats(vcpu);
 
@@ -490,14 +538,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->ctr = vcpu->arch.ctr;
 	regs->lr = vcpu->arch.lr;
 	regs->xer = kvmppc_get_xer(vcpu);
-	regs->msr = vcpu->arch.msr;
-	regs->srr0 = vcpu->arch.srr0;
-	regs->srr1 = vcpu->arch.srr1;
+	regs->msr = vcpu->arch.shared->msr;
+	regs->srr0 = vcpu->arch.shared->srr0;
+	regs->srr1 = vcpu->arch.shared->srr1;
 	regs->pid = vcpu->arch.pid;
-	regs->sprg0 = vcpu->arch.sprg0;
-	regs->sprg1 = vcpu->arch.sprg1;
-	regs->sprg2 = vcpu->arch.sprg2;
-	regs->sprg3 = vcpu->arch.sprg3;
+	regs->sprg0 = vcpu->arch.shared->sprg0;
+	regs->sprg1 = vcpu->arch.shared->sprg1;
+	regs->sprg2 = vcpu->arch.shared->sprg2;
+	regs->sprg3 = vcpu->arch.shared->sprg3;
 	regs->sprg5 = vcpu->arch.sprg4;
 	regs->sprg6 = vcpu->arch.sprg5;
 	regs->sprg7 = vcpu->arch.sprg6;
@@ -518,12 +566,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.lr = regs->lr;
 	kvmppc_set_xer(vcpu, regs->xer);
 	kvmppc_set_msr(vcpu, regs->msr);
-	vcpu->arch.srr0 = regs->srr0;
-	vcpu->arch.srr1 = regs->srr1;
-	vcpu->arch.sprg0 = regs->sprg0;
-	vcpu->arch.sprg1 = regs->sprg1;
-	vcpu->arch.sprg2 = regs->sprg2;
-	vcpu->arch.sprg3 = regs->sprg3;
+	vcpu->arch.shared->srr0 = regs->srr0;
+	vcpu->arch.shared->srr1 = regs->srr1;
+	vcpu->arch.shared->sprg0 = regs->sprg0;
+	vcpu->arch.shared->sprg1 = regs->sprg1;
+	vcpu->arch.shared->sprg2 = regs->sprg2;
+	vcpu->arch.shared->sprg3 = regs->sprg3;
 	vcpu->arch.sprg5 = regs->sprg4;
 	vcpu->arch.sprg6 = regs->sprg5;
 	vcpu->arch.sprg7 = regs->sprg6;
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index d59bcca1f9d8..492bb7030358 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -46,7 +46,9 @@
 #define BOOKE_IRQPRIO_FIT 17
 #define BOOKE_IRQPRIO_DECREMENTER 18
 #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
-#define BOOKE_IRQPRIO_MAX 19
+/* Internal pseudo-irqprio for level triggered externals */
+#define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20
+#define BOOKE_IRQPRIO_MAX 20
 
 extern unsigned long kvmppc_booke_handlers;
 
@@ -54,12 +56,12 @@ extern unsigned long kvmppc_booke_handlers;
  * changing. */
 static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
-	if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+	if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
 		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
 
-	vcpu->arch.msr = new_msr;
+	vcpu->arch.shared->msr = new_msr;
 
-	if (vcpu->arch.msr & MSR_WE) {
+	if (vcpu->arch.shared->msr & MSR_WE) {
 		kvm_vcpu_block(vcpu);
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
 	};
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index cbc790ee1928..1260f5f24c0c 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -31,8 +31,8 @@
 
 static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.pc = vcpu->arch.srr0;
-	kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+	vcpu->arch.pc = vcpu->arch.shared->srr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
 }
 
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -62,7 +62,7 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 		case OP_31_XOP_MFMSR:
 			rt = get_rt(inst);
-			kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr);
+			kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
 			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
 			break;
 
@@ -74,13 +74,13 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 		case OP_31_XOP_WRTEE:
 			rs = get_rs(inst);
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
 					| (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
 			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
 			break;
 
 		case OP_31_XOP_WRTEEI:
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
 							 | (inst & MSR_EE);
 			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
 			break;
@@ -105,7 +105,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 
 	switch (sprn) {
 	case SPRN_DEAR:
-		vcpu->arch.dear = spr_val; break;
+		vcpu->arch.shared->dar = spr_val; break;
 	case SPRN_ESR:
 		vcpu->arch.esr = spr_val; break;
 	case SPRN_DBCR0:
@@ -200,7 +200,7 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_IVPR:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break;
 	case SPRN_DEAR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break;
 	case SPRN_ESR:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break;
 	case SPRN_DBCR0:
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 380a78cf484d..049846911ce4 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -415,7 +415,8 @@ lightweight_exit:
 	lwz	r8, VCPU_GPR(r8)(r4)
 	lwz	r3, VCPU_PC(r4)
 	mtsrr0	r3
-	lwz	r3, VCPU_MSR(r4)
+	lwz	r3, VCPU_SHARED(r4)
+	lwz	r3, VCPU_SHARED_MSR(r3)
 	oris	r3, r3, KVMPPC_MSR_MASK@h
 	ori	r3, r3, KVMPPC_MSR_MASK@l
 	mtsrr1	r3
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index e8a00b0c4449..71750f2dd5d3 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -117,8 +117,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto uninit_vcpu;
 
+	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_tlb;
+
 	return vcpu;
 
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
 uninit_vcpu:
 	kvm_vcpu_uninit(vcpu);
 free_vcpu:
@@ -131,6 +137,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
+	free_page((unsigned long)vcpu->arch.shared);
 	kvmppc_e500_tlb_uninit(vcpu_e500);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 21011e12caeb..d6d6d47a75a9 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -226,8 +226,7 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
 	stlbe->mas1 = 0;
-	trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
-			     stlbe->mas3, stlbe->mas7);
+	trace_kvm_stlb_inval(index_of(tlbsel, esel));
 }
 
 static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -298,7 +297,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	/* Get reference to new page. */
 	new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
 	if (is_error_page(new_page)) {
-		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
+		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n",
+				(long)gfn);
 		kvm_release_page_clean(new_page);
 		return;
 	}
@@ -314,10 +314,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
 		| e500_shadow_mas2_attrib(gtlbe->mas2,
-				vcpu_e500->vcpu.arch.msr & MSR_PR);
+				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 	stlbe->mas3 = (hpaddr & MAS3_RPN)
 		| e500_shadow_mas3_attrib(gtlbe->mas3,
-				vcpu_e500->vcpu.arch.msr & MSR_PR);
+				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 	stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
 
 	trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
@@ -576,28 +576,28 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
 int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
 
 	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
 }
 
 int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
 
 	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
 }
 
 void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
 
 	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as);
 }
 
 void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
 
 	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as);
 }
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index d28e3010a5e2..458946b4775d 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -171,7 +171,7 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 
 	/* Does it match current guest AS? */
 	/* XXX what about IS != DS? */
-	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
 		return 0;
 
 	gpa = get_tlb_raddr(tlbe);
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index b83ba581fd8e..c64fd2909bb2 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -242,9 +242,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 			switch (sprn) {
 			case SPRN_SRR0:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0);
+				break;
 			case SPRN_SRR1:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1);
+				break;
 			case SPRN_PVR:
 				kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break;
 			case SPRN_PIR:
@@ -261,13 +263,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 				kvmppc_set_gpr(vcpu, rt, get_tb()); break;
 
 			case SPRN_SPRG0:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0);
+				break;
 			case SPRN_SPRG1:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1);
+				break;
 			case SPRN_SPRG2:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2);
+				break;
 			case SPRN_SPRG3:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3);
+				break;
 			/* Note: SPRG4-7 are user-readable, so we don't get
 			 * a trap. */
 
@@ -320,9 +326,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			rs = get_rs(inst);
 			switch (sprn) {
 			case SPRN_SRR0:
-				vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SRR1:
-				vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs);
+				break;
 
 			/* XXX We need to context-switch the timebase for
 			 * watchdog and FIT. */
@@ -337,13 +345,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 				break;
 
 			case SPRN_SPRG0:
-				vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SPRG1:
-				vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SPRG2:
-				vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SPRG3:
-				vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs);
+				break;
 
 			default:
 				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 72a4ad86ee91..2f87a1627f6c 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,9 +38,56 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
+	return !(v->arch.shared->msr & MSR_WE) ||
+	       !!(v->arch.pending_exceptions);
 }
 
+int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
+{
+	int nr = kvmppc_get_gpr(vcpu, 11);
+	int r;
+	unsigned long __maybe_unused param1 = kvmppc_get_gpr(vcpu, 3);
+	unsigned long __maybe_unused param2 = kvmppc_get_gpr(vcpu, 4);
+	unsigned long __maybe_unused param3 = kvmppc_get_gpr(vcpu, 5);
+	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
+	unsigned long r2 = 0;
+
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		/* 32 bit mode */
+		param1 &= 0xffffffff;
+		param2 &= 0xffffffff;
+		param3 &= 0xffffffff;
+		param4 &= 0xffffffff;
+	}
+
+	switch (nr) {
+	case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE:
+	{
+		vcpu->arch.magic_page_pa = param1;
+		vcpu->arch.magic_page_ea = param2;
+
+		r2 = KVM_MAGIC_FEAT_SR;
+
+		r = HC_EV_SUCCESS;
+		break;
+	}
+	case HC_VENDOR_KVM | KVM_HC_FEATURES:
+		r = HC_EV_SUCCESS;
+#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */
+		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
+#endif
+
+		/* Second return value is in r4 */
+		break;
+	default:
+		r = HC_EV_UNIMPLEMENTED;
+		break;
+	}
+
+	kvmppc_set_gpr(vcpu, 4, r2);
+
+	return r;
+}
 
 int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
@@ -145,8 +192,10 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_SEGSTATE:
 	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_UNSET_IRQ:
+	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_PPC_OSI:
+	case KVM_CAP_PPC_GET_PVINFO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -534,16 +583,53 @@ out:
 	return r;
 }
 
+static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
+{
+	u32 inst_lis = 0x3c000000;
+	u32 inst_ori = 0x60000000;
+	u32 inst_nop = 0x60000000;
+	u32 inst_sc = 0x44000002;
+	u32 inst_imm_mask = 0xffff;
+
+	/*
+	 * The hypercall to get into KVM from within guest context is as
+	 * follows:
+	 *
+	 *    lis r0, r0, KVM_SC_MAGIC_R0@h
+	 *    ori r0, KVM_SC_MAGIC_R0@l
+	 *    sc
+	 *    nop
+	 */
+	pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask);
+	pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
+	pvinfo->hcall[2] = inst_sc;
+	pvinfo->hcall[3] = inst_nop;
+
+	return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
+	void __user *argp = (void __user *)arg;
 	long r;
 
 	switch (ioctl) {
+	case KVM_PPC_GET_PVINFO: {
+		struct kvm_ppc_pvinfo pvinfo;
+		r = kvm_vm_ioctl_get_pvinfo(&pvinfo);
+		if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
 
+out:
 	return r;
 }
 
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index a8e840018052..3aca1b042b8c 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -98,6 +98,245 @@ TRACE_EVENT(kvm_gtlb_write,
 		__entry->word1, __entry->word2)
 );
 
+
+/*************************************************************************
+ *                         Book3S trace points                           *
+ *************************************************************************/
+
+#ifdef CONFIG_PPC_BOOK3S
+
+TRACE_EVENT(kvm_book3s_exit,
+	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_nr, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	exit_nr		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned long,	msr		)
+		__field(	unsigned long,	dar		)
+		__field(	unsigned long,	srr1		)
+	),
+
+	TP_fast_assign(
+		__entry->exit_nr	= exit_nr;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->dar		= kvmppc_get_fault_dar(vcpu);
+		__entry->msr		= vcpu->arch.shared->msr;
+		__entry->srr1		= to_svcpu(vcpu)->shadow_srr1;
+	),
+
+	TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
+		  __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar,
+		  __entry->srr1)
+);
+
+TRACE_EVENT(kvm_book3s_reenter,
+	TP_PROTO(int r, struct kvm_vcpu *vcpu),
+	TP_ARGS(r, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	r		)
+		__field(	unsigned long,	pc		)
+	),
+
+	TP_fast_assign(
+		__entry->r		= r;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+	),
+
+	TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
+);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+TRACE_EVENT(kvm_book3s_64_mmu_map,
+	TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
+		 struct kvmppc_pte *orig_pte),
+	TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
+
+	TP_STRUCT__entry(
+		__field(	unsigned char,		flag_w		)
+		__field(	unsigned char,		flag_x		)
+		__field(	unsigned long,		eaddr		)
+		__field(	unsigned long,		hpteg		)
+		__field(	unsigned long,		va		)
+		__field(	unsigned long long,	vpage		)
+		__field(	unsigned long,		hpaddr		)
+	),
+
+	TP_fast_assign(
+		__entry->flag_w	= ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
+		__entry->flag_x	= (rflags & HPTE_R_N) ? '-' : 'x';
+		__entry->eaddr	= orig_pte->eaddr;
+		__entry->hpteg	= hpteg;
+		__entry->va	= va;
+		__entry->vpage	= orig_pte->vpage;
+		__entry->hpaddr	= hpaddr;
+	),
+
+	TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
+		  __entry->flag_w, __entry->flag_x, __entry->eaddr,
+		  __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
+);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+TRACE_EVENT(kvm_book3s_mmu_map,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_va		)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_va	= pte->host_va;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_va, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_invalidate,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_va		)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_va	= pte->host_va;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_va, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_flush,
+	TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
+		 unsigned long long p2),
+	TP_ARGS(type, vcpu, p1, p2),
+
+	TP_STRUCT__entry(
+		__field(	int,			count		)
+		__field(	unsigned long long,	p1		)
+		__field(	unsigned long long,	p2		)
+		__field(	const char *,		type		)
+	),
+
+	TP_fast_assign(
+		__entry->count		= vcpu->arch.hpte_cache_count;
+		__entry->p1		= p1;
+		__entry->p2		= p2;
+		__entry->type		= type;
+	),
+
+	TP_printk("Flush %d %sPTEs: %llx - %llx",
+		  __entry->count, __entry->type, __entry->p1, __entry->p2)
+);
+
+TRACE_EVENT(kvm_book3s_slb_found,
+	TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
+	TP_ARGS(gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	gvsid		)
+		__field(	unsigned long long,	hvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->gvsid		= gvsid;
+		__entry->hvsid		= hvsid;
+	),
+
+	TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_fail,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
+	TP_ARGS(sid_map_mask, gvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	gvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->gvsid		= gvsid;
+	),
+
+	TP_printk("%x/%x: %llx", __entry->sid_map_mask,
+		  SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_map,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
+		 unsigned long long hvsid),
+	TP_ARGS(sid_map_mask, gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	guest_vsid	)
+		__field(	unsigned long long,	host_vsid	)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->guest_vsid	= gvsid;
+		__entry->host_vsid	= hvsid;
+	),
+
+	TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
+		  __entry->guest_vsid, __entry->host_vsid)
+);
+
+TRACE_EVENT(kvm_book3s_slbmte,
+	TP_PROTO(u64 slb_vsid, u64 slb_esid),
+	TP_ARGS(slb_vsid, slb_esid),
+
+	TP_STRUCT__entry(
+		__field(	u64,	slb_vsid	)
+		__field(	u64,	slb_esid	)
+	),
+
+	TP_fast_assign(
+		__entry->slb_vsid	= slb_vsid;
+		__entry->slb_esid	= slb_esid;
+	),
+
+	TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
+);
+
+#endif /* CONFIG_PPC_BOOK3S */
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 81c9208025fa..956154f32cfe 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -21,6 +21,16 @@ source "arch/powerpc/platforms/44x/Kconfig"
 source "arch/powerpc/platforms/40x/Kconfig"
 source "arch/powerpc/platforms/amigaone/Kconfig"
 
+config KVM_GUEST
+	bool "KVM Guest support"
+	default y
+	---help---
+	  This option enables various optimizations for running under the KVM
+	  hypervisor. Overhead for the kernel when not running inside KVM should
+	  be minimal.
+
+	  In case of doubt, say Y
+
 config PPC_NATIVE
 	bool
 	depends on 6xx || PPC64
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 42e512ba8b43..287d7bbb6d36 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -5,6 +5,7 @@ header-y += chsc.h
 header-y += cmb.h
 header-y += dasd.h
 header-y += debug.h
+header-y += kvm_virtio.h
 header-y += monwriter.h
 header-y += qeth.h
 header-y += schid.h
diff --git a/arch/s390/include/asm/kvm_virtio.h b/arch/s390/include/asm/kvm_virtio.h
index acdfdff26611..72f614181eff 100644
--- a/arch/s390/include/asm/kvm_virtio.h
+++ b/arch/s390/include/asm/kvm_virtio.h
@@ -54,4 +54,11 @@ struct kvm_vqconfig {
  * This is pagesize for historical reasons. */
 #define KVM_S390_VIRTIO_RING_ALIGN	4096
 
+
+/* These values are supposed to be in ext_params on an interrupt */
+#define VIRTIO_PARAM_MASK		0xff
+#define VIRTIO_PARAM_VRING_INTERRUPT	0x0
+#define VIRTIO_PARAM_CONFIG_CHANGED	0x1
+#define VIRTIO_PARAM_DEV_ADD		0x2
+
 #endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1f99ecfc48e1..b36c6b3fe144 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,7 @@ struct x86_emulate_ops {
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
 	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+	void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
@@ -156,7 +157,10 @@ struct operand {
 		unsigned long orig_val;
 		u64 orig_val64;
 	};
-	unsigned long *ptr;
+	union {
+		unsigned long *reg;
+		unsigned long mem;
+	} addr;
 	union {
 		unsigned long val;
 		u64 val64;
@@ -190,6 +194,7 @@ struct decode_cache {
 	bool has_seg_override;
 	u8 seg_override;
 	unsigned int d;
+	int (*execute)(struct x86_emulate_ctxt *ctxt);
 	unsigned long regs[NR_VCPU_REGS];
 	unsigned long eip;
 	/* modrm */
@@ -197,17 +202,16 @@ struct decode_cache {
 	u8 modrm_mod;
 	u8 modrm_reg;
 	u8 modrm_rm;
-	u8 use_modrm_ea;
+	u8 modrm_seg;
 	bool rip_relative;
-	unsigned long modrm_ea;
-	void *modrm_ptr;
-	unsigned long modrm_val;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
 	struct read_cache mem_read;
 };
 
 struct x86_emulate_ctxt {
+	struct x86_emulate_ops *ops;
+
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
 
@@ -220,12 +224,11 @@ struct x86_emulate_ctxt {
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
 
-	bool restart; /* restart string instruction after writeback */
+	bool perm_ok; /* do not check permissions if true */
 
 	int exception; /* exception that happens during emulation or -1 */
 	u32 error_code; /* error code for exception */
 	bool error_code_valid;
-	unsigned long cr2; /* faulted address in case of #PF */
 
 	/* decode cache */
 	struct decode_cache decode;
@@ -249,13 +252,14 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
 #endif
 
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
-		    struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
-		     struct x86_emulate_ops *ops);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code);
-
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+		     struct x86_emulate_ops *ops, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52e2eb40a1e..9e6fe391094e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,10 +236,14 @@ struct kvm_pio_request {
  */
 struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
+	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
+	gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -249,13 +253,18 @@ struct kvm_mmu {
 	int root_level;
 	int shadow_root_level;
 	union kvm_mmu_page_role base_role;
+	bool direct_map;
 
 	u64 *pae_root;
+	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
+
+	bool nx;
+
+	u64 pdptrs[4]; /* pae */
 };
 
 struct kvm_vcpu_arch {
-	u64 host_tsc;
 	/*
 	 * rip and regs accesses must go through
 	 * kvm_{register,rip}_{read,write} functions.
@@ -272,7 +281,6 @@ struct kvm_vcpu_arch {
 	unsigned long cr4_guest_owned_bits;
 	unsigned long cr8;
 	u32 hflags;
-	u64 pdptrs[4]; /* pae */
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
@@ -282,7 +290,41 @@ struct kvm_vcpu_arch {
 	u64 ia32_misc_enable_msr;
 	bool tpr_access_reporting;
 
+	/*
+	 * Paging state of the vcpu
+	 *
+	 * If the vcpu runs in guest mode with two level paging this still saves
+	 * the paging mode of the l1 guest. This context is always used to
+	 * handle faults.
+	 */
 	struct kvm_mmu mmu;
+
+	/*
+	 * Paging state of an L2 guest (used for nested npt)
+	 *
+	 * This context will save all necessary information to walk page tables
+	 * of the an L2 guest. This context is only initialized for page table
+	 * walking and not for faulting since we never handle l2 page faults on
+	 * the host.
+	 */
+	struct kvm_mmu nested_mmu;
+
+	/*
+	 * Pointer to the mmu context currently used for
+	 * gva_to_gpa translations.
+	 */
+	struct kvm_mmu *walk_mmu;
+
+	/*
+	 * This struct is filled with the necessary information to propagate a
+	 * page fault into the guest
+	 */
+	struct {
+		u64      address;
+		unsigned error_code;
+		bool     nested;
+	} fault;
+
 	/* only needed in kvm_pv_mmu_op() path, but it's hot so
 	 * put it here to avoid allocation */
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -336,9 +378,15 @@ struct kvm_vcpu_arch {
 
 	gpa_t time;
 	struct pvclock_vcpu_time_info hv_clock;
-	unsigned int hv_clock_tsc_khz;
+	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+	u64 last_host_tsc;
+	u64 last_guest_tsc;
+	u64 last_kernel_ns;
+	u64 last_tsc_nsec;
+	u64 last_tsc_write;
+	bool tsc_catchup;
 
 	bool nmi_pending;
 	bool nmi_injected;
@@ -367,9 +415,9 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_arch {
-	unsigned int n_free_mmu_pages;
+	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
-	unsigned int n_alloc_mmu_pages;
+	unsigned int n_max_mmu_pages;
 	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
@@ -394,8 +442,14 @@ struct kvm_arch {
 	gpa_t ept_identity_map_addr;
 
 	unsigned long irq_sources_bitmap;
-	u64 vm_init_tsc;
 	s64 kvmclock_offset;
+	spinlock_t tsc_write_lock;
+	u64 last_tsc_nsec;
+	u64 last_tsc_offset;
+	u64 last_tsc_write;
+	u32 virtual_tsc_khz;
+	u32 virtual_tsc_mult;
+	s8 virtual_tsc_shift;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
@@ -505,6 +559,7 @@ struct kvm_x86_ops {
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code,
 				bool reinject);
+	void (*cancel_injection)(struct kvm_vcpu *vcpu);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -517,11 +572,16 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
+	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+
+	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
 	bool (*has_wbinvd_exit)(void);
 
+	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
@@ -544,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
@@ -608,8 +668,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-			   u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gfn_t gfn, void *data, int offset, int len,
+			    u32 access);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e9a8e8..7b562b6184bc 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void)
 	return cpuid_eax(KVM_CPUID_FEATURES);
 }
 
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
 #endif
 
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f7790fdb2..83c4bb1d917d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -198,6 +198,7 @@
 #define MSR_IA32_TSC			0x00000010
 #define MSR_IA32_PLATFORM_ID		0x00000017
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
+#define MSR_EBC_FREQUENCY_ID		0x0000002c
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
 #define FEATURE_CONTROL_LOCKED				(1<<0)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f324aa6b..7f7e577a0e39 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
 			    struct timespec *ts);
 
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..ca43ce31a19c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = {
 static int kvm_register_clock(char *txt)
 {
 	int cpu = smp_processor_id();
-	int low, high;
+	int low, high, ret;
+
 	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+	ret = native_write_msr_safe(msr_kvm_system_time, low, high);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
 
-	return native_write_msr_safe(msr_kvm_system_time, low, high);
+	return ret;
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..bab3b9e6f66d 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
 static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
 {
 	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
-	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+	return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
+				   shadow->tsc_shift);
 }
 
 /*
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..ddc131ff438f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
+config KVM_MMU_AUDIT
+	bool "Audit KVM MMU"
+	depends on KVM && TRACEPOINTS
+	---help---
+	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
+	 audit  KVM MMU at runtime.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd6..38b6e8dafaff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
  * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
@@ -51,13 +51,13 @@
 #define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
 #define DstReg      (2<<1)	/* Register operand. */
 #define DstMem      (3<<1)	/* Memory operand. */
-#define DstAcc      (4<<1)      /* Destination Accumulator */
+#define DstAcc      (4<<1)	/* Destination Accumulator */
 #define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
 #define DstMem64    (6<<1)	/* 64bit memory operand */
+#define DstImmUByte (7<<1)	/* 8-bit unsigned immediate operand */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
-#define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
 #define SrcReg      (1<<4)	/* Register operand. */
 #define SrcMem      (2<<4)	/* Memory operand. */
 #define SrcMem16    (3<<4)	/* Memory operand (16-bit). */
@@ -71,6 +71,7 @@
 #define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
 #define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
 #define SrcAcc      (0xd<<4)	/* Source Accumulator */
+#define SrcImmU16   (0xe<<4)    /* Immediate operand, unsigned, 16 bits */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -82,8 +83,10 @@
 #define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
-#define GroupMask   0xff        /* Group number stored in bits 0:7 */
 /* Misc flags */
+#define NoAccess    (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64	    (1<<28)
@@ -92,285 +95,30 @@
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
+#define Src2Imm     (4<<29)
 #define Src2Mask    (7<<29)
 
-enum {
-	Group1_80, Group1_81, Group1_82, Group1_83,
-	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
-	Group8, Group9,
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+struct opcode {
+	u32 flags;
+	union {
+		int (*execute)(struct x86_emulate_ctxt *ctxt);
+		struct opcode *group;
+		struct group_dual *gdual;
+	} u;
 };
 
-static u32 opcode_table[256] = {
-	/* 0x00 - 0x07 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
-	/* 0x08 - 0x0F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, 0,
-	/* 0x10 - 0x17 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
-	/* 0x18 - 0x1F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
-	/* 0x20 - 0x27 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
-	/* 0x28 - 0x2F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
-	/* 0x30 - 0x37 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
-	/* 0x38 - 0x3F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	0, 0,
-	/* 0x40 - 0x47 */
-	DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
-	/* 0x48 - 0x4F */
-	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
-	/* 0x50 - 0x57 */
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-	/* 0x58 - 0x5F */
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-	/* 0x60 - 0x67 */
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
-	0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-	0, 0, 0, 0,
-	/* 0x68 - 0x6F */
-	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
-	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
-	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
-	/* 0x70 - 0x77 */
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	/* 0x78 - 0x7F */
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	/* 0x80 - 0x87 */
-	Group | Group1_80, Group | Group1_81,
-	Group | Group1_82, Group | Group1_83,
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	/* 0x88 - 0x8F */
-	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
-	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
-	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
-	/* 0x90 - 0x97 */
-	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
-	/* 0x98 - 0x9F */
-	0, 0, SrcImmFAddr | No64, 0,
-	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
-	/* 0xA0 - 0xA7 */
-	ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
-	ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
-	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
-	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
-	/* 0xA8 - 0xAF */
-	DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
-	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
-	ByteOp | DstDI | String, DstDI | String,
-	/* 0xB0 - 0xB7 */
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	/* 0xB8 - 0xBF */
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	/* 0xC0 - 0xC7 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-	0, ImplicitOps | Stack, 0, 0,
-	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
-	/* 0xC8 - 0xCF */
-	0, 0, 0, ImplicitOps | Stack,
-	ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
-	/* 0xD0 - 0xD7 */
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	0, 0, 0, 0,
-	/* 0xD8 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE0 - 0xE7 */
-	0, 0, 0, 0,
-	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
-	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
-	/* 0xE8 - 0xEF */
-	SrcImm | Stack, SrcImm | ImplicitOps,
-	SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
-	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
-	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
-	/* 0xF0 - 0xF7 */
-	0, 0, 0, 0,
-	ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
-	/* 0xF8 - 0xFF */
-	ImplicitOps, 0, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
-};
-
-static u32 twobyte_table[256] = {
-	/* 0x00 - 0x0F */
-	0, Group | GroupDual | Group7, 0, 0,
-	0, ImplicitOps, ImplicitOps | Priv, 0,
-	ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
-	0, ImplicitOps | ModRM, 0, 0,
-	/* 0x10 - 0x1F */
-	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x20 - 0x2F */
-	ModRM | ImplicitOps | Priv, ModRM | Priv,
-	ModRM | ImplicitOps | Priv, ModRM | Priv,
-	0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x30 - 0x3F */
-	ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
-	ImplicitOps, ImplicitOps | Priv, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x40 - 0x47 */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x48 - 0x4F */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x50 - 0x5F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x60 - 0x6F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x70 - 0x7F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x80 - 0x8F */
-	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
-	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
-	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xA0 - 0xA7 */
-	ImplicitOps | Stack, ImplicitOps | Stack,
-	0, DstMem | SrcReg | ModRM | BitOp,
-	DstMem | SrcReg | Src2ImmByte | ModRM,
-	DstMem | SrcReg | Src2CL | ModRM, 0, 0,
-	/* 0xA8 - 0xAF */
-	ImplicitOps | Stack, ImplicitOps | Stack,
-	0, DstMem | SrcReg | ModRM | BitOp | Lock,
-	DstMem | SrcReg | Src2ImmByte | ModRM,
-	DstMem | SrcReg | Src2CL | ModRM,
-	ModRM, 0,
-	/* 0xB0 - 0xB7 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	0, DstMem | SrcReg | ModRM | BitOp | Lock,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
-	/* 0xB8 - 0xBF */
-	0, 0,
-	Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
-	/* 0xC0 - 0xCF */
-	0, 0, 0, DstMem | SrcReg | ModRM | Mov,
-	0, 0, 0, Group | GroupDual | Group9,
-	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xD0 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xE0 - 0xEF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0xF0 - 0xFF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static u32 group_table[] = {
-	[Group1_80*8] =
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM,
-	[Group1_81*8] =
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM,
-	[Group1_82*8] =
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64,
-	[Group1_83*8] =
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM,
-	[Group1A*8] =
-	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
-	[Group3_Byte*8] =
-	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
-	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
-	0, 0, 0, 0,
-	[Group3*8] =
-	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
-	0, 0, 0, 0,
-	[Group4*8] =
-	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
-	0, 0, 0, 0, 0, 0,
-	[Group5*8] =
-	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
-	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
-	SrcMem | ModRM | Stack, 0,
-	[Group7*8] =
-	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
-	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
-	[Group8*8] =
-	0, 0, 0, 0,
-	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
-	[Group9*8] =
-	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
-};
-
-static u32 group2_table[] = {
-	[Group7*8] =
-	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
-	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov | Priv, 0,
-	[Group9*8] =
-	0, 0, 0, 0, 0, 0, 0, 0,
+struct group_dual {
+	struct opcode mod012[8];
+	struct opcode mod3[8];
 };
 
 /* EFLAGS bit definitions. */
@@ -392,6 +140,9 @@ static u32 group2_table[] = {
 #define EFLG_PF (1<<2)
 #define EFLG_CF (1<<0)
 
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+#define EFLG_RESERVED_ONE_MASK 2
+
 /*
  * Instruction emulation:
  * Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +195,13 @@ static u32 group2_table[] = {
 #define ON64(x)
 #endif
 
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)	\
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
 	do {								\
 		__asm__ __volatile__ (					\
 			_PRE_EFLAGS("0", "4", "2")			\
 			_op _suffix " %"_x"3,%1; "			\
 			_POST_EFLAGS("0", "4", "2")			\
-			: "=m" (_eflags), "=m" ((_dst).val),		\
+			: "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
 			  "=&r" (_tmp)					\
 			: _y ((_src).val), "i" (EFLAGS_MASK));		\
 	} while (0)
@@ -463,13 +214,13 @@ static u32 group2_table[] = {
 									\
 		switch ((_dst).bytes) {					\
 		case 2:							\
-			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
 			break;						\
 		case 4:							\
-			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
 			break;						\
 		case 8:							\
-			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
 			break;						\
 		}							\
 	} while (0)
@@ -479,7 +230,7 @@ static u32 group2_table[] = {
 		unsigned long _tmp;					     \
 		switch ((_dst).bytes) {				             \
 		case 1:							     \
-			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
+			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
 			break;						     \
 		default:						     \
 			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
@@ -566,6 +317,74 @@ static u32 group2_table[] = {
 		}							\
 	} while (0)
 
+#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix)		\
+	do {								\
+		unsigned long _tmp;					\
+									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "4", "1")			\
+			_op _suffix " %5; "				\
+			_POST_EFLAGS("0", "4", "1")			\
+			: "=m" (_eflags), "=&r" (_tmp),			\
+			  "+a" (_rax), "+d" (_rdx)			\
+			: "i" (EFLAGS_MASK), "m" ((_src).val),		\
+			  "a" (_rax), "d" (_rdx));			\
+	} while (0)
+
+#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+	do {								\
+		unsigned long _tmp;					\
+									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "5", "1")			\
+			"1: \n\t"					\
+			_op _suffix " %6; "				\
+			"2: \n\t"					\
+			_POST_EFLAGS("0", "5", "1")			\
+			".pushsection .fixup,\"ax\" \n\t"		\
+			"3: movb $1, %4 \n\t"				\
+			"jmp 2b \n\t"					\
+			".popsection \n\t"				\
+			_ASM_EXTABLE(1b, 3b)				\
+			: "=m" (_eflags), "=&r" (_tmp),			\
+			  "+a" (_rax), "+d" (_rdx), "+qm"(_ex)		\
+			: "i" (EFLAGS_MASK), "m" ((_src).val),		\
+			  "a" (_rax), "d" (_rdx));			\
+	} while (0)
+
+/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags)			\
+	do {									\
+		switch((_src).bytes) {						\
+		case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
+		case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,  _eflags, "w"); break; \
+		case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
+		case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+		}							\
+	} while (0)
+
+#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex)	\
+	do {								\
+		switch((_src).bytes) {					\
+		case 1:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx,	\
+						 _eflags, "b", _ex);	\
+			break;						\
+		case 2:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "w", _ex);	\
+			break;						\
+		case 4:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "l", _ex);	\
+			break;						\
+		case 8: ON64(						\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "q", _ex));	\
+			break;						\
+		}							\
+	} while (0)
+
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
@@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
 	ctxt->exception = vec;
 	ctxt->error_code = error;
 	ctxt->error_code_valid = valid;
-	ctxt->restart = false;
 }
 
 static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
-		       int err)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->cr2 = addr;
-	emulate_exception(ctxt, PF_VECTOR, err, true);
+	emulate_exception(ctxt, PF_VECTOR, 0, true);
 }
 
 static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, TS_VECTOR, err, true);
 }
 
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_exception(ctxt, DE_VECTOR, 0, false);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
 			      unsigned long eip, u8 *dest)
@@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
 
 static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 			   struct x86_emulate_ops *ops,
-			   void *ptr,
+			   ulong addr,
 			   u16 *size, unsigned long *address, int op_bytes)
 {
 	int rc;
@@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (op_bytes == 2)
 		op_bytes = 3;
 	*address = 0;
-	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
-			   ctxt->vcpu, NULL);
+	rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
-			   ctxt->vcpu, NULL);
+	rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
 	return rc;
 }
 
@@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
 	return (!!rc ^ (condition & 1));
 }
 
+static void fetch_register_operand(struct operand *op)
+{
+	switch (op->bytes) {
+	case 1:
+		op->val = *(u8 *)op->addr.reg;
+		break;
+	case 2:
+		op->val = *(u16 *)op->addr.reg;
+		break;
+	case 4:
+		op->val = *(u32 *)op->addr.reg;
+		break;
+	case 8:
+		op->val = *(u64 *)op->addr.reg;
+		break;
+	}
+}
+
 static void decode_register_operand(struct operand *op,
 				    struct decode_cache *c,
 				    int inhibit_bytereg)
@@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op,
 		reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
 	op->type = OP_REG;
 	if ((c->d & ByteOp) && !inhibit_bytereg) {
-		op->ptr = decode_register(reg, c->regs, highbyte_regs);
-		op->val = *(u8 *)op->ptr;
+		op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
 		op->bytes = 1;
 	} else {
-		op->ptr = decode_register(reg, c->regs, 0);
+		op->addr.reg = decode_register(reg, c->regs, 0);
 		op->bytes = c->op_bytes;
-		switch (op->bytes) {
-		case 2:
-			op->val = *(u16 *)op->ptr;
-			break;
-		case 4:
-			op->val = *(u32 *)op->ptr;
-			break;
-		case 8:
-			op->val = *(u64 *) op->ptr;
-			break;
-		}
 	}
+	fetch_register_operand(op);
 	op->orig_val = op->val;
 }
 
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops)
+			struct x86_emulate_ops *ops,
+			struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
 	int rc = X86EMUL_CONTINUE;
+	ulong modrm_ea = 0;
 
 	if (c->rex_prefix) {
 		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
@@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
 	c->modrm_reg |= (c->modrm & 0x38) >> 3;
 	c->modrm_rm |= (c->modrm & 0x07);
-	c->modrm_ea = 0;
-	c->use_modrm_ea = 1;
+	c->modrm_seg = VCPU_SREG_DS;
 
 	if (c->modrm_mod == 3) {
-		c->modrm_ptr = decode_register(c->modrm_rm,
+		op->type = OP_REG;
+		op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		op->addr.reg = decode_register(c->modrm_rm,
 					       c->regs, c->d & ByteOp);
-		c->modrm_val = *(unsigned long *)c->modrm_ptr;
+		fetch_register_operand(op);
 		return rc;
 	}
 
+	op->type = OP_MEM;
+
 	if (c->ad_bytes == 2) {
 		unsigned bx = c->regs[VCPU_REGS_RBX];
 		unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 6)
-				c->modrm_ea += insn_fetch(u16, 2, c->eip);
+				modrm_ea += insn_fetch(u16, 2, c->eip);
 			break;
 		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->eip);
 			break;
 		case 2:
-			c->modrm_ea += insn_fetch(u16, 2, c->eip);
+			modrm_ea += insn_fetch(u16, 2, c->eip);
 			break;
 		}
 		switch (c->modrm_rm) {
 		case 0:
-			c->modrm_ea += bx + si;
+			modrm_ea += bx + si;
 			break;
 		case 1:
-			c->modrm_ea += bx + di;
+			modrm_ea += bx + di;
 			break;
 		case 2:
-			c->modrm_ea += bp + si;
+			modrm_ea += bp + si;
 			break;
 		case 3:
-			c->modrm_ea += bp + di;
+			modrm_ea += bp + di;
 			break;
 		case 4:
-			c->modrm_ea += si;
+			modrm_ea += si;
 			break;
 		case 5:
-			c->modrm_ea += di;
+			modrm_ea += di;
 			break;
 		case 6:
 			if (c->modrm_mod != 0)
-				c->modrm_ea += bp;
+				modrm_ea += bp;
 			break;
 		case 7:
-			c->modrm_ea += bx;
+			modrm_ea += bx;
 			break;
 		}
 		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
 		    (c->modrm_rm == 6 && c->modrm_mod != 0))
-			if (!c->has_seg_override)
-				set_seg_override(c, VCPU_SREG_SS);
-		c->modrm_ea = (u16)c->modrm_ea;
+			c->modrm_seg = VCPU_SREG_SS;
+		modrm_ea = (u16)modrm_ea;
 	} else {
 		/* 32/64-bit ModR/M decode. */
 		if ((c->modrm_rm & 7) == 4) {
@@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			scale = sib >> 6;
 
 			if ((base_reg & 7) == 5 && c->modrm_mod == 0)
-				c->modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->eip);
 			else
-				c->modrm_ea += c->regs[base_reg];
+				modrm_ea += c->regs[base_reg];
 			if (index_reg != 4)
-				c->modrm_ea += c->regs[index_reg] << scale;
+				modrm_ea += c->regs[index_reg] << scale;
 		} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
 			if (ctxt->mode == X86EMUL_MODE_PROT64)
 				c->rip_relative = 1;
 		} else
-			c->modrm_ea += c->regs[c->modrm_rm];
+			modrm_ea += c->regs[c->modrm_rm];
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 5)
-				c->modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->eip);
 			break;
 		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->eip);
 			break;
 		case 2:
-			c->modrm_ea += insn_fetch(s32, 4, c->eip);
+			modrm_ea += insn_fetch(s32, 4, c->eip);
 			break;
 		}
 	}
+	op->addr.mem = modrm_ea;
 done:
 	return rc;
 }
 
 static int decode_abs(struct x86_emulate_ctxt *ctxt,
-		      struct x86_emulate_ops *ops)
+		      struct x86_emulate_ops *ops,
+		      struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
+	op->type = OP_MEM;
 	switch (c->ad_bytes) {
 	case 2:
-		c->modrm_ea = insn_fetch(u16, 2, c->eip);
+		op->addr.mem = insn_fetch(u16, 2, c->eip);
 		break;
 	case 4:
-		c->modrm_ea = insn_fetch(u32, 4, c->eip);
+		op->addr.mem = insn_fetch(u32, 4, c->eip);
 		break;
 	case 8:
-		c->modrm_ea = insn_fetch(u64, 8, c->eip);
+		op->addr.mem = insn_fetch(u64, 8, c->eip);
 		break;
 	}
 done:
 	return rc;
 }
 
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static void fetch_bit_operand(struct decode_cache *c)
 {
-	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
-	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, group;
-
-
-	/* we cannot decode insn before we complete previous rep insn */
-	WARN_ON(ctxt->restart);
-
-	c->eip = ctxt->eip;
-	c->fetch.start = c->fetch.end = c->eip;
-	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
-
-	switch (mode) {
-	case X86EMUL_MODE_REAL:
-	case X86EMUL_MODE_VM86:
-	case X86EMUL_MODE_PROT16:
-		def_op_bytes = def_ad_bytes = 2;
-		break;
-	case X86EMUL_MODE_PROT32:
-		def_op_bytes = def_ad_bytes = 4;
-		break;
-#ifdef CONFIG_X86_64
-	case X86EMUL_MODE_PROT64:
-		def_op_bytes = 4;
-		def_ad_bytes = 8;
-		break;
-#endif
-	default:
-		return -1;
-	}
-
-	c->op_bytes = def_op_bytes;
-	c->ad_bytes = def_ad_bytes;
-
-	/* Legacy prefixes. */
-	for (;;) {
-		switch (c->b = insn_fetch(u8, 1, c->eip)) {
-		case 0x66:	/* operand-size override */
-			/* switch between 2/4 bytes */
-			c->op_bytes = def_op_bytes ^ 6;
-			break;
-		case 0x67:	/* address-size override */
-			if (mode == X86EMUL_MODE_PROT64)
-				/* switch between 4/8 bytes */
-				c->ad_bytes = def_ad_bytes ^ 12;
-			else
-				/* switch between 2/4 bytes */
-				c->ad_bytes = def_ad_bytes ^ 6;
-			break;
-		case 0x26:	/* ES override */
-		case 0x2e:	/* CS override */
-		case 0x36:	/* SS override */
-		case 0x3e:	/* DS override */
-			set_seg_override(c, (c->b >> 3) & 3);
-			break;
-		case 0x64:	/* FS override */
-		case 0x65:	/* GS override */
-			set_seg_override(c, c->b & 7);
-			break;
-		case 0x40 ... 0x4f: /* REX */
-			if (mode != X86EMUL_MODE_PROT64)
-				goto done_prefixes;
-			c->rex_prefix = c->b;
-			continue;
-		case 0xf0:	/* LOCK */
-			c->lock_prefix = 1;
-			break;
-		case 0xf2:	/* REPNE/REPNZ */
-			c->rep_prefix = REPNE_PREFIX;
-			break;
-		case 0xf3:	/* REP/REPE/REPZ */
-			c->rep_prefix = REPE_PREFIX;
-			break;
-		default:
-			goto done_prefixes;
-		}
-
-		/* Any legacy prefix after a REX prefix nullifies its effect. */
-
-		c->rex_prefix = 0;
-	}
-
-done_prefixes:
-
-	/* REX prefix. */
-	if (c->rex_prefix)
-		if (c->rex_prefix & 8)
-			c->op_bytes = 8;	/* REX.W */
-
-	/* Opcode byte(s). */
-	c->d = opcode_table[c->b];
-	if (c->d == 0) {
-		/* Two-byte opcode? */
-		if (c->b == 0x0f) {
-			c->twobyte = 1;
-			c->b = insn_fetch(u8, 1, c->eip);
-			c->d = twobyte_table[c->b];
-		}
-	}
-
-	if (c->d & Group) {
-		group = c->d & GroupMask;
-		c->modrm = insn_fetch(u8, 1, c->eip);
-		--c->eip;
-
-		group = (group << 3) + ((c->modrm >> 3) & 7);
-		if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
-			c->d = group2_table[group];
-		else
-			c->d = group_table[group];
-	}
-
-	/* Unrecognised? */
-	if (c->d == 0) {
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		return -1;
-	}
-
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-		c->op_bytes = 8;
-
-	/* ModRM and SIB bytes. */
-	if (c->d & ModRM)
-		rc = decode_modrm(ctxt, ops);
-	else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops);
-	if (rc != X86EMUL_CONTINUE)
-		goto done;
-
-	if (!c->has_seg_override)
-		set_seg_override(c, VCPU_SREG_DS);
-
-	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, ops, c);
-
-	if (c->ad_bytes != 8)
-		c->modrm_ea = (u32)c->modrm_ea;
-
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
-
-	/*
-	 * Decode and fetch the source operand: register, memory
-	 * or immediate.
-	 */
-	switch (c->d & SrcMask) {
-	case SrcNone:
-		break;
-	case SrcReg:
-		decode_register_operand(&c->src, c, 0);
-		break;
-	case SrcMem16:
-		c->src.bytes = 2;
-		goto srcmem_common;
-	case SrcMem32:
-		c->src.bytes = 4;
-		goto srcmem_common;
-	case SrcMem:
-		c->src.bytes = (c->d & ByteOp) ? 1 :
-							   c->op_bytes;
-		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
-			break;
-	srcmem_common:
-		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
-		 */
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->src.type = OP_REG;
-			c->src.val = c->modrm_val;
-			c->src.ptr = c->modrm_ptr;
-			break;
-		}
-		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
-		c->src.val = 0;
-		break;
-	case SrcImm:
-	case SrcImmU:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-		/* NB. Immediates are sign-extended as necessary. */
-		switch (c->src.bytes) {
-		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
-			break;
-		}
-		if ((c->d & SrcMask) == SrcImmU) {
-			switch (c->src.bytes) {
-			case 1:
-				c->src.val &= 0xff;
-				break;
-			case 2:
-				c->src.val &= 0xffff;
-				break;
-			case 4:
-				c->src.val &= 0xffffffff;
-				break;
-			}
-		}
-		break;
-	case SrcImmByte:
-	case SrcImmUByte:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = 1;
-		if ((c->d & SrcMask) == SrcImmByte)
-			c->src.val = insn_fetch(s8, 1, c->eip);
-		else
-			c->src.val = insn_fetch(u8, 1, c->eip);
-		break;
-	case SrcAcc:
-		c->src.type = OP_REG;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->src.bytes) {
-			case 1:
-				c->src.val = *(u8 *)c->src.ptr;
-				break;
-			case 2:
-				c->src.val = *(u16 *)c->src.ptr;
-				break;
-			case 4:
-				c->src.val = *(u32 *)c->src.ptr;
-				break;
-			case 8:
-				c->src.val = *(u64 *)c->src.ptr;
-				break;
-		}
-		break;
-	case SrcOne:
-		c->src.bytes = 1;
-		c->src.val = 1;
-		break;
-	case SrcSI:
-		c->src.type = OP_MEM;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)
-			register_address(c,  seg_override_base(ctxt, ops, c),
-					 c->regs[VCPU_REGS_RSI]);
-		c->src.val = 0;
-		break;
-	case SrcImmFAddr:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = c->op_bytes + 2;
-		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
-		break;
-	case SrcMemFAddr:
-		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
-		c->src.bytes = c->op_bytes + 2;
-		break;
-	}
+	long sv = 0, mask;
 
-	/*
-	 * Decode and fetch the second source operand: register, memory
-	 * or immediate.
-	 */
-	switch (c->d & Src2Mask) {
-	case Src2None:
-		break;
-	case Src2CL:
-		c->src2.bytes = 1;
-		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
-		break;
-	case Src2ImmByte:
-		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
-		c->src2.bytes = 1;
-		c->src2.val = insn_fetch(u8, 1, c->eip);
-		break;
-	case Src2One:
-		c->src2.bytes = 1;
-		c->src2.val = 1;
-		break;
-	}
+	if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
+		mask = ~(c->dst.bytes * 8 - 1);
 
-	/* Decode and fetch the destination operand: register or memory. */
-	switch (c->d & DstMask) {
-	case ImplicitOps:
-		/* Special instructions do their own operand decoding. */
-		return 0;
-	case DstReg:
-		decode_register_operand(&c->dst, c,
-			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
-		break;
-	case DstMem:
-	case DstMem64:
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-			c->dst.type = OP_REG;
-			c->dst.val = c->dst.orig_val = c->modrm_val;
-			c->dst.ptr = c->modrm_ptr;
-			break;
-		}
-		c->dst.type = OP_MEM;
-		c->dst.ptr = (unsigned long *)c->modrm_ea;
-		if ((c->d & DstMask) == DstMem64)
-			c->dst.bytes = 8;
-		else
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+		if (c->src.bytes == 2)
+			sv = (s16)c->src.val & (s16)mask;
+		else if (c->src.bytes == 4)
+			sv = (s32)c->src.val & (s32)mask;
 
-			c->dst.ptr = (void *)c->dst.ptr +
-						   (c->src.val & mask) / 8;
-		}
-		break;
-	case DstAcc:
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->dst.bytes) {
-			case 1:
-				c->dst.val = *(u8 *)c->dst.ptr;
-				break;
-			case 2:
-				c->dst.val = *(u16 *)c->dst.ptr;
-				break;
-			case 4:
-				c->dst.val = *(u32 *)c->dst.ptr;
-				break;
-			case 8:
-				c->dst.val = *(u64 *)c->dst.ptr;
-				break;
-		}
-		c->dst.orig_val = c->dst.val;
-		break;
-	case DstDI:
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)
-			register_address(c, es_base(ctxt, ops),
-					 c->regs[VCPU_REGS_RDI]);
-		c->dst.val = 0;
-		break;
+		c->dst.addr.mem += (sv >> 3);
 	}
 
-done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+	/* only subword offset */
+	c->src.val &= (c->dst.bytes << 3) - 1;
 }
 
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, addr, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
        return ret;
 }
@@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
 	return ret;
 }
@@ -1573,6 +1068,25 @@ exception:
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
+static void write_register_operand(struct operand *op)
+{
+	/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+	switch (op->bytes) {
+	case 1:
+		*(u8 *)op->addr.reg = (u8)op->val;
+		break;
+	case 2:
+		*(u16 *)op->addr.reg = (u16)op->val;
+		break;
+	case 4:
+		*op->addr.reg = (u32)op->val;
+		break;	/* 64b: zero-extend */
+	case 8:
+		*op->addr.reg = op->val;
+		break;
+	}
+}
+
 static inline int writeback(struct x86_emulate_ctxt *ctxt,
 			    struct x86_emulate_ops *ops)
 {
@@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 
 	switch (c->dst.type) {
 	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.ptr = c->dst.val;
-			break;
-		}
+		write_register_operand(&c->dst);
 		break;
 	case OP_MEM:
 		if (c->lock_prefix)
 			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
+					c->dst.addr.mem,
 					&c->dst.orig_val,
 					&c->dst.val,
 					c->dst.bytes,
@@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					ctxt->vcpu);
 		else
 			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
+					c->dst.addr.mem,
 					&c->dst.val,
 					c->dst.bytes,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt,
-					      (unsigned long)c->dst.ptr, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
@@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
-					       c->regs[VCPU_REGS_RSP]);
+	c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
+					   c->regs[VCPU_REGS_RSP]);
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	*(unsigned long *)dest =
 		(ctxt->eflags & ~change_mask) | (val & change_mask);
 
+	if (rc == X86EMUL_PROPAGATE_FAULT)
+		emulate_pf(ctxt);
+
 	return rc;
 }
 
@@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+			       struct x86_emulate_ops *ops, int irq)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+	struct desc_ptr dt;
+	gva_t cs_addr;
+	gva_t eip_addr;
+	u16 cs, eip;
+	u32 err;
+
+	/* TODO: Add limit checks */
+	c->src.val = ctxt->eflags;
+	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+
+	c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->src.val = c->eip;
+	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.type = OP_NONE;
+
+	ops->get_idt(&dt, ctxt->vcpu);
+
+	eip_addr = dt.address + (irq << 2);
+	cs_addr = dt.address + (irq << 2) + 2;
+
+	rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->eip = eip;
+
+	return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops, int irq)
+{
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_int_real(ctxt, ops, irq);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
+	default:
+		/* Protected mode interrupts unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
+	}
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	unsigned long temp_eip = 0;
+	unsigned long temp_eflags = 0;
+	unsigned long cs = 0;
+	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+
+	/* TODO: Add stack limit check */
+
+	rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	if (temp_eip & ~0xffff) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->eip = temp_eip;
+
+
+	if (c->op_bytes == 4)
+		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+	else if (c->op_bytes == 2) {
+		ctxt->eflags &= ~0xffff;
+		ctxt->eflags |= temp_eflags;
+	}
+
+	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+	return rc;
+}
+
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops* ops)
+{
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_iret_real(ctxt, ops);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
+	default:
+		/* iret from protected mode unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
+	}
+}
+
 static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
@@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
+	unsigned long *rax = &c->regs[VCPU_REGS_RAX];
+	unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+	u8 de = 0;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 	case 3:	/* neg */
 		emulate_1op("neg", c->dst, ctxt->eflags);
 		break;
+	case 4: /* mul */
+		emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 5: /* imul */
+		emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 6: /* div */
+		emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+				       ctxt->eflags, de);
+		break;
+	case 7: /* idiv */
+		emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+				       ctxt->eflags, de);
+		break;
 	default:
-		return 0;
+		return X86EMUL_UNHANDLEABLE;
 	}
-	return 1;
+	if (de)
+		return emulate_de(ctxt);
+	return X86EMUL_CONTINUE;
 }
 
 static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops, int seg)
+{
+	struct decode_cache *c = &ctxt->decode;
+	unsigned short sel;
+	int rc;
+
+	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+	rc = load_segment_descriptor(ctxt, ops, sel, seg);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.val = c->src.val;
+	return rc;
+}
+
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
+	if (ctxt->perm_ok)
+		return true;
+
 	if (emulator_bad_iopl(ctxt, ops))
 		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
 			return false;
+
+	ctxt->perm_ok = true;
+
 	return true;
 }
 
@@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
@@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
@@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
@@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
 	register_address_increment(c, &c->regs[reg], df * op->bytes);
-	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+	op->addr.mem = register_address(c,  base, c->regs[reg]);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_push(ctxt, ctxt->ops);
+	return X86EMUL_CONTINUE;
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u8 al, old_al;
+	bool af, cf, old_cf;
+
+	cf = ctxt->eflags & X86_EFLAGS_CF;
+	al = c->dst.val;
+
+	old_al = al;
+	old_cf = cf;
+	cf = false;
+	af = ctxt->eflags & X86_EFLAGS_AF;
+	if ((al & 0x0f) > 9 || af) {
+		al -= 6;
+		cf = old_cf | (al >= 250);
+		af = true;
+	} else {
+		af = false;
+	}
+	if (old_al > 0x99 || old_cf) {
+		al -= 0x60;
+		cf = true;
+	}
+
+	c->dst.val = al;
+	/* Set PF, ZF, SF */
+	c->src.type = OP_IMM;
+	c->src.val = 0;
+	c->src.bytes = 1;
+	emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+	ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+	if (cf)
+		ctxt->eflags |= X86_EFLAGS_CF;
+	if (af)
+		ctxt->eflags |= X86_EFLAGS_AF;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u16 sel, old_cs;
+	ulong old_eip;
+	int rc;
+
+	old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	old_eip = c->eip;
+
+	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+		return X86EMUL_CONTINUE;
+
+	c->eip = 0;
+	memcpy(&c->eip, c->src.valptr, c->op_bytes);
+
+	c->src.val = old_cs;
+	emulate_push(ctxt, ctxt->ops);
+	rc = writeback(ctxt, ctxt->ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->src.val = old_eip;
+	emulate_push(ctxt, ctxt->ops);
+	rc = writeback(ctxt, ctxt->ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.type = OP_NONE;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	c->dst.type = OP_REG;
+	c->dst.addr.reg = &c->eip;
+	c->dst.bytes = c->op_bytes;
+	rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+	return X86EMUL_CONTINUE;
+}
+
+static int em_imul(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+	return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.val = c->src2.val;
+	return em_imul(ctxt);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.type = OP_REG;
+	c->dst.bytes = c->src.bytes;
+	c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+	c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
+	struct decode_cache *c = &ctxt->decode;
+	u64 tsc = 0;
+
+	if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+	ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
+	c->regs[VCPU_REGS_RAX] = (u32)tsc;
+	c->regs[VCPU_REGS_RDX] = tsc >> 32;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	c->dst.val = c->src.val;
+	return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define N    D(0)
+#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+
+#define D2bv(_f)      D((_f) | ByteOp), D(_f)
+#define I2bv(_f, _e)  I((_f) | ByteOp, _e), I(_f, _e)
+
+#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM),			\
+		D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock),		\
+		D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+
+
+static struct opcode group1[] = {
+	X7(D(Lock)), N
+};
+
+static struct opcode group1A[] = {
+	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+};
+
+static struct opcode group3[] = {
+	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	X4(D(SrcMem | ModRM)),
+};
+
+static struct opcode group4[] = {
+	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	N, N, N, N, N, N,
+};
+
+static struct opcode group5[] = {
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	D(SrcMem | ModRM | Stack),
+	I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
+	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+	D(SrcMem | ModRM | Stack), N,
+};
+
+static struct group_dual group7 = { {
+	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv),
+	D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
+}, {
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
+} };
+
+static struct opcode group8[] = {
+	N, N, N, N,
+	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+};
+
+static struct group_dual group9 = { {
+	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+}, {
+	N, N, N, N, N, N, N, N,
+} };
+
+static struct opcode group11[] = {
+	I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+};
+
+static struct opcode opcode_table[256] = {
+	/* 0x00 - 0x07 */
+	D6ALU(Lock),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x08 - 0x0F */
+	D6ALU(Lock),
+	D(ImplicitOps | Stack | No64), N,
+	/* 0x10 - 0x17 */
+	D6ALU(Lock),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x18 - 0x1F */
+	D6ALU(Lock),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x20 - 0x27 */
+	D6ALU(Lock), N, N,
+	/* 0x28 - 0x2F */
+	D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
+	/* 0x30 - 0x37 */
+	D6ALU(Lock), N, N,
+	/* 0x38 - 0x3F */
+	D6ALU(0), N, N,
+	/* 0x40 - 0x4F */
+	X16(D(DstReg)),
+	/* 0x50 - 0x57 */
+	X8(I(SrcReg | Stack, em_push)),
+	/* 0x58 - 0x5F */
+	X8(D(DstReg | Stack)),
+	/* 0x60 - 0x67 */
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+	N, N, N, N,
+	/* 0x68 - 0x6F */
+	I(SrcImm | Mov | Stack, em_push),
+	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+	I(SrcImmByte | Mov | Stack, em_push),
+	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+	D2bv(DstDI | Mov | String), /* insb, insw/insd */
+	D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+	/* 0x70 - 0x7F */
+	X16(D(SrcImmByte)),
+	/* 0x80 - 0x87 */
+	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
+	G(DstMem | SrcImm | ModRM | Group, group1),
+	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
+	G(DstMem | SrcImmByte | ModRM | Group, group1),
+	D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
+	/* 0x88 - 0x8F */
+	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
+	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+	/* 0x90 - 0x97 */
+	X8(D(SrcAcc | DstReg)),
+	/* 0x98 - 0x9F */
+	D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+	I(SrcImmFAddr | No64, em_call_far), N,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+	/* 0xA0 - 0xA7 */
+	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+	I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+	I2bv(SrcSI | DstDI | Mov | String, em_mov),
+	D2bv(SrcSI | DstDI | String),
+	/* 0xA8 - 0xAF */
+	D2bv(DstAcc | SrcImm),
+	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+	I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+	D2bv(SrcAcc | DstDI | String),
+	/* 0xB0 - 0xB7 */
+	X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+	/* 0xB8 - 0xBF */
+	X8(I(DstReg | SrcImm | Mov, em_mov)),
+	/* 0xC0 - 0xC7 */
+	D2bv(DstMem | SrcImmByte | ModRM),
+	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
+	D(ImplicitOps | Stack),
+	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
+	G(ByteOp, group11), G(0, group11),
+	/* 0xC8 - 0xCF */
+	N, N, N, D(ImplicitOps | Stack),
+	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+	/* 0xD0 - 0xD7 */
+	D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
+	N, N, N, N,
+	/* 0xD8 - 0xDF */
+	N, N, N, N, N, N, N, N,
+	/* 0xE0 - 0xE7 */
+	X4(D(SrcImmByte)),
+	D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
+	/* 0xE8 - 0xEF */
+	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+	D2bv(SrcNone | DstAcc),	D2bv(SrcAcc | ImplicitOps),
+	/* 0xF0 - 0xF7 */
+	N, N, N, N,
+	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+	/* 0xF8 - 0xFF */
+	D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static struct opcode twobyte_table[256] = {
+	/* 0x00 - 0x0F */
+	N, GD(0, &group7), N, N,
+	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+	N, D(ImplicitOps | ModRM), N, N,
+	/* 0x10 - 0x1F */
+	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
+	/* 0x20 - 0x2F */
+	D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
+	D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
+	N, N, N, N,
+	N, N, N, N, N, N, N, N,
+	/* 0x30 - 0x3F */
+	D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
+	D(ImplicitOps | Priv), N,
+	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+	N, N, N, N, N, N, N, N,
+	/* 0x40 - 0x4F */
+	X16(D(DstReg | SrcMem | ModRM | Mov)),
+	/* 0x50 - 0x5F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x60 - 0x6F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x70 - 0x7F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x80 - 0x8F */
+	X16(D(SrcImm)),
+	/* 0x90 - 0x9F */
+	X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+	/* 0xA0 - 0xA7 */
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
+	/* 0xA8 - 0xAF */
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM),
+	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
+	/* 0xB0 - 0xB7 */
+	D2bv(DstMem | SrcReg | ModRM | Lock),
+	D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+	/* 0xB8 - 0xBF */
+	N, N,
+	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+	/* 0xC0 - 0xCF */
+	D2bv(DstMem | SrcReg | ModRM | Lock),
+	N, D(DstMem | SrcReg | ModRM | Mov),
+	N, N, N, GD(0, &group9),
+	N, N, N, N, N, N, N, N,
+	/* 0xD0 - 0xDF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xE0 - 0xEF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xF0 - 0xFF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+
+#undef D2bv
+#undef I2bv
+#undef D6ALU
+
+static unsigned imm_size(struct decode_cache *c)
+{
+	unsigned size;
+
+	size = (c->d & ByteOp) ? 1 : c->op_bytes;
+	if (size == 8)
+		size = 4;
+	return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+		      unsigned size, bool sign_extension)
+{
+	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
+	int rc = X86EMUL_CONTINUE;
+
+	op->type = OP_IMM;
+	op->bytes = size;
+	op->addr.mem = c->eip;
+	/* NB. Immediates are sign-extended as necessary. */
+	switch (op->bytes) {
+	case 1:
+		op->val = insn_fetch(s8, 1, c->eip);
+		break;
+	case 2:
+		op->val = insn_fetch(s16, 2, c->eip);
+		break;
+	case 4:
+		op->val = insn_fetch(s32, 4, c->eip);
+		break;
+	}
+	if (!sign_extension) {
+		switch (op->bytes) {
+		case 1:
+			op->val &= 0xff;
+			break;
+		case 2:
+			op->val &= 0xffff;
+			break;
+		case 4:
+			op->val &= 0xffffffff;
+			break;
+		}
+	}
+done:
+	return rc;
 }
 
 int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	int mode = ctxt->mode;
+	int def_op_bytes, def_ad_bytes, dual, goffset;
+	struct opcode opcode, *g_mod012, *g_mod3;
+	struct operand memop = { .type = OP_NONE };
+
+	c->eip = ctxt->eip;
+	c->fetch.start = c->fetch.end = c->eip;
+	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+
+	switch (mode) {
+	case X86EMUL_MODE_REAL:
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+		def_op_bytes = def_ad_bytes = 2;
+		break;
+	case X86EMUL_MODE_PROT32:
+		def_op_bytes = def_ad_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86EMUL_MODE_PROT64:
+		def_op_bytes = 4;
+		def_ad_bytes = 8;
+		break;
+#endif
+	default:
+		return -1;
+	}
+
+	c->op_bytes = def_op_bytes;
+	c->ad_bytes = def_ad_bytes;
+
+	/* Legacy prefixes. */
+	for (;;) {
+		switch (c->b = insn_fetch(u8, 1, c->eip)) {
+		case 0x66:	/* operand-size override */
+			/* switch between 2/4 bytes */
+			c->op_bytes = def_op_bytes ^ 6;
+			break;
+		case 0x67:	/* address-size override */
+			if (mode == X86EMUL_MODE_PROT64)
+				/* switch between 4/8 bytes */
+				c->ad_bytes = def_ad_bytes ^ 12;
+			else
+				/* switch between 2/4 bytes */
+				c->ad_bytes = def_ad_bytes ^ 6;
+			break;
+		case 0x26:	/* ES override */
+		case 0x2e:	/* CS override */
+		case 0x36:	/* SS override */
+		case 0x3e:	/* DS override */
+			set_seg_override(c, (c->b >> 3) & 3);
+			break;
+		case 0x64:	/* FS override */
+		case 0x65:	/* GS override */
+			set_seg_override(c, c->b & 7);
+			break;
+		case 0x40 ... 0x4f: /* REX */
+			if (mode != X86EMUL_MODE_PROT64)
+				goto done_prefixes;
+			c->rex_prefix = c->b;
+			continue;
+		case 0xf0:	/* LOCK */
+			c->lock_prefix = 1;
+			break;
+		case 0xf2:	/* REPNE/REPNZ */
+			c->rep_prefix = REPNE_PREFIX;
+			break;
+		case 0xf3:	/* REP/REPE/REPZ */
+			c->rep_prefix = REPE_PREFIX;
+			break;
+		default:
+			goto done_prefixes;
+		}
+
+		/* Any legacy prefix after a REX prefix nullifies its effect. */
+
+		c->rex_prefix = 0;
+	}
+
+done_prefixes:
+
+	/* REX prefix. */
+	if (c->rex_prefix & 8)
+		c->op_bytes = 8;	/* REX.W */
+
+	/* Opcode byte(s). */
+	opcode = opcode_table[c->b];
+	/* Two-byte opcode? */
+	if (c->b == 0x0f) {
+		c->twobyte = 1;
+		c->b = insn_fetch(u8, 1, c->eip);
+		opcode = twobyte_table[c->b];
+	}
+	c->d = opcode.flags;
+
+	if (c->d & Group) {
+		dual = c->d & GroupDual;
+		c->modrm = insn_fetch(u8, 1, c->eip);
+		--c->eip;
+
+		if (c->d & GroupDual) {
+			g_mod012 = opcode.u.gdual->mod012;
+			g_mod3 = opcode.u.gdual->mod3;
+		} else
+			g_mod012 = g_mod3 = opcode.u.group;
+
+		c->d &= ~(Group | GroupDual);
+
+		goffset = (c->modrm >> 3) & 7;
+
+		if ((c->modrm >> 6) == 3)
+			opcode = g_mod3[goffset];
+		else
+			opcode = g_mod012[goffset];
+		c->d |= opcode.flags;
+	}
+
+	c->execute = opcode.u.execute;
+
+	/* Unrecognised? */
+	if (c->d == 0 || (c->d & Undefined)) {
+		DPRINTF("Cannot emulate %02x\n", c->b);
+		return -1;
+	}
+
+	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+		c->op_bytes = 8;
+
+	if (c->d & Op3264) {
+		if (mode == X86EMUL_MODE_PROT64)
+			c->op_bytes = 8;
+		else
+			c->op_bytes = 4;
+	}
+
+	/* ModRM and SIB bytes. */
+	if (c->d & ModRM) {
+		rc = decode_modrm(ctxt, ops, &memop);
+		if (!c->has_seg_override)
+			set_seg_override(c, c->modrm_seg);
+	} else if (c->d & MemAbs)
+		rc = decode_abs(ctxt, ops, &memop);
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
+	if (!c->has_seg_override)
+		set_seg_override(c, VCPU_SREG_DS);
+
+	if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
+		memop.addr.mem += seg_override_base(ctxt, ops, c);
+
+	if (memop.type == OP_MEM && c->ad_bytes != 8)
+		memop.addr.mem = (u32)memop.addr.mem;
+
+	if (memop.type == OP_MEM && c->rip_relative)
+		memop.addr.mem += c->eip;
+
+	/*
+	 * Decode and fetch the source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & SrcMask) {
+	case SrcNone:
+		break;
+	case SrcReg:
+		decode_register_operand(&c->src, c, 0);
+		break;
+	case SrcMem16:
+		memop.bytes = 2;
+		goto srcmem_common;
+	case SrcMem32:
+		memop.bytes = 4;
+		goto srcmem_common;
+	case SrcMem:
+		memop.bytes = (c->d & ByteOp) ? 1 :
+							   c->op_bytes;
+	srcmem_common:
+		c->src = memop;
+		break;
+	case SrcImmU16:
+		rc = decode_imm(ctxt, &c->src, 2, false);
+		break;
+	case SrcImm:
+		rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+		break;
+	case SrcImmU:
+		rc = decode_imm(ctxt, &c->src, imm_size(c), false);
+		break;
+	case SrcImmByte:
+		rc = decode_imm(ctxt, &c->src, 1, true);
+		break;
+	case SrcImmUByte:
+		rc = decode_imm(ctxt, &c->src, 1, false);
+		break;
+	case SrcAcc:
+		c->src.type = OP_REG;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
+		fetch_register_operand(&c->src);
+		break;
+	case SrcOne:
+		c->src.bytes = 1;
+		c->src.val = 1;
+		break;
+	case SrcSI:
+		c->src.type = OP_MEM;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.addr.mem =
+			register_address(c,  seg_override_base(ctxt, ops, c),
+					 c->regs[VCPU_REGS_RSI]);
+		c->src.val = 0;
+		break;
+	case SrcImmFAddr:
+		c->src.type = OP_IMM;
+		c->src.addr.mem = c->eip;
+		c->src.bytes = c->op_bytes + 2;
+		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		break;
+	case SrcMemFAddr:
+		memop.bytes = c->op_bytes + 2;
+		goto srcmem_common;
+		break;
+	}
+
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
+	/*
+	 * Decode and fetch the second source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & Src2Mask) {
+	case Src2None:
+		break;
+	case Src2CL:
+		c->src2.bytes = 1;
+		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+		break;
+	case Src2ImmByte:
+		rc = decode_imm(ctxt, &c->src2, 1, true);
+		break;
+	case Src2One:
+		c->src2.bytes = 1;
+		c->src2.val = 1;
+		break;
+	case Src2Imm:
+		rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+		break;
+	}
+
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
+	/* Decode and fetch the destination operand: register or memory. */
+	switch (c->d & DstMask) {
+	case DstReg:
+		decode_register_operand(&c->dst, c,
+			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+		break;
+	case DstImmUByte:
+		c->dst.type = OP_IMM;
+		c->dst.addr.mem = c->eip;
+		c->dst.bytes = 1;
+		c->dst.val = insn_fetch(u8, 1, c->eip);
+		break;
+	case DstMem:
+	case DstMem64:
+		c->dst = memop;
+		if ((c->d & DstMask) == DstMem64)
+			c->dst.bytes = 8;
+		else
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		if (c->d & BitOp)
+			fetch_bit_operand(c);
+		c->dst.orig_val = c->dst.val;
+		break;
+	case DstAcc:
+		c->dst.type = OP_REG;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
+		fetch_register_operand(&c->dst);
+		c->dst.orig_val = c->dst.val;
+		break;
+	case DstDI:
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.addr.mem =
+			register_address(c, es_base(ctxt, ops),
+					 c->regs[VCPU_REGS_RDI]);
+		c->dst.val = 0;
+		break;
+	case ImplicitOps:
+		/* Special instructions do their own operand decoding. */
+	default:
+		c->dst.type = OP_NONE; /* Disable writeback. */
+		return 0;
+	}
+
+done:
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	/* The second termination condition only applies for REPE
+	 * and REPNE. Test if the repeat string operation prefix is
+	 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+	 * corresponding termination condition according to:
+	 * 	- if REPE/REPZ and ZF = 0 then done
+	 * 	- if REPNE/REPNZ and ZF = 1 then done
+	 */
+	if (((c->b == 0xa6) || (c->b == 0xa7) ||
+	     (c->b == 0xae) || (c->b == 0xaf))
+	    && (((c->rep_prefix == REPE_PREFIX) &&
+		 ((ctxt->eflags & EFLG_ZF) == 0))
+		|| ((c->rep_prefix == REPNE_PREFIX) &&
+		    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+		return true;
+
+	return false;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+{
+	struct x86_emulate_ops *ops = ctxt->ops;
 	u64 msr_data;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = c->dst.type;
+	int irq; /* Used for int 3, int, and into */
 
 	ctxt->decode.mem_read.pos = 0;
 
@@ -2576,6 +3030,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		goto done;
 	}
 
+	if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+		emulate_ud(ctxt);
+		goto done;
+	}
+
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 		emulate_gp(ctxt, 0);
@@ -2583,35 +3042,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->rep_prefix && (c->d & String)) {
-		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-		string_done:
-			ctxt->restart = false;
 			ctxt->eip = c->eip;
 			goto done;
 		}
-		/* The second termination condition only applies for REPE
-		 * and REPNE. Test if the repeat string operation prefix is
-		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
-		 * corresponding termination condition according to:
-		 * 	- if REPE/REPZ and ZF = 0 then done
-		 * 	- if REPNE/REPNZ and ZF = 1 then done
-		 */
-		if ((c->b == 0xa6) || (c->b == 0xa7) ||
-		    (c->b == 0xae) || (c->b == 0xaf)) {
-			if ((c->rep_prefix == REPE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == 0))
-				goto string_done;
-			if ((c->rep_prefix == REPNE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
-				goto string_done;
-		}
-		c->eip = ctxt->eip;
 	}
 
-	if (c->src.type == OP_MEM) {
-		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
+		rc = read_emulated(ctxt, ops, c->src.addr.mem,
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2619,7 +3058,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->src2.type == OP_MEM) {
-		rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+		rc = read_emulated(ctxt, ops, c->src2.addr.mem,
 					&c->src2.val, c->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2631,7 +3070,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+		rc = read_emulated(ctxt, ops, c->dst.addr.mem,
 				   &c->dst.val, c->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2640,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 special_insn:
 
+	if (c->execute) {
+		rc = c->execute(ctxt);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		goto writeback;
+	}
+
 	if (c->twobyte)
 		goto twobyte_insn;
 
@@ -2653,8 +3099,6 @@ special_insn:
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x08 ... 0x0d:
 	      or:		/* or */
@@ -2672,8 +3116,6 @@ special_insn:
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x18 ... 0x1d:
 	      sbb:		/* sbb */
@@ -2684,8 +3126,6 @@ special_insn:
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x20 ... 0x25:
 	      and:		/* and */
@@ -2709,58 +3149,29 @@ special_insn:
 	case 0x48 ... 0x4f: /* dec r16/r32 */
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
-	case 0x50 ... 0x57:  /* push reg */
-		emulate_push(ctxt, ops);
-		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x60:	/* pusha */
 		rc = emulate_pusha(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
 			goto cannot_emulate;
 		c->dst.val = (s32) c->src.val;
 		break;
-	case 0x68: /* push imm */
-	case 0x6a: /* push imm8 */
-		emulate_push(ctxt, ops);
-		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
-		c->dst.bytes = min(c->dst.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  c->dst.bytes)) {
-			emulate_gp(ctxt, 0);
-			goto done;
-		}
-		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
-				     c->regs[VCPU_REGS_RDX], &c->dst.val))
-			goto done; /* IO is needed, skip writeback */
-		break;
+		c->src.val = c->regs[VCPU_REGS_RDX];
+		goto do_io_in;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
-		c->src.bytes = min(c->src.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  c->src.bytes)) {
-			emulate_gp(ctxt, 0);
-			goto done;
-		}
-		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
-				      &c->src.val, 1, ctxt->vcpu);
-
-		c->dst.type = OP_NONE; /* nothing to writeback */
+		c->dst.val = c->regs[VCPU_REGS_RDX];
+		goto do_io_out;
 		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
@@ -2793,29 +3204,15 @@ special_insn:
 	case 0x86 ... 0x87:	/* xchg */
 	xchg:
 		/* Write back the register source. */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *) c->src.ptr = (u8) c->dst.val;
-			break;
-		case 2:
-			*(u16 *) c->src.ptr = (u16) c->dst.val;
-			break;
-		case 4:
-			*c->src.ptr = (u32) c->dst.val;
-			break;	/* 64b reg: zero-extend */
-		case 8:
-			*c->src.ptr = c->dst.val;
-			break;
-		}
+		c->src.val = c->dst.val;
+		write_register_operand(&c->src);
 		/*
 		 * Write back the memory destination with implicit LOCK
 		 * prefix.
 		 */
-		c->dst.val = c->src.val;
+		c->dst.val = c->src.orig_val;
 		c->lock_prefix = 1;
 		break;
-	case 0x88 ... 0x8b:	/* mov */
-		goto mov;
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
 			emulate_ud(ctxt);
@@ -2824,7 +3221,7 @@ special_insn:
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
 		break;
 	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->modrm_ea;
+		c->dst.val = c->src.addr.mem;
 		break;
 	case 0x8e: { /* mov seg, r/m16 */
 		uint16_t sel;
@@ -2847,76 +3244,87 @@ special_insn:
 	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
-	case 0x90: /* nop / xchg r8,rax */
-		if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
-			c->dst.type = OP_NONE;  /* nop */
+	case 0x90 ... 0x97: /* nop / xchg reg, rax */
+		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
 			break;
-		}
-	case 0x91 ... 0x97: /* xchg reg,rax */
-		c->src.type = OP_REG;
-		c->src.bytes = c->op_bytes;
-		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
-		c->src.val = *(c->src.ptr);
 		goto xchg;
+	case 0x98: /* cbw/cwde/cdqe */
+		switch (c->op_bytes) {
+		case 2: c->dst.val = (s8)c->dst.val; break;
+		case 4: c->dst.val = (s16)c->dst.val; break;
+		case 8: c->dst.val = (s32)c->dst.val; break;
+		}
+		break;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
 		emulate_push(ctxt, ops);
 		break;
 	case 0x9d: /* popf */
 		c->dst.type = OP_REG;
-		c->dst.ptr = (unsigned long *) &ctxt->eflags;
+		c->dst.addr.reg = &ctxt->eflags;
 		c->dst.bytes = c->op_bytes;
 		rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
-	case 0xa0 ... 0xa3:	/* mov */
-	case 0xa4 ... 0xa5:	/* movs */
-		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
 		goto cmp;
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
-	case 0xaa ... 0xab:	/* stos */
-		c->dst.val = c->regs[VCPU_REGS_RAX];
-		break;
-	case 0xac ... 0xad:	/* lods */
-		goto mov;
 	case 0xae ... 0xaf:	/* scas */
-		DPRINTF("Urk! I don't handle SCAS.\n");
-		goto cannot_emulate;
-	case 0xb0 ... 0xbf: /* mov r, imm */
-		goto mov;
+		goto cmp;
 	case 0xc0 ... 0xc1:
 		emulate_grp2(ctxt);
 		break;
 	case 0xc3: /* ret */
 		c->dst.type = OP_REG;
-		c->dst.ptr = &c->eip;
+		c->dst.addr.reg = &c->eip;
 		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
-	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
-	mov:
-		c->dst.val = c->src.val;
+	case 0xc4:		/* les */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+		break;
+	case 0xc5:		/* lds */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
 		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
+		break;
+	case 0xcc:		/* int3 */
+		irq = 3;
+		goto do_interrupt;
+	case 0xcd:		/* int n */
+		irq = c->src.val;
+	do_interrupt:
+		rc = emulate_int(ctxt, ops, irq);
+		break;
+	case 0xce:		/* into */
+		if (ctxt->eflags & EFLG_OF) {
+			irq = 4;
+			goto do_interrupt;
+		}
+		break;
+	case 0xcf:		/* iret */
+		rc = emulate_iret(ctxt, ops);
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
-		c->src.val = 1;
 		emulate_grp2(ctxt);
 		break;
 	case 0xd2 ... 0xd3:	/* Grp2 */
 		c->src.val = c->regs[VCPU_REGS_RCX];
 		emulate_grp2(ctxt);
 		break;
+	case 0xe0 ... 0xe2:	/* loop/loopz/loopnz */
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
+		    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
+			jmp_rel(c, c->src.val);
+		break;
+	case 0xe3:	/* jcxz/jecxz/jrcxz */
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
+			jmp_rel(c, c->src.val);
+		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
 		goto do_io_in;
@@ -2964,15 +3372,16 @@ special_insn:
 		break;
 	case 0xee: /* out dx,al */
 	case 0xef: /* out dx,(e/r)ax */
-		c->src.val = c->regs[VCPU_REGS_RDX];
+		c->dst.val = c->regs[VCPU_REGS_RDX];
 	do_io_out:
-		c->dst.bytes = min(c->dst.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+		c->src.bytes = min(c->src.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->dst.val,
+					  c->src.bytes)) {
 			emulate_gp(ctxt, 0);
 			goto done;
 		}
-		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
-				      ctxt->vcpu);
+		ops->pio_out_emulated(c->src.bytes, c->dst.val,
+				      &c->src.val, 1, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
@@ -2981,24 +3390,22 @@ special_insn:
 	case 0xf5:	/* cmc */
 		/* complement carry flag from eflags reg */
 		ctxt->eflags ^= EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		if (!emulate_grp3(ctxt, ops))
-			goto cannot_emulate;
+		rc = emulate_grp3(ctxt, ops);
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0xf9: /* stc */
+		ctxt->eflags |= EFLG_CF;
 		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
 			goto done;
-		} else {
+		} else
 			ctxt->eflags &= ~X86_EFLAGS_IF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
-		}
 		break;
 	case 0xfb: /* sti */
 		if (emulator_bad_iopl(ctxt, ops)) {
@@ -3007,29 +3414,29 @@ special_insn:
 		} else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	case 0xfc: /* cld */
 		ctxt->eflags &= ~EFLG_DF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfd: /* std */
 		ctxt->eflags |= EFLG_DF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfe: /* Grp4 */
 	grp45:
 		rc = emulate_grp45(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xff: /* Grp5 */
 		if (c->modrm_reg == 5)
 			goto jump_far;
 		goto grp45;
+	default:
+		goto cannot_emulate;
 	}
 
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 writeback:
 	rc = writeback(ctxt, ops);
 	if (rc != X86EMUL_CONTINUE)
@@ -3050,25 +3457,32 @@ writeback:
 				&c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
-		struct read_cache *rc = &ctxt->decode.io_read;
+		struct read_cache *r = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		/*
-		 * Re-enter guest when pio read ahead buffer is empty or,
-		 * if it is not used, after each 1024 iteration.
-		 */
-		if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
-		    (rc->end != 0 && rc->end == rc->pos))
-			ctxt->restart = false;
+
+		if (!string_insn_completed(ctxt)) {
+			/*
+			 * Re-enter guest when pio read ahead buffer is empty
+			 * or, if it is not used, after each 1024 iteration.
+			 */
+			if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+			    (r->end == 0 || r->end != r->pos)) {
+				/*
+				 * Reset read cache. Usually happens before
+				 * decode, but since instruction is restarted
+				 * we have to do it here.
+				 */
+				ctxt->decode.mem_read.end = 0;
+				return EMULATION_RESTART;
+			}
+			goto done; /* skip rip writeback */
+		}
 	}
-	/*
-	 * reset read cache here in case string instruction is restared
-	 * without decoding
-	 */
-	ctxt->decode.mem_read.end = 0;
+
 	ctxt->eip = c->eip;
 
 done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
 	switch (c->b) {
@@ -3091,7 +3505,7 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 2: /* lgdt */
-			rc = read_descriptor(ctxt, ops, c->src.ptr,
+			rc = read_descriptor(ctxt, ops, c->src.addr.mem,
 					     &size, &address, c->op_bytes);
 			if (rc != X86EMUL_CONTINUE)
 				goto done;
@@ -3104,14 +3518,12 @@ twobyte_insn:
 				switch (c->modrm_rm) {
 				case 1:
 					rc = kvm_fix_hypercall(ctxt->vcpu);
-					if (rc != X86EMUL_CONTINUE)
-						goto done;
 					break;
 				default:
 					goto cannot_emulate;
 				}
 			} else {
-				rc = read_descriptor(ctxt, ops, c->src.ptr,
+				rc = read_descriptor(ctxt, ops, c->src.addr.mem,
 						     &size, &address,
 						     c->op_bytes);
 				if (rc != X86EMUL_CONTINUE)
@@ -3126,7 +3538,7 @@ twobyte_insn:
 			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
-			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
 				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
@@ -3134,7 +3546,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
+			emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -3144,23 +3556,16 @@ twobyte_insn:
 		break;
 	case 0x05: 		/* syscall */
 		rc = emulate_syscall(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x06:
 		emulate_clts(ctxt->vcpu);
-		c->dst.type = OP_NONE;
 		break;
 	case 0x09:		/* wbinvd */
 		kvm_emulate_wbinvd(ctxt->vcpu);
-		c->dst.type = OP_NONE;
 		break;
 	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
-		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
 		switch (c->modrm_reg) {
@@ -3170,8 +3575,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		}
-		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
-		c->dst.type = OP_NONE;	/* no writeback */
+		c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		break;
 	case 0x21: /* mov from dr to reg */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
@@ -3179,11 +3583,10 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		}
-		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
-		c->dst.type = OP_NONE;	/* no writeback */
+		ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
 		break;
 	case 0x22: /* mov reg, cr */
-		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+		if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
 			emulate_gp(ctxt, 0);
 			goto done;
 		}
@@ -3196,7 +3599,7 @@ twobyte_insn:
 			goto done;
 		}
 
-		if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+		if (ops->set_dr(c->modrm_reg, c->src.val &
 				((ctxt->mode == X86EMUL_MODE_PROT64) ?
 				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
 			/* #UD condition is already handled by the code above */
@@ -3215,7 +3618,6 @@ twobyte_insn:
 			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
 		break;
 	case 0x32:
 		/* rdmsr */
@@ -3227,21 +3629,12 @@ twobyte_insn:
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
 		}
 		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
 		break;
 	case 0x34:		/* sysenter */
 		rc = emulate_sysenter(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x35:		/* sysexit */
 		rc = emulate_sysexit(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		c->dst.val = c->dst.orig_val = c->src.val;
@@ -3251,15 +3644,15 @@ twobyte_insn:
 	case 0x80 ... 0x8f: /* jnz rel, etc*/
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
-		c->dst.type = OP_NONE;
+		break;
+	case 0x90 ... 0x9f:     /* setcc r/m8 */
+		c->dst.val = test_cc(c->b, ctxt->eflags);
 		break;
 	case 0xa0:	  /* push fs */
 		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xa3:
 	      bt:		/* bt */
@@ -3277,13 +3670,9 @@ twobyte_insn:
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xab:
 	      bts:		/* bts */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xac: /* shrd imm8, r, r/m */
@@ -3306,15 +3695,22 @@ twobyte_insn:
 		} else {
 			/* Failure: write the value we saw to EAX. */
 			c->dst.type = OP_REG;
-			c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+			c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
 		}
 		break;
+	case 0xb2:		/* lss */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+		break;
 	case 0xb3:
 	      btr:		/* btr */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xb4:		/* lfs */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+		break;
+	case 0xb5:		/* lgs */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3334,15 +3730,43 @@ twobyte_insn:
 		break;
 	case 0xbb:
 	      btc:		/* btc */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xbc: {		/* bsf */
+		u8 zf;
+		__asm__ ("bsf %2, %0; setz %1"
+			 : "=r"(c->dst.val), "=q"(zf)
+			 : "r"(c->src.val));
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
+		if (zf) {
+			ctxt->eflags |= X86_EFLAGS_ZF;
+			c->dst.type = OP_NONE;	/* Disable writeback. */
+		}
+		break;
+	}
+	case 0xbd: {		/* bsr */
+		u8 zf;
+		__asm__ ("bsr %2, %0; setz %1"
+			 : "=r"(c->dst.val), "=q"(zf)
+			 : "r"(c->src.val));
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
+		if (zf) {
+			ctxt->eflags |= X86_EFLAGS_ZF;
+			c->dst.type = OP_NONE;	/* Disable writeback. */
+		}
+		break;
+	}
 	case 0xbe ... 0xbf:	/* movsx */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
 							(s16) c->src.val;
 		break;
+	case 0xc0 ... 0xc1:	/* xadd */
+		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+		/* Write back the register source. */
+		c->src.val = c->dst.orig_val;
+		write_register_operand(&c->src);
+		break;
 	case 0xc3:		/* movnti */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
@@ -3350,10 +3774,14 @@ twobyte_insn:
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		rc = emulate_grp9(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
+	default:
+		goto cannot_emulate;
 	}
+
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 	goto writeback;
 
 cannot_emulate:
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb2314b522..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	}
 }
 
-int pit_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
-	if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
-		return atomic_read(&pit->pit_state.pit_timer.pending);
-	return 0;
-}
-
 static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73ce2098..f628234fbeca 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
 static void pic_lock(struct kvm_pic *s)
 	__acquires(&s->lock)
 {
-	raw_spin_lock(&s->lock);
+	spin_lock(&s->lock);
 }
 
 static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
 
 	s->wakeup_needed = false;
 
-	raw_spin_unlock(&s->lock);
+	spin_unlock(&s->lock);
 
 	if (wakeup) {
 		kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s)
 		if (!found)
 			return;
 
+		kvm_make_request(KVM_REQ_EVENT, found);
 		kvm_vcpu_kick(found);
 	}
 }
@@ -308,13 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 	addr &= 1;
 	if (addr == 0) {
 		if (val & 0x10) {
-			kvm_pic_reset(s);	/* init */
-			/*
-			 * deassert a pending interrupt
-			 */
-			pic_irq_request(s->pics_state->kvm, 0);
-			s->init_state = 1;
 			s->init4 = val & 1;
+			s->last_irr = 0;
+			s->imr = 0;
+			s->priority_add = 0;
+			s->special_mask = 0;
+			s->read_reg_select = 0;
+			if (!s->init4) {
+				s->special_fully_nested_mode = 0;
+				s->auto_eoi = 0;
+			}
+			s->init_state = 1;
 			if (val & 0x02)
 				printk(KERN_ERR "single mode not supported");
 			if (val & 0x08)
@@ -564,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
 		return NULL;
-	raw_spin_lock_init(&s->lock);
+	spin_lock_init(&s->lock);
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a049835e..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
 /*
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -33,12 +33,7 @@
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	int ret;
-
-	ret = pit_has_pending_timer(vcpu);
-	ret |= apic_has_pending_timer(vcpu);
-
-	return ret;
+	return apic_has_pending_timer(vcpu);
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c314502993..ba910d149410 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
 };
 
 struct kvm_pic {
-	raw_spinlock_t lock;
+	spinlock_t lock;
 	bool wakeup_needed;
 	unsigned pending_acks;
 	struct kvm *kvm;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8e755b..975bb45329a1 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
 
-	return vcpu->arch.pdptrs[index];
+	return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
+{
+	load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
+
+	return mmu->pdptrs[index];
 }
 
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 22b06f7660f4..413f8973a855 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
 
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
-	u32 tpr, isrv, ppr;
+	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
 
+	old_ppr = apic_get_reg(apic, APIC_PROCPRI);
 	tpr = apic_get_reg(apic, APIC_TASKPRI);
 	isr = apic_find_highest_isr(apic);
 	isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
 		   apic, ppr, isr, isrv);
 
-	apic_set_reg(apic, APIC_PROCPRI, ppr);
+	if (old_ppr != ppr) {
+		apic_set_reg(apic, APIC_PROCPRI, ppr);
+		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+	}
 }
 
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			break;
 		}
 
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		kvm_vcpu_kick(vcpu);
 		break;
 
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 				       "INIT on a runnable vcpu %d\n",
 				       vcpu->vcpu_id);
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		} else {
 			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			result = 1;
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}
 		break;
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 		trigger_mode = IOAPIC_EDGE_TRIG;
 	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
 		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1151,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..908ea5464a51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -49,15 +49,25 @@
  */
 bool tdp_enabled = false;
 
-#undef MMU_DEBUG
+enum {
+	AUDIT_PRE_PAGE_FAULT,
+	AUDIT_POST_PAGE_FAULT,
+	AUDIT_PRE_PTE_WRITE,
+	AUDIT_POST_PTE_WRITE,
+	AUDIT_PRE_SYNC,
+	AUDIT_POST_SYNC
+};
 
-#undef AUDIT
+char *audit_point_name[] = {
+	"pre page fault",
+	"post page fault",
+	"pre pte write",
+	"post pte write",
+	"pre sync",
+	"post sync"
+};
 
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
 
@@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
 
 #endif
 
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
 static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
@@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644);
 	}
 #endif
 
+#define PTE_PREFETCH_NUM		8
+
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
@@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static bool spte_has_volatile_bits(u64 spte)
+{
+	if (!shadow_accessed_mask)
+		return false;
+
+	if (!is_shadow_present_pte(spte))
+		return false;
+
+	if ((spte & shadow_accessed_mask) &&
+	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
+		return false;
+
+	return true;
+}
+
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+	return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
 static void update_spte(u64 *sptep, u64 new_spte)
 {
-	u64 old_spte;
+	u64 mask, old_spte = *sptep;
+
+	WARN_ON(!is_rmap_spte(new_spte));
+
+	new_spte |= old_spte & shadow_dirty_mask;
 
-	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
-	      !is_rmap_spte(*sptep))
+	mask = shadow_accessed_mask;
+	if (is_writable_pte(old_spte))
+		mask |= shadow_dirty_mask;
+
+	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
 		__set_spte(sptep, new_spte);
-	else {
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
-		if (old_spte & shadow_accessed_mask)
-			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
-	}
+
+	if (!shadow_accessed_mask)
+		return;
+
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-				   rmap_desc_cache, 4);
+				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 		desc->sptes[0] = (u64 *)*rmapp;
 		desc->sptes[1] = spte;
 		*rmapp = (unsigned long)desc | 1;
+		++count;
 	} else {
 		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 			desc = desc->more;
 		}
 		for (i = 0; desc->sptes[i]; ++i)
-			;
+			++count;
 		desc->sptes[i] = spte;
 	}
 	return count;
@@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
 		BUG();
 	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p 1->0\n", spte);
 		if ((u64 *)*rmapp != spte) {
-			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-			       spte, *spte);
+			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
 			BUG();
 		}
 		*rmapp = 0;
 	} else {
-		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p many->many\n", spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 		prev_desc = NULL;
 		while (desc) {
@@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 			prev_desc = desc;
 			desc = desc->more;
 		}
-		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
+		pr_err("rmap_remove: %p many->many\n", spte);
 		BUG();
 	}
 }
@@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 	pfn_t pfn;
 	u64 old_spte = *sptep;
 
-	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
-	      old_spte & shadow_accessed_mask) {
+	if (!spte_has_volatile_bits(old_spte))
 		__set_spte(sptep, new_spte);
-	} else
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
 
 	if (!is_rmap_spte(old_spte))
 		return;
+
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(old_spte))
+	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
 		kvm_set_pfn_dirty(pfn);
 }
 
@@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		}
 		spte = rmap_next(kvm, rmapp, spte);
 	}
-	if (write_protected) {
-		pfn_t pfn;
-
-		spte = rmap_next(kvm, rmapp, NULL);
-		pfn = spte_to_pfn(*spte);
-		kvm_set_pfn_dirty(pfn);
-	}
 
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
@@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt)
 }
 #endif
 
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values.  We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+	kvm->arch.n_used_mmu_pages += nr;
+	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
@@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
-	++kvm->arch.n_free_mmu_pages;
+	kvm_mod_used_mmu_pages(kvm, -1);
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-	--vcpu->kvm->arch.n_free_mmu_pages;
+	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
 	return sp;
 }
 
@@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (role.direct)
 		role.cr4_pae = 0;
 	role.access = access;
-	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+	if (!vcpu->arch.mmu.direct_map
+	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
@@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 	iterator->addr = addr;
 	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
 	iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+	if (iterator->level == PT64_ROOT_LEVEL &&
+	    vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+	    !vcpu->arch.mmu.direct_map)
+		--iterator->level;
+
 	if (iterator->level == PT32E_ROOT_LEVEL) {
 		iterator->shadow_addr
 			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 /*
  * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
  */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 {
-	int used_pages;
 	LIST_HEAD(invalid_list);
-
-	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
-	used_pages = max(0, used_pages);
-
 	/*
 	 * If we set the number of mmu pages to be smaller be than the
 	 * number of actived pages , we must to free some mmu pages before we
 	 * change the value
 	 */
 
-	if (used_pages > kvm_nr_mmu_pages) {
-		while (used_pages > kvm_nr_mmu_pages &&
+	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
 			!list_empty(&kvm->arch.active_mmu_pages)) {
 			struct kvm_mmu_page *page;
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
-							       &invalid_list);
+			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		}
-		kvm_mmu_commit_zap_page(kvm, &invalid_list);
-		kvm_nr_mmu_pages = used_pages;
-		kvm->arch.n_free_mmu_pages = 0;
+		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
-	else
-		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_alloc_mmu_pages;
 
-	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 	int r;
 
-	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
 	r = 0;
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: zap %lx %x\n",
+		pgprintk("%s: zap %llx %x\n",
 			 __func__, gfn, sp->role.word);
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
@@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = shadow_base_present_pte | shadow_dirty_mask;
+	spte = shadow_base_present_pte;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
 	if (!dirty)
@@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
-	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
-		&& !user_fault)) {
+	    || (!vcpu->arch.mmu.direct_map && write_fault
+		&& !is_write_protection(vcpu) && !user_fault)) {
 
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 		spte |= PT_WRITABLE_MASK;
 
-		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+		if (!vcpu->arch.mmu.direct_map
+		    && !(pte_access & ACC_WRITE_MASK))
 			spte &= ~PT_USER_MASK;
 
 		/*
@@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			goto set_pte;
 
 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
-			pgprintk("%s: found shadow page for %lx, marking ro\n",
+			pgprintk("%s: found shadow page for %llx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
@@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
-		kvm_set_pfn_dirty(pfn);
 	update_spte(sptep, spte);
 done:
 	return ret;
@@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	int rmap_count;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
+		 " user_fault %d gfn %llx\n",
 		 __func__, *sptep, pt_access,
 		 write_fault, user_fault, gfn);
 
@@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			__set_spte(sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
-			pgprintk("hfn old %lx new %lx\n",
+			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
 		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
 		 *sptep, sptep);
@@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(vcpu->kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+	      (no_dirty_log && slot->dirty_bitmap))
+		slot = NULL;
+
+	return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+				     bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+	unsigned long hva;
+
+	slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+	if (!slot) {
+		get_page(bad_page);
+		return page_to_pfn(bad_page);
+	}
+
+	hva = gfn_to_hva_memslot(slot, gfn);
+
+	return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp,
+				    u64 *start, u64 *end)
+{
+	struct page *pages[PTE_PREFETCH_NUM];
+	unsigned access = sp->role.access;
+	int i, ret;
+	gfn_t gfn;
+
+	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+	if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
+		return -1;
+
+	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+	if (ret <= 0)
+		return -1;
+
+	for (i = 0; i < ret; i++, gfn++, start++)
+		mmu_set_spte(vcpu, start, ACC_ALL,
+			     access, 0, 0, 1, NULL,
+			     sp->role.level, gfn,
+			     page_to_pfn(pages[i]), true, true);
+
+	return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp, u64 *sptep)
+{
+	u64 *spte, *start = NULL;
+	int i;
+
+	WARN_ON(!sp->role.direct);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+			if (!start)
+				continue;
+			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+				break;
+			start = NULL;
+		} else if (!start)
+			start = spte;
+	}
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+
+	/*
+	 * Since it's no accessed bit on EPT, it's no way to
+	 * distinguish between actually accessed translations
+	 * and prefetched, so disable pte prefetch if EPT is
+	 * enabled.
+	 */
+	if (!shadow_accessed_mask)
+		return;
+
+	sp = page_header(__pa(sptep));
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	__direct_pte_prefetch(vcpu, sp, sptep);
+}
+
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			int level, gfn_t gfn, pfn_t pfn)
 {
@@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
 				     level, gfn, pfn, false, true);
+			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
@@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			__set_spte(iterator.sptep,
 				   __pa(sp->spt)
 				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				   | shadow_user_mask | shadow_x_mask);
+				   | shadow_user_mask | shadow_x_mask
+				   | shadow_accessed_mask);
 		}
 	}
 	return pt_write;
 }
 
-static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
 {
-	char buf[1];
-	void __user *hva;
-	int r;
+	siginfo_t info;
+
+	info.si_signo	= SIGBUS;
+	info.si_errno	= 0;
+	info.si_code	= BUS_MCEERR_AR;
+	info.si_addr	= (void __user *)address;
+	info.si_addr_lsb = PAGE_SHIFT;
 
-	/* Touch the page, so send SIGBUS */
-	hva = (void __user *)gfn_to_hva(kvm, gfn);
-	r = copy_from_user(buf, hva, 1);
+	send_sig_info(SIGBUS, &info, tsk);
 }
 
 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
 	if (is_hwpoison_pfn(pfn)) {
-		kvm_send_hwpoison_signal(kvm, gfn);
+		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
 		return 0;
 	} else if (is_fault_pfn(pfn))
 		return -EFAULT;
@@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
+	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+	     vcpu->arch.mmu.direct_map)) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		sp = page_header(root);
@@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	return ret;
 }
 
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
-	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	int direct = 0;
-	u64 pdptr;
-
-	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+	unsigned i;
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu);
+		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
+				      1, ACC_ALL, NULL);
+		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+		for (i = 0; i < 4; ++i) {
+			hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+			ASSERT(!VALID_PAGE(root));
+			spin_lock(&vcpu->kvm->mmu_lock);
+			kvm_mmu_free_some_pages(vcpu);
+			sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+					      PT32_ROOT_LEVEL, 1, ACC_ALL,
+					      NULL);
+			root = __pa(sp->spt);
+			++sp->root_count;
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+		}
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+	} else
+		BUG();
+
+	return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	u64 pdptr, pm_mask;
+	gfn_t root_gfn;
+	int i;
+
+	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+
+	if (mmu_check_root(vcpu, root_gfn))
+		return 1;
+
+	/*
+	 * Do we shadow a long mode page table? If so we need to
+	 * write-protect the guests page table root.
+	 */
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
-		if (tdp_enabled) {
-			direct = 1;
-			root_gfn = 0;
-		}
+
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
-		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, direct,
-				      ACC_ALL, NULL);
+		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+				      0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu.root_hpa = root;
 		return 0;
 	}
-	direct = !is_paging(vcpu);
+
+	/*
+	 * We shadow a 32 bit page table. This may be a legacy 2-level
+	 * or a PAE 3-level page table. In either case we need to be aware that
+	 * the shadow page table may be a PAE or a long mode page table.
+	 */
+	pm_mask = PT_PRESENT_MASK;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
 		ASSERT(!VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-			pdptr = kvm_pdptr_read(vcpu, i);
+			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
 			if (!is_present_gpte(pdptr)) {
 				vcpu->arch.mmu.pae_root[i] = 0;
 				continue;
 			}
 			root_gfn = pdptr >> PAGE_SHIFT;
-		} else if (vcpu->arch.mmu.root_level == 0)
-			root_gfn = 0;
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
-		if (tdp_enabled) {
-			direct = 1;
-			root_gfn = i << 30;
+			if (mmu_check_root(vcpu, root_gfn))
+				return 1;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, direct,
+				      PT32_ROOT_LEVEL, 0,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 
-		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
 	}
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+	/*
+	 * If we shadow a 32 bit page table with a long mode page
+	 * table we enter this path.
+	 */
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		if (vcpu->arch.mmu.lm_root == NULL) {
+			/*
+			 * The additional page necessary for this is only
+			 * allocated on demand.
+			 */
+
+			u64 *lm_root;
+
+			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+			if (lm_root == NULL)
+				return 1;
+
+			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+			vcpu->arch.mmu.lm_root = lm_root;
+		}
+
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+	}
+
 	return 0;
 }
 
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mmu.direct_map)
+		return mmu_alloc_direct_roots(vcpu);
+	else
+		return mmu_alloc_shadow_roots(vcpu);
+}
+
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	struct kvm_mmu_page *sp;
 
+	if (vcpu->arch.mmu.direct_map)
+		return;
+
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
 		mmu_sync_children(vcpu, sp);
@@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 			mmu_sync_children(vcpu, sp);
 		}
 	}
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }
 
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vaddr;
 }
 
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+					 u32 access, u32 *error)
+{
+	if (error)
+		*error = 0;
+	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code)
 {
@@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+static int nonpaging_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
-
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = nonpaging_page_fault;
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
+	context->nx = false;
 	return 0;
 }
 
@@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-			      u64 addr,
-			      u32 err_code)
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr3;
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu)
 {
-	kvm_inject_page_fault(vcpu, addr, err_code);
+	vcpu->arch.mmu.inject_page_fault(vcpu);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 {
 	int bit7;
 
 	bit7 = (gpte >> 7) & 1;
-	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
 #define PTTYPE 64
@@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context,
+				  int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
-	if (!is_nx(vcpu))
+	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
 	switch (level) {
 	case PT32_ROOT_LEVEL:
@@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 	}
 }
 
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+					struct kvm_mmu *context,
+					int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	context->nx = is_nx(vcpu);
+
+	reset_rsvds_bits_mask(vcpu, context, level);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->root_level = level;
 	context->shadow_root_level = level;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
 }
 
-static int paging32_init_context(struct kvm_vcpu *vcpu)
+static int paging32_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	context->nx = false;
+
+	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 
-	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
+static int paging32E_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
 
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	struct kvm_mmu *context = vcpu->arch.walk_mmu;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
@@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->invlpg = nonpaging_invlpg;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
+	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+	context->get_cr3 = get_cr3;
+	context->inject_page_fault = kvm_inject_page_fault;
+	context->nx = is_nx(vcpu);
 
 	if (!is_paging(vcpu)) {
+		context->nx = false;
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+		context->nx = is_nx(vcpu);
+		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+		context->nx = is_nx(vcpu);
+		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
-		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+		context->nx = false;
+		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
 	}
@@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
 	int r;
-
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (!is_paging(vcpu))
-		r = nonpaging_init_context(vcpu);
+		r = nonpaging_init_context(vcpu, context);
 	else if (is_long_mode(vcpu))
-		r = paging64_init_context(vcpu);
+		r = paging64_init_context(vcpu, context);
 	else if (is_pae(vcpu))
-		r = paging32E_init_context(vcpu);
+		r = paging32E_init_context(vcpu, context);
 	else
-		r = paging32_init_context(vcpu);
+		r = paging32_init_context(vcpu, context);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
-	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+
+	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
+	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
+	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+	return r;
+}
+
+static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+	g_context->get_cr3           = get_cr3;
+	g_context->inject_page_fault = kvm_inject_page_fault;
+
+	/*
+	 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
+	 * translation of l2_gpa to l1_gpa addresses is done using the
+	 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
+	 * functions between mmu and nested_mmu are swapped.
+	 */
+	if (!is_paging(vcpu)) {
+		g_context->nx = false;
+		g_context->root_level = 0;
+		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+	} else if (is_long_mode(vcpu)) {
+		g_context->nx = is_nx(vcpu);
+		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
+		g_context->root_level = PT64_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else if (is_pae(vcpu)) {
+		g_context->nx = is_nx(vcpu);
+		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
+		g_context->root_level = PT32E_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else {
+		g_context->nx = false;
+		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
+		g_context->root_level = PT32_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+	}
+
+	return 0;
+}
 
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.update_pte.pfn = bad_pfn;
 
-	if (tdp_enabled)
+	if (mmu_is_nested(vcpu))
+		return init_kvm_nested_mmu(vcpu);
+	else if (tdp_enabled)
 		return init_kvm_tdp_mmu(vcpu);
 	else
 		return init_kvm_softmmu(vcpu);
@@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	/* set_cr3() should ensure TLB has been flushed */
-	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 out:
 	return r;
 }
@@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
 	mmu_free_roots(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp,
@@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
         }
 
-	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+	if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
 		return;
 
 	++vcpu->kvm->stat.mmu_pte_updated;
@@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
-	kvm_mmu_audit(vcpu, "pre pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (guest_initiated) {
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
@@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	kvm_mmu_audit(vcpu, "post pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t gpa;
 	int r;
 
-	if (tdp_enabled)
+	if (vcpu->arch.mmu.direct_map)
 		return 0;
 
 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	int free_pages;
 	LIST_HEAD(invalid_list);
 
-	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
-	while (free_pages < KVM_REFILL_PAGES &&
+	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		struct kvm_mmu_page *sp;
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						       &invalid_list);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
+	if (vcpu->arch.mmu.lm_root != NULL)
+		free_page((unsigned long)vcpu->arch.mmu.lm_root);
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	return init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-
-	destroy_kvm_mmu(vcpu);
-	free_mmu_pages(vcpu);
-	mmu_free_memory_caches(vcpu);
-}
-
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
@@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	struct kvm *kvm;
 	struct kvm *kvm_freed = NULL;
-	int cache_count = 0;
+
+	if (nr_to_scan == 0)
+		goto out;
 
 	spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int npages, idx, freed_pages;
+		int idx, freed_pages;
 		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		npages = kvm->arch.n_alloc_mmu_pages -
-			 kvm->arch.n_free_mmu_pages;
-		cache_count += npages;
-		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+		if (!kvm_freed && nr_to_scan > 0 &&
+		    kvm->arch.n_used_mmu_pages > 0) {
 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
 							  &invalid_list);
-			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
@@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 	spin_unlock(&kvm_lock);
 
-	return cache_count;
+out:
+	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
 }
 
 static struct shrinker mmu_shrinker = {
@@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void)
 void kvm_mmu_module_exit(void)
 {
 	mmu_destroy_caches();
+	percpu_counter_destroy(&kvm_total_used_mmu_pages);
 	unregister_shrinker(&mmu_shrinker);
 }
 
@@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
+	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+		goto nomem;
+
 	register_shrinker(&mmu_shrinker);
 
 	return 0;
@@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-	gva = (long long)(gva << 16) >> 16;
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
 #endif
-	return gva;
-}
-
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
-		}
-	}
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
-		return;
-	}
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root && VALID_PAGE(root)) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
-		}
-	}
-	return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 ent = pt[i];
-
-		if (ent == shadow_trap_nonpresent_pte)
-			continue;
-
-		va = canonicalize(va);
-		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
-			audit_mappings_page(vcpu, ent, va, level - 1);
-		else {
-			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
-			gfn_t gfn = gpa >> PAGE_SHIFT;
-			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
-			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
 
-			if (is_error_pfn(pfn)) {
-				kvm_release_pfn_clean(pfn);
-				continue;
-			}
-
-			if (is_shadow_present_pte(ent)
-			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-				printk(KERN_ERR "xx audit error: (%s) levels %d"
-				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
-				       audit_msg, vcpu->arch.mmu.root_level,
-				       va, gpa, hpa, ent,
-				       is_shadow_present_pte(ent));
-			else if (ent == shadow_notrap_nonpresent_pte
-				 && !is_error_hpa(hpa))
-				printk(KERN_ERR "audit: (%s) notrap shadow,"
-				       " valid guest gva %lx\n", audit_msg, va);
-			kvm_release_pfn_clean(pfn);
-
-		}
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_memslots *slots;
-	int nmaps = 0;
-	int i, j, k, idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *m = &slots->memslots[i];
-		struct kvm_rmap_desc *d;
-
-		for (j = 0; j < m->npages; ++j) {
-			unsigned long *rmapp = &m->rmap[j];
-
-			if (!*rmapp)
-				continue;
-			if (!(*rmapp & 1)) {
-				++nmaps;
-				continue;
-			}
-			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-			while (d) {
-				for (k = 0; k < RMAP_EXT; ++k)
-					if (d->sptes[k])
-						++nmaps;
-					else
-						break;
-				d = d->more;
-			}
-		}
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-	return nmaps;
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
-	unsigned long *rmapp;
-	struct kvm_mmu_page *rev_sp;
-	gfn_t gfn;
-
-	if (is_writable_pte(*sptep)) {
-		rev_sp = page_header(__pa(sptep));
-		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
-		if (!gfn_to_memslot(kvm, gfn)) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
-					 audit_msg, gfn);
-			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
-			       audit_msg, (long int)(sptep - rev_sp->spt),
-					rev_sp->gfn);
-			dump_stack();
-			return;
-		}
-
-		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-		if (!*rmapp) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-					 audit_msg, *sptep);
-			dump_stack();
-		}
-	}
-
-}
-
-void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			u64 ent = pt[i];
-
-			if (!(ent & PT_PRESENT_MASK))
-				continue;
-			if (!is_writable_pte(ent))
-				continue;
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
-	}
-	return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	check_writable_mappings_rmap(vcpu);
-	count_rmaps(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	u64 *spte;
-	gfn_t gfn;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %lx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
-	}
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	int olddbg = dbg;
+	ASSERT(vcpu);
 
-	dbg = 0;
-	audit_msg = msg;
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_writable_sptes_have_rmaps(vcpu);
-	dbg = olddbg;
+	destroy_kvm_mmu(vcpu);
+	free_mmu_pages(vcpu);
+	mmu_free_memory_caches(vcpu);
+	mmu_audit_disable();
 }
-
-#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,10 +49,17 @@
 #define PFERR_FETCH_MASK (1U << 4)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
+static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+{
+	return kvm->arch.n_max_mmu_pages -
+		kvm->arch.n_used_mmu_pages;
+}
 
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+	if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
 		__kvm_mmu_free_some_pages(vcpu);
 }
 
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..ba2bcdde6221
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,299 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/ratelimit.h>
+
+static int audit_point;
+
+#define audit_printk(fmt, args...)		\
+	printk(KERN_ERR "audit: (%s) error: "	\
+		fmt, audit_point_name[audit_point], ##args)
+
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
+
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn, int level)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 *ent = sp->spt;
+
+		fn(vcpu, ent + i, level);
+
+		if (is_shadow_present_pte(ent[i]) &&
+		      !is_last_spte(ent[i], level)) {
+			struct kvm_mmu_page *child;
+
+			child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+			__mmu_spte_walk(vcpu, child, fn, level - 1);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
+		return;
+	}
+
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu, sp, fn, 2);
+		}
+	}
+
+	return;
+}
+
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+		fn(kvm, sp);
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	struct kvm_mmu_page *sp;
+	gfn_t gfn;
+	pfn_t pfn;
+	hpa_t hpa;
+
+	sp = page_header(__pa(sptep));
+
+	if (sp->unsync) {
+		if (level != PT_PAGE_TABLE_LEVEL) {
+			audit_printk("unsync sp: %p level = %d\n", sp, level);
+			return;
+		}
+
+		if (*sptep == shadow_notrap_nonpresent_pte) {
+			audit_printk("notrap spte in unsync sp: %p\n", sp);
+			return;
+		}
+	}
+
+	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+		audit_printk("notrap spte in direct sp: %p\n", sp);
+		return;
+	}
+
+	if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+		return;
+
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
+		return;
+	}
+
+	hpa =  pfn << PAGE_SHIFT;
+	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+		audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
+				   vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+}
+
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		audit_printk("no memslot for gfn %llx\n", gfn);
+		audit_printk("index %ld of sp (gfn=%llx)\n",
+		       (long int)(sptep - rev_sp->spt), rev_sp->gfn);
+		dump_stack();
+		return;
+	}
+
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		audit_printk("no rmap for writable spte %llx\n", *sptep);
+		dump_stack();
+	}
+}
+
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+		inspect_spte_has_rmap(vcpu->kvm, sptep);
+}
+
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+	if (audit_point == AUDIT_POST_SYNC && sp->unsync)
+		audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+}
+
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	int i;
+
+	if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+		return;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		if (!is_rmap_spte(sp->spt[i]))
+			continue;
+
+		inspect_spte_has_rmap(kvm, sp->spt + i);
+	}
+}
+
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	u64 *spte;
+
+	if (sp->role.direct || sp->unsync || sp->role.invalid)
+		return;
+
+	slot = gfn_to_memslot(kvm, sp->gfn);
+	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		if (is_writable_pte(*spte))
+			audit_printk("shadow page has writable mappings: gfn "
+				     "%llx role %x\n", sp->gfn, sp->role.word);
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+}
+
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	check_mappings_rmap(kvm, sp);
+	audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+	walk_all_active_sps(kvm, audit_sp);
+}
+
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	audit_sptes_have_rmaps(vcpu, sptep, level);
+	audit_mappings(vcpu, sptep, level);
+	audit_spte_after_sync(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, audit_spte);
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+{
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+	if (!__ratelimit(&ratelimit_state))
+		return;
+
+	audit_point = point;
+	audit_all_active_sps(vcpu->kvm);
+	audit_vcpu_spte(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930ef..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 
 	TP_ARGS(sp)
 );
+
+TRACE_EVENT(
+	kvm_mmu_audit,
+	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+	TP_ARGS(vcpu, audit_point),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(int, audit_point)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->audit_point = audit_point;
+	),
+
+	TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+		  audit_point_name[__entry->audit_point])
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960d..cd7a833a3b52 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -67,6 +67,7 @@ struct guest_walker {
 	int level;
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t ptes[PT_MAX_FULL_LEVELS];
+	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
 	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
 	unsigned pt_access;
 	unsigned pte_access;
@@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 #if PTTYPE == 64
-	if (is_nx(vcpu))
+	if (vcpu->arch.mmu.nx)
 		access &= ~(gpte >> PT64_NX_SHIFT);
 #endif
 	return access;
@@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 /*
  * Fetch a guest pte for a guest virtual address
  */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-			    struct kvm_vcpu *vcpu, gva_t addr,
-			    int write_fault, int user_fault, int fetch_fault)
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				    gva_t addr, u32 access)
 {
 	pt_element_t pte;
 	gfn_t table_gfn;
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
 	bool eperm, present, rsvd_fault;
+	int offset, write_fault, user_fault, fetch_fault;
+
+	write_fault = access & PFERR_WRITE_MASK;
+	user_fault = access & PFERR_USER_MASK;
+	fetch_fault = access & PFERR_FETCH_MASK;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
 walk:
 	present = true;
 	eperm = rsvd_fault = false;
-	walker->level = vcpu->arch.mmu.root_level;
-	pte = vcpu->arch.cr3;
+	walker->level = mmu->root_level;
+	pte           = mmu->get_cr3(vcpu);
+
 #if PTTYPE == 64
-	if (!is_long_mode(vcpu)) {
-		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+	if (walker->level == PT32E_ROOT_LEVEL) {
+		pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
 			present = false;
@@ -142,7 +149,7 @@ walk:
 	}
 #endif
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	pt_access = ACC_ALL;
 
@@ -150,12 +157,14 @@ walk:
 		index = PT_INDEX(addr, walker->level);
 
 		table_gfn = gpte_to_gfn(pte);
-		pte_gpa = gfn_to_gpa(table_gfn);
-		pte_gpa += index * sizeof(pt_element_t);
+		offset    = index * sizeof(pt_element_t);
+		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+		if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
+					    offset, sizeof(pte),
+					    PFERR_USER_MASK|PFERR_WRITE_MASK)) {
 			present = false;
 			break;
 		}
@@ -167,7 +176,7 @@ walk:
 			break;
 		}
 
-		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
 			rsvd_fault = true;
 			break;
 		}
@@ -204,17 +213,28 @@ walk:
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
 				is_large_pte(pte) &&
-				is_long_mode(vcpu))) {
+				mmu->root_level == PT64_ROOT_LEVEL)) {
 			int lvl = walker->level;
+			gpa_t real_gpa;
+			gfn_t gfn;
+			u32 ac;
 
-			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
-			walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
-					>> PAGE_SHIFT;
+			gfn = gpte_to_gfn_lvl(pte, lvl);
+			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
 			if (PTTYPE == 32 &&
 			    walker->level == PT_DIRECTORY_LEVEL &&
 			    is_cpuid_PSE36())
-				walker->gfn += pse36_gfn_delta(pte);
+				gfn += pse36_gfn_delta(pte);
+
+			ac = write_fault | fetch_fault | user_fault;
+
+			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
+						      ac);
+			if (real_gpa == UNMAPPED_GVA)
+				return 0;
+
+			walker->gfn = real_gpa >> PAGE_SHIFT;
 
 			break;
 		}
@@ -249,18 +269,36 @@ error:
 	walker->error_code = 0;
 	if (present)
 		walker->error_code |= PFERR_PRESENT_MASK;
-	if (write_fault)
-		walker->error_code |= PFERR_WRITE_MASK;
-	if (user_fault)
-		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault && is_nx(vcpu))
+
+	walker->error_code |= write_fault | user_fault;
+
+	if (fetch_fault && mmu->nx)
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
+
+	vcpu->arch.fault.address    = addr;
+	vcpu->arch.fault.error_code = walker->error_code;
+
 	trace_kvm_mmu_walker_error(walker->error_code);
 	return 0;
 }
 
+static int FNAME(walk_addr)(struct guest_walker *walker,
+			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+{
+	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+					access);
+}
+
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+				   struct kvm_vcpu *vcpu, gva_t addr,
+				   u32 access)
+{
+	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+					addr, access);
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
@@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 				struct guest_walker *gw, int level)
 {
-	int r;
 	pt_element_t curr_pte;
-
-	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+	u64 mask;
+	int r, index;
+
+	if (level == PT_PAGE_TABLE_LEVEL) {
+		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+		base_gpa = pte_gpa & ~mask;
+		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+		curr_pte = gw->prefetch_ptes[index];
+	} else
+		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
 				  &curr_pte, sizeof(curr_pte));
+
 	return r || curr_pte != gw->ptes[level - 1];
 }
 
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+				u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+	pt_element_t *gptep = gw->prefetch_ptes;
+	u64 *spte;
+	int i;
+
+	sp = page_header(__pa(sptep));
+
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	if (sp->role.direct)
+		return __direct_pte_prefetch(vcpu, sp, sptep);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		pt_element_t gpte;
+		unsigned pte_access;
+		gfn_t gfn;
+		pfn_t pfn;
+		bool dirty;
+
+		if (spte == sptep)
+			continue;
+
+		if (*spte != shadow_trap_nonpresent_pte)
+			continue;
+
+		gpte = gptep[i];
+
+		if (!is_present_gpte(gpte) ||
+		      is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
+			if (!sp->unsync)
+				__set_spte(spte, shadow_notrap_nonpresent_pte);
+			continue;
+		}
+
+		if (!(gpte & PT_ACCESSED_MASK))
+			continue;
+
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		gfn = gpte_to_gfn(gpte);
+		dirty = is_dirty_gpte(gpte);
+		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+				      (pte_access & ACC_WRITE_MASK) && dirty);
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			break;
+		}
+
+		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+			     dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+			     pfn, true, true);
+	}
+}
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
@@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
+	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
 	return it.sptep;
 
@@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	int write_fault = error_code & PFERR_WRITE_MASK;
 	int user_fault = error_code & PFERR_USER_MASK;
-	int fetch_fault = error_code & PFERR_FETCH_MASK;
 	struct guest_walker walker;
 	u64 *sptep;
 	int write_pt = 0;
@@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	unsigned long mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
-	kvm_mmu_audit(vcpu, "pre page fault");
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	/*
 	 * Look up the guest pte for the faulting address.
 	 */
-	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-			     fetch_fault);
+	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
 
 	/*
 	 * The page is not mapped by the guest.  Let the guest handle it.
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		inject_page_fault(vcpu, addr, walker.error_code);
+		inject_page_fault(vcpu);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 		return 0;
 	}
@@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
+
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
@@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
@@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	gpa_t gpa = UNMAPPED_GVA;
 	int r;
 
-	r = FNAME(walk_addr)(&walker, vcpu, vaddr,
-			     !!(access & PFERR_WRITE_MASK),
-			     !!(access & PFERR_USER_MASK),
-			     !!(access & PFERR_FETCH_MASK));
+	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+
+	if (r) {
+		gpa = gfn_to_gpa(walker.gfn);
+		gpa |= vaddr & ~PAGE_MASK;
+	} else if (error)
+		*error = walker.error_code;
+
+	return gpa;
+}
+
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+				      u32 access, u32 *error)
+{
+	struct guest_walker walker;
+	gpa_t gpa = UNMAPPED_GVA;
+	int r;
+
+	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
 
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
@@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
 		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
 		      || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a3f9f64f86f..82e144a4e514 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
  * AMD SVM support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -88,6 +88,14 @@ struct nested_state {
 	/* A VMEXIT is required but not yet emulated */
 	bool exit_required;
 
+	/*
+	 * If we vmexit during an instruction emulation we need this to restore
+	 * the l1 guest rip after the emulation
+	 */
+	unsigned long vmexit_rip;
+	unsigned long vmexit_rsp;
+	unsigned long vmexit_rax;
+
 	/* cache for intercepts of the guest */
 	u16 intercept_cr_read;
 	u16 intercept_cr_write;
@@ -96,6 +104,8 @@ struct nested_state {
 	u32 intercept_exceptions;
 	u64 intercept;
 
+	/* Nested Paging related state */
+	u64 nested_cr3;
 };
 
 #define MSRPM_OFFSETS	16
@@ -284,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 	force_new_asid(vcpu);
 }
 
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+	return PT64_ROOT_LEVEL;
+#else
+	return PT32E_ROOT_LEVEL;
+#endif
+}
+
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	vcpu->arch.efer = efer;
@@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u64 g_tsc_offset = 0;
+
+	if (is_nested(svm)) {
+		g_tsc_offset = svm->vmcb->control.tsc_offset -
+			       svm->nested.hsave->control.tsc_offset;
+		svm->nested.hsave->control.tsc_offset = offset;
+	}
+
+	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+}
+
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.tsc_offset += adjustment;
+	if (is_nested(svm))
+		svm->nested.hsave->control.tsc_offset += adjustment;
+}
+
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -793,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	save->efer = EFER_SVME;
+	svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	save->dr7 = 0x400;
 	save->rflags = 2;
@@ -804,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
-	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	(void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+	svm->vcpu.arch.cr0 = 0;
+	(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
 
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
@@ -901,7 +943,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	init_vmcb(svm);
-	svm->vmcb->control.tsc_offset = 0-native_read_tsc();
+	kvm_write_tsc(&svm->vcpu, 0);
 
 	err = fx_init(&svm->vcpu);
 	if (err)
@@ -947,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	int i;
 
 	if (unlikely(cpu != vcpu->cpu)) {
-		u64 delta;
-
-		if (check_tsc_unstable()) {
-			/*
-			 * Make sure that the guest sees a monotonically
-			 * increasing TSC.
-			 */
-			delta = vcpu->arch.host_tsc - native_read_tsc();
-			svm->vmcb->control.tsc_offset += delta;
-			if (is_nested(svm))
-				svm->nested.hsave->control.tsc_offset += delta;
-		}
-		vcpu->cpu = cpu;
-		kvm_migrate_timers(vcpu);
 		svm->asid_generation = 0;
 	}
 
@@ -976,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	++vcpu->stat.host_state_reload;
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
-	vcpu->arch.host_tsc = native_read_tsc();
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -995,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	switch (reg) {
 	case VCPU_EXREG_PDPTR:
 		BUG_ON(!npt_enabled);
-		load_pdptrs(vcpu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
 		break;
 	default:
 		BUG();
@@ -1206,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 		if (old == new) {
 			/* cr0 write with ts and mp unchanged */
 			svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
+				svm->nested.vmexit_rip = kvm_rip_read(vcpu);
+				svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+				svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 				return;
+			}
 		}
 	}
 
@@ -1581,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return svm->nested.nested_cr3;
+}
+
+static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
+				   unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.nested_cr3 = root;
+	force_new_asid(vcpu);
+}
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+	svm->vmcb->control.exit_code_hi = 0;
+	svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
+	svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+
+	nested_svm_vmexit(svm);
+}
+
+static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
+	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
+	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
+	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+	vcpu->arch.mmu.shadow_root_level = get_npt_level();
+	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	return r;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
 	if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1629,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
 		return false;
 
+	/*
+	 * if vmexit was already requested (by intercepted exception
+	 * for instance) do not overwrite it with "external interrupt"
+	 * vmexit.
+	 */
+	if (svm->nested.exit_required)
+		return false;
+
 	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
 	svm->vmcb->control.exit_info_1 = 0;
 	svm->vmcb->control.exit_info_2 = 0;
@@ -1896,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.ds     = vmcb->save.ds;
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
+	nested_vmcb->save.efer   = svm->vcpu.arch.efer;
 	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
@@ -1917,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+	nested_vmcb->control.next_rip          = vmcb->control.next_rip;
 
 	/*
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -1947,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
+	svm->nested.nested_cr3 = 0;
+
 	/* Restore selected save entries */
 	svm->vmcb->save.es = hsave->save.es;
 	svm->vmcb->save.cs = hsave->save.cs;
@@ -1973,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	nested_svm_unmap(page);
 
+	nested_svm_uninit_mmu_context(&svm->vcpu);
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
 
@@ -2012,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 	return true;
 }
 
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+		return false;
+
+	if (vmcb->control.asid == 0)
+		return false;
+
+	if (vmcb->control.nested_ctl && !npt_enabled)
+		return false;
+
+	return true;
+}
+
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
@@ -2026,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return false;
 
-	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
+	if (!nested_vmcb_checks(nested_vmcb)) {
+		nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
+		nested_vmcb->control.exit_code_hi = 0;
+		nested_vmcb->control.exit_info_1  = 0;
+		nested_vmcb->control.exit_info_2  = 0;
+
+		nested_svm_unmap(page);
+
+		return false;
+	}
+
+	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
 			       nested_vmcb->control.event_inj,
@@ -2055,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	hsave->save.cr4    = svm->vcpu.arch.cr4;
 	hsave->save.rflags = vmcb->save.rflags;
-	hsave->save.rip    = svm->next_rip;
+	hsave->save.rip    = kvm_rip_read(&svm->vcpu);
 	hsave->save.rsp    = vmcb->save.rsp;
 	hsave->save.rax    = vmcb->save.rax;
 	if (npt_enabled)
@@ -2070,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
 
+	if (nested_vmcb->control.nested_ctl) {
+		kvm_mmu_unload(&svm->vcpu);
+		svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+		nested_svm_init_mmu_context(&svm->vcpu);
+	}
+
 	/* Load the nested guest state */
 	svm->vmcb->save.es = nested_vmcb->save.es;
 	svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2227,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
 	if (nested_svm_check_permissions(svm))
 		return 1;
 
-	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	skip_emulated_instruction(&svm->vcpu);
+	/* Save rip after vmrun instruction */
+	kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
 
 	if (!nested_svm_vmrun(svm))
 		return 1;
@@ -2257,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm)
 
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 
 	enable_gif(svm);
 
@@ -2399,6 +2522,23 @@ static int emulate_on_interception(struct vcpu_svm *svm)
 	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
+static int cr0_write_interception(struct vcpu_svm *svm)
+{
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	int r;
+
+	r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+
+	if (svm->nested.vmexit_rip) {
+		kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
+		kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
+		svm->nested.vmexit_rip = 0;
+	}
+
+	return r == EMULATE_DONE;
+}
+
 static int cr8_write_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
@@ -2542,20 +2682,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	switch (ecx) {
-	case MSR_IA32_TSC: {
-		u64 tsc_offset = data - native_read_tsc();
-		u64 g_tsc_offset = 0;
-
-		if (is_nested(svm)) {
-			g_tsc_offset = svm->vmcb->control.tsc_offset -
-				       svm->nested.hsave->control.tsc_offset;
-			svm->nested.hsave->control.tsc_offset = tsc_offset;
-		}
-
-		svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
-
+	case MSR_IA32_TSC:
+		kvm_write_tsc(vcpu, data);
 		break;
-	}
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
@@ -2643,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
 
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	/*
@@ -2672,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR4]			= emulate_on_interception,
 	[SVM_EXIT_READ_CR8]			= emulate_on_interception,
 	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR0]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR0]			= cr0_write_interception,
 	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
@@ -2871,7 +3001,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
-	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
+	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
 		       __func__, svm->vmcb->control.exit_int_info,
@@ -3088,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 
 	svm->int3_injected = 0;
 
-	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+	if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
 		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+	}
 
 	svm->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&svm->vcpu);
@@ -3098,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
 		return;
 
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
 	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
 	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
 
@@ -3134,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	}
 }
 
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+
+	control->exit_int_info = control->event_inj;
+	control->exit_int_info_err = control->event_inj_err;
+	control->event_inj = 0;
+	svm_complete_interrupts(svm);
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #else
@@ -3167,9 +3313,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	savesegment(gs, gs_selector);
 	ldt_selector = kvm_read_ldt();
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
-	/* required for live migration with NPT */
-	if (npt_enabled)
-		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
 	clgi();
 
@@ -3291,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (npt_enabled) {
-		svm->vmcb->control.nested_cr3 = root;
-		force_new_asid(vcpu);
-		return;
-	}
-
 	svm->vmcb->save.cr3 = root;
 	force_new_asid(vcpu);
 }
 
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.nested_cr3 = root;
+
+	/* Also sync guest cr3 here in case we live migrate */
+	svm->vmcb->save.cr3 = vcpu->arch.cr3;
+
+	force_new_asid(vcpu);
+}
+
 static int is_disabled(void)
 {
 	u64 vm_cr;
@@ -3333,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
-	return PT64_ROOT_LEVEL;
-#else
-	return PT32E_ROOT_LEVEL;
-#endif
-}
-
 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
@@ -3354,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
 	switch (func) {
+	case 0x80000001:
+		if (nested)
+			entry->ecx |= (1 << 2); /* Set SVM bit */
+		break;
 	case 0x8000000A:
 		entry->eax = 1; /* SVM revision 1 */
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 				   ASID emulation to nested SVM */
 		entry->ecx = 0; /* Reserved */
-		entry->edx = 0; /* Do not support any additional features */
+		entry->edx = 0; /* Per default do not support any
+				   additional features */
+
+		/* Support next_rip if host supports it */
+		if (svm_has(SVM_FEATURE_NRIP))
+			entry->edx |= SVM_FEATURE_NRIP;
+
+		/* Support NPT for the guest if enabled */
+		if (npt_enabled)
+			entry->edx |= SVM_FEATURE_NPT;
 
 		break;
 	}
@@ -3497,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
 	.queue_exception = svm_queue_exception,
+	.cancel_injection = svm_cancel_injection,
 	.interrupt_allowed = svm_interrupt_allowed,
 	.nmi_allowed = svm_nmi_allowed,
 	.get_nmi_mask = svm_get_nmi_mask,
@@ -3519,6 +3673,11 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_supported_cpuid = svm_set_supported_cpuid,
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
+
+	.write_tsc_offset = svm_write_tsc_offset,
+	.adjust_tsc_offset = svm_adjust_tsc_offset,
+
+	.set_tdp_cr3 = set_tdp_cr3,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d8..fc7a101c4a35 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
  *
  * timer support
  *
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bddfab12013..8da0e45ff7c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
@@ -125,6 +125,7 @@ struct vcpu_vmx {
 	unsigned long         host_rsp;
 	int                   launched;
 	u8                    fail;
+	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
 	struct shared_msr_entry *guest_msrs;
 	int                   nmsrs;
@@ -154,11 +155,6 @@ struct vcpu_vmx {
 			u32 limit;
 			u32 ar;
 		} tr, es, ds, fs, gs;
-		struct {
-			bool pending;
-			u8 vector;
-			unsigned rip;
-		} irq;
 	} rmode;
 	int vpid;
 	bool emulation_required;
@@ -505,7 +501,6 @@ static void __vcpu_clear(void *arg)
 		vmcs_clear(vmx->vmcs);
 	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	rdtscll(vmx->vcpu.arch.host_tsc);
 	list_del(&vmx->local_vcpus_link);
 	vmx->vcpu.cpu = -1;
 	vmx->launched = 0;
@@ -706,11 +701,10 @@ static void reload_tss(void)
 	/*
 	 * VT restores TR but not its size.  Useless.
 	 */
-	struct desc_ptr gdt;
+	struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 	struct desc_struct *descs;
 
-	native_store_gdt(&gdt);
-	descs = (void *)gdt.address;
+	descs = (void *)gdt->address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
 }
@@ -753,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 
 static unsigned long segment_base(u16 selector)
 {
-	struct desc_ptr gdt;
+	struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 	struct desc_struct *d;
 	unsigned long table_base;
 	unsigned long v;
@@ -761,8 +755,7 @@ static unsigned long segment_base(u16 selector)
 	if (!(selector & ~3))
 		return 0;
 
-	native_store_gdt(&gdt);
-	table_base = gdt.address;
+	table_base = gdt->address;
 
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector = kvm_read_ldt();
@@ -883,7 +876,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tsc_this, delta, new_offset;
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 
 	if (!vmm_exclusive)
@@ -897,37 +889,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (vcpu->cpu != cpu) {
-		struct desc_ptr dt;
+		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
-		kvm_migrate_timers(vcpu);
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
 		local_irq_enable();
 
-		vcpu->cpu = cpu;
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
 		 * processors.
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
-		native_store_gdt(&dt);
-		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
+		vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
-		/*
-		 * Make sure the time stamp counter is monotonous.
-		 */
-		rdtscll(tsc_this);
-		if (tsc_this < vcpu->arch.host_tsc) {
-			delta = vcpu->arch.host_tsc - tsc_this;
-			new_offset = vmcs_read64(TSC_OFFSET) + delta;
-			vmcs_write64(TSC_OFFSET, new_offset);
-		}
 	}
 }
 
@@ -1044,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	}
 
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = nr;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (kvm_exception_is_soft(nr))
-			vmx->rmode.irq.rip +=
-				vmx->vcpu.arch.event_exit_inst_len;
-		intr_info |= INTR_TYPE_SOFT_INTR;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 
@@ -1149,12 +1120,17 @@ static u64 guest_read_tsc(void)
 }
 
 /*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ * writes 'offset' into guest's timestamp counter offset register
  */
-static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+	vmcs_write64(TSC_OFFSET, offset);
+}
+
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
 {
-	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+	u64 offset = vmcs_read64(TSC_OFFSET);
+	vmcs_write64(TSC_OFFSET, offset + adjustment);
 }
 
 /*
@@ -1227,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct shared_msr_entry *msr;
-	u64 host_tsc;
 	int ret = 0;
 
 	switch (msr_index) {
@@ -1257,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_TSC:
-		rdtscll(host_tsc);
-		guest_write_tsc(data, host_tsc);
+		kvm_write_tsc(vcpu, data);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1856,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
 		return;
 
 	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
-		vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
-		vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
-		vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
-		vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+		vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
+		vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
+		vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
+		vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
 	}
 }
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 {
 	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
-		vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
-		vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
-		vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
-		vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+		vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+		vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+		vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+		vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
 	}
 
 	__set_bit(VCPU_EXREG_PDPTR,
@@ -2515,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat, tsc_this, tsc_base;
+	u64 host_pat;
 	unsigned long a;
 	struct desc_ptr dt;
 	int i;
@@ -2656,12 +2630,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
-	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
-	rdtscll(tsc_this);
-	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
-		tsc_base = tsc_this;
-
-	guest_write_tsc(0, tsc_base);
+	kvm_write_tsc(&vmx->vcpu, 0);
 
 	return 0;
 }
@@ -2834,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.irq_injections;
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = irq;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (vcpu->arch.interrupt.soft)
-			vmx->rmode.irq.rip +=
-				vmx->vcpu.arch.event_exit_inst_len;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	intr = irq | INTR_INFO_VALID_MASK;
@@ -2875,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.nmi_injections;
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = NMI_VECTOR;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     NMI_VECTOR | INTR_TYPE_SOFT_INTR |
-			     INTR_INFO_VALID_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -3346,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 1;
 }
 
@@ -3358,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	++vcpu->stat.irq_window_exits;
 
 	/*
@@ -3614,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
 	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 	++vcpu->stat.nmi_window_exits;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 1;
 }
@@ -3623,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	enum emulation_result err = EMULATE_DONE;
 	int ret = 1;
+	u32 cpu_exec_ctrl;
+	bool intr_window_requested;
+
+	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
 
 	while (!guest_state_valid(vcpu)) {
+		if (intr_window_requested
+		    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+			return handle_interrupt_window(&vmx->vcpu);
+
 		err = emulate_instruction(vcpu, 0, 0, 0);
 
 		if (err == EMULATE_DO_MMIO) {
@@ -3790,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
-	u32 exit_intr_info;
-	u32 idt_vectoring_info = vmx->idt_vectoring_info;
-	bool unblock_nmi;
-	u8 vector;
-	int type;
-	bool idtv_info_valid;
-
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	u32 exit_intr_info = vmx->exit_intr_info;
 
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3816,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		asm("int $2");
 		kvm_after_handle_nmi(&vmx->vcpu);
 	}
+}
 
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+	u32 exit_intr_info = vmx->exit_intr_info;
+	bool unblock_nmi;
+	u8 vector;
+	bool idtv_info_valid;
+
+	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	if (cpu_has_virtual_nmis()) {
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3839,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	} else if (unlikely(vmx->soft_vnmi_blocked))
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+}
+
+static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+				      u32 idt_vectoring_info,
+				      int instr_len_field,
+				      int error_code_field)
+{
+	u8 vector;
+	int type;
+	bool idtv_info_valid;
+
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&vmx->vcpu);
@@ -3847,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	if (!idtv_info_valid)
 		return;
 
+	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
@@ -3863,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		break;
 	case INTR_TYPE_SOFT_EXCEPTION:
 		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+			vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_HARD_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
-			u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			u32 err = vmcs_read32(error_code_field);
 			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
 		break;
 	case INTR_TYPE_SOFT_INTR:
 		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+			vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_EXT_INTR:
 		kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3885,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-	vmx->rmode.irq.pending = 0;
-	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
-		return;
-	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
-	if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-		return;
-	}
-	vmx->idt_vectoring_info =
-		VECTORING_INFO_VALID_MASK
-		| INTR_TYPE_EXT_INTR
-		| vmx->rmode.irq.vector;
+	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+				  VM_EXIT_INSTRUCTION_LEN,
+				  IDT_VECTORING_ERROR_CODE);
+}
+
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	__vmx_complete_interrupts(to_vmx(vcpu),
+				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+				  VM_ENTRY_INSTRUCTION_LEN,
+				  VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 }
 
 #ifdef CONFIG_X86_64
@@ -4032,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
 	      : "cc", "memory"
-		, R"bx", R"di", R"si"
+		, R"ax", R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 #endif
@@ -4043,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_dirty = 0;
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	vmx_complete_atomic_exit(vmx);
+	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 }
 
@@ -4119,6 +4097,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
+	vmx->vcpu.cpu = cpu;
 	err = vmx_vcpu_setup(vmx);
 	vmx_vcpu_put(&vmx->vcpu);
 	put_cpu();
@@ -4334,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
+	.cancel_injection = vmx_cancel_injection,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.nmi_allowed = vmx_nmi_allowed,
 	.get_nmi_mask = vmx_get_nmi_mask,
@@ -4356,6 +4336,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_supported_cpuid = vmx_set_supported_cpuid,
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+	.write_tsc_offset = vmx_write_tsc_offset,
+	.adjust_tsc_offset = vmx_adjust_tsc_offset,
+
+	.set_tdp_cr3 = vmx_set_cr3,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c2ecf0a806d..2288ad829b32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
@@ -55,6 +55,8 @@
 #include <asm/mce.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
+#include <asm/pvclock.h>
+#include <asm/div64.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -71,7 +73,7 @@
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
 
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
@@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	u32 prev_nr;
 	int class1, class2;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	if (!vcpu->arch.exception.pending) {
 	queue:
 		vcpu->arch.exception.pending = true;
@@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
-			   u32 error_code)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 {
+	unsigned error_code = vcpu->arch.fault.error_code;
+
 	++vcpu->stat.pf_guest;
-	vcpu->arch.cr2 = addr;
+	vcpu->arch.cr2 = vcpu->arch.fault.address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
+void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+{
+	if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
+		vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+	else
+		vcpu->arch.mmu.inject_page_fault(vcpu);
+
+	vcpu->arch.fault.nested = false;
+}
+
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.nmi_pending = 1;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 EXPORT_SYMBOL_GPL(kvm_require_cpl);
 
 /*
+ * This function will be used to read from the physical memory of the currently
+ * running guest. The difference to kvm_read_guest_page is that this function
+ * can read from guest physical or from the guest's guest physical memory.
+ */
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gfn_t ngfn, void *data, int offset, int len,
+			    u32 access)
+{
+	gfn_t real_gfn;
+	gpa_t ngpa;
+
+	ngpa     = gfn_to_gpa(ngfn);
+	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
+	if (real_gfn == UNMAPPED_GVA)
+		return -EFAULT;
+
+	real_gfn = gpa_to_gfn(real_gfn);
+
+	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
+
+int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+			       void *data, int offset, int len, u32 access)
+{
+	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
+				       data, offset, len, access);
+}
+
+/*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 {
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 	int i;
 	int ret;
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 
-	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
-				  offset * sizeof(u64), sizeof(pdpte));
+	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
+				      offset * sizeof(u64), sizeof(pdpte),
+				      PFERR_USER_MASK|PFERR_WRITE_MASK);
 	if (ret < 0) {
 		ret = 0;
 		goto out;
@@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	}
 	ret = 1;
 
-	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 	__set_bit(VCPU_EXREG_PDPTR,
 		  (unsigned long *)&vcpu->arch.regs_avail);
 	__set_bit(VCPU_EXREG_PDPTR,
@@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
 
 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 	bool changed = true;
+	int offset;
+	gfn_t gfn;
 	int r;
 
 	if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		return true;
 
-	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+	gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
+	offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
+				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
 		goto out;
-	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 out:
 
 	return changed;
@@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 				return 1;
 		} else
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+						 vcpu->arch.cr3))
 			return 1;
 	}
 
@@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
-		   && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
 		return 1;
 
 	if (cr4 & X86_CR4_VMXE)
@@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		if (is_pae(vcpu)) {
 			if (cr3 & CR3_PAE_RESERVED_BITS)
 				return 1;
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+			if (is_paging(vcpu) &&
+			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 				return 1;
 		}
 		/*
@@ -737,7 +791,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
@@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 	/*
 	 * The guest calculates current wall clock time by adding
-	 * system time (updated by kvm_write_guest_time below) to the
+	 * system time (updated by kvm_guest_time_update below) to the
 	 * wall clock specified here.  guest system time equals host
 	 * system time for us, thus we must fill in host boot time here.
 	 */
@@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 	return quotient;
 }
 
-static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+			       s8 *pshift, u32 *pmultiplier)
 {
-	uint64_t nsecs = 1000000000LL;
+	uint64_t scaled64;
 	int32_t  shift = 0;
 	uint64_t tps64;
 	uint32_t tps32;
 
-	tps64 = tsc_khz * 1000LL;
-	while (tps64 > nsecs*2) {
+	tps64 = base_khz * 1000LL;
+	scaled64 = scaled_khz * 1000LL;
+	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 		tps64 >>= 1;
 		shift--;
 	}
 
 	tps32 = (uint32_t)tps64;
-	while (tps32 <= (uint32_t)nsecs) {
-		tps32 <<= 1;
+	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
+			scaled64 >>= 1;
+		else
+			tps32 <<= 1;
 		shift++;
 	}
 
-	hv_clock->tsc_shift = shift;
-	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+	*pshift = shift;
+	*pmultiplier = div_frac(scaled64, tps32);
 
-	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
-		 __func__, tsc_khz, hv_clock->tsc_shift,
-		 hv_clock->tsc_to_system_mul);
+	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
+		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
+}
+
+static inline u64 get_kernel_ns(void)
+{
+	struct timespec ts;
+
+	WARN_ON(preemptible());
+	ktime_get_ts(&ts);
+	monotonic_to_bootbased(&ts);
+	return timespec_to_ns(&ts);
 }
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+unsigned long max_tsc_khz;
 
-static void kvm_write_guest_time(struct kvm_vcpu *v)
+static inline int kvm_tsc_changes_freq(void)
+{
+	int cpu = get_cpu();
+	int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+		  cpufreq_quick_get(cpu) != 0;
+	put_cpu();
+	return ret;
+}
+
+static inline u64 nsec_to_cycles(u64 nsec)
+{
+	u64 ret;
+
+	WARN_ON(preemptible());
+	if (kvm_tsc_changes_freq())
+		printk_once(KERN_WARNING
+		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
+	ret = nsec * __get_cpu_var(cpu_tsc_khz);
+	do_div(ret, USEC_PER_SEC);
+	return ret;
+}
+
+static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+{
+	/* Compute a scale to convert nanoseconds in TSC cycles */
+	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+			   &kvm->arch.virtual_tsc_shift,
+			   &kvm->arch.virtual_tsc_mult);
+	kvm->arch.virtual_tsc_khz = this_tsc_khz;
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+				      vcpu->kvm->arch.virtual_tsc_mult,
+				      vcpu->kvm->arch.virtual_tsc_shift);
+	tsc += vcpu->arch.last_tsc_write;
+	return tsc;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u64 offset, ns, elapsed;
+	unsigned long flags;
+	s64 sdiff;
+
+	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+	offset = data - native_read_tsc();
+	ns = get_kernel_ns();
+	elapsed = ns - kvm->arch.last_tsc_nsec;
+	sdiff = data - kvm->arch.last_tsc_write;
+	if (sdiff < 0)
+		sdiff = -sdiff;
+
+	/*
+	 * Special case: close write to TSC within 5 seconds of
+	 * another CPU is interpreted as an attempt to synchronize
+	 * The 5 seconds is to accomodate host load / swapping as
+	 * well as any reset of TSC during the boot process.
+	 *
+	 * In that case, for a reliable TSC, we can match TSC offsets,
+	 * or make a best guest using elapsed value.
+	 */
+	if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+	    elapsed < 5ULL * NSEC_PER_SEC) {
+		if (!check_tsc_unstable()) {
+			offset = kvm->arch.last_tsc_offset;
+			pr_debug("kvm: matched tsc offset for %llu\n", data);
+		} else {
+			u64 delta = nsec_to_cycles(elapsed);
+			offset += delta;
+			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
+		}
+		ns = kvm->arch.last_tsc_nsec;
+	}
+	kvm->arch.last_tsc_nsec = ns;
+	kvm->arch.last_tsc_write = data;
+	kvm->arch.last_tsc_offset = offset;
+	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+	/* Reset of TSC must disable overshoot protection below */
+	vcpu->arch.hv_clock.tsc_timestamp = 0;
+	vcpu->arch.last_tsc_write = data;
+	vcpu->arch.last_tsc_nsec = ns;
+}
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
+static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-	struct timespec ts;
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
 	unsigned long this_tsc_khz;
+	s64 kernel_ns, max_kernel_ns;
+	u64 tsc_timestamp;
 
-	if ((!vcpu->time_page))
-		return;
+	/* Keep irq disabled to prevent changes to the clock */
+	local_irq_save(flags);
+	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+	kernel_ns = get_kernel_ns();
+	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 
-	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
-	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
-		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = this_tsc_khz;
+	if (unlikely(this_tsc_khz == 0)) {
+		local_irq_restore(flags);
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+		return 1;
+	}
+
+	/*
+	 * We may have to catch up the TSC to match elapsed wall clock
+	 * time for two reasons, even if kvmclock is used.
+	 *   1) CPU could have been running below the maximum TSC rate
+	 *   2) Broken TSC compensation resets the base at each VCPU
+	 *      entry to avoid unknown leaps of TSC even when running
+	 *      again on the same CPU.  This may cause apparent elapsed
+	 *      time to disappear, and the guest to stand still or run
+	 *	very slowly.
+	 */
+	if (vcpu->tsc_catchup) {
+		u64 tsc = compute_guest_tsc(v, kernel_ns);
+		if (tsc > tsc_timestamp) {
+			kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+			tsc_timestamp = tsc;
+		}
 	}
-	put_cpu_var(cpu_tsc_khz);
 
-	/* Keep irq disabled to prevent changes to the clock */
-	local_irq_save(flags);
-	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
-	ktime_get_ts(&ts);
-	monotonic_to_bootbased(&ts);
 	local_irq_restore(flags);
 
-	/* With all the info we got, fill in the values */
+	if (!vcpu->time_page)
+		return 0;
 
-	vcpu->hv_clock.system_time = ts.tv_nsec +
-				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+	/*
+	 * Time as measured by the TSC may go backwards when resetting the base
+	 * tsc_timestamp.  The reason for this is that the TSC resolution is
+	 * higher than the resolution of the other clock scales.  Thus, many
+	 * possible measurments of the TSC correspond to one measurement of any
+	 * other clock, and so a spread of values is possible.  This is not a
+	 * problem for the computation of the nanosecond clock; with TSC rates
+	 * around 1GHZ, there can only be a few cycles which correspond to one
+	 * nanosecond value, and any path through this code will inevitably
+	 * take longer than that.  However, with the kernel_ns value itself,
+	 * the precision may be much lower, down to HZ granularity.  If the
+	 * first sampling of TSC against kernel_ns ends in the low part of the
+	 * range, and the second in the high end of the range, we can get:
+	 *
+	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+	 *
+	 * As the sampling errors potentially range in the thousands of cycles,
+	 * it is possible such a time value has already been observed by the
+	 * guest.  To protect against this, we must compute the system time as
+	 * observed by the guest and ensure the new system time is greater.
+	 */
+	max_kernel_ns = 0;
+	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+		max_kernel_ns = vcpu->last_guest_tsc -
+				vcpu->hv_clock.tsc_timestamp;
+		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+				    vcpu->hv_clock.tsc_to_system_mul,
+				    vcpu->hv_clock.tsc_shift);
+		max_kernel_ns += vcpu->last_kernel_ns;
+	}
 
+	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
+		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+				   &vcpu->hv_clock.tsc_shift,
+				   &vcpu->hv_clock.tsc_to_system_mul);
+		vcpu->hw_tsc_khz = this_tsc_khz;
+	}
+
+	if (max_kernel_ns > kernel_ns)
+		kernel_ns = max_kernel_ns;
+
+	/* With all the info we got, fill in the values */
+	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+	vcpu->last_kernel_ns = kernel_ns;
+	vcpu->last_guest_tsc = tsc_timestamp;
 	vcpu->hv_clock.flags = 0;
 
 	/*
@@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	kunmap_atomic(shared_kaddr, KM_USER0);
 
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
-}
-
-static int kvm_request_guest_time_update(struct kvm_vcpu *v)
-{
-	struct kvm_vcpu_arch *vcpu = &v->arch;
-
-	if (!vcpu->time_page)
-		return 0;
-	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
-	return 1;
+	return 0;
 }
 
 static bool msr_mtrr_valid(unsigned msr)
@@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		}
 
 		vcpu->arch.time = data;
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 		/* we verify if the enable bit is set... */
 		if (!(data & 1))
@@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			kvm_release_page_clean(vcpu->arch.time_page);
 			vcpu->arch.time_page = NULL;
 		}
-
-		kvm_request_guest_time_update(vcpu);
 		break;
 	}
 	case MSR_IA32_MCG_CTL:
@@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 			"0x%x data 0x%llx\n", msr, data);
 		break;
+	case MSR_K7_CLK_CTL:
+		/*
+		 * Ignore all writes to this no longer documented MSR.
+		 * Writes are only relevant for old K7 processors,
+		 * all pre-dating SVM, but a recommended workaround from
+		 * AMD for these chips. It is possible to speicify the
+		 * affected processor models on the command line, hence
+		 * the need to ignore the workaround.
+		 */
+		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
@@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case 0xcd: /* fsb frequency */
 		data = 3;
 		break;
+		/*
+		 * MSR_EBC_FREQUENCY_ID
+		 * Conservative value valid for even the basic CPU models.
+		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+		 * and 266MHz for model 3, or 4. Set Core Clock
+		 * Frequency to System Bus Frequency Ratio to 1 (bits
+		 * 31:24) even though these are only valid for CPU
+		 * models > 2, however guests may end up dividing or
+		 * multiplying by zero otherwise.
+		 */
+	case MSR_EBC_FREQUENCY_ID:
+		data = 1 << 24;
+		break;
 	case MSR_IA32_APICBASE:
 		data = kvm_get_apic_base(vcpu);
 		break;
@@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
 		return get_msr_mce(vcpu, msr, pdata);
+	case MSR_K7_CLK_CTL:
+		/*
+		 * Provide expected ramp-up count for K7. All other
+		 * are set to zero, indicating minimum divisors for
+		 * every field.
+		 *
+		 * This prevents guest kernels on AMD host with CPU
+		 * type 6, model 8 and higher from exploding due to
+		 * the rdmsr failing.
+		 */
+		data = 0x20000000;
+		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
@@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
-		unsigned long khz = cpufreq_quick_get(cpu);
-		if (!khz)
-			khz = tsc_khz;
-		per_cpu(cpu_tsc_khz, cpu) = khz;
+	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+		/* Make sure TSC doesn't go backwards */
+		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+				native_read_tsc() - vcpu->arch.last_host_tsc;
+		if (tsc_delta < 0)
+			mark_tsc_unstable("KVM discovered backwards TSC");
+		if (check_tsc_unstable()) {
+			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+			vcpu->arch.tsc_catchup = 1;
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		}
+		if (vcpu->cpu != cpu)
+			kvm_migrate_timers(vcpu);
+		vcpu->cpu = cpu;
 	}
-	kvm_request_guest_time_update(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
+	vcpu->arch.last_host_tsc = native_read_tsc();
 }
 
 static int is_efer_nx(void)
@@ -1995,7 +2248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(F16C);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
 		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
@@ -2204,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -ENXIO;
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 0;
 }
@@ -2357,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 		vcpu->arch.sipi_vector = events->sipi_vector;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -2760,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 
 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_alloc_mmu_pages;
+	return kvm->arch.n_max_mmu_pages;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2796,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
-		raw_spin_lock(&pic_irqchip(kvm)->lock);
+		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[0],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		raw_spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
-		raw_spin_lock(&pic_irqchip(kvm)->lock);
+		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[1],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		raw_spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3201,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_CLOCK: {
-		struct timespec now;
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 		s64 delta;
@@ -3215,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = 0;
-		ktime_get_ts(&now);
-		now_ns = timespec_to_ns(&now);
+		local_irq_disable();
+		now_ns = get_kernel_ns();
 		delta = user_ns.clock - now_ns;
+		local_irq_enable();
 		kvm->arch.kvmclock_offset = delta;
 		break;
 	}
 	case KVM_GET_CLOCK: {
-		struct timespec now;
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		ktime_get_ts(&now);
-		now_ns = timespec_to_ns(&now);
+		local_irq_disable();
+		now_ns = get_kernel_ns();
 		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+		local_irq_enable();
 		user_ns.flags = 0;
 
 		r = -EFAULT;
@@ -3292,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	return gpa;
+}
+
+static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	gpa_t t_gpa;
+	u32 error;
+
+	BUG_ON(!mmu_is_nested(vcpu));
+
+	/* NPT walks are always user-walks */
+	access |= PFERR_USER_MASK;
+	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
+	if (t_gpa == UNMAPPED_GVA)
+		vcpu->arch.fault.nested = true;
+
+	return t_gpa;
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_FETCH_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
 }
 
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3326,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+							    error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3381,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
-						       PFERR_WRITE_MASK, error);
+		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+							     PFERR_WRITE_MASK,
+							     error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3624,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
 	if (vcpu->arch.pio.count)
 		goto data_avail;
 
-	trace_kvm_pio(1, port, size, 1);
+	trace_kvm_pio(0, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 1;
@@ -3652,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
 			      const void *val, unsigned int count,
 			      struct kvm_vcpu *vcpu)
 {
-	trace_kvm_pio(0, port, size, 1);
+	trace_kvm_pio(1, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 0;
@@ -3791,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_gdt(vcpu, dt);
 }
 
+static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->get_idt(vcpu, dt);
+}
+
 static unsigned long emulator_get_cached_segment_base(int seg,
 						      struct kvm_vcpu *vcpu)
 {
@@ -3884,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_segment_selector = emulator_set_segment_selector,
 	.get_cached_segment_base = emulator_get_cached_segment_base,
 	.get_gdt             = emulator_get_gdt,
+	.get_idt	     = emulator_get_idt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
@@ -3919,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception == PF_VECTOR)
-		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+		kvm_propagate_fault(vcpu);
 	else if (ctxt->error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
 	else
 		kvm_queue_exception(vcpu, ctxt->exception);
 }
 
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	int cs_db, cs_l;
+
+	cache_all_regs(vcpu);
+
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+	vcpu->arch.emulate_ctxt.vcpu = vcpu;
+	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+	vcpu->arch.emulate_ctxt.mode =
+		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+		? X86EMUL_MODE_VM86 : cs_l
+		? X86EMUL_MODE_PROT64 :	cs_db
+		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+	memset(c, 0, sizeof(struct decode_cache));
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+}
+
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
+{
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	int ret;
+
+	init_emulate_ctxt(vcpu);
+
+	vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
+	vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
+	vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
+	ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+
+	if (ret != X86EMUL_CONTINUE)
+		return EMULATE_FAIL;
+
+	vcpu->arch.emulate_ctxt.eip = c->eip;
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+	if (irq == NMI_VECTOR)
+		vcpu->arch.nmi_pending = false;
+	else
+		vcpu->arch.interrupt.pending = false;
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.insn_emulation_fail;
@@ -3982,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	cache_all_regs(vcpu);
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
-		int cs_db, cs_l;
-		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-		vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-		vcpu->arch.emulate_ctxt.mode =
-			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-			? X86EMUL_MODE_VM86 : cs_l
-			? X86EMUL_MODE_PROT64 :	cs_db
-			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-		memset(c, 0, sizeof(struct decode_cache));
-		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		init_emulate_ctxt(vcpu);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
 		vcpu->arch.emulate_ctxt.exception = -1;
+		vcpu->arch.emulate_ctxt.perm_ok = false;
+
+		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+		if (r == X86EMUL_PROPAGATE_FAULT)
+			goto done;
 
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
@@ -4049,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 restart:
-	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
 
-	if (r) { /* emulation failed */
+	if (r == EMULATION_FAILED) {
 		if (reexecute_instruction(vcpu, cr2))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
 	}
 
-	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
-	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-
+done:
 	if (vcpu->arch.emulate_ctxt.exception >= 0) {
 		inject_emulated_exception(vcpu);
-		return EMULATE_DONE;
-	}
-
-	if (vcpu->arch.pio.count) {
+		r = EMULATE_DONE;
+	} else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->mmio_needed) {
+		r = EMULATE_DO_MMIO;
+	} else if (vcpu->mmio_needed) {
 		if (vcpu->mmio_is_write)
 			vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->arch.emulate_ctxt.restart)
+		r = EMULATE_DO_MMIO;
+	} else if (r == EMULATION_RESTART)
 		goto restart;
+	else
+		r = EMULATE_DONE;
 
-	return EMULATE_DONE;
+	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+	return r;
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
@@ -4097,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
-static void bounce_off(void *info)
+static void tsc_bad(void *info)
+{
+	__get_cpu_var(cpu_tsc_khz) = 0;
+}
+
+static void tsc_khz_changed(void *data)
 {
-	/* nothing */
+	struct cpufreq_freqs *freq = data;
+	unsigned long khz = 0;
+
+	if (data)
+		khz = freq->new;
+	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		khz = cpufreq_quick_get(raw_smp_processor_id());
+	if (!khz)
+		khz = tsc_khz;
+	__get_cpu_var(cpu_tsc_khz) = khz;
 }
 
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4110,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	struct kvm_vcpu *vcpu;
 	int i, send_ipi = 0;
 
+	/*
+	 * We allow guests to temporarily run on slowing clocks,
+	 * provided we notify them after, or to run on accelerating
+	 * clocks, provided we notify them before.  Thus time never
+	 * goes backwards.
+	 *
+	 * However, we have a problem.  We can't atomically update
+	 * the frequency of a given CPU from this function; it is
+	 * merely a notifier, which can be called from any CPU.
+	 * Changing the TSC frequency at arbitrary points in time
+	 * requires a recomputation of local variables related to
+	 * the TSC for each VCPU.  We must flag these local variables
+	 * to be updated and be sure the update takes place with the
+	 * new frequency before any guests proceed.
+	 *
+	 * Unfortunately, the combination of hotplug CPU and frequency
+	 * change creates an intractable locking scenario; the order
+	 * of when these callouts happen is undefined with respect to
+	 * CPU hotplug, and they can race with each other.  As such,
+	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+	 * undefined; you can actually have a CPU frequency change take
+	 * place in between the computation of X and the setting of the
+	 * variable.  To protect against this problem, all updates of
+	 * the per_cpu tsc_khz variable are done in an interrupt
+	 * protected IPI, and all callers wishing to update the value
+	 * must wait for a synchronous IPI to complete (which is trivial
+	 * if the caller is on the CPU already).  This establishes the
+	 * necessary total order on variable updates.
+	 *
+	 * Note that because a guest time update may take place
+	 * anytime after the setting of the VCPU's request bit, the
+	 * correct TSC value must be set before the request.  However,
+	 * to ensure the update actually makes it to any guest which
+	 * starts running in hardware virtualization between the set
+	 * and the acquisition of the spinlock, we must also ping the
+	 * CPU after setting the request bit.
+	 *
+	 */
+
 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
 		return 0;
 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
 		return 0;
-	per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+
+	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (vcpu->cpu != freq->cpu)
 				continue;
-			if (!kvm_request_guest_time_update(vcpu))
-				continue;
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 			if (vcpu->cpu != smp_processor_id())
-				send_ipi++;
+				send_ipi = 1;
 		}
 	}
 	spin_unlock(&kvm_lock);
@@ -4142,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 		 * guest context is entered kvmclock will be updated,
 		 * so the guest will not see stale values.
 		 */
-		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 	}
 	return 0;
 }
 
 static struct notifier_block kvmclock_cpufreq_notifier_block = {
-        .notifier_call  = kvmclock_cpufreq_notifier
+	.notifier_call  = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_notifier(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+		case CPU_ONLINE:
+		case CPU_DOWN_FAILED:
+			smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+			break;
+		case CPU_DOWN_PREPARE:
+			smp_call_function_single(cpu, tsc_bad, NULL, 1);
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvmclock_cpu_notifier_block = {
+	.notifier_call  = kvmclock_cpu_notifier,
+	.priority = -INT_MAX
 };
 
 static void kvm_timer_init(void)
 {
 	int cpu;
 
+	max_tsc_khz = tsc_khz;
+	register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+		struct cpufreq_policy policy;
+		memset(&policy, 0, sizeof(policy));
+		cpufreq_get_policy(&policy, get_cpu());
+		if (policy.cpuinfo.max_freq)
+			max_tsc_khz = policy.cpuinfo.max_freq;
+#endif
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
-		for_each_online_cpu(cpu) {
-			unsigned long khz = cpufreq_get(cpu);
-			if (!khz)
-				khz = tsc_khz;
-			per_cpu(cpu_tsc_khz, cpu) = khz;
-		}
-	} else {
-		for_each_possible_cpu(cpu)
-			per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
 	}
+	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4269,6 +4672,7 @@ void kvm_arch_exit(void)
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
+	unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 }
@@ -4684,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
-			kvm_write_guest_time(vcpu);
+		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+			r = kvm_guest_time_update(vcpu);
+			if (unlikely(r))
+				goto out;
+		}
 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4710,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (unlikely(r))
 		goto out;
 
+	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		inject_pending_event(vcpu);
+
+		/* enable NMI/IRQ window open exits if needed */
+		if (vcpu->arch.nmi_pending)
+			kvm_x86_ops->enable_nmi_window(vcpu);
+		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+			kvm_x86_ops->enable_irq_window(vcpu);
+
+		if (kvm_lapic_enabled(vcpu)) {
+			update_cr8_intercept(vcpu);
+			kvm_lapic_sync_to_vapic(vcpu);
+		}
+	}
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -4728,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
+		kvm_x86_ops->cancel_injection(vcpu);
 		r = 1;
 		goto out;
 	}
 
-	inject_pending_event(vcpu);
-
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		kvm_x86_ops->enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		kvm_x86_ops->enable_irq_window(vcpu);
-
-	if (kvm_lapic_enabled(vcpu)) {
-		update_cr8_intercept(vcpu);
-		kvm_lapic_sync_to_vapic(vcpu);
-	}
-
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 	kvm_guest_enter();
@@ -4770,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
+	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
 	atomic_set(&vcpu->guest_mode, 0);
 	smp_wmb();
 	local_irq_enable();
@@ -4899,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
-	    vcpu->arch.emulate_ctxt.restart) {
+	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
 		if (vcpu->mmio_needed) {
 			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 			vcpu->mmio_read_completed = 1;
@@ -4981,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	vcpu->arch.exception.pending = false;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -5044,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
 	vcpu->arch.mp_state = mp_state->mp_state;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 
@@ -5051,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
 	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
-	int cs_db, cs_l, ret;
-	cache_all_regs(vcpu);
-
-	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+	int ret;
 
-	vcpu->arch.emulate_ctxt.vcpu = vcpu;
-	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-	vcpu->arch.emulate_ctxt.mode =
-		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-		? X86EMUL_MODE_VM86 : cs_l
-		? X86EMUL_MODE_PROT64 :	cs_db
-		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-	memset(c, 0, sizeof(struct decode_cache));
-	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+	init_emulate_ctxt(vcpu);
 
-	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
 				   tss_selector, reason, has_error_code,
 				   error_code);
 
@@ -5078,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5113,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
-		load_pdptrs(vcpu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
 		mmu_reset_needed = 1;
 	}
 
@@ -5148,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	    !is_protmode(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -5334,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 						unsigned int id)
 {
+	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+		printk_once(KERN_WARNING
+		"kvm: SMP vm created on host with unstable TSC; "
+		"guest TSC will not be reliable\n");
 	return kvm_x86_ops->vcpu_create(kvm, id);
 }
 
@@ -5376,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr6 = DR6_FIXED_1;
 	vcpu->arch.dr7 = DR7_FIXED_1;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
 int kvm_arch_hardware_enable(void *garbage)
 {
-	/*
-	 * Since this may be called from a hotplug notifcation,
-	 * we can't get the CPU frequency directly.
-	 */
-	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-		int cpu = raw_smp_processor_id();
-		per_cpu(cpu_tsc_khz, cpu) = 0;
-	}
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
 
 	kvm_shared_msr_cpu_online();
-
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			if (vcpu->cpu == smp_processor_id())
+				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
@@ -5425,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 
+	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.translate_gpa = translate_gpa;
+	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
@@ -5438,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.pio_data = page_address(page);
 
+	if (!kvm->arch.virtual_tsc_khz)
+		kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+
 	r = kvm_mmu_create(vcpu);
 	if (r < 0)
 		goto fail_free_pio_data;
@@ -5497,7 +5912,7 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
-	rdtscll(kvm->arch.vm_init_tsc);
+	spin_lock_init(&kvm->arch.tsc_write_lock);
 
 	return kvm;
 }
@@ -5684,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 		rflags |= X86_EFLAGS_TF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..2cea414489f3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -67,5 +72,8 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
 
 #endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e573077f1672..e4bac29a32e7 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -23,13 +23,12 @@ comment "Crypto core or helper"
 
 config CRYPTO_FIPS
 	bool "FIPS 200 compliance"
-	depends on CRYPTO_ANSI_CPRNG
+	depends on CRYPTO_ANSI_CPRNG && !CRYPTO_MANAGER_DISABLE_TESTS
 	help
 	  This options enables the fips boot option which is
 	  required if you want to system to operate in a FIPS 200
 	  certification.  You should say no unless you know what
-	  this is. Note that CRYPTO_ANSI_CPRNG is required if this
-	  option is selected
+	  this is.
 
 config CRYPTO_ALGAPI
 	tristate
@@ -365,7 +364,7 @@ config CRYPTO_RMD128
 	  RIPEMD-160 should be used.
 
 	  Developed by Hans Dobbertin, Antoon Bosselaers and Bart Preneel.
-	  See <http://home.esat.kuleuven.be/~bosselae/ripemd160.html>
+	  See <http://homes.esat.kuleuven.be/~bosselae/ripemd160.html>
 
 config CRYPTO_RMD160
 	tristate "RIPEMD-160 digest algorithm"
@@ -382,7 +381,7 @@ config CRYPTO_RMD160
 	  against RIPEMD-160.
 
 	  Developed by Hans Dobbertin, Antoon Bosselaers and Bart Preneel.
-	  See <http://home.esat.kuleuven.be/~bosselae/ripemd160.html>
+	  See <http://homes.esat.kuleuven.be/~bosselae/ripemd160.html>
 
 config CRYPTO_RMD256
 	tristate "RIPEMD-256 digest algorithm"
@@ -394,7 +393,7 @@ config CRYPTO_RMD256
 	  (than RIPEMD-128).
 
 	  Developed by Hans Dobbertin, Antoon Bosselaers and Bart Preneel.
-	  See <http://home.esat.kuleuven.be/~bosselae/ripemd160.html>
+	  See <http://homes.esat.kuleuven.be/~bosselae/ripemd160.html>
 
 config CRYPTO_RMD320
 	tristate "RIPEMD-320 digest algorithm"
@@ -406,7 +405,7 @@ config CRYPTO_RMD320
 	  (than RIPEMD-160).
 
 	  Developed by Hans Dobbertin, Antoon Bosselaers and Bart Preneel.
-	  See <http://home.esat.kuleuven.be/~bosselae/ripemd160.html>
+	  See <http://homes.esat.kuleuven.be/~bosselae/ripemd160.html>
 
 config CRYPTO_SHA1
 	tristate "SHA1 digest algorithm"
@@ -461,7 +460,7 @@ config CRYPTO_WP512
 	  Whirlpool will be part of the ISO/IEC 10118-3:2003(E) standard
 
 	  See also:
-	  <http://planeta.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html>
+	  <http://www.larc.usp.br/~pbarreto/WhirlpoolPage.html>
 
 config CRYPTO_GHASH_CLMUL_NI_INTEL
 	tristate "GHASH digest algorithm (CLMUL-NI accelerated)"
@@ -579,8 +578,8 @@ config CRYPTO_ANUBIS
 	  in the NESSIE competition.
 
 	  See also:
-	  <https://www.cosic.esat.kuleuven.ac.be/nessie/reports/>
-	  <http://planeta.terra.com.br/informatica/paulobarreto/AnubisPage.html>
+	  <https://www.cosic.esat.kuleuven.be/nessie/reports/>
+	  <http://www.larc.usp.br/~pbarreto/AnubisPage.html>
 
 config CRYPTO_ARC4
 	tristate "ARC4 cipher algorithm"
@@ -659,7 +658,7 @@ config CRYPTO_KHAZAD
 	  on 32-bit processors.  Khazad uses an 128 bit key size.
 
 	  See also:
-	  <http://planeta.terra.com.br/informatica/paulobarreto/KhazadPage.html>
+	  <http://www.larc.usp.br/~pbarreto/KhazadPage.html>
 
 config CRYPTO_SALSA20
 	tristate "Salsa20 stream cipher algorithm (EXPERIMENTAL)"
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index ef71318976c7..e46d21ae26bc 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -3,6 +3,13 @@
  *
  * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  *
+ * Added AEAD support to cryptd.
+ *    Authors: Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
  * Software Foundation; either version 2 of the License, or (at your option)
@@ -12,6 +19,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/aead.h>
 #include <crypto/cryptd.h>
 #include <crypto/crypto_wq.h>
 #include <linux/err.h>
@@ -44,6 +52,11 @@ struct hashd_instance_ctx {
 	struct cryptd_queue *queue;
 };
 
+struct aead_instance_ctx {
+	struct crypto_aead_spawn aead_spawn;
+	struct cryptd_queue *queue;
+};
+
 struct cryptd_blkcipher_ctx {
 	struct crypto_blkcipher *child;
 };
@@ -61,6 +74,14 @@ struct cryptd_hash_request_ctx {
 	struct shash_desc desc;
 };
 
+struct cryptd_aead_ctx {
+	struct crypto_aead *child;
+};
+
+struct cryptd_aead_request_ctx {
+	crypto_completion_t complete;
+};
+
 static void cryptd_queue_worker(struct work_struct *work);
 
 static int cryptd_init_queue(struct cryptd_queue *queue,
@@ -601,6 +622,144 @@ out_put_alg:
 	return err;
 }
 
+static void cryptd_aead_crypt(struct aead_request *req,
+			struct crypto_aead *child,
+			int err,
+			int (*crypt)(struct aead_request *req))
+{
+	struct cryptd_aead_request_ctx *rctx;
+	rctx = aead_request_ctx(req);
+
+	if (unlikely(err == -EINPROGRESS))
+		goto out;
+	aead_request_set_tfm(req, child);
+	err = crypt( req );
+	req->base.complete = rctx->complete;
+out:
+	local_bh_disable();
+	rctx->complete(&req->base, err);
+	local_bh_enable();
+}
+
+static void cryptd_aead_encrypt(struct crypto_async_request *areq, int err)
+{
+	struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(areq->tfm);
+	struct crypto_aead *child = ctx->child;
+	struct aead_request *req;
+
+	req = container_of(areq, struct aead_request, base);
+	cryptd_aead_crypt(req, child, err, crypto_aead_crt(child)->encrypt);
+}
+
+static void cryptd_aead_decrypt(struct crypto_async_request *areq, int err)
+{
+	struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(areq->tfm);
+	struct crypto_aead *child = ctx->child;
+	struct aead_request *req;
+
+	req = container_of(areq, struct aead_request, base);
+	cryptd_aead_crypt(req, child, err, crypto_aead_crt(child)->decrypt);
+}
+
+static int cryptd_aead_enqueue(struct aead_request *req,
+				    crypto_completion_t complete)
+{
+	struct cryptd_aead_request_ctx *rctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct cryptd_queue *queue = cryptd_get_queue(crypto_aead_tfm(tfm));
+
+	rctx->complete = req->base.complete;
+	req->base.complete = complete;
+	return cryptd_enqueue_request(queue, &req->base);
+}
+
+static int cryptd_aead_encrypt_enqueue(struct aead_request *req)
+{
+	return cryptd_aead_enqueue(req, cryptd_aead_encrypt );
+}
+
+static int cryptd_aead_decrypt_enqueue(struct aead_request *req)
+{
+	return cryptd_aead_enqueue(req, cryptd_aead_decrypt );
+}
+
+static int cryptd_aead_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
+	struct aead_instance_ctx *ictx = crypto_instance_ctx(inst);
+	struct crypto_aead_spawn *spawn = &ictx->aead_spawn;
+	struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_aead *cipher;
+
+	cipher = crypto_spawn_aead(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	crypto_aead_set_flags(cipher, CRYPTO_TFM_REQ_MAY_SLEEP);
+	ctx->child = cipher;
+	tfm->crt_aead.reqsize = sizeof(struct cryptd_aead_request_ctx);
+	return 0;
+}
+
+static void cryptd_aead_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_aead(ctx->child);
+}
+
+static int cryptd_create_aead(struct crypto_template *tmpl,
+		              struct rtattr **tb,
+			      struct cryptd_queue *queue)
+{
+	struct aead_instance_ctx *ctx;
+	struct crypto_instance *inst;
+	struct crypto_alg *alg;
+	int err;
+
+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_AEAD,
+				CRYPTO_ALG_TYPE_MASK);
+        if (IS_ERR(alg))
+		return PTR_ERR(alg);
+
+	inst = cryptd_alloc_instance(alg, 0, sizeof(*ctx));
+	err = PTR_ERR(inst);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	ctx = crypto_instance_ctx(inst);
+	ctx->queue = queue;
+
+	err = crypto_init_spawn(&ctx->aead_spawn.base, alg, inst,
+			CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_ASYNC);
+	if (err)
+		goto out_free_inst;
+
+	inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC;
+	inst->alg.cra_type = alg->cra_type;
+	inst->alg.cra_ctxsize = sizeof(struct cryptd_aead_ctx);
+	inst->alg.cra_init = cryptd_aead_init_tfm;
+	inst->alg.cra_exit = cryptd_aead_exit_tfm;
+	inst->alg.cra_aead.setkey      = alg->cra_aead.setkey;
+	inst->alg.cra_aead.setauthsize = alg->cra_aead.setauthsize;
+	inst->alg.cra_aead.geniv       = alg->cra_aead.geniv;
+	inst->alg.cra_aead.ivsize      = alg->cra_aead.ivsize;
+	inst->alg.cra_aead.maxauthsize = alg->cra_aead.maxauthsize;
+	inst->alg.cra_aead.encrypt     = cryptd_aead_encrypt_enqueue;
+	inst->alg.cra_aead.decrypt     = cryptd_aead_decrypt_enqueue;
+	inst->alg.cra_aead.givencrypt  = alg->cra_aead.givencrypt;
+	inst->alg.cra_aead.givdecrypt  = alg->cra_aead.givdecrypt;
+
+	err = crypto_register_instance(tmpl, inst);
+	if (err) {
+		crypto_drop_spawn(&ctx->aead_spawn.base);
+out_free_inst:
+		kfree(inst);
+	}
+out_put_alg:
+	crypto_mod_put(alg);
+	return err;
+}
+
 static struct cryptd_queue queue;
 
 static int cryptd_create(struct crypto_template *tmpl, struct rtattr **tb)
@@ -616,6 +775,8 @@ static int cryptd_create(struct crypto_template *tmpl, struct rtattr **tb)
 		return cryptd_create_blkcipher(tmpl, tb, &queue);
 	case CRYPTO_ALG_TYPE_DIGEST:
 		return cryptd_create_hash(tmpl, tb, &queue);
+	case CRYPTO_ALG_TYPE_AEAD:
+		return cryptd_create_aead(tmpl, tb, &queue);
 	}
 
 	return -EINVAL;
@@ -625,16 +786,21 @@ static void cryptd_free(struct crypto_instance *inst)
 {
 	struct cryptd_instance_ctx *ctx = crypto_instance_ctx(inst);
 	struct hashd_instance_ctx *hctx = crypto_instance_ctx(inst);
+	struct aead_instance_ctx *aead_ctx = crypto_instance_ctx(inst);
 
 	switch (inst->alg.cra_flags & CRYPTO_ALG_TYPE_MASK) {
 	case CRYPTO_ALG_TYPE_AHASH:
 		crypto_drop_shash(&hctx->spawn);
 		kfree(ahash_instance(inst));
 		return;
+	case CRYPTO_ALG_TYPE_AEAD:
+		crypto_drop_spawn(&aead_ctx->aead_spawn.base);
+		kfree(inst);
+		return;
+	default:
+		crypto_drop_spawn(&ctx->spawn);
+		kfree(inst);
 	}
-
-	crypto_drop_spawn(&ctx->spawn);
-	kfree(inst);
 }
 
 static struct crypto_template cryptd_tmpl = {
@@ -724,6 +890,40 @@ void cryptd_free_ahash(struct cryptd_ahash *tfm)
 }
 EXPORT_SYMBOL_GPL(cryptd_free_ahash);
 
+struct cryptd_aead *cryptd_alloc_aead(const char *alg_name,
+						  u32 type, u32 mask)
+{
+	char cryptd_alg_name[CRYPTO_MAX_ALG_NAME];
+	struct crypto_aead *tfm;
+
+	if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME,
+		     "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME)
+		return ERR_PTR(-EINVAL);
+	tfm = crypto_alloc_aead(cryptd_alg_name, type, mask);
+	if (IS_ERR(tfm))
+		return ERR_CAST(tfm);
+	if (tfm->base.__crt_alg->cra_module != THIS_MODULE) {
+		crypto_free_aead(tfm);
+		return ERR_PTR(-EINVAL);
+	}
+	return __cryptd_aead_cast(tfm);
+}
+EXPORT_SYMBOL_GPL(cryptd_alloc_aead);
+
+struct crypto_aead *cryptd_aead_child(struct cryptd_aead *tfm)
+{
+	struct cryptd_aead_ctx *ctx;
+	ctx = crypto_aead_ctx(&tfm->base);
+	return ctx->child;
+}
+EXPORT_SYMBOL_GPL(cryptd_aead_child);
+
+void cryptd_free_aead(struct cryptd_aead *tfm)
+{
+	crypto_free_aead(&tfm->base);
+}
+EXPORT_SYMBOL_GPL(cryptd_free_aead);
+
 static int __init cryptd_init(void)
 {
 	int err;
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index ea0b3863ad0f..eab2cf7a0269 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -172,6 +172,7 @@ config CRYPTO_DEV_MV_CESA
 
 config CRYPTO_DEV_NIAGARA2
        tristate "Niagara2 Stream Processing Unit driver"
+       select CRYPTO_DES
        select CRYPTO_ALGAPI
        depends on SPARC64
        help
@@ -243,4 +244,12 @@ config CRYPTO_DEV_OMAP_SHAM
 	  OMAP processors have SHA1/MD5 hw accelerator. Select this if you
 	  want to use the OMAP module for SHA1/MD5 algorithms.
 
+config CRYPTO_DEV_OMAP_AES
+	tristate "Support for OMAP AES hw engine"
+	depends on ARCH_OMAP2 || ARCH_OMAP3
+	select CRYPTO_AES
+	help
+	  OMAP processors have AES module accelerator. Select this if you
+	  want to use the OMAP module for AES algorithms.
+
 endif # CRYPTO_HW
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index 6dbbe00c4524..256697330a41 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -2,11 +2,12 @@ obj-$(CONFIG_CRYPTO_DEV_PADLOCK_AES) += padlock-aes.o
 obj-$(CONFIG_CRYPTO_DEV_PADLOCK_SHA) += padlock-sha.o
 obj-$(CONFIG_CRYPTO_DEV_GEODE) += geode-aes.o
 obj-$(CONFIG_CRYPTO_DEV_NIAGARA2) += n2_crypto.o
-n2_crypto-objs := n2_core.o n2_asm.o
+n2_crypto-y := n2_core.o n2_asm.o
 obj-$(CONFIG_CRYPTO_DEV_HIFN_795X) += hifn_795x.o
 obj-$(CONFIG_CRYPTO_DEV_MV_CESA) += mv_cesa.o
 obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o
 obj-$(CONFIG_CRYPTO_DEV_IXP4XX) += ixp4xx_crypto.o
 obj-$(CONFIG_CRYPTO_DEV_PPC4XX) += amcc/
 obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o
+obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o
 
diff --git a/drivers/crypto/amcc/Makefile b/drivers/crypto/amcc/Makefile
index aa376e8d5ed5..5c0c62b65d69 100644
--- a/drivers/crypto/amcc/Makefile
+++ b/drivers/crypto/amcc/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_CRYPTO_DEV_PPC4XX) += crypto4xx.o
-crypto4xx-objs :=  crypto4xx_core.o crypto4xx_alg.o crypto4xx_sa.o
+crypto4xx-y :=  crypto4xx_core.o crypto4xx_alg.o crypto4xx_sa.o
diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index e449ac5627a5..0eac3da566ba 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -2700,8 +2700,7 @@ static void __devexit hifn_remove(struct pci_dev *pdev)
 	dev = pci_get_drvdata(pdev);
 
 	if (dev) {
-		cancel_delayed_work(&dev->work);
-		flush_scheduled_work();
+		cancel_delayed_work_sync(&dev->work);
 
 		hifn_unregister_rng(dev);
 		hifn_unregister_alg(dev);
diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c
new file mode 100644
index 000000000000..799ca517c121
--- /dev/null
+++ b/drivers/crypto/omap-aes.c
@@ -0,0 +1,948 @@
+/*
+ * Cryptographic API.
+ *
+ * Support for OMAP AES HW acceleration.
+ *
+ * Copyright (c) 2010 Nokia Corporation
+ * Author: Dmitry Kasatkin <dmitry.kasatkin@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
+#include <linux/io.h>
+#include <linux/crypto.h>
+#include <linux/interrupt.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/aes.h>
+
+#include <plat/cpu.h>
+#include <plat/dma.h>
+
+/* OMAP TRM gives bitfields as start:end, where start is the higher bit
+   number. For example 7:0 */
+#define FLD_MASK(start, end)	(((1 << ((start) - (end) + 1)) - 1) << (end))
+#define FLD_VAL(val, start, end) (((val) << (end)) & FLD_MASK(start, end))
+
+#define AES_REG_KEY(x)			(0x1C - ((x ^ 0x01) * 0x04))
+#define AES_REG_IV(x)			(0x20 + ((x) * 0x04))
+
+#define AES_REG_CTRL			0x30
+#define AES_REG_CTRL_CTR_WIDTH		(1 << 7)
+#define AES_REG_CTRL_CTR		(1 << 6)
+#define AES_REG_CTRL_CBC		(1 << 5)
+#define AES_REG_CTRL_KEY_SIZE		(3 << 3)
+#define AES_REG_CTRL_DIRECTION		(1 << 2)
+#define AES_REG_CTRL_INPUT_READY	(1 << 1)
+#define AES_REG_CTRL_OUTPUT_READY	(1 << 0)
+
+#define AES_REG_DATA			0x34
+#define AES_REG_DATA_N(x)		(0x34 + ((x) * 0x04))
+
+#define AES_REG_REV			0x44
+#define AES_REG_REV_MAJOR		0xF0
+#define AES_REG_REV_MINOR		0x0F
+
+#define AES_REG_MASK			0x48
+#define AES_REG_MASK_SIDLE		(1 << 6)
+#define AES_REG_MASK_START		(1 << 5)
+#define AES_REG_MASK_DMA_OUT_EN		(1 << 3)
+#define AES_REG_MASK_DMA_IN_EN		(1 << 2)
+#define AES_REG_MASK_SOFTRESET		(1 << 1)
+#define AES_REG_AUTOIDLE		(1 << 0)
+
+#define AES_REG_SYSSTATUS		0x4C
+#define AES_REG_SYSSTATUS_RESETDONE	(1 << 0)
+
+#define DEFAULT_TIMEOUT		(5*HZ)
+
+#define FLAGS_MODE_MASK		0x000f
+#define FLAGS_ENCRYPT		BIT(0)
+#define FLAGS_CBC		BIT(1)
+#define FLAGS_GIV		BIT(2)
+
+#define FLAGS_NEW_KEY		BIT(4)
+#define FLAGS_NEW_IV		BIT(5)
+#define FLAGS_INIT		BIT(6)
+#define FLAGS_FAST		BIT(7)
+#define FLAGS_BUSY		8
+
+struct omap_aes_ctx {
+	struct omap_aes_dev *dd;
+
+	int		keylen;
+	u32		key[AES_KEYSIZE_256 / sizeof(u32)];
+	unsigned long	flags;
+};
+
+struct omap_aes_reqctx {
+	unsigned long mode;
+};
+
+#define OMAP_AES_QUEUE_LENGTH	1
+#define OMAP_AES_CACHE_SIZE	0
+
+struct omap_aes_dev {
+	struct list_head	list;
+	unsigned long		phys_base;
+	void __iomem 		*io_base;
+	struct clk		*iclk;
+	struct omap_aes_ctx	*ctx;
+	struct device		*dev;
+	unsigned long		flags;
+
+	u32			*iv;
+	u32			ctrl;
+
+	spinlock_t			lock;
+	struct crypto_queue		queue;
+
+	struct tasklet_struct		task;
+
+	struct ablkcipher_request	*req;
+	size_t				total;
+	struct scatterlist		*in_sg;
+	size_t				in_offset;
+	struct scatterlist		*out_sg;
+	size_t				out_offset;
+
+	size_t			buflen;
+	void			*buf_in;
+	size_t			dma_size;
+	int			dma_in;
+	int			dma_lch_in;
+	dma_addr_t		dma_addr_in;
+	void			*buf_out;
+	int			dma_out;
+	int			dma_lch_out;
+	dma_addr_t		dma_addr_out;
+};
+
+/* keep registered devices data here */
+static LIST_HEAD(dev_list);
+static DEFINE_SPINLOCK(list_lock);
+
+static inline u32 omap_aes_read(struct omap_aes_dev *dd, u32 offset)
+{
+	return __raw_readl(dd->io_base + offset);
+}
+
+static inline void omap_aes_write(struct omap_aes_dev *dd, u32 offset,
+				  u32 value)
+{
+	__raw_writel(value, dd->io_base + offset);
+}
+
+static inline void omap_aes_write_mask(struct omap_aes_dev *dd, u32 offset,
+					u32 value, u32 mask)
+{
+	u32 val;
+
+	val = omap_aes_read(dd, offset);
+	val &= ~mask;
+	val |= value;
+	omap_aes_write(dd, offset, val);
+}
+
+static void omap_aes_write_n(struct omap_aes_dev *dd, u32 offset,
+					u32 *value, int count)
+{
+	for (; count--; value++, offset += 4)
+		omap_aes_write(dd, offset, *value);
+}
+
+static int omap_aes_wait(struct omap_aes_dev *dd, u32 offset, u32 bit)
+{
+	unsigned long timeout = jiffies + DEFAULT_TIMEOUT;
+
+	while (!(omap_aes_read(dd, offset) & bit)) {
+		if (time_is_before_jiffies(timeout)) {
+			dev_err(dd->dev, "omap-aes timeout\n");
+			return -ETIMEDOUT;
+		}
+	}
+	return 0;
+}
+
+static int omap_aes_hw_init(struct omap_aes_dev *dd)
+{
+	int err = 0;
+
+	clk_enable(dd->iclk);
+	if (!(dd->flags & FLAGS_INIT)) {
+		/* is it necessary to reset before every operation? */
+		omap_aes_write_mask(dd, AES_REG_MASK, AES_REG_MASK_SOFTRESET,
+					AES_REG_MASK_SOFTRESET);
+		/*
+		 * prevent OCP bus error (SRESP) in case an access to the module
+		 * is performed while the module is coming out of soft reset
+		 */
+		__asm__ __volatile__("nop");
+		__asm__ __volatile__("nop");
+
+		err = omap_aes_wait(dd, AES_REG_SYSSTATUS,
+				AES_REG_SYSSTATUS_RESETDONE);
+		if (!err)
+			dd->flags |= FLAGS_INIT;
+	}
+
+	return err;
+}
+
+static void omap_aes_hw_cleanup(struct omap_aes_dev *dd)
+{
+	clk_disable(dd->iclk);
+}
+
+static void omap_aes_write_ctrl(struct omap_aes_dev *dd)
+{
+	unsigned int key32;
+	int i;
+	u32 val, mask;
+
+	val = FLD_VAL(((dd->ctx->keylen >> 3) - 1), 4, 3);
+	if (dd->flags & FLAGS_CBC)
+		val |= AES_REG_CTRL_CBC;
+	if (dd->flags & FLAGS_ENCRYPT)
+		val |= AES_REG_CTRL_DIRECTION;
+
+	if (dd->ctrl == val && !(dd->flags & FLAGS_NEW_IV) &&
+		   !(dd->ctx->flags & FLAGS_NEW_KEY))
+		goto out;
+
+	/* only need to write control registers for new settings */
+
+	dd->ctrl = val;
+
+	val = 0;
+	if (dd->dma_lch_out >= 0)
+		val |= AES_REG_MASK_DMA_OUT_EN;
+	if (dd->dma_lch_in >= 0)
+		val |= AES_REG_MASK_DMA_IN_EN;
+
+	mask = AES_REG_MASK_DMA_IN_EN | AES_REG_MASK_DMA_OUT_EN;
+
+	omap_aes_write_mask(dd, AES_REG_MASK, val, mask);
+
+	pr_debug("Set key\n");
+	key32 = dd->ctx->keylen / sizeof(u32);
+	/* set a key */
+	for (i = 0; i < key32; i++) {
+		omap_aes_write(dd, AES_REG_KEY(i),
+			__le32_to_cpu(dd->ctx->key[i]));
+	}
+	dd->ctx->flags &= ~FLAGS_NEW_KEY;
+
+	if (dd->flags & FLAGS_NEW_IV) {
+		pr_debug("Set IV\n");
+		omap_aes_write_n(dd, AES_REG_IV(0), dd->iv, 4);
+		dd->flags &= ~FLAGS_NEW_IV;
+	}
+
+	mask = AES_REG_CTRL_CBC | AES_REG_CTRL_DIRECTION |
+			AES_REG_CTRL_KEY_SIZE;
+
+	omap_aes_write_mask(dd, AES_REG_CTRL, dd->ctrl, mask);
+
+out:
+	/* start DMA or disable idle mode */
+	omap_aes_write_mask(dd, AES_REG_MASK, AES_REG_MASK_START,
+			    AES_REG_MASK_START);
+}
+
+static struct omap_aes_dev *omap_aes_find_dev(struct omap_aes_ctx *ctx)
+{
+	struct omap_aes_dev *dd = NULL, *tmp;
+
+	spin_lock_bh(&list_lock);
+	if (!ctx->dd) {
+		list_for_each_entry(tmp, &dev_list, list) {
+			/* FIXME: take fist available aes core */
+			dd = tmp;
+			break;
+		}
+		ctx->dd = dd;
+	} else {
+		/* already found before */
+		dd = ctx->dd;
+	}
+	spin_unlock_bh(&list_lock);
+
+	return dd;
+}
+
+static void omap_aes_dma_callback(int lch, u16 ch_status, void *data)
+{
+	struct omap_aes_dev *dd = data;
+
+	if (lch == dd->dma_lch_out)
+		tasklet_schedule(&dd->task);
+}
+
+static int omap_aes_dma_init(struct omap_aes_dev *dd)
+{
+	int err = -ENOMEM;
+
+	dd->dma_lch_out = -1;
+	dd->dma_lch_in = -1;
+
+	dd->buf_in = (void *)__get_free_pages(GFP_KERNEL, OMAP_AES_CACHE_SIZE);
+	dd->buf_out = (void *)__get_free_pages(GFP_KERNEL, OMAP_AES_CACHE_SIZE);
+	dd->buflen = PAGE_SIZE << OMAP_AES_CACHE_SIZE;
+	dd->buflen &= ~(AES_BLOCK_SIZE - 1);
+
+	if (!dd->buf_in || !dd->buf_out) {
+		dev_err(dd->dev, "unable to alloc pages.\n");
+		goto err_alloc;
+	}
+
+	/* MAP here */
+	dd->dma_addr_in = dma_map_single(dd->dev, dd->buf_in, dd->buflen,
+					 DMA_TO_DEVICE);
+	if (dma_mapping_error(dd->dev, dd->dma_addr_in)) {
+		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
+		err = -EINVAL;
+		goto err_map_in;
+	}
+
+	dd->dma_addr_out = dma_map_single(dd->dev, dd->buf_out, dd->buflen,
+					  DMA_FROM_DEVICE);
+	if (dma_mapping_error(dd->dev, dd->dma_addr_out)) {
+		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
+		err = -EINVAL;
+		goto err_map_out;
+	}
+
+	err = omap_request_dma(dd->dma_in, "omap-aes-rx",
+			       omap_aes_dma_callback, dd, &dd->dma_lch_in);
+	if (err) {
+		dev_err(dd->dev, "Unable to request DMA channel\n");
+		goto err_dma_in;
+	}
+	err = omap_request_dma(dd->dma_out, "omap-aes-tx",
+			       omap_aes_dma_callback, dd, &dd->dma_lch_out);
+	if (err) {
+		dev_err(dd->dev, "Unable to request DMA channel\n");
+		goto err_dma_out;
+	}
+
+	omap_set_dma_dest_params(dd->dma_lch_in, 0, OMAP_DMA_AMODE_CONSTANT,
+				 dd->phys_base + AES_REG_DATA, 0, 4);
+
+	omap_set_dma_dest_burst_mode(dd->dma_lch_in, OMAP_DMA_DATA_BURST_4);
+	omap_set_dma_src_burst_mode(dd->dma_lch_in, OMAP_DMA_DATA_BURST_4);
+
+	omap_set_dma_src_params(dd->dma_lch_out, 0, OMAP_DMA_AMODE_CONSTANT,
+				dd->phys_base + AES_REG_DATA, 0, 4);
+
+	omap_set_dma_src_burst_mode(dd->dma_lch_out, OMAP_DMA_DATA_BURST_4);
+	omap_set_dma_dest_burst_mode(dd->dma_lch_out, OMAP_DMA_DATA_BURST_4);
+
+	return 0;
+
+err_dma_out:
+	omap_free_dma(dd->dma_lch_in);
+err_dma_in:
+	dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
+			 DMA_FROM_DEVICE);
+err_map_out:
+	dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen, DMA_TO_DEVICE);
+err_map_in:
+	free_pages((unsigned long)dd->buf_out, OMAP_AES_CACHE_SIZE);
+	free_pages((unsigned long)dd->buf_in, OMAP_AES_CACHE_SIZE);
+err_alloc:
+	if (err)
+		pr_err("error: %d\n", err);
+	return err;
+}
+
+static void omap_aes_dma_cleanup(struct omap_aes_dev *dd)
+{
+	omap_free_dma(dd->dma_lch_out);
+	omap_free_dma(dd->dma_lch_in);
+	dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
+			 DMA_FROM_DEVICE);
+	dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen, DMA_TO_DEVICE);
+	free_pages((unsigned long)dd->buf_out, OMAP_AES_CACHE_SIZE);
+	free_pages((unsigned long)dd->buf_in, OMAP_AES_CACHE_SIZE);
+}
+
+static void sg_copy_buf(void *buf, struct scatterlist *sg,
+			      unsigned int start, unsigned int nbytes, int out)
+{
+	struct scatter_walk walk;
+
+	if (!nbytes)
+		return;
+
+	scatterwalk_start(&walk, sg);
+	scatterwalk_advance(&walk, start);
+	scatterwalk_copychunks(buf, &walk, nbytes, out);
+	scatterwalk_done(&walk, out, 0);
+}
+
+static int sg_copy(struct scatterlist **sg, size_t *offset, void *buf,
+		   size_t buflen, size_t total, int out)
+{
+	unsigned int count, off = 0;
+
+	while (buflen && total) {
+		count = min((*sg)->length - *offset, total);
+		count = min(count, buflen);
+
+		if (!count)
+			return off;
+
+		sg_copy_buf(buf + off, *sg, *offset, count, out);
+
+		off += count;
+		buflen -= count;
+		*offset += count;
+		total -= count;
+
+		if (*offset == (*sg)->length) {
+			*sg = sg_next(*sg);
+			if (*sg)
+				*offset = 0;
+			else
+				total = 0;
+		}
+	}
+
+	return off;
+}
+
+static int omap_aes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
+			       dma_addr_t dma_addr_out, int length)
+{
+	struct omap_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct omap_aes_dev *dd = ctx->dd;
+	int len32;
+
+	pr_debug("len: %d\n", length);
+
+	dd->dma_size = length;
+
+	if (!(dd->flags & FLAGS_FAST))
+		dma_sync_single_for_device(dd->dev, dma_addr_in, length,
+					   DMA_TO_DEVICE);
+
+	len32 = DIV_ROUND_UP(length, sizeof(u32));
+
+	/* IN */
+	omap_set_dma_transfer_params(dd->dma_lch_in, OMAP_DMA_DATA_TYPE_S32,
+				     len32, 1, OMAP_DMA_SYNC_PACKET, dd->dma_in,
+					OMAP_DMA_DST_SYNC);
+
+	omap_set_dma_src_params(dd->dma_lch_in, 0, OMAP_DMA_AMODE_POST_INC,
+				dma_addr_in, 0, 0);
+
+	/* OUT */
+	omap_set_dma_transfer_params(dd->dma_lch_out, OMAP_DMA_DATA_TYPE_S32,
+				     len32, 1, OMAP_DMA_SYNC_PACKET,
+					dd->dma_out, OMAP_DMA_SRC_SYNC);
+
+	omap_set_dma_dest_params(dd->dma_lch_out, 0, OMAP_DMA_AMODE_POST_INC,
+				 dma_addr_out, 0, 0);
+
+	omap_start_dma(dd->dma_lch_in);
+	omap_start_dma(dd->dma_lch_out);
+
+	omap_aes_write_ctrl(dd);
+
+	return 0;
+}
+
+static int omap_aes_crypt_dma_start(struct omap_aes_dev *dd)
+{
+	struct crypto_tfm *tfm = crypto_ablkcipher_tfm(
+					crypto_ablkcipher_reqtfm(dd->req));
+	int err, fast = 0, in, out;
+	size_t count;
+	dma_addr_t addr_in, addr_out;
+
+	pr_debug("total: %d\n", dd->total);
+
+	if (sg_is_last(dd->in_sg) && sg_is_last(dd->out_sg)) {
+		/* check for alignment */
+		in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32));
+		out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32));
+
+		fast = in && out;
+	}
+
+	if (fast)  {
+		count = min(dd->total, sg_dma_len(dd->in_sg));
+		count = min(count, sg_dma_len(dd->out_sg));
+
+		if (count != dd->total)
+			return -EINVAL;
+
+		pr_debug("fast\n");
+
+		err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+		if (!err) {
+			dev_err(dd->dev, "dma_map_sg() error\n");
+			return -EINVAL;
+		}
+
+		err = dma_map_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
+		if (!err) {
+			dev_err(dd->dev, "dma_map_sg() error\n");
+			dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+			return -EINVAL;
+		}
+
+		addr_in = sg_dma_address(dd->in_sg);
+		addr_out = sg_dma_address(dd->out_sg);
+
+		dd->flags |= FLAGS_FAST;
+
+	} else {
+		/* use cache buffers */
+		count = sg_copy(&dd->in_sg, &dd->in_offset, dd->buf_in,
+				 dd->buflen, dd->total, 0);
+
+		addr_in = dd->dma_addr_in;
+		addr_out = dd->dma_addr_out;
+
+		dd->flags &= ~FLAGS_FAST;
+
+	}
+
+	dd->total -= count;
+
+	err = omap_aes_hw_init(dd);
+
+	err = omap_aes_crypt_dma(tfm, addr_in, addr_out, count);
+
+	return err;
+}
+
+static void omap_aes_finish_req(struct omap_aes_dev *dd, int err)
+{
+	struct omap_aes_ctx *ctx;
+
+	pr_debug("err: %d\n", err);
+
+	ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(dd->req));
+
+	if (!dd->total)
+		dd->req->base.complete(&dd->req->base, err);
+}
+
+static int omap_aes_crypt_dma_stop(struct omap_aes_dev *dd)
+{
+	int err = 0;
+	size_t count;
+
+	pr_debug("total: %d\n", dd->total);
+
+	omap_aes_write_mask(dd, AES_REG_MASK, 0, AES_REG_MASK_START);
+
+	omap_aes_hw_cleanup(dd);
+
+	omap_stop_dma(dd->dma_lch_in);
+	omap_stop_dma(dd->dma_lch_out);
+
+	if (dd->flags & FLAGS_FAST) {
+		dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
+		dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+	} else {
+		dma_sync_single_for_device(dd->dev, dd->dma_addr_out,
+					   dd->dma_size, DMA_FROM_DEVICE);
+
+		/* copy data */
+		count = sg_copy(&dd->out_sg, &dd->out_offset, dd->buf_out,
+				 dd->buflen, dd->dma_size, 1);
+		if (count != dd->dma_size) {
+			err = -EINVAL;
+			pr_err("not all data converted: %u\n", count);
+		}
+	}
+
+	if (err || !dd->total)
+		omap_aes_finish_req(dd, err);
+
+	return err;
+}
+
+static int omap_aes_handle_req(struct omap_aes_dev *dd)
+{
+	struct crypto_async_request *async_req, *backlog;
+	struct omap_aes_ctx *ctx;
+	struct omap_aes_reqctx *rctx;
+	struct ablkcipher_request *req;
+	unsigned long flags;
+
+	if (dd->total)
+		goto start;
+
+	spin_lock_irqsave(&dd->lock, flags);
+	backlog = crypto_get_backlog(&dd->queue);
+	async_req = crypto_dequeue_request(&dd->queue);
+	if (!async_req)
+		clear_bit(FLAGS_BUSY, &dd->flags);
+	spin_unlock_irqrestore(&dd->lock, flags);
+
+	if (!async_req)
+		return 0;
+
+	if (backlog)
+		backlog->complete(backlog, -EINPROGRESS);
+
+	req = ablkcipher_request_cast(async_req);
+
+	pr_debug("get new req\n");
+
+	/* assign new request to device */
+	dd->req = req;
+	dd->total = req->nbytes;
+	dd->in_offset = 0;
+	dd->in_sg = req->src;
+	dd->out_offset = 0;
+	dd->out_sg = req->dst;
+
+	rctx = ablkcipher_request_ctx(req);
+	ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req));
+	rctx->mode &= FLAGS_MODE_MASK;
+	dd->flags = (dd->flags & ~FLAGS_MODE_MASK) | rctx->mode;
+
+	dd->iv = req->info;
+	if ((dd->flags & FLAGS_CBC) && dd->iv)
+		dd->flags |= FLAGS_NEW_IV;
+	else
+		dd->flags &= ~FLAGS_NEW_IV;
+
+	ctx->dd = dd;
+	if (dd->ctx != ctx) {
+		/* assign new context to device */
+		dd->ctx = ctx;
+		ctx->flags |= FLAGS_NEW_KEY;
+	}
+
+	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE))
+		pr_err("request size is not exact amount of AES blocks\n");
+
+start:
+	return omap_aes_crypt_dma_start(dd);
+}
+
+static void omap_aes_task(unsigned long data)
+{
+	struct omap_aes_dev *dd = (struct omap_aes_dev *)data;
+	int err;
+
+	pr_debug("enter\n");
+
+	err = omap_aes_crypt_dma_stop(dd);
+
+	err = omap_aes_handle_req(dd);
+
+	pr_debug("exit\n");
+}
+
+static int omap_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
+{
+	struct omap_aes_ctx *ctx = crypto_ablkcipher_ctx(
+			crypto_ablkcipher_reqtfm(req));
+	struct omap_aes_reqctx *rctx = ablkcipher_request_ctx(req);
+	struct omap_aes_dev *dd;
+	unsigned long flags;
+	int err;
+
+	pr_debug("nbytes: %d, enc: %d, cbc: %d\n", req->nbytes,
+		  !!(mode & FLAGS_ENCRYPT),
+		  !!(mode & FLAGS_CBC));
+
+	dd = omap_aes_find_dev(ctx);
+	if (!dd)
+		return -ENODEV;
+
+	rctx->mode = mode;
+
+	spin_lock_irqsave(&dd->lock, flags);
+	err = ablkcipher_enqueue_request(&dd->queue, req);
+	spin_unlock_irqrestore(&dd->lock, flags);
+
+	if (!test_and_set_bit(FLAGS_BUSY, &dd->flags))
+		omap_aes_handle_req(dd);
+
+	pr_debug("exit\n");
+
+	return err;
+}
+
+/* ********************** ALG API ************************************ */
+
+static int omap_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	struct omap_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (keylen != AES_KEYSIZE_128 && keylen != AES_KEYSIZE_192 &&
+		   keylen != AES_KEYSIZE_256)
+		return -EINVAL;
+
+	pr_debug("enter, keylen: %d\n", keylen);
+
+	memcpy(ctx->key, key, keylen);
+	ctx->keylen = keylen;
+	ctx->flags |= FLAGS_NEW_KEY;
+
+	return 0;
+}
+
+static int omap_aes_ecb_encrypt(struct ablkcipher_request *req)
+{
+	return omap_aes_crypt(req, FLAGS_ENCRYPT);
+}
+
+static int omap_aes_ecb_decrypt(struct ablkcipher_request *req)
+{
+	return omap_aes_crypt(req, 0);
+}
+
+static int omap_aes_cbc_encrypt(struct ablkcipher_request *req)
+{
+	return omap_aes_crypt(req, FLAGS_ENCRYPT | FLAGS_CBC);
+}
+
+static int omap_aes_cbc_decrypt(struct ablkcipher_request *req)
+{
+	return omap_aes_crypt(req, FLAGS_CBC);
+}
+
+static int omap_aes_cra_init(struct crypto_tfm *tfm)
+{
+	pr_debug("enter\n");
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct omap_aes_reqctx);
+
+	return 0;
+}
+
+static void omap_aes_cra_exit(struct crypto_tfm *tfm)
+{
+	pr_debug("enter\n");
+}
+
+/* ********************** ALGS ************************************ */
+
+static struct crypto_alg algs[] = {
+{
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "ecb-aes-omap",
+	.cra_priority		= 100,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct omap_aes_ctx),
+	.cra_alignmask	 	= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= omap_aes_cra_init,
+	.cra_exit		= omap_aes_cra_exit,
+	.cra_u.ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.setkey		= omap_aes_setkey,
+		.encrypt	= omap_aes_ecb_encrypt,
+		.decrypt	= omap_aes_ecb_decrypt,
+	}
+},
+{
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "cbc-aes-omap",
+	.cra_priority		= 100,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct omap_aes_ctx),
+	.cra_alignmask	 	= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= omap_aes_cra_init,
+	.cra_exit		= omap_aes_cra_exit,
+	.cra_u.ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= omap_aes_setkey,
+		.encrypt	= omap_aes_cbc_encrypt,
+		.decrypt	= omap_aes_cbc_decrypt,
+	}
+}
+};
+
+static int omap_aes_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct omap_aes_dev *dd;
+	struct resource *res;
+	int err = -ENOMEM, i, j;
+	u32 reg;
+
+	dd = kzalloc(sizeof(struct omap_aes_dev), GFP_KERNEL);
+	if (dd == NULL) {
+		dev_err(dev, "unable to alloc data struct.\n");
+		goto err_data;
+	}
+	dd->dev = dev;
+	platform_set_drvdata(pdev, dd);
+
+	spin_lock_init(&dd->lock);
+	crypto_init_queue(&dd->queue, OMAP_AES_QUEUE_LENGTH);
+
+	/* Get the base address */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "invalid resource type\n");
+		err = -ENODEV;
+		goto err_res;
+	}
+	dd->phys_base = res->start;
+
+	/* Get the DMA */
+	res = platform_get_resource(pdev, IORESOURCE_DMA, 0);
+	if (!res)
+		dev_info(dev, "no DMA info\n");
+	else
+		dd->dma_out = res->start;
+
+	/* Get the DMA */
+	res = platform_get_resource(pdev, IORESOURCE_DMA, 1);
+	if (!res)
+		dev_info(dev, "no DMA info\n");
+	else
+		dd->dma_in = res->start;
+
+	/* Initializing the clock */
+	dd->iclk = clk_get(dev, "ick");
+	if (!dd->iclk) {
+		dev_err(dev, "clock intialization failed.\n");
+		err = -ENODEV;
+		goto err_res;
+	}
+
+	dd->io_base = ioremap(dd->phys_base, SZ_4K);
+	if (!dd->io_base) {
+		dev_err(dev, "can't ioremap\n");
+		err = -ENOMEM;
+		goto err_io;
+	}
+
+	clk_enable(dd->iclk);
+	reg = omap_aes_read(dd, AES_REG_REV);
+	dev_info(dev, "OMAP AES hw accel rev: %u.%u\n",
+		 (reg & AES_REG_REV_MAJOR) >> 4, reg & AES_REG_REV_MINOR);
+	clk_disable(dd->iclk);
+
+	tasklet_init(&dd->task, omap_aes_task, (unsigned long)dd);
+
+	err = omap_aes_dma_init(dd);
+	if (err)
+		goto err_dma;
+
+	INIT_LIST_HEAD(&dd->list);
+	spin_lock(&list_lock);
+	list_add_tail(&dd->list, &dev_list);
+	spin_unlock(&list_lock);
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++) {
+		pr_debug("i: %d\n", i);
+		INIT_LIST_HEAD(&algs[i].cra_list);
+		err = crypto_register_alg(&algs[i]);
+		if (err)
+			goto err_algs;
+	}
+
+	pr_info("probe() done\n");
+
+	return 0;
+err_algs:
+	for (j = 0; j < i; j++)
+		crypto_unregister_alg(&algs[j]);
+	omap_aes_dma_cleanup(dd);
+err_dma:
+	tasklet_kill(&dd->task);
+	iounmap(dd->io_base);
+err_io:
+	clk_put(dd->iclk);
+err_res:
+	kfree(dd);
+	dd = NULL;
+err_data:
+	dev_err(dev, "initialization failed.\n");
+	return err;
+}
+
+static int omap_aes_remove(struct platform_device *pdev)
+{
+	struct omap_aes_dev *dd = platform_get_drvdata(pdev);
+	int i;
+
+	if (!dd)
+		return -ENODEV;
+
+	spin_lock(&list_lock);
+	list_del(&dd->list);
+	spin_unlock(&list_lock);
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++)
+		crypto_unregister_alg(&algs[i]);
+
+	tasklet_kill(&dd->task);
+	omap_aes_dma_cleanup(dd);
+	iounmap(dd->io_base);
+	clk_put(dd->iclk);
+	kfree(dd);
+	dd = NULL;
+
+	return 0;
+}
+
+static struct platform_driver omap_aes_driver = {
+	.probe	= omap_aes_probe,
+	.remove	= omap_aes_remove,
+	.driver	= {
+		.name	= "omap-aes",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init omap_aes_mod_init(void)
+{
+	pr_info("loading %s driver\n", "omap-aes");
+
+	if (!cpu_class_is_omap2() || omap_type() != OMAP2_DEVICE_TYPE_SEC) {
+		pr_err("Unsupported cpu\n");
+		return -ENODEV;
+	}
+
+	return  platform_driver_register(&omap_aes_driver);
+}
+
+static void __exit omap_aes_mod_exit(void)
+{
+	platform_driver_unregister(&omap_aes_driver);
+}
+
+module_init(omap_aes_mod_init);
+module_exit(omap_aes_mod_exit);
+
+MODULE_DESCRIPTION("OMAP AES hw acceleration support.");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Dmitry Kasatkin");
+
diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c
index 7d1485676886..a081c7c7d03f 100644
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@@ -311,7 +311,8 @@ static int omap_sham_xmit_dma(struct omap_sham_dev *dd, dma_addr_t dma_addr,
 	len32 = DIV_ROUND_UP(length, sizeof(u32));
 
 	omap_set_dma_transfer_params(dd->dma_lch, OMAP_DMA_DATA_TYPE_S32, len32,
-			1, OMAP_DMA_SYNC_PACKET, dd->dma, OMAP_DMA_DST_SYNC);
+			1, OMAP_DMA_SYNC_PACKET, dd->dma,
+				OMAP_DMA_DST_SYNC_PREFETCH);
 
 	omap_set_dma_src_params(dd->dma_lch, 0, OMAP_DMA_AMODE_POST_INC,
 				dma_addr, 0, 0);
@@ -1072,6 +1073,9 @@ static int omap_sham_dma_init(struct omap_sham_dev *dd)
 	omap_set_dma_dest_burst_mode(dd->dma_lch,
 			OMAP_DMA_DATA_BURST_16);
 
+	omap_set_dma_src_burst_mode(dd->dma_lch,
+			OMAP_DMA_DATA_BURST_4);
+
 	return 0;
 }
 
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 4bcd825b5739..b879c3f5d7c0 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -161,7 +161,7 @@ struct talitos_private {
 static void to_talitos_ptr(struct talitos_ptr *talitos_ptr, dma_addr_t dma_addr)
 {
 	talitos_ptr->ptr = cpu_to_be32(lower_32_bits(dma_addr));
-	talitos_ptr->eptr = cpu_to_be32(upper_32_bits(dma_addr));
+	talitos_ptr->eptr = upper_32_bits(dma_addr);
 }
 
 /*
@@ -332,10 +332,9 @@ static int talitos_submit(struct device *dev, struct talitos_desc *desc,
 
 	/* GO! */
 	wmb();
-	out_be32(priv->reg + TALITOS_FF(ch),
-		 cpu_to_be32(upper_32_bits(request->dma_desc)));
+	out_be32(priv->reg + TALITOS_FF(ch), upper_32_bits(request->dma_desc));
 	out_be32(priv->reg + TALITOS_FF_LO(ch),
-		 cpu_to_be32(lower_32_bits(request->dma_desc)));
+		 lower_32_bits(request->dma_desc));
 
 	spin_unlock_irqrestore(&priv->chan[ch].head_lock, flags);
 
@@ -1751,14 +1750,14 @@ static int ahash_init_sha224_swinit(struct ahash_request *areq)
 	ahash_init(areq);
 	req_ctx->swinit = 1;/* prevent h/w initting context with sha256 values*/
 
-	req_ctx->hw_context[0] = cpu_to_be32(SHA224_H0);
-	req_ctx->hw_context[1] = cpu_to_be32(SHA224_H1);
-	req_ctx->hw_context[2] = cpu_to_be32(SHA224_H2);
-	req_ctx->hw_context[3] = cpu_to_be32(SHA224_H3);
-	req_ctx->hw_context[4] = cpu_to_be32(SHA224_H4);
-	req_ctx->hw_context[5] = cpu_to_be32(SHA224_H5);
-	req_ctx->hw_context[6] = cpu_to_be32(SHA224_H6);
-	req_ctx->hw_context[7] = cpu_to_be32(SHA224_H7);
+	req_ctx->hw_context[0] = SHA224_H0;
+	req_ctx->hw_context[1] = SHA224_H1;
+	req_ctx->hw_context[2] = SHA224_H2;
+	req_ctx->hw_context[3] = SHA224_H3;
+	req_ctx->hw_context[4] = SHA224_H4;
+	req_ctx->hw_context[5] = SHA224_H5;
+	req_ctx->hw_context[6] = SHA224_H6;
+	req_ctx->hw_context[7] = SHA224_H7;
 
 	/* init 64-bit count */
 	req_ctx->hw_context[8] = 0;
@@ -2333,8 +2332,7 @@ static int talitos_remove(struct platform_device *ofdev)
 		talitos_unregister_rng(dev);
 
 	for (i = 0; i < priv->num_channels; i++)
-		if (priv->chan[i].fifo)
-			kfree(priv->chan[i].fifo);
+		kfree(priv->chan[i].fifo);
 
 	kfree(priv->chan);
 
@@ -2389,6 +2387,9 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev,
 					DESC_HDR_MODE0_MDEU_SHA256;
 		}
 		break;
+	default:
+		dev_err(dev, "unknown algorithm type %d\n", t_alg->algt.type);
+		return ERR_PTR(-EINVAL);
 	}
 
 	alg->cra_module = THIS_MODULE;
diff --git a/drivers/gpu/drm/nouveau/nouveau_i2c.h b/drivers/gpu/drm/nouveau/nouveau_i2c.h
index f71cb32f7571..cfe7c8426d1d 100644
--- a/drivers/gpu/drm/nouveau/nouveau_i2c.h
+++ b/drivers/gpu/drm/nouveau/nouveau_i2c.h
@@ -24,7 +24,6 @@
 #define __NOUVEAU_I2C_H__
 
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/i2c-algo-bit.h>
 #include "drm_dp_helper.h"
 
diff --git a/drivers/gpu/drm/radeon/radeon_mode.h b/drivers/gpu/drm/radeon/radeon_mode.h
index 17a6602b5885..454c1dc7ea45 100644
--- a/drivers/gpu/drm/radeon/radeon_mode.h
+++ b/drivers/gpu/drm/radeon/radeon_mode.h
@@ -36,7 +36,6 @@
 #include <drm_dp_helper.h>
 #include <drm_fixed.h>
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/i2c-algo-bit.h>
 
 struct radeon_bo;
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index 6369ba7f96f8..3052e2969ad0 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -56,20 +56,20 @@ menu "Special HID drivers"
 	depends on HID
 
 config HID_3M_PCT
-	tristate "3M PCT"
+	tristate "3M PCT touchscreen"
 	depends on USB_HID
 	---help---
 	Support for 3M PCT touch screens.
 
 config HID_A4TECH
-	tristate "A4 tech" if EMBEDDED
+	tristate "A4 tech mice" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for A4 tech X5 and WOP-35 / Trust 450L mice.
 
 config HID_ACRUX_FF
-	tristate "ACRUX force feedback support"
+	tristate "ACRUX force feedback"
 	depends on USB_HID
 	select INPUT_FF_MEMLESS
 	---help---
@@ -77,7 +77,7 @@ config HID_ACRUX_FF
 	game controllers.
 
 config HID_APPLE
-	tristate "Apple" if EMBEDDED
+	tristate "Apple {i,Power,Mac}Books" if EMBEDDED
 	depends on (USB_HID || BT_HIDP)
 	default !EMBEDDED
 	---help---
@@ -88,7 +88,7 @@ config HID_APPLE
 	MacBooks, MacBook Pros and Apple Aluminum.
 
 config HID_BELKIN
-	tristate "Belkin" if EMBEDDED
+	tristate "Belkin Flip KVM and Wireless keyboard" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
@@ -101,14 +101,14 @@ config HID_CANDO
 	Support for Cando dual touch panel.
 
 config HID_CHERRY
-	tristate "Cherry" if EMBEDDED
+	tristate "Cherry Cymotion keyboard" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for Cherry Cymotion keyboard.
 
 config HID_CHICONY
-	tristate "Chicony" if EMBEDDED
+	tristate "Chicony Tactical pad" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
@@ -130,20 +130,20 @@ config HID_PRODIKEYS
 	  and some additional multimedia keys.
 
 config HID_CYPRESS
-	tristate "Cypress" if EMBEDDED
+	tristate "Cypress mouse and barcode readers" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for cypress mouse and barcode readers.
 
 config HID_DRAGONRISE
-	tristate "DragonRise Inc. support"
+	tristate "DragonRise Inc. game controller"
 	depends on USB_HID
 	---help---
 	Say Y here if you have DragonRise Inc.game controllers.
 
 config DRAGONRISE_FF
-	bool "DragonRise Inc. force feedback support"
+	bool "DragonRise Inc. force feedback"
 	depends on HID_DRAGONRISE
 	select INPUT_FF_MEMLESS
 	---help---
@@ -157,46 +157,58 @@ config HID_EGALAX
 	Support for the eGalax dual-touch panel.
 
 config HID_ELECOM
-	tristate "ELECOM"
+	tristate "ELECOM BM084 bluetooth mouse"
 	depends on BT_HIDP
 	---help---
 	Support for the ELECOM BM084 (bluetooth mouse).
 
 config HID_EZKEY
-	tristate "Ezkey" if EMBEDDED
+	tristate "Ezkey BTC 8193 keyboard" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for Ezkey BTC 8193 keyboard.
 
 config HID_KYE
-	tristate "Kye" if EMBEDDED
+	tristate "Kye/Genius Ergo Mouse" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for Kye/Genius Ergo Mouse.
 
+config HID_UCLOGIC
+	tristate "UC-Logic"
+	depends on USB_HID
+	---help---
+	Support for UC-Logic tablets.
+
+config HID_WALTOP
+	tristate "Waltop"
+	depends on USB_HID
+	---help---
+	Support for Waltop tablets.
+
 config HID_GYRATION
-	tristate "Gyration"
+	tristate "Gyration remote control"
 	depends on USB_HID
 	---help---
 	Support for Gyration remote control.
 
 config HID_TWINHAN
-	tristate "Twinhan"
+	tristate "Twinhan IR remote control"
 	depends on USB_HID
 	---help---
 	Support for Twinhan IR remote control.
 
 config HID_KENSINGTON
-	tristate "Kensington" if EMBEDDED
+	tristate "Kensington Slimblade Trackball" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for Kensington Slimblade Trackball.
 
 config HID_LOGITECH
-	tristate "Logitech" if EMBEDDED
+	tristate "Logitech devices" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
@@ -220,12 +232,12 @@ config LOGITECH_FF
 	  force feedback.
 
 config LOGIRUMBLEPAD2_FF
-	bool "Logitech Rumblepad 2 force feedback support"
+	bool "Logitech RumblePad/Rumblepad 2 force feedback support"
 	depends on HID_LOGITECH
 	select INPUT_FF_MEMLESS
 	help
 	  Say Y here if you want to enable force feedback support for Logitech
-	  Rumblepad 2 devices.
+	  RumblePad and Rumblepad 2 devices.
 
 config LOGIG940_FF
 	bool "Logitech Flight System G940 force feedback support"
@@ -235,6 +247,14 @@ config LOGIG940_FF
 	  Say Y here if you want to enable force feedback support for Logitech
 	  Flight System G940 devices.
 
+config LOGIWII_FF
+	bool "Logitech Speed Force Wireless force feedback support"
+	depends on HID_LOGITECH
+	select INPUT_FF_MEMLESS
+	help
+	  Say Y here if you want to enable force feedback support for Logitech
+	  Speed Force Wireless (Wii) devices.
+
 config HID_MAGICMOUSE
 	tristate "Apple MagicMouse multi-touch support"
 	depends on BT_HIDP
@@ -245,39 +265,39 @@ config HID_MAGICMOUSE
 	Apple Wireless "Magic" Mouse.
 
 config HID_MICROSOFT
-	tristate "Microsoft" if EMBEDDED
+	tristate "Microsoft non-fully HID-compliant devices" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for Microsoft devices that are not fully compliant with HID standard.
 
 config HID_MOSART
-	tristate "MosArt"
+	tristate "MosArt dual-touch panels"
 	depends on USB_HID
 	---help---
 	Support for MosArt dual-touch panels.
 
 config HID_MONTEREY
-	tristate "Monterey" if EMBEDDED
+	tristate "Monterey Genius KB29E keyboard" if EMBEDDED
 	depends on USB_HID
 	default !EMBEDDED
 	---help---
 	Support for Monterey Genius KB29E.
 
 config HID_NTRIG
-	tristate "NTrig"
+	tristate "N-Trig touch screen"
 	depends on USB_HID
 	---help---
 	Support for N-Trig touch screen.
 
 config HID_ORTEK
-	tristate "Ortek"
+	tristate "Ortek WKB-2000 wireless keyboard and mouse trackpad"
 	depends on USB_HID
 	---help---
 	Support for Ortek WKB-2000 wireless keyboard + mouse trackpad.
 
 config HID_PANTHERLORD
-	tristate "Pantherlord support"
+	tristate "Pantherlord/GreenAsia game controller"
 	depends on USB_HID
 	---help---
 	  Say Y here if you have a PantherLord/GreenAsia based game controller
@@ -292,7 +312,7 @@ config PANTHERLORD_FF
 	  or adapter and want to enable force feedback support for it.
 
 config HID_PETALYNX
-	tristate "Petalynx"
+	tristate "Petalynx Maxter remote control"
 	depends on USB_HID
 	---help---
 	Support for Petalynx Maxter remote control.
@@ -356,7 +376,7 @@ config HID_PICOLCD_LEDS
 	  Provide access to PicoLCD's GPO pins via leds class.
 
 config HID_QUANTA
-	tristate "Quanta Optical Touch"
+	tristate "Quanta Optical Touch panels"
 	depends on USB_HID
 	---help---
 	Support for Quanta Optical Touch dual-touch panels.
@@ -376,32 +396,39 @@ config HID_ROCCAT_KONE
 	---help---
 	Support for Roccat Kone mouse.
 
+config HID_ROCCAT_PYRA
+	tristate "Roccat Pyra mouse support"
+	depends on USB_HID
+	select HID_ROCCAT
+	---help---
+	Support for Roccat Pyra mouse.
+
 config HID_SAMSUNG
-	tristate "Samsung"
+	tristate "Samsung InfraRed remote control or keyboards"
 	depends on USB_HID
 	---help---
 	Support for Samsung InfraRed remote control or keyboards.
 
 config HID_SONY
-	tristate "Sony"
+	tristate "Sony PS3 controller"
 	depends on USB_HID
 	---help---
 	Support for Sony PS3 controller.
 
 config HID_STANTUM
-	tristate "Stantum"
+	tristate "Stantum multitouch panel"
 	depends on USB_HID
 	---help---
 	Support for Stantum multitouch panel.
 
 config HID_SUNPLUS
-	tristate "Sunplus"
+	tristate "Sunplus wireless desktop"
 	depends on USB_HID
 	---help---
 	Support for Sunplus wireless desktop.
 
 config HID_GREENASIA
-	tristate "GreenAsia (Product ID 0x12) support"
+	tristate "GreenAsia (Product ID 0x12) game controller support"
 	depends on USB_HID
 	---help---
 	  Say Y here if you have a GreenAsia (Product ID 0x12) based game
diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index 46f037f3df80..c335605b9200 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -21,6 +21,9 @@ endif
 ifdef CONFIG_LOGIG940_FF
 	hid-logitech-objs	+= hid-lg3ff.o
 endif
+ifdef CONFIG_LOGIWII_FF
+	hid-logitech-objs	+= hid-lg4ff.o
+endif
 
 obj-$(CONFIG_HID_3M_PCT)	+= hid-3m-pct.o
 obj-$(CONFIG_HID_A4TECH)	+= hid-a4tech.o
@@ -52,6 +55,7 @@ obj-$(CONFIG_HID_PETALYNX)	+= hid-petalynx.o
 obj-$(CONFIG_HID_PICOLCD)	+= hid-picolcd.o
 obj-$(CONFIG_HID_ROCCAT)	+= hid-roccat.o
 obj-$(CONFIG_HID_ROCCAT_KONE)	+= hid-roccat-kone.o
+obj-$(CONFIG_HID_ROCCAT_PYRA)	+= hid-roccat-pyra.o
 obj-$(CONFIG_HID_SAMSUNG)	+= hid-samsung.o
 obj-$(CONFIG_HID_SMARTJOYPLUS)	+= hid-sjoy.o
 obj-$(CONFIG_HID_SONY)		+= hid-sony.o
@@ -61,9 +65,11 @@ obj-$(CONFIG_HID_GREENASIA)	+= hid-gaff.o
 obj-$(CONFIG_HID_THRUSTMASTER)	+= hid-tmff.o
 obj-$(CONFIG_HID_TOPSEED)	+= hid-topseed.o
 obj-$(CONFIG_HID_TWINHAN)	+= hid-twinhan.o
+obj-$(CONFIG_HID_UCLOGIC)	+= hid-uclogic.o
 obj-$(CONFIG_HID_ZEROPLUS)	+= hid-zpff.o
 obj-$(CONFIG_HID_ZYDACRON)	+= hid-zydacron.o
 obj-$(CONFIG_HID_WACOM)		+= hid-wacom.o
+obj-$(CONFIG_HID_WALTOP)	+= hid-waltop.o
 
 obj-$(CONFIG_USB_HID)		+= usbhid/
 obj-$(CONFIG_USB_MOUSE)		+= usbhid/
diff --git a/drivers/hid/hid-3m-pct.c b/drivers/hid/hid-3m-pct.c
index 2a0d56b7a02b..02d8cd3b1b1b 100644
--- a/drivers/hid/hid-3m-pct.c
+++ b/drivers/hid/hid-3m-pct.c
@@ -2,6 +2,8 @@
  *  HID driver for 3M PCT multitouch panels
  *
  *  Copyright (c) 2009-2010 Stephane Chatty <chatty@enac.fr>
+ *  Copyright (c) 2010      Henrik Rydberg <rydberg@euromail.se>
+ *  Copyright (c) 2010      Canonical, Ltd.
  *
  */
 
@@ -24,15 +26,26 @@ MODULE_LICENSE("GPL");
 
 #include "hid-ids.h"
 
+#define MAX_SLOTS		60
+#define MAX_TRKID		USHRT_MAX
+#define MAX_EVENTS		360
+
+/* estimated signal-to-noise ratios */
+#define SN_MOVE			2048
+#define SN_WIDTH		128
+
 struct mmm_finger {
 	__s32 x, y, w, h;
-	__u8 rank;
+	__u16 id;
+	bool prev_touch;
 	bool touch, valid;
 };
 
 struct mmm_data {
-	struct mmm_finger f[10];
-	__u8 curid, num;
+	struct mmm_finger f[MAX_SLOTS];
+	__u16 id;
+	__u8 curid;
+	__u8 nexp, nreal;
 	bool touch, valid;
 };
 
@@ -40,6 +53,10 @@ static int mmm_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		struct hid_field *field, struct hid_usage *usage,
 		unsigned long **bit, int *max)
 {
+	int f1 = field->logical_minimum;
+	int f2 = field->logical_maximum;
+	int df = f2 - f1;
+
 	switch (usage->hid & HID_USAGE_PAGE) {
 
 	case HID_UP_BUTTON:
@@ -50,18 +67,20 @@ static int mmm_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		case HID_GD_X:
 			hid_map_usage(hi, usage, bit, max,
 					EV_ABS, ABS_MT_POSITION_X);
+			input_set_abs_params(hi->input, ABS_MT_POSITION_X,
+					     f1, f2, df / SN_MOVE, 0);
 			/* touchscreen emulation */
 			input_set_abs_params(hi->input, ABS_X,
-						field->logical_minimum,
-						field->logical_maximum, 0, 0);
+					     f1, f2, df / SN_MOVE, 0);
 			return 1;
 		case HID_GD_Y:
 			hid_map_usage(hi, usage, bit, max,
 					EV_ABS, ABS_MT_POSITION_Y);
+			input_set_abs_params(hi->input, ABS_MT_POSITION_Y,
+					     f1, f2, df / SN_MOVE, 0);
 			/* touchscreen emulation */
 			input_set_abs_params(hi->input, ABS_Y,
-						field->logical_minimum,
-						field->logical_maximum, 0, 0);
+					     f1, f2, df / SN_MOVE, 0);
 			return 1;
 		}
 		return 0;
@@ -81,21 +100,31 @@ static int mmm_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		case HID_DG_TIPSWITCH:
 			/* touchscreen emulation */
 			hid_map_usage(hi, usage, bit, max, EV_KEY, BTN_TOUCH);
+			input_set_capability(hi->input, EV_KEY, BTN_TOUCH);
 			return 1;
 		case HID_DG_WIDTH:
 			hid_map_usage(hi, usage, bit, max,
 					EV_ABS, ABS_MT_TOUCH_MAJOR);
+			input_set_abs_params(hi->input, ABS_MT_TOUCH_MAJOR,
+					     f1, f2, df / SN_WIDTH, 0);
 			return 1;
 		case HID_DG_HEIGHT:
 			hid_map_usage(hi, usage, bit, max,
 					EV_ABS, ABS_MT_TOUCH_MINOR);
+			input_set_abs_params(hi->input, ABS_MT_TOUCH_MINOR,
+					     f1, f2, df / SN_WIDTH, 0);
 			input_set_abs_params(hi->input, ABS_MT_ORIENTATION,
-					1, 1, 0, 0);
+					0, 1, 0, 0);
 			return 1;
 		case HID_DG_CONTACTID:
-			field->logical_maximum = 59;
+			field->logical_maximum = MAX_TRKID;
 			hid_map_usage(hi, usage, bit, max,
 					EV_ABS, ABS_MT_TRACKING_ID);
+			input_set_abs_params(hi->input, ABS_MT_TRACKING_ID,
+					     0, MAX_TRKID, 0, 0);
+			if (!hi->input->mt)
+				input_mt_create_slots(hi->input, MAX_SLOTS);
+			input_set_events_per_packet(hi->input, MAX_EVENTS);
 			return 1;
 		}
 		/* let hid-input decide for the others */
@@ -113,10 +142,10 @@ static int mmm_input_mapped(struct hid_device *hdev, struct hid_input *hi,
 		struct hid_field *field, struct hid_usage *usage,
 		unsigned long **bit, int *max)
 {
+	/* tell hid-input to skip setup of these event types */
 	if (usage->type == EV_KEY || usage->type == EV_ABS)
-		clear_bit(usage->code, *bit);
-
-	return 0;
+		set_bit(usage->type, hi->input->evbit);
+	return -1;
 }
 
 /*
@@ -126,70 +155,49 @@ static int mmm_input_mapped(struct hid_device *hdev, struct hid_input *hi,
 static void mmm_filter_event(struct mmm_data *md, struct input_dev *input)
 {
 	struct mmm_finger *oldest = 0;
-	bool pressed = false, released = false;
 	int i;
-
-	/*
-	 * we need to iterate on all fingers to decide if we have a press
-	 * or a release event in our touchscreen emulation.
-	 */
-	for (i = 0; i < 10; ++i) {
+	for (i = 0; i < MAX_SLOTS; ++i) {
 		struct mmm_finger *f = &md->f[i];
 		if (!f->valid) {
 			/* this finger is just placeholder data, ignore */
-		} else if (f->touch) {
+			continue;
+		}
+		input_mt_slot(input, i);
+		if (f->touch) {
 			/* this finger is on the screen */
 			int wide = (f->w > f->h);
-			input_event(input, EV_ABS, ABS_MT_TRACKING_ID, i);
+			/* divided by two to match visual scale of touch */
+			int major = max(f->w, f->h) >> 1;
+			int minor = min(f->w, f->h) >> 1;
+
+			if (!f->prev_touch)
+				f->id = md->id++;
+			input_event(input, EV_ABS, ABS_MT_TRACKING_ID, f->id);
 			input_event(input, EV_ABS, ABS_MT_POSITION_X, f->x);
 			input_event(input, EV_ABS, ABS_MT_POSITION_Y, f->y);
 			input_event(input, EV_ABS, ABS_MT_ORIENTATION, wide);
-			input_event(input, EV_ABS, ABS_MT_TOUCH_MAJOR,
-						wide ? f->w : f->h);
-			input_event(input, EV_ABS, ABS_MT_TOUCH_MINOR,
-						wide ? f->h : f->w);
-			input_mt_sync(input);
-			/*
-			 * touchscreen emulation: maintain the age rank
-			 * of this finger, decide if we have a press
-			 */
-			if (f->rank == 0) {
-				f->rank = ++(md->num);
-				if (f->rank == 1)
-					pressed = true;
-			}
-			if (f->rank == 1)
+			input_event(input, EV_ABS, ABS_MT_TOUCH_MAJOR, major);
+			input_event(input, EV_ABS, ABS_MT_TOUCH_MINOR, minor);
+			/* touchscreen emulation: pick the oldest contact */
+			if (!oldest || ((f->id - oldest->id) & (SHRT_MAX + 1)))
 				oldest = f;
 		} else {
 			/* this finger took off the screen */
-			/* touchscreen emulation: maintain age rank of others */
-			int j;
-
-			for (j = 0; j < 10; ++j) {
-				struct mmm_finger *g = &md->f[j];
-				if (g->rank > f->rank) {
-					g->rank--;
-					if (g->rank == 1)
-						oldest = g;
-				}
-			}
-			f->rank = 0;
-			--(md->num);
-			if (md->num == 0)
-				released = true;
+			input_event(input, EV_ABS, ABS_MT_TRACKING_ID, -1);
 		}
+		f->prev_touch = f->touch;
 		f->valid = 0;
 	}
 
 	/* touchscreen emulation */
 	if (oldest) {
-		if (pressed)
-			input_event(input, EV_KEY, BTN_TOUCH, 1);
+		input_event(input, EV_KEY, BTN_TOUCH, 1);
 		input_event(input, EV_ABS, ABS_X, oldest->x);
 		input_event(input, EV_ABS, ABS_Y, oldest->y);
-	} else if (released) {
+	} else {
 		input_event(input, EV_KEY, BTN_TOUCH, 0);
 	}
+	input_sync(input);
 }
 
 /*
@@ -223,10 +231,12 @@ static int mmm_event(struct hid_device *hid, struct hid_field *field,
 				md->f[md->curid].h = value;
 			break;
 		case HID_DG_CONTACTID:
+			value = clamp_val(value, 0, MAX_SLOTS - 1);
 			if (md->valid) {
 				md->curid = value;
 				md->f[value].touch = md->touch;
 				md->f[value].valid = 1;
+				md->nreal++;
 			}
 			break;
 		case HID_GD_X:
@@ -238,7 +248,12 @@ static int mmm_event(struct hid_device *hid, struct hid_field *field,
 				md->f[md->curid].y = value;
 			break;
 		case HID_DG_CONTACTCOUNT:
-			mmm_filter_event(md, input);
+			if (value)
+				md->nexp = value;
+			if (md->nreal >= md->nexp) {
+				mmm_filter_event(md, input);
+				md->nreal = 0;
+			}
 			break;
 		}
 	}
@@ -255,6 +270,8 @@ static int mmm_probe(struct hid_device *hdev, const struct hid_device_id *id)
 	int ret;
 	struct mmm_data *md;
 
+	hdev->quirks |= HID_QUIRK_NO_INPUT_SYNC;
+
 	md = kzalloc(sizeof(struct mmm_data), GFP_KERNEL);
 	if (!md) {
 		dev_err(&hdev->dev, "cannot allocate 3M data\n");
diff --git a/drivers/hid/hid-a4tech.c b/drivers/hid/hid-a4tech.c
index 3a2b223c1da4..1666c1684e79 100644
--- a/drivers/hid/hid-a4tech.c
+++ b/drivers/hid/hid-a4tech.c
@@ -133,6 +133,8 @@ static const struct hid_device_id a4_devices[] = {
 		.driver_data = A4_2WHEEL_MOUSE_HACK_7 },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_X5_005D),
 		.driver_data = A4_2WHEEL_MOUSE_HACK_B8 },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_RP_649),
+		.driver_data = A4_2WHEEL_MOUSE_HACK_B8 },
 	{ }
 };
 MODULE_DEVICE_TABLE(hid, a4_devices);
diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index bba05d0a8980..eaeca564a8d3 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -246,17 +246,18 @@ static int apple_event(struct hid_device *hdev, struct hid_field *field,
 /*
  * MacBook JIS keyboard has wrong logical maximum
  */
-static void apple_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *apple_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
 	struct apple_sc *asc = hid_get_drvdata(hdev);
 
-	if ((asc->quirks & APPLE_RDESC_JIS) && rsize >= 60 &&
+	if ((asc->quirks & APPLE_RDESC_JIS) && *rsize >= 60 &&
 			rdesc[53] == 0x65 && rdesc[59] == 0x65) {
 		dev_info(&hdev->dev, "fixing up MacBook JIS keyboard report "
 				"descriptor\n");
 		rdesc[53] = rdesc[59] = 0xe7;
 	}
+	return rdesc;
 }
 
 static void apple_setup_input(struct input_dev *input)
diff --git a/drivers/hid/hid-cherry.c b/drivers/hid/hid-cherry.c
index 24663a8717b1..e880086c2311 100644
--- a/drivers/hid/hid-cherry.c
+++ b/drivers/hid/hid-cherry.c
@@ -26,15 +26,16 @@
  * Cherry Cymotion keyboard have an invalid HID report descriptor,
  * that needs fixing before we can parse it.
  */
-static void ch_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *ch_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize >= 17 && rdesc[11] == 0x3c && rdesc[12] == 0x02) {
+	if (*rsize >= 17 && rdesc[11] == 0x3c && rdesc[12] == 0x02) {
 		dev_info(&hdev->dev, "fixing up Cherry Cymotion report "
 				"descriptor\n");
 		rdesc[11] = rdesc[16] = 0xff;
 		rdesc[12] = rdesc[17] = 0x03;
 	}
+	return rdesc;
 }
 
 #define ch_map_key_clear(c)	hid_map_usage_clear(hi, usage, bit, max, \
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 3cb6632d4518..7832b6e2478b 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -388,12 +388,6 @@ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item)
 	__u32 data;
 	unsigned n;
 
-	/* Local delimiter could have value 0, which allows size to be 0 */
-	if (item->size == 0 && item->tag != HID_LOCAL_ITEM_TAG_DELIMITER) {
-		dbg_hid("item data expected for local item\n");
-		return -1;
-	}
-
 	data = item_udata(item);
 
 	switch (item->tag) {
@@ -651,7 +645,7 @@ int hid_parse_report(struct hid_device *device, __u8 *start,
 	};
 
 	if (device->driver->report_fixup)
-		device->driver->report_fixup(device, start, size);
+		start = device->driver->report_fixup(device, start, &size);
 
 	device->rdesc = kmemdup(start, size, GFP_KERNEL);
 	if (device->rdesc == NULL)
@@ -1241,6 +1235,7 @@ static const struct hid_device_id hid_blacklist[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_3M, USB_DEVICE_ID_3M2256) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_WCP32PU) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_X5_005D) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_RP_649) },
 #if defined(CONFIG_HID_ACRUX_FF) || defined(CONFIG_HID_ACRUX_FF_MODULE)
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ACRUX, 0x0802) },
 #endif
@@ -1248,6 +1243,7 @@ static const struct hid_device_id hid_blacklist[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL4) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MIGHTYMOUSE) },
 	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE) },
+	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_FOUNTAIN_ANSI) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_FOUNTAIN_ISO) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER_ANSI) },
@@ -1327,6 +1323,7 @@ static const struct hid_device_id hid_blacklist[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_CORDLESS_DESKTOP_LX500) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_EXTREME_3D) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WHEEL) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD_CORD) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2_2) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_F3D) },
@@ -1336,6 +1333,7 @@ static const struct hid_device_id hid_blacklist[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOMO_WHEEL) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G25_WHEEL) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WII_WHEEL) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACETRAVELLER) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACENAVIGATOR) },
@@ -1371,12 +1369,15 @@ static const struct hid_device_id hid_blacklist[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_PIXART_IMAGING_INC_OPTICAL_TOUCH_SCREEN) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KONE) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_PYRA_WIRED) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_SAMSUNG, USB_DEVICE_ID_SAMSUNG_IR_REMOTE) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_SAMSUNG, USB_DEVICE_ID_SAMSUNG_WIRELESS_KBD_MOUSE) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER) },
 	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGX_MOUSE) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_STANTUM, USB_DEVICE_ID_MTP) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_STANTUM_STM, USB_DEVICE_ID_MTP_STM) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_STANTUM_SITRONIX, USB_DEVICE_ID_MTP_SITRONIX) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_SUNPLUS, USB_DEVICE_ID_SUNPLUS_WDESKTOP) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb300) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb304) },
@@ -1388,8 +1389,16 @@ static const struct hid_device_id hid_blacklist[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_TOPSEED, USB_DEVICE_ID_TOPSEED_CYBERLINK) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_TOPSEED2, USB_DEVICE_ID_TOPSEED2_RF_COMBO) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_TWINHAN, USB_DEVICE_ID_TWINHAN_IR_REMOTE) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_PF1209) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP8060U) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SMARTJOY_PLUS) },
 	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_WACOM, USB_DEVICE_ID_WACOM_GRAPHIRE_BLUETOOTH) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_SLIM_TABLET_5_8_INCH) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_SLIM_TABLET_12_1_INCH) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_MEDIA_TABLET_10_6_INCH) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_MEDIA_TABLET_14_1_INCH) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0005) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0030) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ZYDACRON, USB_DEVICE_ID_ZYDACRON_REMOTE_CONTROL) },
diff --git a/drivers/hid/hid-cypress.c b/drivers/hid/hid-cypress.c
index 998b6f443d7d..4cd0e2345991 100644
--- a/drivers/hid/hid-cypress.c
+++ b/drivers/hid/hid-cypress.c
@@ -31,16 +31,16 @@
  * Some USB barcode readers from cypress have usage min and usage max in
  * the wrong order
  */
-static void cp_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *cp_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
 	unsigned long quirks = (unsigned long)hid_get_drvdata(hdev);
 	unsigned int i;
 
 	if (!(quirks & CP_RDESC_SWAPPED_MIN_MAX))
-		return;
+		return rdesc;
 
-	for (i = 0; i < rsize - 4; i++)
+	for (i = 0; i < *rsize - 4; i++)
 		if (rdesc[i] == 0x29 && rdesc[i + 2] == 0x19) {
 			__u8 tmp;
 
@@ -50,6 +50,7 @@ static void cp_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 			rdesc[i + 3] = rdesc[i + 1];
 			rdesc[i + 1] = tmp;
 		}
+	return rdesc;
 }
 
 static int cp_input_mapped(struct hid_device *hdev, struct hid_input *hi,
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 61a3e572224a..75c5e23d09d2 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -570,6 +570,8 @@ void hid_debug_event(struct hid_device *hdev, char *buf)
 				buf[i];
 		list->tail = (list->tail + i) % HID_DEBUG_BUFSIZE;
         }
+
+	wake_up_interruptible(&hdev->debug_wait);
 }
 EXPORT_SYMBOL_GPL(hid_debug_event);
 
diff --git a/drivers/hid/hid-egalax.c b/drivers/hid/hid-egalax.c
index 8ca7f65cf2f8..54b017ad258d 100644
--- a/drivers/hid/hid-egalax.c
+++ b/drivers/hid/hid-egalax.c
@@ -31,7 +31,7 @@ struct egalax_data {
 	bool first;		/* is this the first finger in the frame? */
 	bool valid;		/* valid finger data, or just placeholder? */
 	bool activity;		/* at least one active finger previously? */
-	__u16 lastx, lasty;	/* latest valid (x, y) in the frame */
+	__u16 lastx, lasty, lastz;	/* latest valid (x, y, z) in the frame */
 };
 
 static int egalax_input_mapping(struct hid_device *hdev, struct hid_input *hi,
@@ -79,6 +79,10 @@ static int egalax_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		case HID_DG_TIPPRESSURE:
 			hid_map_usage(hi, usage, bit, max,
 					EV_ABS, ABS_MT_PRESSURE);
+			/* touchscreen emulation */
+			input_set_abs_params(hi->input, ABS_PRESSURE,
+						field->logical_minimum,
+						field->logical_maximum, 0, 0);
 			return 1;
 		}
 		return 0;
@@ -109,8 +113,8 @@ static void egalax_filter_event(struct egalax_data *td, struct input_dev *input)
 	if (td->valid) {
 		/* emit multitouch events */
 		input_event(input, EV_ABS, ABS_MT_TRACKING_ID, td->id);
-		input_event(input, EV_ABS, ABS_MT_POSITION_X, td->x);
-		input_event(input, EV_ABS, ABS_MT_POSITION_Y, td->y);
+		input_event(input, EV_ABS, ABS_MT_POSITION_X, td->x >> 3);
+		input_event(input, EV_ABS, ABS_MT_POSITION_Y, td->y >> 3);
 		input_event(input, EV_ABS, ABS_MT_PRESSURE, td->z);
 
 		input_mt_sync(input);
@@ -121,6 +125,7 @@ static void egalax_filter_event(struct egalax_data *td, struct input_dev *input)
 		 */
 		td->lastx = td->x;
 		td->lasty = td->y;
+		td->lastz = td->z;
 	}
 
 	/*
@@ -129,8 +134,9 @@ static void egalax_filter_event(struct egalax_data *td, struct input_dev *input)
 	 * the oldest on the panel, the one we want for single touch
 	 */
 	if (!td->first && td->activity) {
-		input_event(input, EV_ABS, ABS_X, td->lastx);
-		input_event(input, EV_ABS, ABS_Y, td->lasty);
+		input_event(input, EV_ABS, ABS_X, td->lastx >> 3);
+		input_event(input, EV_ABS, ABS_Y, td->lasty >> 3);
+ 		input_event(input, EV_ABS, ABS_PRESSURE, td->lastz);
 	}
 
 	if (!td->valid) {
diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c
index 7a40878f46b4..6e31f305397d 100644
--- a/drivers/hid/hid-elecom.c
+++ b/drivers/hid/hid-elecom.c
@@ -20,14 +20,15 @@
 
 #include "hid-ids.h"
 
-static void elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize >= 48 && rdesc[46] == 0x05 && rdesc[47] == 0x0c) {
+	if (*rsize >= 48 && rdesc[46] == 0x05 && rdesc[47] == 0x0c) {
 		dev_info(&hdev->dev, "Fixing up Elecom BM084 "
 				"report descriptor.\n");
 		rdesc[47] = 0x00;
 	}
+    return rdesc;
 }
 
 static const struct hid_device_id elecom_devices[] = {
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 855aa8e355f4..3ee999d33004 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -25,6 +25,7 @@
 #define USB_VENDOR_ID_A4TECH		0x09da
 #define USB_DEVICE_ID_A4TECH_WCP32PU	0x0006
 #define USB_DEVICE_ID_A4TECH_X5_005D	0x000a
+#define USB_DEVICE_ID_A4TECH_RP_649	0x001a
 
 #define USB_VENDOR_ID_AASHIMA		0x06d6
 #define USB_DEVICE_ID_AASHIMA_GAMEPAD	0x0025
@@ -63,6 +64,7 @@
 #define USB_VENDOR_ID_APPLE		0x05ac
 #define USB_DEVICE_ID_APPLE_MIGHTYMOUSE	0x0304
 #define USB_DEVICE_ID_APPLE_MAGICMOUSE	0x030d
+#define USB_DEVICE_ID_APPLE_MAGICTRACKPAD	0x030e
 #define USB_DEVICE_ID_APPLE_FOUNTAIN_ANSI	0x020e
 #define USB_DEVICE_ID_APPLE_FOUNTAIN_ISO	0x020f
 #define USB_DEVICE_ID_APPLE_GEYSER_ANSI	0x0214
@@ -142,6 +144,7 @@
 #define USB_DEVICE_ID_CH_FLIGHT_SIM_ECLIPSE_YOKE       0x0051
 #define USB_DEVICE_ID_CH_FLIGHT_SIM_YOKE	0x00ff
 #define USB_DEVICE_ID_CH_3AXIS_5BUTTON_STICK	0x00d3
+#define USB_DEVICE_ID_CH_AXIS_295	0x001c
 
 #define USB_VENDOR_ID_CHERRY		0x046a
 #define USB_DEVICE_ID_CHERRY_CYMOTION	0x0023
@@ -343,6 +346,7 @@
 #define USB_DEVICE_ID_LOGITECH_RECEIVER	0xc101
 #define USB_DEVICE_ID_LOGITECH_HARMONY_FIRST  0xc110
 #define USB_DEVICE_ID_LOGITECH_HARMONY_LAST 0xc14f
+#define USB_DEVICE_ID_LOGITECH_RUMBLEPAD_CORD	0xc20a
 #define USB_DEVICE_ID_LOGITECH_RUMBLEPAD	0xc211
 #define USB_DEVICE_ID_LOGITECH_EXTREME_3D	0xc215
 #define USB_DEVICE_ID_LOGITECH_RUMBLEPAD2	0xc218
@@ -354,6 +358,7 @@
 #define USB_DEVICE_ID_LOGITECH_WINGMAN_FFG	0xc293
 #define USB_DEVICE_ID_LOGITECH_MOMO_WHEEL	0xc295
 #define USB_DEVICE_ID_LOGITECH_G25_WHEEL	0xc299
+#define USB_DEVICE_ID_LOGITECH_WII_WHEEL	0xc29c
 #define USB_DEVICE_ID_LOGITECH_ELITE_KBD	0xc30a
 #define USB_DEVICE_ID_S510_RECEIVER	0xc50c
 #define USB_DEVICE_ID_S510_RECEIVER_2	0xc517
@@ -466,6 +471,8 @@
 
 #define USB_VENDOR_ID_ROCCAT		0x1e7d
 #define USB_DEVICE_ID_ROCCAT_KONE	0x2ced
+#define USB_DEVICE_ID_ROCCAT_PYRA_WIRED	0x2c24
+#define USB_DEVICE_ID_ROCCAT_PYRA_WIRELESS	0x2cf6
 
 #define USB_VENDOR_ID_SAITEK		0x06a3
 #define USB_DEVICE_ID_SAITEK_RUMBLEPAD	0xff17
@@ -485,6 +492,12 @@
 #define USB_VENDOR_ID_STANTUM		0x1f87
 #define USB_DEVICE_ID_MTP		0x0002
 
+#define USB_VENDOR_ID_STANTUM_STM		0x0483
+#define USB_DEVICE_ID_MTP_STM		0x3261
+
+#define USB_VENDOR_ID_STANTUM_SITRONIX		0x1403
+#define USB_DEVICE_ID_MTP_SITRONIX		0x5001
+
 #define USB_VENDOR_ID_SUN		0x0430
 #define USB_DEVICE_ID_RARITAN_KVM_DONGLE	0xcdab
 
@@ -514,8 +527,10 @@
 
 #define USB_VENDOR_ID_UCLOGIC		0x5543
 #define USB_DEVICE_ID_UCLOGIC_TABLET_PF1209	0x0042
-#define USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U	0x0003
 #define USB_DEVICE_ID_UCLOGIC_TABLET_KNA5	0x6001
+#define USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U	0x0003
+#define USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U	0x0004
+#define USB_DEVICE_ID_UCLOGIC_TABLET_WP8060U	0x0005
 
 #define USB_VENDOR_ID_VERNIER		0x08f7
 #define USB_DEVICE_ID_VERNIER_LABPRO	0x0001
@@ -527,6 +542,12 @@
 #define USB_VENDOR_ID_WACOM		0x056a
 #define USB_DEVICE_ID_WACOM_GRAPHIRE_BLUETOOTH	0x81
 
+#define USB_VENDOR_ID_WALTOP				0x172f
+#define USB_DEVICE_ID_WALTOP_SLIM_TABLET_5_8_INCH	0x0032
+#define USB_DEVICE_ID_WALTOP_SLIM_TABLET_12_1_INCH	0x0034
+#define USB_DEVICE_ID_WALTOP_MEDIA_TABLET_10_6_INCH	0x0501
+#define USB_DEVICE_ID_WALTOP_MEDIA_TABLET_14_1_INCH	0x0500
+
 #define USB_VENDOR_ID_WISEGROUP		0x0925
 #define USB_DEVICE_ID_SMARTJOY_PLUS	0x0005
 #define USB_DEVICE_ID_1_PHIDGETSERVO_20	0x8101
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 6c03dcc5760a..834ef47b76d6 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -149,6 +149,83 @@ static int hidinput_setkeycode(struct input_dev *dev,
 }
 
 
+/**
+ * hidinput_calc_abs_res - calculate an absolute axis resolution
+ * @field: the HID report field to calculate resolution for
+ * @code: axis code
+ *
+ * The formula is:
+ *                         (logical_maximum - logical_minimum)
+ * resolution = ----------------------------------------------------------
+ *              (physical_maximum - physical_minimum) * 10 ^ unit_exponent
+ *
+ * as seen in the HID specification v1.11 6.2.2.7 Global Items.
+ *
+ * Only exponent 1 length units are processed. Centimeters are converted to
+ * inches. Degrees are converted to radians.
+ */
+static __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code)
+{
+	__s32 unit_exponent = field->unit_exponent;
+	__s32 logical_extents = field->logical_maximum -
+					field->logical_minimum;
+	__s32 physical_extents = field->physical_maximum -
+					field->physical_minimum;
+	__s32 prev;
+
+	/* Check if the extents are sane */
+	if (logical_extents <= 0 || physical_extents <= 0)
+		return 0;
+
+	/*
+	 * Verify and convert units.
+	 * See HID specification v1.11 6.2.2.7 Global Items for unit decoding
+	 */
+	if (code == ABS_X || code == ABS_Y || code == ABS_Z) {
+		if (field->unit == 0x11) {		/* If centimeters */
+			/* Convert to inches */
+			prev = logical_extents;
+			logical_extents *= 254;
+			if (logical_extents < prev)
+				return 0;
+			unit_exponent += 2;
+		} else if (field->unit != 0x13) {	/* If not inches */
+			return 0;
+		}
+	} else if (code == ABS_RX || code == ABS_RY || code == ABS_RZ) {
+		if (field->unit == 0x14) {		/* If degrees */
+			/* Convert to radians */
+			prev = logical_extents;
+			logical_extents *= 573;
+			if (logical_extents < prev)
+				return 0;
+			unit_exponent += 1;
+		} else if (field->unit != 0x12) {	/* If not radians */
+			return 0;
+		}
+	} else {
+		return 0;
+	}
+
+	/* Apply negative unit exponent */
+	for (; unit_exponent < 0; unit_exponent++) {
+		prev = logical_extents;
+		logical_extents *= 10;
+		if (logical_extents < prev)
+			return 0;
+	}
+	/* Apply positive unit exponent */
+	for (; unit_exponent > 0; unit_exponent--) {
+		prev = physical_extents;
+		physical_extents *= 10;
+		if (physical_extents < prev)
+			return 0;
+	}
+
+	/* Calculate resolution */
+	return logical_extents / physical_extents;
+}
+
 static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_field *field,
 				     struct hid_usage *usage)
 {
@@ -336,6 +413,10 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 			map_key_clear(BTN_STYLUS);
 			break;
 
+		case 0x46: /* TabletPick */
+			map_key_clear(BTN_STYLUS2);
+			break;
+
 		default:  goto unknown;
 		}
 		break;
@@ -537,6 +618,9 @@ mapped:
 			input_set_abs_params(input, usage->code, a, b, (b - a) >> 8, (b - a) >> 4);
 		else	input_set_abs_params(input, usage->code, a, b, 0, 0);
 
+		input_abs_set_res(input, usage->code,
+				  hidinput_calc_abs_res(field, usage->code));
+
 		/* use a larger default input buffer for MT devices */
 		if (usage->code == ABS_MT_POSITION_X && input->hint_events_per_packet == 0)
 			input_set_events_per_packet(input, 60);
@@ -659,6 +743,9 @@ void hidinput_report_event(struct hid_device *hid, struct hid_report *report)
 {
 	struct hid_input *hidinput;
 
+	if (hid->quirks & HID_QUIRK_NO_INPUT_SYNC)
+		return;
+
 	list_for_each_entry(hidinput, &hid->inputs, list)
 		input_sync(hidinput->input);
 }
diff --git a/drivers/hid/hid-kye.c b/drivers/hid/hid-kye.c
index f8871712b7b5..817247ee006c 100644
--- a/drivers/hid/hid-kye.c
+++ b/drivers/hid/hid-kye.c
@@ -23,10 +23,10 @@
  *   - report size 8 count 1 must be size 1 count 8 for button bitfield
  *   - change the button usage range to 4-7 for the extra buttons
  */
-static void kye_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *kye_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize >= 74 &&
+	if (*rsize >= 74 &&
 		rdesc[61] == 0x05 && rdesc[62] == 0x08 &&
 		rdesc[63] == 0x19 && rdesc[64] == 0x08 &&
 		rdesc[65] == 0x29 && rdesc[66] == 0x0f &&
@@ -40,6 +40,7 @@ static void kye_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		rdesc[72] = 0x01;
 		rdesc[74] = 0x08;
 	}
+	return rdesc;
 }
 
 static const struct hid_device_id kye_devices[] = {
diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c
index f6433d8050a9..b629fba5a057 100644
--- a/drivers/hid/hid-lg.c
+++ b/drivers/hid/hid-lg.c
@@ -7,6 +7,7 @@
  *  Copyright (c) 2006-2007 Jiri Kosina
  *  Copyright (c) 2007 Paul Walmsley
  *  Copyright (c) 2008 Jiri Slaby
+ *  Copyright (c) 2010 Hendrik Iben
  */
 
 /*
@@ -19,6 +20,9 @@
 #include <linux/device.h>
 #include <linux/hid.h>
 #include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
 
 #include "hid-ids.h"
 #include "hid-lg.h"
@@ -35,31 +39,43 @@
 #define LG_FF2			0x400
 #define LG_RDESC_REL_ABS	0x800
 #define LG_FF3			0x1000
+#define LG_FF4			0x2000
 
 /*
  * Certain Logitech keyboards send in report #3 keys which are far
  * above the logical maximum described in descriptor. This extends
  * the original value of 0x28c of logical maximum to 0x104d
  */
-static void lg_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *lg_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
 	unsigned long quirks = (unsigned long)hid_get_drvdata(hdev);
 
-	if ((quirks & LG_RDESC) && rsize >= 90 && rdesc[83] == 0x26 &&
+	if ((quirks & LG_RDESC) && *rsize >= 90 && rdesc[83] == 0x26 &&
 			rdesc[84] == 0x8c && rdesc[85] == 0x02) {
 		dev_info(&hdev->dev, "fixing up Logitech keyboard report "
 				"descriptor\n");
 		rdesc[84] = rdesc[89] = 0x4d;
 		rdesc[85] = rdesc[90] = 0x10;
 	}
-	if ((quirks & LG_RDESC_REL_ABS) && rsize >= 50 &&
+	if ((quirks & LG_RDESC_REL_ABS) && *rsize >= 50 &&
 			rdesc[32] == 0x81 && rdesc[33] == 0x06 &&
 			rdesc[49] == 0x81 && rdesc[50] == 0x06) {
 		dev_info(&hdev->dev, "fixing up rel/abs in Logitech "
 				"report descriptor\n");
 		rdesc[33] = rdesc[50] = 0x02;
 	}
+	if ((quirks & LG_FF4) && *rsize >= 101 &&
+			rdesc[41] == 0x95 && rdesc[42] == 0x0B &&
+			rdesc[47] == 0x05 && rdesc[48] == 0x09) {
+		dev_info(&hdev->dev, "fixing up Logitech Speed Force Wireless "
+			"button descriptor\n");
+		rdesc[41] = 0x05;
+		rdesc[42] = 0x09;
+		rdesc[47] = 0x95;
+		rdesc[48] = 0x0B;
+	}
+	return rdesc;
 }
 
 #define lg_map_key_clear(c)	hid_map_usage_clear(hi, usage, bit, max, \
@@ -285,12 +301,33 @@ static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		goto err_free;
 	}
 
+	if (quirks & LG_FF4) {
+		unsigned char buf[] = { 0x00, 0xAF,  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+		ret = hdev->hid_output_raw_report(hdev, buf, sizeof(buf), HID_FEATURE_REPORT);
+
+		if (ret >= 0) {
+			/* insert a little delay of 10 jiffies ~ 40ms */
+			wait_queue_head_t wait;
+			init_waitqueue_head (&wait);
+			wait_event_interruptible_timeout(wait, 0, 10);
+
+			/* Select random Address */
+			buf[1] = 0xB2;
+			get_random_bytes(&buf[2], 2);
+
+			ret = hdev->hid_output_raw_report(hdev, buf, sizeof(buf), HID_FEATURE_REPORT);
+		}
+	}
+
 	if (quirks & LG_FF)
 		lgff_init(hdev);
 	if (quirks & LG_FF2)
 		lg2ff_init(hdev);
 	if (quirks & LG_FF3)
 		lg3ff_init(hdev);
+	if (quirks & LG_FF4)
+		lg4ff_init(hdev);
 
 	return 0;
 err_free:
@@ -325,6 +362,8 @@ static const struct hid_device_id lg_devices[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WHEEL),
 		.driver_data = LG_NOGET | LG_FF },
 
+	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD_CORD),
+		.driver_data = LG_FF2 },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD),
 		.driver_data = LG_FF },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2_2),
@@ -339,6 +378,8 @@ static const struct hid_device_id lg_devices[] = {
 		.driver_data = LG_FF },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G25_WHEEL),
 		.driver_data = LG_FF },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WII_WHEEL),
+		.driver_data = LG_FF4 },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_FFG ),
 		.driver_data = LG_FF },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2),
diff --git a/drivers/hid/hid-lg.h b/drivers/hid/hid-lg.h
index ce2ac8672624..b0100ba2ae0b 100644
--- a/drivers/hid/hid-lg.h
+++ b/drivers/hid/hid-lg.h
@@ -19,4 +19,10 @@ int lg3ff_init(struct hid_device *hdev);
 static inline int lg3ff_init(struct hid_device *hdev) { return -1; }
 #endif
 
+#ifdef CONFIG_LOGIWII_FF
+int lg4ff_init(struct hid_device *hdev);
+#else
+static inline int lg4ff_init(struct hid_device *hdev) { return -1; }
+#endif
+
 #endif
diff --git a/drivers/hid/hid-lg2ff.c b/drivers/hid/hid-lg2ff.c
index d888f1e6794f..4258253c36b3 100644
--- a/drivers/hid/hid-lg2ff.c
+++ b/drivers/hid/hid-lg2ff.c
@@ -1,5 +1,5 @@
 /*
- *  Force feedback support for Logitech Rumblepad 2
+ *  Force feedback support for Logitech RumblePad and Rumblepad 2
  *
  *  Copyright (c) 2008 Anssi Hannula <anssi.hannula@gmail.com>
  */
@@ -110,7 +110,7 @@ int lg2ff_init(struct hid_device *hid)
 
 	usbhid_submit_report(hid, report, USB_DIR_OUT);
 
-	dev_info(&hid->dev, "Force feedback for Logitech Rumblepad 2 by "
+	dev_info(&hid->dev, "Force feedback for Logitech RumblePad/Rumblepad 2 by "
 	       "Anssi Hannula <anssi.hannula@gmail.com>\n");
 
 	return 0;
diff --git a/drivers/hid/hid-lg4ff.c b/drivers/hid/hid-lg4ff.c
new file mode 100644
index 000000000000..7eef5a2ce948
--- /dev/null
+++ b/drivers/hid/hid-lg4ff.c
@@ -0,0 +1,136 @@
+/*
+ *  Force feedback support for Logitech Speed Force Wireless
+ *
+ *  http://wiibrew.org/wiki/Logitech_USB_steering_wheel
+ *
+ *  Copyright (c) 2010 Simon Wood <simon@mungewell.org>
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#include <linux/input.h>
+#include <linux/usb.h>
+#include <linux/hid.h>
+
+#include "usbhid/usbhid.h"
+#include "hid-lg.h"
+
+struct lg4ff_device {
+	struct hid_report *report;
+};
+
+static const signed short ff4_wheel_ac[] = {
+	FF_CONSTANT,
+	FF_AUTOCENTER,
+	-1
+};
+
+static int hid_lg4ff_play(struct input_dev *dev, void *data,
+			 struct ff_effect *effect)
+{
+	struct hid_device *hid = input_get_drvdata(dev);
+	struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list;
+	struct hid_report *report = list_entry(report_list->next, struct hid_report, list);
+	int x;
+
+#define CLAMP(x) if (x < 0) x = 0; if (x > 0xff) x = 0xff
+
+	switch (effect->type) {
+	case FF_CONSTANT:
+		x = effect->u.ramp.start_level + 0x80;	/* 0x80 is no force */
+		CLAMP(x);
+		report->field[0]->value[0] = 0x11;	/* Slot 1 */
+		report->field[0]->value[1] = 0x10;
+		report->field[0]->value[2] = x;
+		report->field[0]->value[3] = 0x00;
+		report->field[0]->value[4] = 0x00;
+		report->field[0]->value[5] = 0x08;
+		report->field[0]->value[6] = 0x00;
+		dbg_hid("Autocenter, x=0x%02X\n", x);
+
+		usbhid_submit_report(hid, report, USB_DIR_OUT);
+		break;
+	}
+	return 0;
+}
+
+static void hid_lg4ff_set_autocenter(struct input_dev *dev, u16 magnitude)
+{
+	struct hid_device *hid = input_get_drvdata(dev);
+	struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list;
+	struct hid_report *report = list_entry(report_list->next, struct hid_report, list);
+	__s32 *value = report->field[0]->value;
+
+	*value++ = 0xfe;
+	*value++ = 0x0d;
+	*value++ = 0x07;
+	*value++ = 0x07;
+	*value++ = (magnitude >> 8) & 0xff;
+	*value++ = 0x00;
+	*value = 0x00;
+
+	usbhid_submit_report(hid, report, USB_DIR_OUT);
+}
+
+
+int lg4ff_init(struct hid_device *hid)
+{
+	struct hid_input *hidinput = list_entry(hid->inputs.next, struct hid_input, list);
+	struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list;
+	struct input_dev *dev = hidinput->input;
+	struct hid_report *report;
+	struct hid_field *field;
+	const signed short *ff_bits = ff4_wheel_ac;
+	int error;
+	int i;
+
+	/* Find the report to use */
+	if (list_empty(report_list)) {
+		err_hid("No output report found");
+		return -1;
+	}
+
+	/* Check that the report looks ok */
+	report = list_entry(report_list->next, struct hid_report, list);
+	if (!report) {
+		err_hid("NULL output report");
+		return -1;
+	}
+
+	field = report->field[0];
+	if (!field) {
+		err_hid("NULL field");
+		return -1;
+	}
+
+	for (i = 0; ff_bits[i] >= 0; i++)
+		set_bit(ff_bits[i], dev->ffbit);
+
+	error = input_ff_create_memless(dev, NULL, hid_lg4ff_play);
+
+	if (error)
+		return error;
+
+	if (test_bit(FF_AUTOCENTER, dev->ffbit))
+		dev->ff->set_autocenter = hid_lg4ff_set_autocenter;
+
+	dev_info(&hid->dev, "Force feedback for Logitech Speed Force Wireless by "
+			"Simon Wood <simon@mungewell.org>\n");
+	return 0;
+}
+
diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c
index 319b0e57ee41..e6dc15171664 100644
--- a/drivers/hid/hid-magicmouse.c
+++ b/drivers/hid/hid-magicmouse.c
@@ -2,6 +2,7 @@
  *   Apple "Magic" Wireless Mouse driver
  *
  *   Copyright (c) 2010 Michael Poole <mdpoole@troilus.org>
+ *   Copyright (c) 2010 Chase Douglas <chase.douglas@canonical.com>
  */
 
 /*
@@ -53,7 +54,9 @@ static bool report_undeciphered;
 module_param(report_undeciphered, bool, 0644);
 MODULE_PARM_DESC(report_undeciphered, "Report undeciphered multi-touch state field using a MSC_RAW event");
 
-#define TOUCH_REPORT_ID   0x29
+#define TRACKPAD_REPORT_ID 0x28
+#define MOUSE_REPORT_ID    0x29
+#define DOUBLE_REPORT_ID   0xf7
 /* These definitions are not precise, but they're close enough.  (Bits
  * 0x03 seem to indicate the aspect ratio of the touch, bits 0x70 seem
  * to be some kind of bit mask -- 0x20 may be a near-field reading,
@@ -67,15 +70,19 @@ MODULE_PARM_DESC(report_undeciphered, "Report undeciphered multi-touch state fie
 
 #define SCROLL_ACCEL_DEFAULT 7
 
+/* Single touch emulation should only begin when no touches are currently down.
+ * This is true when single_touch_id is equal to NO_TOUCHES. If multiple touches
+ * are down and the touch providing for single touch emulation is lifted,
+ * single_touch_id is equal to SINGLE_TOUCH_UP. While single touch emulation is
+ * occuring, single_touch_id corresponds with the tracking id of the touch used.
+ */
+#define NO_TOUCHES -1
+#define SINGLE_TOUCH_UP -2
+
 /**
  * struct magicmouse_sc - Tracks Magic Mouse-specific data.
  * @input: Input device through which we report events.
  * @quirks: Currently unused.
- * @last_timestamp: Timestamp from most recent (18-bit) touch report
- *     (units of milliseconds over short windows, but seems to
- *     increase faster when there are no touches).
- * @delta_time: 18-bit difference between the two most recent touch
- *     reports from the mouse.
  * @ntouches: Number of touches in most recent touch report.
  * @scroll_accel: Number of consecutive scroll motions.
  * @scroll_jiffies: Time of last scroll motion.
@@ -86,8 +93,6 @@ struct magicmouse_sc {
 	struct input_dev *input;
 	unsigned long quirks;
 
-	int last_timestamp;
-	int delta_time;
 	int ntouches;
 	int scroll_accel;
 	unsigned long scroll_jiffies;
@@ -98,9 +103,9 @@ struct magicmouse_sc {
 		short scroll_x;
 		short scroll_y;
 		u8 size;
-		u8 down;
 	} touches[16];
 	int tracking_ids[16];
+	int single_touch_id;
 };
 
 static int magicmouse_firm_touch(struct magicmouse_sc *msc)
@@ -166,18 +171,35 @@ static void magicmouse_emit_buttons(struct magicmouse_sc *msc, int state)
 static void magicmouse_emit_touch(struct magicmouse_sc *msc, int raw_id, u8 *tdata)
 {
 	struct input_dev *input = msc->input;
-	__s32 x_y = tdata[0] << 8 | tdata[1] << 16 | tdata[2] << 24;
-	int misc = tdata[5] | tdata[6] << 8;
-	int id = (misc >> 6) & 15;
-	int x = x_y << 12 >> 20;
-	int y = -(x_y >> 20);
-	int down = (tdata[7] & TOUCH_STATE_MASK) != TOUCH_STATE_NONE;
+	int id, x, y, size, orientation, touch_major, touch_minor, state, down;
+
+	if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE) {
+		id = (tdata[6] << 2 | tdata[5] >> 6) & 0xf;
+		x = (tdata[1] << 28 | tdata[0] << 20) >> 20;
+		y = -((tdata[2] << 24 | tdata[1] << 16) >> 20);
+		size = tdata[5] & 0x3f;
+		orientation = (tdata[6] >> 2) - 32;
+		touch_major = tdata[3];
+		touch_minor = tdata[4];
+		state = tdata[7] & TOUCH_STATE_MASK;
+		down = state != TOUCH_STATE_NONE;
+	} else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
+		id = (tdata[7] << 2 | tdata[6] >> 6) & 0xf;
+		x = (tdata[1] << 27 | tdata[0] << 19) >> 19;
+		y = -((tdata[3] << 30 | tdata[2] << 22 | tdata[1] << 14) >> 19);
+		size = tdata[6] & 0x3f;
+		orientation = (tdata[7] >> 2) - 32;
+		touch_major = tdata[4];
+		touch_minor = tdata[5];
+		state = tdata[8] & TOUCH_STATE_MASK;
+		down = state != TOUCH_STATE_NONE;
+	}
 
 	/* Store tracking ID and other fields. */
 	msc->tracking_ids[raw_id] = id;
 	msc->touches[id].x = x;
 	msc->touches[id].y = y;
-	msc->touches[id].size = misc & 63;
+	msc->touches[id].size = size;
 
 	/* If requested, emulate a scroll wheel by detecting small
 	 * vertical touch motions.
@@ -188,7 +210,7 @@ static void magicmouse_emit_touch(struct magicmouse_sc *msc, int raw_id, u8 *tda
 		int step_y = msc->touches[id].scroll_y - y;
 
 		/* Calculate and apply the scroll motion. */
-		switch (tdata[7] & TOUCH_STATE_MASK) {
+		switch (state) {
 		case TOUCH_STATE_START:
 			msc->touches[id].scroll_x = x;
 			msc->touches[id].scroll_y = y;
@@ -222,21 +244,28 @@ static void magicmouse_emit_touch(struct magicmouse_sc *msc, int raw_id, u8 *tda
 		}
 	}
 
+	if (down) {
+		msc->ntouches++;
+		if (msc->single_touch_id == NO_TOUCHES)
+			msc->single_touch_id = id;
+	} else if (msc->single_touch_id == id)
+		msc->single_touch_id = SINGLE_TOUCH_UP;
+
 	/* Generate the input events for this touch. */
 	if (report_touches && down) {
-		int orientation = (misc >> 10) - 32;
-
-		msc->touches[id].down = 1;
-
 		input_report_abs(input, ABS_MT_TRACKING_ID, id);
-		input_report_abs(input, ABS_MT_TOUCH_MAJOR, tdata[3]);
-		input_report_abs(input, ABS_MT_TOUCH_MINOR, tdata[4]);
+		input_report_abs(input, ABS_MT_TOUCH_MAJOR, touch_major << 2);
+		input_report_abs(input, ABS_MT_TOUCH_MINOR, touch_minor << 2);
 		input_report_abs(input, ABS_MT_ORIENTATION, orientation);
 		input_report_abs(input, ABS_MT_POSITION_X, x);
 		input_report_abs(input, ABS_MT_POSITION_Y, y);
 
-		if (report_undeciphered)
-			input_event(input, EV_MSC, MSC_RAW, tdata[7]);
+		if (report_undeciphered) {
+			if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE)
+				input_event(input, EV_MSC, MSC_RAW, tdata[7]);
+			else /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
+				input_event(input, EV_MSC, MSC_RAW, tdata[8]);
+		}
 
 		input_mt_sync(input);
 	}
@@ -247,39 +276,43 @@ static int magicmouse_raw_event(struct hid_device *hdev,
 {
 	struct magicmouse_sc *msc = hid_get_drvdata(hdev);
 	struct input_dev *input = msc->input;
-	int x, y, ts, ii, clicks, last_up;
+	int x = 0, y = 0, ii, clicks = 0, npoints;
 
 	switch (data[0]) {
-	case 0x10:
-		if (size != 6)
+	case TRACKPAD_REPORT_ID:
+		/* Expect four bytes of prefix, and N*9 bytes of touch data. */
+		if (size < 4 || ((size - 4) % 9) != 0)
 			return 0;
-		x = (__s16)(data[2] | data[3] << 8);
-		y = (__s16)(data[4] | data[5] << 8);
+		npoints = (size - 4) / 9;
+		msc->ntouches = 0;
+		for (ii = 0; ii < npoints; ii++)
+			magicmouse_emit_touch(msc, ii, data + ii * 9 + 4);
+
+		/* We don't need an MT sync here because trackpad emits a
+		 * BTN_TOUCH event in a new frame when all touches are released.
+		 */
+		if (msc->ntouches == 0)
+			msc->single_touch_id = NO_TOUCHES;
+
 		clicks = data[1];
+
+		/* The following bits provide a device specific timestamp. They
+		 * are unused here.
+		 *
+		 * ts = data[1] >> 6 | data[2] << 2 | data[3] << 10;
+		 */
 		break;
-	case TOUCH_REPORT_ID:
+	case MOUSE_REPORT_ID:
 		/* Expect six bytes of prefix, and N*8 bytes of touch data. */
 		if (size < 6 || ((size - 6) % 8) != 0)
 			return 0;
-		ts = data[3] >> 6 | data[4] << 2 | data[5] << 10;
-		msc->delta_time = (ts - msc->last_timestamp) & 0x3ffff;
-		msc->last_timestamp = ts;
-		msc->ntouches = (size - 6) / 8;
-		for (ii = 0; ii < msc->ntouches; ii++)
+		npoints = (size - 6) / 8;
+		msc->ntouches = 0;
+		for (ii = 0; ii < npoints; ii++)
 			magicmouse_emit_touch(msc, ii, data + ii * 8 + 6);
 
-		if (report_touches) {
-			last_up = 1;
-			for (ii = 0; ii < ARRAY_SIZE(msc->touches); ii++) {
-				if (msc->touches[ii].down) {
-					last_up = 0;
-					msc->touches[ii].down = 0;
-				}
-			}
-			if (last_up) {
-				input_mt_sync(input);
-			}
-		}
+		if (report_touches && msc->ntouches == 0)
+			input_mt_sync(input);
 
 		/* When emulating three-button mode, it is important
 		 * to have the current touch information before
@@ -288,68 +321,72 @@ static int magicmouse_raw_event(struct hid_device *hdev,
 		x = (int)(((data[3] & 0x0c) << 28) | (data[1] << 22)) >> 22;
 		y = (int)(((data[3] & 0x30) << 26) | (data[2] << 22)) >> 22;
 		clicks = data[3];
+
+		/* The following bits provide a device specific timestamp. They
+		 * are unused here.
+		 *
+		 * ts = data[3] >> 6 | data[4] << 2 | data[5] << 10;
+		 */
+		break;
+	case DOUBLE_REPORT_ID:
+		/* Sometimes the trackpad sends two touch reports in one
+		 * packet.
+		 */
+		magicmouse_raw_event(hdev, report, data + 2, data[1]);
+		magicmouse_raw_event(hdev, report, data + 2 + data[1],
+			size - 2 - data[1]);
 		break;
-	case 0x20: /* Theoretically battery status (0-100), but I have
-		    * never seen it -- maybe it is only upon request.
-		    */
-	case 0x60: /* Unknown, maybe laser on/off. */
-	case 0x61: /* Laser reflection status change.
-		    * data[1]: 0 = spotted, 1 = lost
-		    */
 	default:
 		return 0;
 	}
 
-	magicmouse_emit_buttons(msc, clicks & 3);
-	input_report_rel(input, REL_X, x);
-	input_report_rel(input, REL_Y, y);
+	if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE) {
+		magicmouse_emit_buttons(msc, clicks & 3);
+		input_report_rel(input, REL_X, x);
+		input_report_rel(input, REL_Y, y);
+	} else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
+		input_report_key(input, BTN_MOUSE, clicks & 1);
+		input_report_key(input, BTN_TOUCH, msc->ntouches > 0);
+		input_report_key(input, BTN_TOOL_FINGER, msc->ntouches == 1);
+		input_report_key(input, BTN_TOOL_DOUBLETAP, msc->ntouches == 2);
+		input_report_key(input, BTN_TOOL_TRIPLETAP, msc->ntouches == 3);
+		input_report_key(input, BTN_TOOL_QUADTAP, msc->ntouches == 4);
+		if (msc->single_touch_id >= 0) {
+			input_report_abs(input, ABS_X,
+				msc->touches[msc->single_touch_id].x);
+			input_report_abs(input, ABS_Y,
+				msc->touches[msc->single_touch_id].y);
+		}
+	}
+
 	input_sync(input);
 	return 1;
 }
 
-static int magicmouse_input_open(struct input_dev *dev)
-{
-	struct hid_device *hid = input_get_drvdata(dev);
-
-	return hid->ll_driver->open(hid);
-}
-
-static void magicmouse_input_close(struct input_dev *dev)
-{
-	struct hid_device *hid = input_get_drvdata(dev);
-
-	hid->ll_driver->close(hid);
-}
-
 static void magicmouse_setup_input(struct input_dev *input, struct hid_device *hdev)
 {
-	input_set_drvdata(input, hdev);
-	input->event = hdev->ll_driver->hidinput_input_event;
-	input->open = magicmouse_input_open;
-	input->close = magicmouse_input_close;
-
-	input->name = hdev->name;
-	input->phys = hdev->phys;
-	input->uniq = hdev->uniq;
-	input->id.bustype = hdev->bus;
-	input->id.vendor = hdev->vendor;
-	input->id.product = hdev->product;
-	input->id.version = hdev->version;
-	input->dev.parent = hdev->dev.parent;
-
 	__set_bit(EV_KEY, input->evbit);
-	__set_bit(BTN_LEFT, input->keybit);
-	__set_bit(BTN_RIGHT, input->keybit);
-	if (emulate_3button)
-		__set_bit(BTN_MIDDLE, input->keybit);
-	__set_bit(BTN_TOOL_FINGER, input->keybit);
-
-	__set_bit(EV_REL, input->evbit);
-	__set_bit(REL_X, input->relbit);
-	__set_bit(REL_Y, input->relbit);
-	if (emulate_scroll_wheel) {
-		__set_bit(REL_WHEEL, input->relbit);
-		__set_bit(REL_HWHEEL, input->relbit);
+
+	if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE) {
+		__set_bit(BTN_LEFT, input->keybit);
+		__set_bit(BTN_RIGHT, input->keybit);
+		if (emulate_3button)
+			__set_bit(BTN_MIDDLE, input->keybit);
+
+		__set_bit(EV_REL, input->evbit);
+		__set_bit(REL_X, input->relbit);
+		__set_bit(REL_Y, input->relbit);
+		if (emulate_scroll_wheel) {
+			__set_bit(REL_WHEEL, input->relbit);
+			__set_bit(REL_HWHEEL, input->relbit);
+		}
+	} else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
+		__set_bit(BTN_MOUSE, input->keybit);
+		__set_bit(BTN_TOOL_FINGER, input->keybit);
+		__set_bit(BTN_TOOL_DOUBLETAP, input->keybit);
+		__set_bit(BTN_TOOL_TRIPLETAP, input->keybit);
+		__set_bit(BTN_TOOL_QUADTAP, input->keybit);
+		__set_bit(BTN_TOUCH, input->keybit);
 	}
 
 	if (report_touches) {
@@ -359,16 +396,26 @@ static void magicmouse_setup_input(struct input_dev *input, struct hid_device *h
 		input_set_abs_params(input, ABS_MT_TOUCH_MAJOR, 0, 255, 4, 0);
 		input_set_abs_params(input, ABS_MT_TOUCH_MINOR, 0, 255, 4, 0);
 		input_set_abs_params(input, ABS_MT_ORIENTATION, -32, 31, 1, 0);
-		input_set_abs_params(input, ABS_MT_POSITION_X, -1100, 1358,
-				4, 0);
+
 		/* Note: Touch Y position from the device is inverted relative
 		 * to how pointer motion is reported (and relative to how USB
 		 * HID recommends the coordinates work).  This driver keeps
 		 * the origin at the same position, and just uses the additive
 		 * inverse of the reported Y.
 		 */
-		input_set_abs_params(input, ABS_MT_POSITION_Y, -1589, 2047,
-				4, 0);
+		if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE) {
+			input_set_abs_params(input, ABS_MT_POSITION_X, -1100,
+				1358, 4, 0);
+			input_set_abs_params(input, ABS_MT_POSITION_Y, -1589,
+				2047, 4, 0);
+		} else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
+			input_set_abs_params(input, ABS_X, -2909, 3167, 4, 0);
+			input_set_abs_params(input, ABS_Y, -2456, 2565, 4, 0);
+			input_set_abs_params(input, ABS_MT_POSITION_X, -2909,
+				3167, 4, 0);
+			input_set_abs_params(input, ABS_MT_POSITION_Y, -2456,
+				2565, 4, 0);
+		}
 	}
 
 	if (report_undeciphered) {
@@ -377,12 +424,22 @@ static void magicmouse_setup_input(struct input_dev *input, struct hid_device *h
 	}
 }
 
+static int magicmouse_input_mapping(struct hid_device *hdev,
+		struct hid_input *hi, struct hid_field *field,
+		struct hid_usage *usage, unsigned long **bit, int *max)
+{
+	struct magicmouse_sc *msc = hid_get_drvdata(hdev);
+
+	if (!msc->input)
+		msc->input = hi->input;
+
+	return 0;
+}
+
 static int magicmouse_probe(struct hid_device *hdev,
 	const struct hid_device_id *id)
 {
-	__u8 feature_1[] = { 0xd7, 0x01 };
-	__u8 feature_2[] = { 0xf8, 0x01, 0x32 };
-	struct input_dev *input;
+	__u8 feature[] = { 0xd7, 0x01 };
 	struct magicmouse_sc *msc;
 	struct hid_report *report;
 	int ret;
@@ -398,6 +455,8 @@ static int magicmouse_probe(struct hid_device *hdev,
 	msc->quirks = id->driver_data;
 	hid_set_drvdata(hdev, msc);
 
+	msc->single_touch_id = NO_TOUCHES;
+
 	ret = hid_parse(hdev);
 	if (ret) {
 		dev_err(&hdev->dev, "magicmouse hid parse failed\n");
@@ -410,10 +469,22 @@ static int magicmouse_probe(struct hid_device *hdev,
 		goto err_free;
 	}
 
-	/* we are handling the input ourselves */
-	hidinput_disconnect(hdev);
+	/* We do this after hid-input is done parsing reports so that
+	 * hid-input uses the most natural button and axis IDs.
+	 */
+	if (msc->input)
+		magicmouse_setup_input(msc->input, hdev);
+
+	if (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE)
+		report = hid_register_report(hdev, HID_INPUT_REPORT,
+			MOUSE_REPORT_ID);
+	else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
+		report = hid_register_report(hdev, HID_INPUT_REPORT,
+			TRACKPAD_REPORT_ID);
+		report = hid_register_report(hdev, HID_INPUT_REPORT,
+			DOUBLE_REPORT_ID);
+	}
 
-	report = hid_register_report(hdev, HID_INPUT_REPORT, TOUCH_REPORT_ID);
 	if (!report) {
 		dev_err(&hdev->dev, "unable to register touch report\n");
 		ret = -ENOMEM;
@@ -421,39 +492,15 @@ static int magicmouse_probe(struct hid_device *hdev,
 	}
 	report->size = 6;
 
-	ret = hdev->hid_output_raw_report(hdev, feature_1, sizeof(feature_1),
+	ret = hdev->hid_output_raw_report(hdev, feature, sizeof(feature),
 			HID_FEATURE_REPORT);
-	if (ret != sizeof(feature_1)) {
-		dev_err(&hdev->dev, "unable to request touch data (1:%d)\n",
-				ret);
-		goto err_stop_hw;
-	}
-	ret = hdev->hid_output_raw_report(hdev, feature_2,
-			sizeof(feature_2), HID_FEATURE_REPORT);
-	if (ret != sizeof(feature_2)) {
-		dev_err(&hdev->dev, "unable to request touch data (2:%d)\n",
+	if (ret != sizeof(feature)) {
+		dev_err(&hdev->dev, "unable to request touch data (%d)\n",
 				ret);
 		goto err_stop_hw;
 	}
 
-	input = input_allocate_device();
-	if (!input) {
-		dev_err(&hdev->dev, "can't alloc input device\n");
-		ret = -ENOMEM;
-		goto err_stop_hw;
-	}
-	magicmouse_setup_input(input, hdev);
-
-	ret = input_register_device(input);
-	if (ret) {
-		dev_err(&hdev->dev, "input device registration failed\n");
-		goto err_input;
-	}
-	msc->input = input;
-
 	return 0;
-err_input:
-	input_free_device(input);
 err_stop_hw:
 	hid_hw_stop(hdev);
 err_free:
@@ -466,13 +513,14 @@ static void magicmouse_remove(struct hid_device *hdev)
 	struct magicmouse_sc *msc = hid_get_drvdata(hdev);
 
 	hid_hw_stop(hdev);
-	input_unregister_device(msc->input);
 	kfree(msc);
 }
 
 static const struct hid_device_id magic_mice[] = {
-	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE),
-		.driver_data = 0 },
+	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE,
+		USB_DEVICE_ID_APPLE_MAGICMOUSE), .driver_data = 0 },
+	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE,
+		USB_DEVICE_ID_APPLE_MAGICTRACKPAD), .driver_data = 0 },
 	{ }
 };
 MODULE_DEVICE_TABLE(hid, magic_mice);
@@ -483,6 +531,7 @@ static struct hid_driver magicmouse_driver = {
 	.probe = magicmouse_probe,
 	.remove = magicmouse_remove,
 	.raw_event = magicmouse_raw_event,
+	.input_mapping = magicmouse_input_mapping,
 };
 
 static int __init magicmouse_init(void)
diff --git a/drivers/hid/hid-microsoft.c b/drivers/hid/hid-microsoft.c
index 359cc447c6c6..dc618c33d0a2 100644
--- a/drivers/hid/hid-microsoft.c
+++ b/drivers/hid/hid-microsoft.c
@@ -33,18 +33,19 @@
  * Microsoft Wireless Desktop Receiver (Model 1028) has
  * 'Usage Min/Max' where it ought to have 'Physical Min/Max'
  */
-static void ms_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *ms_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
 	unsigned long quirks = (unsigned long)hid_get_drvdata(hdev);
 
-	if ((quirks & MS_RDESC) && rsize == 571 && rdesc[557] == 0x19 &&
+	if ((quirks & MS_RDESC) && *rsize == 571 && rdesc[557] == 0x19 &&
 			rdesc[559] == 0x29) {
 		dev_info(&hdev->dev, "fixing up Microsoft Wireless Receiver "
 				"Model 1028 report descriptor\n");
 		rdesc[557] = 0x35;
 		rdesc[559] = 0x45;
 	}
+	return rdesc;
 }
 
 #define ms_map_key_clear(c)	hid_map_usage_clear(hi, usage, bit, max, \
diff --git a/drivers/hid/hid-monterey.c b/drivers/hid/hid-monterey.c
index 2cd05aa244b9..c95c31e2d869 100644
--- a/drivers/hid/hid-monterey.c
+++ b/drivers/hid/hid-monterey.c
@@ -22,14 +22,15 @@
 
 #include "hid-ids.h"
 
-static void mr_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *mr_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize >= 30 && rdesc[29] == 0x05 && rdesc[30] == 0x09) {
+	if (*rsize >= 30 && rdesc[29] == 0x05 && rdesc[30] == 0x09) {
 		dev_info(&hdev->dev, "fixing up button/consumer in HID report "
 				"descriptor\n");
 		rdesc[30] = 0x0c;
 	}
+	return rdesc;
 }
 
 #define mr_map_key_clear(c)	hid_map_usage_clear(hi, usage, bit, max, \
diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c
index fb69b8c4953f..69169efa1e16 100644
--- a/drivers/hid/hid-ntrig.c
+++ b/drivers/hid/hid-ntrig.c
@@ -90,6 +90,55 @@ struct ntrig_data {
 };
 
 
+/*
+ * This function converts the 4 byte raw firmware code into
+ * a string containing 5 comma separated numbers.
+ */
+static int ntrig_version_string(unsigned char *raw, char *buf)
+{
+	__u8 a =  (raw[1] & 0x0e) >> 1;
+	__u8 b =  (raw[0] & 0x3c) >> 2;
+	__u8 c = ((raw[0] & 0x03) << 3) | ((raw[3] & 0xe0) >> 5);
+	__u8 d = ((raw[3] & 0x07) << 3) | ((raw[2] & 0xe0) >> 5);
+	__u8 e =   raw[2] & 0x07;
+
+	/*
+	 * As yet unmapped bits:
+	 * 0b11000000 0b11110001 0b00011000 0b00011000
+	 */
+
+	return sprintf(buf, "%u.%u.%u.%u.%u", a, b, c, d, e);
+}
+
+static void ntrig_report_version(struct hid_device *hdev)
+{
+	int ret;
+	char buf[20];
+	struct usb_device *usb_dev = hid_to_usb_dev(hdev);
+	unsigned char *data = kmalloc(8, GFP_KERNEL);
+
+	if (!data)
+		goto err_free;
+
+	ret = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0),
+			      USB_REQ_CLEAR_FEATURE,
+			      USB_TYPE_CLASS | USB_RECIP_INTERFACE |
+			      USB_DIR_IN,
+			      0x30c, 1, data, 8,
+			      USB_CTRL_SET_TIMEOUT);
+
+	if (ret == 8) {
+		ret = ntrig_version_string(&data[2], buf);
+
+		dev_info(&hdev->dev,
+			 "Firmware version: %s (%02x%02x %02x%02x)\n",
+			 buf, data[2], data[3], data[4], data[5]);
+	}
+
+err_free:
+	kfree(data);
+}
+
 static ssize_t show_phys_width(struct device *dev,
 			       struct device_attribute *attr,
 			       char *buf)
@@ -377,8 +426,8 @@ static struct attribute_group ntrig_attribute_group = {
  */
 
 static int ntrig_input_mapping(struct hid_device *hdev, struct hid_input *hi,
-		struct hid_field *field, struct hid_usage *usage,
-		unsigned long **bit, int *max)
+			       struct hid_field *field, struct hid_usage *usage,
+			       unsigned long **bit, int *max)
 {
 	struct ntrig_data *nd = hid_get_drvdata(hdev);
 
@@ -448,13 +497,13 @@ static int ntrig_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		/* width/height mapped on TouchMajor/TouchMinor/Orientation */
 		case HID_DG_WIDTH:
 			hid_map_usage(hi, usage, bit, max,
-					EV_ABS, ABS_MT_TOUCH_MAJOR);
+				      EV_ABS, ABS_MT_TOUCH_MAJOR);
 			return 1;
 		case HID_DG_HEIGHT:
 			hid_map_usage(hi, usage, bit, max,
-					EV_ABS, ABS_MT_TOUCH_MINOR);
+				      EV_ABS, ABS_MT_TOUCH_MINOR);
 			input_set_abs_params(hi->input, ABS_MT_ORIENTATION,
-					0, 1, 0, 0);
+					     0, 1, 0, 0);
 			return 1;
 		}
 		return 0;
@@ -468,8 +517,8 @@ static int ntrig_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 }
 
 static int ntrig_input_mapped(struct hid_device *hdev, struct hid_input *hi,
-		struct hid_field *field, struct hid_usage *usage,
-		unsigned long **bit, int *max)
+			      struct hid_field *field, struct hid_usage *usage,
+			      unsigned long **bit, int *max)
 {
 	/* No special mappings needed for the pen and single touch */
 	if (field->physical)
@@ -489,7 +538,7 @@ static int ntrig_input_mapped(struct hid_device *hdev, struct hid_input *hi,
  * and call input_mt_sync after each point if necessary
  */
 static int ntrig_event (struct hid_device *hid, struct hid_field *field,
-		                        struct hid_usage *usage, __s32 value)
+			struct hid_usage *usage, __s32 value)
 {
 	struct input_dev *input = field->hidinput->input;
 	struct ntrig_data *nd = hid_get_drvdata(hid);
@@ -848,6 +897,8 @@ static int ntrig_probe(struct hid_device *hdev, const struct hid_device_id *id)
 	if (report)
 		usbhid_submit_report(hdev, report, USB_DIR_OUT);
 
+	ntrig_report_version(hdev);
+
 	ret = sysfs_create_group(&hdev->dev.kobj,
 			&ntrig_attribute_group);
 
@@ -860,7 +911,7 @@ err_free:
 static void ntrig_remove(struct hid_device *hdev)
 {
 	sysfs_remove_group(&hdev->dev.kobj,
-			&ntrig_attribute_group);
+			   &ntrig_attribute_group);
 	hid_hw_stop(hdev);
 	kfree(hid_get_drvdata(hdev));
 }
diff --git a/drivers/hid/hid-ortek.c b/drivers/hid/hid-ortek.c
index aa9a960f73a4..2e79716dca31 100644
--- a/drivers/hid/hid-ortek.c
+++ b/drivers/hid/hid-ortek.c
@@ -19,14 +19,15 @@
 
 #include "hid-ids.h"
 
-static void ortek_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *ortek_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize >= 56 && rdesc[54] == 0x25 && rdesc[55] == 0x01) {
+	if (*rsize >= 56 && rdesc[54] == 0x25 && rdesc[55] == 0x01) {
 		dev_info(&hdev->dev, "Fixing up Ortek WKB-2000 "
 				"report descriptor.\n");
 		rdesc[55] = 0x92;
 	}
+	return rdesc;
 }
 
 static const struct hid_device_id ortek_devices[] = {
diff --git a/drivers/hid/hid-petalynx.c b/drivers/hid/hid-petalynx.c
index 500fbd0652dc..308d6ae48a3e 100644
--- a/drivers/hid/hid-petalynx.c
+++ b/drivers/hid/hid-petalynx.c
@@ -23,10 +23,10 @@
 #include "hid-ids.h"
 
 /* Petalynx Maxter Remote has maximum for consumer page set too low */
-static void pl_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *pl_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize >= 60 && rdesc[39] == 0x2a && rdesc[40] == 0xf5 &&
+	if (*rsize >= 60 && rdesc[39] == 0x2a && rdesc[40] == 0xf5 &&
 			rdesc[41] == 0x00 && rdesc[59] == 0x26 &&
 			rdesc[60] == 0xf9 && rdesc[61] == 0x00) {
 		dev_info(&hdev->dev, "fixing up Petalynx Maxter Remote report "
@@ -34,6 +34,7 @@ static void pl_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		rdesc[60] = 0xfa;
 		rdesc[40] = 0xfa;
 	}
+	return rdesc;
 }
 
 #define pl_map_key_clear(c)	hid_map_usage_clear(hi, usage, bit, max, \
diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c
index 845f428b8090..48eab84f53b5 100644
--- a/drivers/hid/hid-prodikeys.c
+++ b/drivers/hid/hid-prodikeys.c
@@ -740,10 +740,10 @@ int pcmidi_snd_terminate(struct pcmidi_snd *pm)
 /*
  * PC-MIDI report descriptor for report id is wrong.
  */
-static void pk_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *pk_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize == 178 &&
+	if (*rsize == 178 &&
 	      rdesc[111] == 0x06 && rdesc[112] == 0x00 &&
 	      rdesc[113] == 0xff) {
 		dev_info(&hdev->dev, "fixing up pc-midi keyboard report "
@@ -751,6 +751,7 @@ static void pk_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 
 		rdesc[144] = 0x18; /* report 4: was 0x10 report count */
 	}
+	return rdesc;
 }
 
 static int pk_input_mapping(struct hid_device *hdev, struct hid_input *hi,
diff --git a/drivers/hid/hid-roccat-pyra.c b/drivers/hid/hid-roccat-pyra.c
new file mode 100644
index 000000000000..9bf23047892a
--- /dev/null
+++ b/drivers/hid/hid-roccat-pyra.c
@@ -0,0 +1,968 @@
+/*
+ * Roccat Pyra driver for Linux
+ *
+ * Copyright (c) 2010 Stefan Achatz <erazor_de@users.sourceforge.net>
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * Roccat Pyra is a mobile gamer mouse which comes in wired and wireless
+ * variant. Wireless variant is not tested.
+ * Userland tools can be found at http://sourceforge.net/projects/roccat
+ */
+
+#include <linux/device.h>
+#include <linux/input.h>
+#include <linux/hid.h>
+#include <linux/usb.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "hid-ids.h"
+#include "hid-roccat.h"
+#include "hid-roccat-pyra.h"
+
+static void profile_activated(struct pyra_device *pyra,
+		unsigned int new_profile)
+{
+	pyra->actual_profile = new_profile;
+	pyra->actual_cpi = pyra->profile_settings[pyra->actual_profile].y_cpi;
+}
+
+static int pyra_send_control(struct usb_device *usb_dev, int value,
+		enum pyra_control_requests request)
+{
+	int len;
+	struct pyra_control control;
+
+	if ((request == PYRA_CONTROL_REQUEST_PROFILE_SETTINGS ||
+			request == PYRA_CONTROL_REQUEST_PROFILE_BUTTONS) &&
+			(value < 0 || value > 4))
+		return -EINVAL;
+
+	control.command = PYRA_COMMAND_CONTROL;
+	control.value = value;
+	control.request = request;
+
+	len = usb_control_msg(usb_dev, usb_sndctrlpipe(usb_dev, 0),
+			USB_REQ_SET_CONFIGURATION,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT,
+			PYRA_USB_COMMAND_CONTROL, 0, (char *)&control,
+			sizeof(struct pyra_control),
+			USB_CTRL_SET_TIMEOUT);
+
+	if (len != sizeof(struct pyra_control))
+		return len;
+
+	return 0;
+}
+
+static int pyra_receive_control_status(struct usb_device *usb_dev)
+{
+	int len;
+	struct pyra_control control;
+
+	do {
+		msleep(10);
+
+		len = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0),
+				USB_REQ_CLEAR_FEATURE,
+				USB_TYPE_CLASS | USB_RECIP_INTERFACE |
+				USB_DIR_IN,
+				PYRA_USB_COMMAND_CONTROL, 0, (char *)&control,
+				sizeof(struct pyra_control),
+				USB_CTRL_SET_TIMEOUT);
+
+		/* requested too early, try again */
+	} while (len == -EPROTO);
+
+	if (len == sizeof(struct pyra_control) &&
+			control.command == PYRA_COMMAND_CONTROL &&
+			control.request == PYRA_CONTROL_REQUEST_STATUS &&
+			control.value == 1)
+			return 0;
+	else {
+		dev_err(&usb_dev->dev, "receive control status: "
+				"unknown response 0x%x 0x%x\n",
+				control.request, control.value);
+		return -EINVAL;
+	}
+}
+
+static int pyra_get_profile_settings(struct usb_device *usb_dev,
+		struct pyra_profile_settings *buf, int number)
+{
+	int retval;
+
+	retval = pyra_send_control(usb_dev, number,
+			PYRA_CONTROL_REQUEST_PROFILE_SETTINGS);
+
+	if (retval)
+		return retval;
+
+	retval = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0),
+			USB_REQ_CLEAR_FEATURE,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN,
+			PYRA_USB_COMMAND_PROFILE_SETTINGS, 0, (char *)buf,
+			sizeof(struct pyra_profile_settings),
+			USB_CTRL_SET_TIMEOUT);
+
+	if (retval != sizeof(struct pyra_profile_settings))
+		return retval;
+
+	return 0;
+}
+
+static int pyra_get_profile_buttons(struct usb_device *usb_dev,
+		struct pyra_profile_buttons *buf, int number)
+{
+	int retval;
+
+	retval = pyra_send_control(usb_dev, number,
+			PYRA_CONTROL_REQUEST_PROFILE_BUTTONS);
+
+	if (retval)
+		return retval;
+
+	retval = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0),
+			USB_REQ_CLEAR_FEATURE,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN,
+			PYRA_USB_COMMAND_PROFILE_BUTTONS, 0, (char *)buf,
+			sizeof(struct pyra_profile_buttons),
+			USB_CTRL_SET_TIMEOUT);
+
+	if (retval != sizeof(struct pyra_profile_buttons))
+		return retval;
+
+	return 0;
+}
+
+static int pyra_get_settings(struct usb_device *usb_dev,
+		struct pyra_settings *buf)
+{
+	int len;
+	len = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0),
+			USB_REQ_CLEAR_FEATURE,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN,
+			PYRA_USB_COMMAND_SETTINGS, 0, buf,
+			sizeof(struct pyra_settings), USB_CTRL_SET_TIMEOUT);
+	if (len != sizeof(struct pyra_settings))
+		return -EIO;
+	return 0;
+}
+
+static int pyra_get_info(struct usb_device *usb_dev, struct pyra_info *buf)
+{
+	int len;
+	len = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0),
+			USB_REQ_CLEAR_FEATURE,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN,
+			PYRA_USB_COMMAND_INFO, 0, buf,
+			sizeof(struct pyra_info), USB_CTRL_SET_TIMEOUT);
+	if (len != sizeof(struct pyra_info))
+		return -EIO;
+	return 0;
+}
+
+static int pyra_set_profile_settings(struct usb_device *usb_dev,
+		struct pyra_profile_settings const *settings)
+{
+	int len;
+	len = usb_control_msg(usb_dev, usb_sndctrlpipe(usb_dev, 0),
+			USB_REQ_SET_CONFIGURATION,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT,
+			PYRA_USB_COMMAND_PROFILE_SETTINGS, 0, (char *)settings,
+			sizeof(struct pyra_profile_settings),
+			USB_CTRL_SET_TIMEOUT);
+	if (len != sizeof(struct pyra_profile_settings))
+		return -EIO;
+	if (pyra_receive_control_status(usb_dev))
+		return -EIO;
+	return 0;
+}
+
+static int pyra_set_profile_buttons(struct usb_device *usb_dev,
+		struct pyra_profile_buttons const *buttons)
+{
+	int len;
+	len = usb_control_msg(usb_dev, usb_sndctrlpipe(usb_dev, 0),
+			USB_REQ_SET_CONFIGURATION,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT,
+			PYRA_USB_COMMAND_PROFILE_BUTTONS, 0, (char *)buttons,
+			sizeof(struct pyra_profile_buttons),
+			USB_CTRL_SET_TIMEOUT);
+	if (len != sizeof(struct pyra_profile_buttons))
+		return -EIO;
+	if (pyra_receive_control_status(usb_dev))
+		return -EIO;
+	return 0;
+}
+
+static int pyra_set_settings(struct usb_device *usb_dev,
+		struct pyra_settings const *settings)
+{
+	int len;
+	len = usb_control_msg(usb_dev, usb_sndctrlpipe(usb_dev, 0),
+			USB_REQ_SET_CONFIGURATION,
+			USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT,
+			PYRA_USB_COMMAND_SETTINGS, 0, (char *)settings,
+			sizeof(struct pyra_settings), USB_CTRL_SET_TIMEOUT);
+	if (len != sizeof(struct pyra_settings))
+		return -EIO;
+	if (pyra_receive_control_status(usb_dev))
+		return -EIO;
+	return 0;
+}
+
+static ssize_t pyra_sysfs_read_profilex_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count, int number)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+
+	if (off >= sizeof(struct pyra_profile_settings))
+		return 0;
+
+	if (off + count > sizeof(struct pyra_profile_settings))
+		count = sizeof(struct pyra_profile_settings) - off;
+
+	mutex_lock(&pyra->pyra_lock);
+	memcpy(buf, ((char const *)&pyra->profile_settings[number]) + off,
+			count);
+	mutex_unlock(&pyra->pyra_lock);
+
+	return count;
+}
+
+static ssize_t pyra_sysfs_read_profile1_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_settings(fp, kobj,
+			attr, buf, off, count, 0);
+}
+
+static ssize_t pyra_sysfs_read_profile2_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_settings(fp, kobj,
+			attr, buf, off, count, 1);
+}
+
+static ssize_t pyra_sysfs_read_profile3_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_settings(fp, kobj,
+			attr, buf, off, count, 2);
+}
+
+static ssize_t pyra_sysfs_read_profile4_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_settings(fp, kobj,
+			attr, buf, off, count, 3);
+}
+
+static ssize_t pyra_sysfs_read_profile5_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_settings(fp, kobj,
+			attr, buf, off, count, 4);
+}
+
+static ssize_t pyra_sysfs_read_profilex_buttons(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count, int number)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+
+	if (off >= sizeof(struct pyra_profile_buttons))
+		return 0;
+
+	if (off + count > sizeof(struct pyra_profile_buttons))
+		count = sizeof(struct pyra_profile_buttons) - off;
+
+	mutex_lock(&pyra->pyra_lock);
+	memcpy(buf, ((char const *)&pyra->profile_buttons[number]) + off,
+			count);
+	mutex_unlock(&pyra->pyra_lock);
+
+	return count;
+}
+
+static ssize_t pyra_sysfs_read_profile1_buttons(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_buttons(fp, kobj,
+			attr, buf, off, count, 0);
+}
+
+static ssize_t pyra_sysfs_read_profile2_buttons(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_buttons(fp, kobj,
+			attr, buf, off, count, 1);
+}
+
+static ssize_t pyra_sysfs_read_profile3_buttons(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_buttons(fp, kobj,
+			attr, buf, off, count, 2);
+}
+
+static ssize_t pyra_sysfs_read_profile4_buttons(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_buttons(fp, kobj,
+			attr, buf, off, count, 3);
+}
+
+static ssize_t pyra_sysfs_read_profile5_buttons(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	return pyra_sysfs_read_profilex_buttons(fp, kobj,
+			attr, buf, off, count, 4);
+}
+
+static ssize_t pyra_sysfs_write_profile_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+	struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev));
+	int retval = 0;
+	int difference;
+	int profile_number;
+	struct pyra_profile_settings *profile_settings;
+
+	if (off != 0 || count != sizeof(struct pyra_profile_settings))
+		return -EINVAL;
+
+	profile_number = ((struct pyra_profile_settings const *)buf)->number;
+	profile_settings = &pyra->profile_settings[profile_number];
+
+	mutex_lock(&pyra->pyra_lock);
+	difference = memcmp(buf, profile_settings,
+			sizeof(struct pyra_profile_settings));
+	if (difference) {
+		retval = pyra_set_profile_settings(usb_dev,
+				(struct pyra_profile_settings const *)buf);
+		if (!retval)
+			memcpy(profile_settings, buf,
+					sizeof(struct pyra_profile_settings));
+	}
+	mutex_unlock(&pyra->pyra_lock);
+
+	if (retval)
+		return retval;
+
+	return sizeof(struct pyra_profile_settings);
+}
+
+static ssize_t pyra_sysfs_write_profile_buttons(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+	struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev));
+	int retval = 0;
+	int difference;
+	int profile_number;
+	struct pyra_profile_buttons *profile_buttons;
+
+	if (off != 0 || count != sizeof(struct pyra_profile_buttons))
+		return -EINVAL;
+
+	profile_number = ((struct pyra_profile_buttons const *)buf)->number;
+	profile_buttons = &pyra->profile_buttons[profile_number];
+
+	mutex_lock(&pyra->pyra_lock);
+	difference = memcmp(buf, profile_buttons,
+			sizeof(struct pyra_profile_buttons));
+	if (difference) {
+		retval = pyra_set_profile_buttons(usb_dev,
+				(struct pyra_profile_buttons const *)buf);
+		if (!retval)
+			memcpy(profile_buttons, buf,
+					sizeof(struct pyra_profile_buttons));
+	}
+	mutex_unlock(&pyra->pyra_lock);
+
+	if (retval)
+		return retval;
+
+	return sizeof(struct pyra_profile_buttons);
+}
+
+static ssize_t pyra_sysfs_read_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+
+	if (off >= sizeof(struct pyra_settings))
+		return 0;
+
+	if (off + count > sizeof(struct pyra_settings))
+		count = sizeof(struct pyra_settings) - off;
+
+	mutex_lock(&pyra->pyra_lock);
+	memcpy(buf, ((char const *)&pyra->settings) + off, count);
+	mutex_unlock(&pyra->pyra_lock);
+
+	return count;
+}
+
+static ssize_t pyra_sysfs_write_settings(struct file *fp,
+		struct kobject *kobj, struct bin_attribute *attr, char *buf,
+		loff_t off, size_t count)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+	struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev));
+	int retval = 0;
+	int difference;
+
+	if (off != 0 || count != sizeof(struct pyra_settings))
+		return -EINVAL;
+
+	mutex_lock(&pyra->pyra_lock);
+	difference = memcmp(buf, &pyra->settings, sizeof(struct pyra_settings));
+	if (difference) {
+		retval = pyra_set_settings(usb_dev,
+				(struct pyra_settings const *)buf);
+		if (!retval)
+			memcpy(&pyra->settings, buf,
+					sizeof(struct pyra_settings));
+	}
+	mutex_unlock(&pyra->pyra_lock);
+
+	if (retval)
+		return retval;
+
+	profile_activated(pyra, pyra->settings.startup_profile);
+
+	return sizeof(struct pyra_settings);
+}
+
+
+static ssize_t pyra_sysfs_show_actual_cpi(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+	return snprintf(buf, PAGE_SIZE, "%d\n", pyra->actual_cpi);
+}
+
+static ssize_t pyra_sysfs_show_actual_profile(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+	return snprintf(buf, PAGE_SIZE, "%d\n", pyra->actual_profile);
+}
+
+static ssize_t pyra_sysfs_show_firmware_version(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+	return snprintf(buf, PAGE_SIZE, "%d\n", pyra->firmware_version);
+}
+
+static ssize_t pyra_sysfs_show_startup_profile(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev));
+	return snprintf(buf, PAGE_SIZE, "%d\n", pyra->settings.startup_profile);
+}
+
+static DEVICE_ATTR(actual_cpi, 0440, pyra_sysfs_show_actual_cpi, NULL);
+
+static DEVICE_ATTR(actual_profile, 0440, pyra_sysfs_show_actual_profile, NULL);
+
+static DEVICE_ATTR(firmware_version, 0440,
+		pyra_sysfs_show_firmware_version, NULL);
+
+static DEVICE_ATTR(startup_profile, 0440,
+		pyra_sysfs_show_startup_profile, NULL);
+
+static struct attribute *pyra_attributes[] = {
+		&dev_attr_actual_cpi.attr,
+		&dev_attr_actual_profile.attr,
+		&dev_attr_firmware_version.attr,
+		&dev_attr_startup_profile.attr,
+		NULL
+};
+
+static struct attribute_group pyra_attribute_group = {
+		.attrs = pyra_attributes
+};
+
+static struct bin_attribute pyra_profile_settings_attr = {
+		.attr = { .name = "profile_settings", .mode = 0220 },
+		.size = sizeof(struct pyra_profile_settings),
+		.write = pyra_sysfs_write_profile_settings
+};
+
+static struct bin_attribute pyra_profile1_settings_attr = {
+		.attr = { .name = "profile1_settings", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_settings),
+		.read = pyra_sysfs_read_profile1_settings
+};
+
+static struct bin_attribute pyra_profile2_settings_attr = {
+		.attr = { .name = "profile2_settings", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_settings),
+		.read = pyra_sysfs_read_profile2_settings
+};
+
+static struct bin_attribute pyra_profile3_settings_attr = {
+		.attr = { .name = "profile3_settings", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_settings),
+		.read = pyra_sysfs_read_profile3_settings
+};
+
+static struct bin_attribute pyra_profile4_settings_attr = {
+		.attr = { .name = "profile4_settings", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_settings),
+		.read = pyra_sysfs_read_profile4_settings
+};
+
+static struct bin_attribute pyra_profile5_settings_attr = {
+		.attr = { .name = "profile5_settings", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_settings),
+		.read = pyra_sysfs_read_profile5_settings
+};
+
+static struct bin_attribute pyra_profile_buttons_attr = {
+		.attr = { .name = "profile_buttons", .mode = 0220 },
+		.size = sizeof(struct pyra_profile_buttons),
+		.write = pyra_sysfs_write_profile_buttons
+};
+
+static struct bin_attribute pyra_profile1_buttons_attr = {
+		.attr = { .name = "profile1_buttons", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_buttons),
+		.read = pyra_sysfs_read_profile1_buttons
+};
+
+static struct bin_attribute pyra_profile2_buttons_attr = {
+		.attr = { .name = "profile2_buttons", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_buttons),
+		.read = pyra_sysfs_read_profile2_buttons
+};
+
+static struct bin_attribute pyra_profile3_buttons_attr = {
+		.attr = { .name = "profile3_buttons", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_buttons),
+		.read = pyra_sysfs_read_profile3_buttons
+};
+
+static struct bin_attribute pyra_profile4_buttons_attr = {
+		.attr = { .name = "profile4_buttons", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_buttons),
+		.read = pyra_sysfs_read_profile4_buttons
+};
+
+static struct bin_attribute pyra_profile5_buttons_attr = {
+		.attr = { .name = "profile5_buttons", .mode = 0440 },
+		.size = sizeof(struct pyra_profile_buttons),
+		.read = pyra_sysfs_read_profile5_buttons
+};
+
+static struct bin_attribute pyra_settings_attr = {
+		.attr = { .name = "settings", .mode = 0660 },
+		.size = sizeof(struct pyra_settings),
+		.read = pyra_sysfs_read_settings,
+		.write = pyra_sysfs_write_settings
+};
+
+static int pyra_create_sysfs_attributes(struct usb_interface *intf)
+{
+	int retval;
+
+	retval = sysfs_create_group(&intf->dev.kobj, &pyra_attribute_group);
+	if (retval)
+		goto exit_1;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile_settings_attr);
+	if (retval)
+		goto exit_2;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile1_settings_attr);
+	if (retval)
+		goto exit_3;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile2_settings_attr);
+	if (retval)
+		goto exit_4;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile3_settings_attr);
+	if (retval)
+		goto exit_5;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile4_settings_attr);
+	if (retval)
+		goto exit_6;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile5_settings_attr);
+	if (retval)
+		goto exit_7;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile_buttons_attr);
+	if (retval)
+		goto exit_8;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile1_buttons_attr);
+	if (retval)
+		goto exit_9;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile2_buttons_attr);
+	if (retval)
+		goto exit_10;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile3_buttons_attr);
+	if (retval)
+		goto exit_11;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile4_buttons_attr);
+	if (retval)
+		goto exit_12;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_profile5_buttons_attr);
+	if (retval)
+		goto exit_13;
+
+	retval = sysfs_create_bin_file(&intf->dev.kobj,
+			&pyra_settings_attr);
+	if (retval)
+		goto exit_14;
+
+	return 0;
+
+exit_14:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile5_buttons_attr);
+exit_13:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile4_buttons_attr);
+exit_12:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile3_buttons_attr);
+exit_11:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile2_buttons_attr);
+exit_10:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile1_buttons_attr);
+exit_9:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile_buttons_attr);
+exit_8:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile5_settings_attr);
+exit_7:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile4_settings_attr);
+exit_6:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile3_settings_attr);
+exit_5:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile2_settings_attr);
+exit_4:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile1_settings_attr);
+exit_3:
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile_settings_attr);
+exit_2:
+	sysfs_remove_group(&intf->dev.kobj, &pyra_attribute_group);
+exit_1:
+	return retval;
+}
+
+static void pyra_remove_sysfs_attributes(struct usb_interface *intf)
+{
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_settings_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile5_buttons_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile4_buttons_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile3_buttons_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile2_buttons_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile1_buttons_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile_buttons_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile5_settings_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile4_settings_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile3_settings_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile2_settings_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile1_settings_attr);
+	sysfs_remove_bin_file(&intf->dev.kobj, &pyra_profile_settings_attr);
+	sysfs_remove_group(&intf->dev.kobj, &pyra_attribute_group);
+}
+
+static int pyra_init_pyra_device_struct(struct usb_device *usb_dev,
+		struct pyra_device *pyra)
+{
+	struct pyra_info *info;
+	int retval, i;
+
+	mutex_init(&pyra->pyra_lock);
+
+	info = kmalloc(sizeof(struct pyra_info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	retval = pyra_get_info(usb_dev, info);
+	if (retval) {
+		kfree(info);
+		return retval;
+	}
+	pyra->firmware_version = info->firmware_version;
+	kfree(info);
+
+	retval = pyra_get_settings(usb_dev, &pyra->settings);
+	if (retval)
+		return retval;
+
+	for (i = 0; i < 5; ++i) {
+		retval = pyra_get_profile_settings(usb_dev,
+				&pyra->profile_settings[i], i);
+		if (retval)
+			return retval;
+
+		retval = pyra_get_profile_buttons(usb_dev,
+				&pyra->profile_buttons[i], i);
+		if (retval)
+			return retval;
+	}
+
+	profile_activated(pyra, pyra->settings.startup_profile);
+
+	return 0;
+}
+
+static int pyra_init_specials(struct hid_device *hdev)
+{
+	struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
+	struct usb_device *usb_dev = interface_to_usbdev(intf);
+	struct pyra_device *pyra;
+	int retval;
+
+	if (intf->cur_altsetting->desc.bInterfaceProtocol
+			== USB_INTERFACE_PROTOCOL_MOUSE) {
+
+		pyra = kzalloc(sizeof(*pyra), GFP_KERNEL);
+		if (!pyra) {
+			dev_err(&hdev->dev, "can't alloc device descriptor\n");
+			return -ENOMEM;
+		}
+		hid_set_drvdata(hdev, pyra);
+
+		retval = pyra_init_pyra_device_struct(usb_dev, pyra);
+		if (retval) {
+			dev_err(&hdev->dev,
+					"couldn't init struct pyra_device\n");
+			goto exit_free;
+		}
+
+		retval = roccat_connect(hdev);
+		if (retval < 0) {
+			dev_err(&hdev->dev, "couldn't init char dev\n");
+		} else {
+			pyra->chrdev_minor = retval;
+			pyra->roccat_claimed = 1;
+		}
+
+		retval = pyra_create_sysfs_attributes(intf);
+		if (retval) {
+			dev_err(&hdev->dev, "cannot create sysfs files\n");
+			goto exit_free;
+		}
+	} else {
+		hid_set_drvdata(hdev, NULL);
+	}
+
+	return 0;
+exit_free:
+	kfree(pyra);
+	return retval;
+}
+
+static void pyra_remove_specials(struct hid_device *hdev)
+{
+	struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
+	struct pyra_device *pyra;
+
+	if (intf->cur_altsetting->desc.bInterfaceProtocol
+			== USB_INTERFACE_PROTOCOL_MOUSE) {
+		pyra_remove_sysfs_attributes(intf);
+		pyra = hid_get_drvdata(hdev);
+		if (pyra->roccat_claimed)
+			roccat_disconnect(pyra->chrdev_minor);
+		kfree(hid_get_drvdata(hdev));
+	}
+}
+
+static int pyra_probe(struct hid_device *hdev, const struct hid_device_id *id)
+{
+	int retval;
+
+	retval = hid_parse(hdev);
+	if (retval) {
+		dev_err(&hdev->dev, "parse failed\n");
+		goto exit;
+	}
+
+	retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
+	if (retval) {
+		dev_err(&hdev->dev, "hw start failed\n");
+		goto exit;
+	}
+
+	retval = pyra_init_specials(hdev);
+	if (retval) {
+		dev_err(&hdev->dev, "couldn't install mouse\n");
+		goto exit_stop;
+	}
+	return 0;
+
+exit_stop:
+	hid_hw_stop(hdev);
+exit:
+	return retval;
+}
+
+static void pyra_remove(struct hid_device *hdev)
+{
+	pyra_remove_specials(hdev);
+	hid_hw_stop(hdev);
+}
+
+static void pyra_keep_values_up_to_date(struct pyra_device *pyra,
+		u8 const *data)
+{
+	struct pyra_mouse_event_button const *button_event;
+
+	switch (data[0]) {
+	case PYRA_MOUSE_REPORT_NUMBER_BUTTON:
+		button_event = (struct pyra_mouse_event_button const *)data;
+		switch (button_event->type) {
+		case PYRA_MOUSE_EVENT_BUTTON_TYPE_PROFILE_2:
+			profile_activated(pyra, button_event->data1 - 1);
+			break;
+		case PYRA_MOUSE_EVENT_BUTTON_TYPE_CPI:
+			pyra->actual_cpi = button_event->data1;
+			break;
+		}
+		break;
+	}
+}
+
+static void pyra_report_to_chrdev(struct pyra_device const *pyra,
+		u8 const *data)
+{
+	struct pyra_roccat_report roccat_report;
+	struct pyra_mouse_event_button const *button_event;
+
+	if (data[0] != PYRA_MOUSE_REPORT_NUMBER_BUTTON)
+		return;
+
+	button_event = (struct pyra_mouse_event_button const *)data;
+
+	switch (button_event->type) {
+	case PYRA_MOUSE_EVENT_BUTTON_TYPE_PROFILE_2:
+	case PYRA_MOUSE_EVENT_BUTTON_TYPE_CPI:
+		roccat_report.type = button_event->type;
+		roccat_report.value = button_event->data1;
+		roccat_report.key = 0;
+		roccat_report_event(pyra->chrdev_minor,
+				(uint8_t const *)&roccat_report,
+				sizeof(struct pyra_roccat_report));
+		break;
+	case PYRA_MOUSE_EVENT_BUTTON_TYPE_MACRO:
+	case PYRA_MOUSE_EVENT_BUTTON_TYPE_SHORTCUT:
+	case PYRA_MOUSE_EVENT_BUTTON_TYPE_QUICKLAUNCH:
+		if (button_event->data2 == PYRA_MOUSE_EVENT_BUTTON_PRESS) {
+			roccat_report.type = button_event->type;
+			roccat_report.key = button_event->data1;
+			/*
+			 * pyra reports profile numbers with range 1-5.
+			 * Keeping this behaviour.
+			 */
+			roccat_report.value = pyra->actual_profile + 1;
+			roccat_report_event(pyra->chrdev_minor,
+					(uint8_t const *)&roccat_report,
+					sizeof(struct pyra_roccat_report));
+		}
+		break;
+	}
+}
+
+static int pyra_raw_event(struct hid_device *hdev, struct hid_report *report,
+		u8 *data, int size)
+{
+	struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
+	struct pyra_device *pyra = hid_get_drvdata(hdev);
+
+	if (intf->cur_altsetting->desc.bInterfaceProtocol
+			!= USB_INTERFACE_PROTOCOL_MOUSE)
+		return 0;
+
+	pyra_keep_values_up_to_date(pyra, data);
+
+	if (pyra->roccat_claimed)
+		pyra_report_to_chrdev(pyra, data);
+
+	return 0;
+}
+
+static const struct hid_device_id pyra_devices[] = {
+	{ HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT,
+			USB_DEVICE_ID_ROCCAT_PYRA_WIRED) },
+	/* TODO add USB_DEVICE_ID_ROCCAT_PYRA_WIRELESS after testing */
+	{ }
+};
+
+MODULE_DEVICE_TABLE(hid, pyra_devices);
+
+static struct hid_driver pyra_driver = {
+		.name = "pyra",
+		.id_table = pyra_devices,
+		.probe = pyra_probe,
+		.remove = pyra_remove,
+		.raw_event = pyra_raw_event
+};
+
+static int __init pyra_init(void)
+{
+	return hid_register_driver(&pyra_driver);
+}
+
+static void __exit pyra_exit(void)
+{
+	hid_unregister_driver(&pyra_driver);
+}
+
+module_init(pyra_init);
+module_exit(pyra_exit);
+
+MODULE_AUTHOR("Stefan Achatz");
+MODULE_DESCRIPTION("USB Roccat Pyra driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/hid/hid-roccat-pyra.h b/drivers/hid/hid-roccat-pyra.h
new file mode 100644
index 000000000000..22f80a8f26f9
--- /dev/null
+++ b/drivers/hid/hid-roccat-pyra.h
@@ -0,0 +1,186 @@
+#ifndef __HID_ROCCAT_PYRA_H
+#define __HID_ROCCAT_PYRA_H
+
+/*
+ * Copyright (c) 2010 Stefan Achatz <erazor_de@users.sourceforge.net>
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/types.h>
+
+#pragma pack(push)
+#pragma pack(1)
+
+struct pyra_b {
+	uint8_t command; /* PYRA_COMMAND_B */
+	uint8_t size; /* always 3 */
+	uint8_t unknown; /* 1 */
+};
+
+struct pyra_control {
+	uint8_t command; /* PYRA_COMMAND_CONTROL */
+	/*
+	 * value is profile number for request_settings and request_buttons
+	 * 1 if status ok for request_status
+	 */
+	uint8_t value; /* Range 0-4 */
+	uint8_t request;
+};
+
+enum pyra_control_requests {
+	PYRA_CONTROL_REQUEST_STATUS = 0x00,
+	PYRA_CONTROL_REQUEST_PROFILE_SETTINGS = 0x10,
+	PYRA_CONTROL_REQUEST_PROFILE_BUTTONS = 0x20
+};
+
+struct pyra_settings {
+	uint8_t command; /* PYRA_COMMAND_SETTINGS */
+	uint8_t size; /* always 3 */
+	uint8_t startup_profile; /* Range 0-4! */
+};
+
+struct pyra_profile_settings {
+	uint8_t command; /* PYRA_COMMAND_PROFILE_SETTINGS */
+	uint8_t size; /* always 0xd */
+	uint8_t number; /* Range 0-4 */
+	uint8_t xysync;
+	uint8_t x_sensitivity; /* 0x1-0xa */
+	uint8_t y_sensitivity;
+	uint8_t x_cpi; /* unused */
+	uint8_t y_cpi; /* this value is for x and y */
+	uint8_t lightswitch; /* 0 = off, 1 = on */
+	uint8_t light_effect;
+	uint8_t handedness;
+	uint16_t checksum; /* byte sum */
+};
+
+struct pyra_profile_buttons {
+	uint8_t command; /* PYRA_COMMAND_PROFILE_BUTTONS */
+	uint8_t size; /* always 0x13 */
+	uint8_t number; /* Range 0-4 */
+	uint8_t buttons[14];
+	uint16_t checksum; /* byte sum */
+};
+
+struct pyra_info {
+	uint8_t command; /* PYRA_COMMAND_INFO */
+	uint8_t size; /* always 6 */
+	uint8_t firmware_version;
+	uint8_t unknown1; /* always 0 */
+	uint8_t unknown2; /* always 1 */
+	uint8_t unknown3; /* always 0 */
+};
+
+enum pyra_commands {
+	PYRA_COMMAND_CONTROL = 0x4,
+	PYRA_COMMAND_SETTINGS = 0x5,
+	PYRA_COMMAND_PROFILE_SETTINGS = 0x6,
+	PYRA_COMMAND_PROFILE_BUTTONS = 0x7,
+	PYRA_COMMAND_INFO = 0x9,
+	PYRA_COMMAND_B = 0xb
+};
+
+enum pyra_usb_commands {
+	PYRA_USB_COMMAND_CONTROL = 0x304,
+	PYRA_USB_COMMAND_SETTINGS = 0x305,
+	PYRA_USB_COMMAND_PROFILE_SETTINGS = 0x306,
+	PYRA_USB_COMMAND_PROFILE_BUTTONS = 0x307,
+	PYRA_USB_COMMAND_INFO = 0x309,
+	PYRA_USB_COMMAND_B = 0x30b /* writes 3 bytes */
+};
+
+enum pyra_mouse_report_numbers {
+	PYRA_MOUSE_REPORT_NUMBER_HID = 1,
+	PYRA_MOUSE_REPORT_NUMBER_AUDIO = 2,
+	PYRA_MOUSE_REPORT_NUMBER_BUTTON = 3,
+};
+
+struct pyra_mouse_event_button {
+	uint8_t report_number; /* always 3 */
+	uint8_t unknown; /* always 0 */
+	uint8_t type;
+	uint8_t data1;
+	uint8_t data2;
+};
+
+struct pyra_mouse_event_audio {
+	uint8_t report_number; /* always 2 */
+	uint8_t type;
+	uint8_t unused; /* always 0 */
+};
+
+/* hid audio controls */
+enum pyra_mouse_event_audio_types {
+	PYRA_MOUSE_EVENT_AUDIO_TYPE_MUTE = 0xe2,
+	PYRA_MOUSE_EVENT_AUDIO_TYPE_VOLUME_UP = 0xe9,
+	PYRA_MOUSE_EVENT_AUDIO_TYPE_VOLUME_DOWN = 0xea,
+};
+
+enum pyra_mouse_event_button_types {
+	/*
+	 * Mouse sends tilt events on report_number 1 and 3
+	 * Tilt events are sent repeatedly with 0.94s between first and second
+	 * event and 0.22s on subsequent
+	 */
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_TILT = 0x10,
+
+	/*
+	 * These are sent sequentially
+	 * data1 contains new profile number in range 1-5
+	 */
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_PROFILE_1 = 0x20,
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_PROFILE_2 = 0x30,
+
+	/*
+	 * data1 = button_number (rmp index)
+	 * data2 = pressed/released
+	 */
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_MACRO = 0x40,
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_SHORTCUT = 0x50,
+
+	/*
+	 * data1 = button_number (rmp index)
+	 */
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_QUICKLAUNCH = 0x60,
+
+	/* data1 = new cpi */
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_CPI = 0xb0,
+
+	/* data1 and data2 = new sensitivity */
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_SENSITIVITY = 0xc0,
+
+	PYRA_MOUSE_EVENT_BUTTON_TYPE_MULTIMEDIA = 0xf0,
+};
+
+enum {
+	PYRA_MOUSE_EVENT_BUTTON_PRESS = 0,
+	PYRA_MOUSE_EVENT_BUTTON_RELEASE = 1,
+};
+
+struct pyra_roccat_report {
+	uint8_t type;
+	uint8_t value;
+	uint8_t key;
+};
+
+#pragma pack(pop)
+
+struct pyra_device {
+	int actual_profile;
+	int actual_cpi;
+	int firmware_version;
+	int roccat_claimed;
+	int chrdev_minor;
+	struct mutex pyra_lock;
+	struct pyra_settings settings;
+	struct pyra_profile_settings profile_settings[5];
+	struct pyra_profile_buttons profile_buttons[5];
+};
+
+#endif
diff --git a/drivers/hid/hid-samsung.c b/drivers/hid/hid-samsung.c
index bda0fd60c98d..35894444e000 100644
--- a/drivers/hid/hid-samsung.c
+++ b/drivers/hid/hid-samsung.c
@@ -61,10 +61,10 @@ static inline void samsung_irda_dev_trace(struct hid_device *hdev,
 			"descriptor\n", rsize);
 }
 
-static void samsung_irda_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *samsung_irda_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize == 184 && rdesc[175] == 0x25 && rdesc[176] == 0x40 &&
+	if (*rsize == 184 && rdesc[175] == 0x25 && rdesc[176] == 0x40 &&
 			rdesc[177] == 0x75 && rdesc[178] == 0x30 &&
 			rdesc[179] == 0x95 && rdesc[180] == 0x01 &&
 			rdesc[182] == 0x40) {
@@ -74,24 +74,25 @@ static void samsung_irda_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		rdesc[180] = 0x06;
 		rdesc[182] = 0x42;
 	} else
-	if (rsize == 203 && rdesc[192] == 0x15 && rdesc[193] == 0x0 &&
+	if (*rsize == 203 && rdesc[192] == 0x15 && rdesc[193] == 0x0 &&
 			rdesc[194] == 0x25 && rdesc[195] == 0x12) {
 		samsung_irda_dev_trace(hdev, 203);
 		rdesc[193] = 0x1;
 		rdesc[195] = 0xf;
 	} else
-	if (rsize == 135 && rdesc[124] == 0x15 && rdesc[125] == 0x0 &&
+	if (*rsize == 135 && rdesc[124] == 0x15 && rdesc[125] == 0x0 &&
 			rdesc[126] == 0x25 && rdesc[127] == 0x11) {
 		samsung_irda_dev_trace(hdev, 135);
 		rdesc[125] = 0x1;
 		rdesc[127] = 0xe;
 	} else
-	if (rsize == 171 && rdesc[160] == 0x15 && rdesc[161] == 0x0 &&
+	if (*rsize == 171 && rdesc[160] == 0x15 && rdesc[161] == 0x0 &&
 			rdesc[162] == 0x25 && rdesc[163] == 0x01) {
 		samsung_irda_dev_trace(hdev, 171);
 		rdesc[161] = 0x1;
 		rdesc[163] = 0x3;
 	}
+	return rdesc;
 }
 
 #define samsung_kbd_mouse_map_key_clear(c) \
@@ -130,11 +131,12 @@ static int samsung_kbd_mouse_input_mapping(struct hid_device *hdev,
 	return 1;
 }
 
-static void samsung_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-	unsigned int rsize)
+static __u8 *samsung_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+	unsigned int *rsize)
 {
 	if (USB_DEVICE_ID_SAMSUNG_IR_REMOTE == hdev->product)
-		samsung_irda_report_fixup(hdev, rdesc, rsize);
+		rdesc = samsung_irda_report_fixup(hdev, rdesc, rsize);
+	return rdesc;
 }
 
 static int samsung_input_mapping(struct hid_device *hdev, struct hid_input *hi,
diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index 402d5574b574..677bb3da10e8 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c
@@ -24,24 +24,46 @@
 
 #include "hid-ids.h"
 
-#define VAIO_RDESC_CONSTANT 0x0001
+#define VAIO_RDESC_CONSTANT     (1 << 0)
+#define SIXAXIS_CONTROLLER_USB  (1 << 1)
+#define SIXAXIS_CONTROLLER_BT   (1 << 2)
 
 struct sony_sc {
 	unsigned long quirks;
 };
 
 /* Sony Vaio VGX has wrongly mouse pointer declared as constant */
-static void sony_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *sony_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
 	struct sony_sc *sc = hid_get_drvdata(hdev);
 
 	if ((sc->quirks & VAIO_RDESC_CONSTANT) &&
-			rsize >= 56 && rdesc[54] == 0x81 && rdesc[55] == 0x07) {
+			*rsize >= 56 && rdesc[54] == 0x81 && rdesc[55] == 0x07) {
 		dev_info(&hdev->dev, "Fixing up Sony Vaio VGX report "
 				"descriptor\n");
 		rdesc[55] = 0x06;
 	}
+	return rdesc;
+}
+
+static int sixaxis_usb_output_raw_report(struct hid_device *hid, __u8 *buf,
+		size_t count, unsigned char report_type)
+{
+	struct usb_interface *intf = to_usb_interface(hid->dev.parent);
+	struct usb_device *dev = interface_to_usbdev(intf);
+	struct usb_host_interface *interface = intf->cur_altsetting;
+	int report_id = buf[0];
+	int ret;
+
+	ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
+		HID_REQ_SET_REPORT,
+		USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
+		((report_type + 1) << 8) | report_id,
+		interface->desc.bInterfaceNumber, buf, count,
+		USB_CTRL_SET_TIMEOUT);
+
+	return ret;
 }
 
 /*
@@ -49,7 +71,7 @@ static void sony_report_fixup(struct hid_device *hdev, __u8 *rdesc,
  * to "operational".  Without this, the ps3 controller will not report any
  * events.
  */
-static int sony_set_operational_usb(struct hid_device *hdev)
+static int sixaxis_set_operational_usb(struct hid_device *hdev)
 {
 	struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
 	struct usb_device *dev = interface_to_usbdev(intf);
@@ -74,7 +96,7 @@ static int sony_set_operational_usb(struct hid_device *hdev)
 	return ret;
 }
 
-static int sony_set_operational_bt(struct hid_device *hdev)
+static int sixaxis_set_operational_bt(struct hid_device *hdev)
 {
 	unsigned char buf[] = { 0xf4,  0x42, 0x03, 0x00, 0x00 };
 	return hdev->hid_output_raw_report(hdev, buf, sizeof(buf), HID_FEATURE_REPORT);
@@ -108,16 +130,14 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		goto err_free;
 	}
 
-	switch (hdev->bus) {
-	case BUS_USB:
-		ret = sony_set_operational_usb(hdev);
-		break;
-	case BUS_BLUETOOTH:
-		ret = sony_set_operational_bt(hdev);
-		break;
-	default:
-		ret = 0;
+	if (sc->quirks & SIXAXIS_CONTROLLER_USB) {
+		hdev->hid_output_raw_report = sixaxis_usb_output_raw_report;
+		ret = sixaxis_set_operational_usb(hdev);
 	}
+	else if (sc->quirks & SIXAXIS_CONTROLLER_BT)
+		ret = sixaxis_set_operational_bt(hdev);
+	else
+		ret = 0;
 
 	if (ret < 0)
 		goto err_stop;
@@ -137,8 +157,10 @@ static void sony_remove(struct hid_device *hdev)
 }
 
 static const struct hid_device_id sony_devices[] = {
-	{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER) },
-	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER),
+		.driver_data = SIXAXIS_CONTROLLER_USB },
+	{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER),
+		.driver_data = SIXAXIS_CONTROLLER_BT },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGX_MOUSE),
 		.driver_data = VAIO_RDESC_CONSTANT },
 	{ }
diff --git a/drivers/hid/hid-stantum.c b/drivers/hid/hid-stantum.c
index 90df886c5e04..3171be28c3d5 100644
--- a/drivers/hid/hid-stantum.c
+++ b/drivers/hid/hid-stantum.c
@@ -249,6 +249,8 @@ static void stantum_remove(struct hid_device *hdev)
 
 static const struct hid_device_id stantum_devices[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_STANTUM, USB_DEVICE_ID_MTP) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_STANTUM_STM, USB_DEVICE_ID_MTP_STM) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_STANTUM_SITRONIX, USB_DEVICE_ID_MTP_SITRONIX) },
 	{ }
 };
 MODULE_DEVICE_TABLE(hid, stantum_devices);
diff --git a/drivers/hid/hid-sunplus.c b/drivers/hid/hid-sunplus.c
index 438107d9f1b2..164ed568f6cf 100644
--- a/drivers/hid/hid-sunplus.c
+++ b/drivers/hid/hid-sunplus.c
@@ -22,16 +22,17 @@
 
 #include "hid-ids.h"
 
-static void sp_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-		unsigned int rsize)
+static __u8 *sp_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
 {
-	if (rsize >= 107 && rdesc[104] == 0x26 && rdesc[105] == 0x80 &&
+	if (*rsize >= 107 && rdesc[104] == 0x26 && rdesc[105] == 0x80 &&
 			rdesc[106] == 0x03) {
 		dev_info(&hdev->dev, "fixing up Sunplus Wireless Desktop "
 				"report descriptor\n");
 		rdesc[105] = rdesc[110] = 0x03;
 		rdesc[106] = rdesc[111] = 0x21;
 	}
+	return rdesc;
 }
 
 #define sp_map_key_clear(c)	hid_map_usage_clear(hi, usage, bit, max, \
diff --git a/drivers/hid/hid-uclogic.c b/drivers/hid/hid-uclogic.c
new file mode 100644
index 000000000000..05fdc85a76e5
--- /dev/null
+++ b/drivers/hid/hid-uclogic.c
@@ -0,0 +1,623 @@
+/*
+ *  HID driver for UC-Logic devices not fully compliant with HID standard
+ *
+ *  Copyright (c) 2010 Nikolai Kondrashov
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/hid.h>
+#include <linux/module.h>
+
+#include "hid-ids.h"
+
+/*
+ * The original descriptors of WPXXXXU tablets have three report IDs, of
+ * which only two are used (8 and 9), and the remaining (7) seems to have
+ * the originally intended pen description which was abandoned for some
+ * reason.  From this unused description it is possible to extract the
+ * actual physical extents and resolution. All the models use the same
+ * descriptor with different extents for the unused report ID.
+ *
+ * Here it is:
+ *
+ *  Usage Page (Digitizer),         ; Digitizer (0Dh)
+ *  Usage (Pen),                    ; Pen (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (7),
+ *    Usage (Stylus),               ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (Tip Switch),         ; Tip switch (42h, momentary control)
+ *      Usage (Barrel Switch),      ; Barrel switch (44h, momentary control)
+ *      Usage (Eraser),             ; Eraser (45h, momentary control)
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (3),
+ *      Input (Variable),
+ *      Report Count (3),
+ *      Input (Constant, Variable),
+ *      Usage (In Range),           ; In range (32h, momentary control)
+ *      Report Count (1),
+ *      Input (Variable),
+ *      Report Count (1),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Report Size (16),
+ *      Report Count (1),
+ *      Push,
+ *      Unit Exponent (13),
+ *      Unit (Inch^3),
+ *      Physical Minimum (0),
+ *      Physical Maximum (Xpm),
+ *      Logical Maximum (Xlm),
+ *      Input (Variable),
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Physical Maximum (Ypm),
+ *      Logical Maximum (Ylm),
+ *      Input (Variable),
+ *      Pop,
+ *      Usage Page (Digitizer),     ; Digitizer (0Dh)
+ *      Usage (Tip Pressure),       ; Tip pressure (30h, dynamic value)
+ *      Logical Maximum (1023),
+ *      Input (Variable),
+ *      Report Size (16),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),           ; Generic desktop controls (01h)
+ *  Usage (Mouse),                  ; Mouse (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (8),
+ *    Usage (Pointer),              ; Pointer (01h, physical collection)
+ *    Collection (Physical),
+ *      Usage Page (Button),        ; Button (09h)
+ *      Usage Minimum (01h),
+ *      Usage Maximum (03h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Count (3),
+ *      Report Size (1),
+ *      Input (Variable),
+ *      Report Count (5),
+ *      Input (Constant),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Usage (Wheel),              ; Wheel (38h, dynamic value)
+ *      Usage (00h),
+ *      Logical Minimum (-127),
+ *      Logical Maximum (127),
+ *      Report Size (8),
+ *      Report Count (4),
+ *      Input (Variable, Relative),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),           ; Generic desktop controls (01h)
+ *  Usage (Mouse),                  ; Mouse (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (9),
+ *    Usage (Pointer),              ; Pointer (01h, physical collection)
+ *    Collection (Physical),
+ *      Usage Page (Button),        ; Button (09h)
+ *      Usage Minimum (01h),
+ *      Usage Maximum (03h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Count (3),
+ *      Report Size (1),
+ *      Input (Variable),
+ *      Report Count (5),
+ *      Input (Constant),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Logical Minimum (0),
+ *      Logical Maximum (32767),
+ *      Physical Minimum (0),
+ *      Physical Maximum (32767),
+ *      Report Count (2),
+ *      Report Size (16),
+ *      Input (Variable),
+ *      Usage Page (Digitizer),     ; Digitizer (0Dh)
+ *      Usage (Tip Pressure),       ; Tip pressure (30h, dynamic value)
+ *      Logical Maximum (1023),
+ *      Report Count (1),
+ *      Report Size (16),
+ *      Input (Variable),
+ *    End Collection,
+ *  End Collection
+ *
+ * Here are the extents values for the WPXXXXU models:
+ *
+ *              Xpm     Xlm     Ypm     Ylm
+ *  WP4030U     4000    8000    3000    6000
+ *  WP5540U     5500    11000   4000    8000
+ *  WP8060U     8000    16000   6000    12000
+ *
+ * This suggests that all of them have 2000 LPI resolution, as advertised.
+ */
+
+/* Size of the original descriptor of WPXXXXU tablets */
+#define WPXXXXU_RDESC_ORIG_SIZE	212
+
+/*
+ * Fixed WP4030U report descriptor.
+ * Although the hardware might actually support it, the mouse description
+ * has been removed, since there seems to be no devices having one and it
+ * wouldn't make much sense because of the working area size.
+ */
+static __u8 wp4030u_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x09,         /*      Report ID (9),                  */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x05,         /*          Report Count (5),           */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0xA0, 0x0F,   /*          Physical Maximum (4000),    */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0xB8, 0x0B,   /*          Physical Maximum (3000),    */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0                /*  End Collection                      */
+};
+
+/* Fixed WP5540U report descriptor */
+static __u8 wp5540u_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x09,         /*      Report ID (9),                  */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x05,         /*          Report Count (5),           */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0x7C, 0x15,   /*          Physical Maximum (5500),    */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0xA0, 0x0F,   /*          Physical Maximum (4000),    */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0,               /*  End Collection,                     */
+	0x05, 0x01,         /*  Usage Page (Desktop),               */
+	0x09, 0x02,         /*  Usage (Mouse),                      */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x08,         /*      Report ID (8),                  */
+	0x09, 0x01,         /*      Usage (Pointer),                */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x05, 0x09,         /*          Usage Page (Button),        */
+	0x19, 0x01,         /*          Usage Minimum (01h),        */
+	0x29, 0x03,         /*          Usage Maximum (03h),        */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x05,         /*          Report Count (5),           */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x75, 0x08,         /*          Report Size (8),            */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x15, 0x81,         /*          Logical Minimum (-127),     */
+	0x25, 0x7F,         /*          Logical Maximum (127),      */
+	0x95, 0x02,         /*          Report Count (2),           */
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0x09, 0x38,         /*          Usage (Wheel),              */
+	0x15, 0xFF,         /*          Logical Minimum (-1),       */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0                /*  End Collection                      */
+};
+
+/* Fixed WP8060U report descriptor */
+static __u8 wp8060u_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x09,         /*      Report ID (9),                  */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x05,         /*          Report Count (5),           */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0x40, 0x1F,   /*          Physical Maximum (8000),    */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0x70, 0x17,   /*          Physical Maximum (6000),    */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0,               /*  End Collection,                     */
+	0x05, 0x01,         /*  Usage Page (Desktop),               */
+	0x09, 0x02,         /*  Usage (Mouse),                      */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x08,         /*      Report ID (8),                  */
+	0x09, 0x01,         /*      Usage (Pointer),                */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x05, 0x09,         /*          Usage Page (Button),        */
+	0x19, 0x01,         /*          Usage Minimum (01h),        */
+	0x29, 0x03,         /*          Usage Maximum (03h),        */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x05,         /*          Report Count (5),           */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x75, 0x08,         /*          Report Size (8),            */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x15, 0x81,         /*          Logical Minimum (-127),     */
+	0x25, 0x7F,         /*          Logical Maximum (127),      */
+	0x95, 0x02,         /*          Report Count (2),           */
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0x09, 0x38,         /*          Usage (Wheel),              */
+	0x15, 0xFF,         /*          Logical Minimum (-1),       */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0                /*  End Collection                      */
+};
+
+/*
+ * Original PF1209 report descriptor.
+ *
+ * The descriptor is similar to WPXXXXU descriptors, with an addition of a
+ * feature report (ID 4) of unknown purpose.
+ *
+ * Although the advertised resolution is 4000 LPI the unused report ID
+ * (taken from WPXXXXU, it seems) states 2000 LPI, but it is probably
+ * incorrect and is a result of blind copying without understanding. Anyway
+ * the real logical extents are always scaled to 0..32767, which IMHO spoils
+ * the precision.
+ *
+ *  Usage Page (Digitizer),         ; Digitizer (0Dh)
+ *  Usage (Pen),                    ; Pen (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (7),
+ *    Usage (Stylus),               ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (Tip Switch),         ; Tip switch (42h, momentary control)
+ *      Usage (Barrel Switch),      ; Barrel switch (44h, momentary control)
+ *      Usage (Eraser),             ; Eraser (45h, momentary control)
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (3),
+ *      Input (Variable),
+ *      Report Count (3),
+ *      Input (Constant, Variable),
+ *      Usage (In Range),           ; In range (32h, momentary control)
+ *      Report Count (1),
+ *      Input (Variable),
+ *      Report Count (1),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Report Size (16),
+ *      Report Count (1),
+ *      Push,
+ *      Unit Exponent (13),
+ *      Unit (Inch^3),
+ *      Physical Minimum (0),
+ *      Physical Maximum (12000),
+ *      Logical Maximum (24000),
+ *      Input (Variable),
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Physical Maximum (9000),
+ *      Logical Maximum (18000),
+ *      Input (Variable),
+ *      Pop,
+ *      Usage Page (Digitizer),     ; Digitizer (0Dh)
+ *      Usage (Tip Pressure),       ; Tip pressure (30h, dynamic value)
+ *      Logical Maximum (1023),
+ *      Input (Variable),
+ *      Report Size (16),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),           ; Generic desktop controls (01h)
+ *  Usage (Mouse),                  ; Mouse (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (8),
+ *    Usage (Pointer),              ; Pointer (01h, physical collection)
+ *    Collection (Physical),
+ *      Usage Page (Button),        ; Button (09h)
+ *      Usage Minimum (01h),
+ *      Usage Maximum (03h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Count (3),
+ *      Report Size (1),
+ *      Input (Variable),
+ *      Report Count (5),
+ *      Input (Constant),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Usage (Wheel),              ; Wheel (38h, dynamic value)
+ *      Usage (00h),
+ *      Logical Minimum (-127),
+ *      Logical Maximum (127),
+ *      Report Size (8),
+ *      Report Count (4),
+ *      Input (Variable, Relative),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),           ; Generic desktop controls (01h)
+ *  Usage (Mouse),                  ; Mouse (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (9),
+ *    Usage (Pointer),              ; Pointer (01h, physical collection)
+ *    Collection (Physical),
+ *      Usage Page (Button),        ; Button (09h)
+ *      Usage Minimum (01h),
+ *      Usage Maximum (03h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Count (3),
+ *      Report Size (1),
+ *      Input (Variable),
+ *      Report Count (5),
+ *      Input (Constant),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Logical Minimum (0),
+ *      Logical Maximum (32767),
+ *      Physical Minimum (0),
+ *      Physical Maximum (32767),
+ *      Report Count (2),
+ *      Report Size (16),
+ *      Input (Variable),
+ *      Usage Page (Digitizer),     ; Digitizer (0Dh)
+ *      Usage (Tip Pressure),       ; Tip pressure (30h, dynamic value)
+ *      Logical Maximum (1023),
+ *      Report Count (1),
+ *      Report Size (16),
+ *      Input (Variable),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),           ; Generic desktop controls (01h)
+ *  Usage (00h),
+ *  Collection (Application),
+ *    Report ID (4),
+ *    Logical Minimum (0),
+ *    Logical Maximum (255),
+ *    Usage (00h),
+ *    Report Size (8),
+ *    Report Count (3),
+ *    Feature (Variable),
+ *  End Collection
+ */
+
+/* Size of the original descriptor of PF1209 tablet */
+#define PF1209_RDESC_ORIG_SIZE	234
+
+/*
+ * Fixed PF1209 report descriptor
+ *
+ * The descriptor is fixed similarly to WP5540U and WP8060U, plus the
+ * feature report is removed, because its purpose is unknown and it is of no
+ * use to the generic HID driver anyway for now.
+ */
+static __u8 pf1209_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x09,         /*      Report ID (9),                  */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x05,         /*          Report Count (5),           */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0xE0, 0x2E,   /*          Physical Maximum (12000),   */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0x28, 0x23,   /*          Physical Maximum (9000),    */
+	0x26, 0xFF, 0x7F,   /*          Logical Maximum (32767),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0,               /*  End Collection,                     */
+	0x05, 0x01,         /*  Usage Page (Desktop),               */
+	0x09, 0x02,         /*  Usage (Mouse),                      */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x08,         /*      Report ID (8),                  */
+	0x09, 0x01,         /*      Usage (Pointer),                */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x05, 0x09,         /*          Usage Page (Button),        */
+	0x19, 0x01,         /*          Usage Minimum (01h),        */
+	0x29, 0x03,         /*          Usage Maximum (03h),        */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x05,         /*          Report Count (5),           */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x75, 0x08,         /*          Report Size (8),            */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x15, 0x81,         /*          Logical Minimum (-127),     */
+	0x25, 0x7F,         /*          Logical Maximum (127),      */
+	0x95, 0x02,         /*          Report Count (2),           */
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0x09, 0x38,         /*          Usage (Wheel),              */
+	0x15, 0xFF,         /*          Logical Minimum (-1),       */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0x81, 0x01,         /*          Input (Constant),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0                /*  End Collection                      */
+};
+
+static __u8 *uclogic_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+					unsigned int *rsize)
+{
+	switch (hdev->product) {
+	case USB_DEVICE_ID_UCLOGIC_TABLET_PF1209:
+		if (*rsize == PF1209_RDESC_ORIG_SIZE) {
+			rdesc = pf1209_rdesc_fixed;
+			*rsize = sizeof(pf1209_rdesc_fixed);
+		}
+		break;
+	case USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U:
+		if (*rsize == WPXXXXU_RDESC_ORIG_SIZE) {
+			rdesc = wp4030u_rdesc_fixed;
+			*rsize = sizeof(wp4030u_rdesc_fixed);
+		}
+		break;
+	case USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U:
+		if (*rsize == WPXXXXU_RDESC_ORIG_SIZE) {
+			rdesc = wp5540u_rdesc_fixed;
+			*rsize = sizeof(wp5540u_rdesc_fixed);
+		}
+		break;
+	case USB_DEVICE_ID_UCLOGIC_TABLET_WP8060U:
+		if (*rsize == WPXXXXU_RDESC_ORIG_SIZE) {
+			rdesc = wp8060u_rdesc_fixed;
+			*rsize = sizeof(wp8060u_rdesc_fixed);
+		}
+		break;
+	}
+
+	return rdesc;
+}
+
+static const struct hid_device_id uclogic_devices[] = {
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC,
+				USB_DEVICE_ID_UCLOGIC_TABLET_PF1209) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC,
+				USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC,
+				USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC,
+				USB_DEVICE_ID_UCLOGIC_TABLET_WP8060U) },
+	{ }
+};
+MODULE_DEVICE_TABLE(hid, uclogic_devices);
+
+static struct hid_driver uclogic_driver = {
+	.name = "uclogic",
+	.id_table = uclogic_devices,
+	.report_fixup = uclogic_report_fixup,
+};
+
+static int __init uclogic_init(void)
+{
+	return hid_register_driver(&uclogic_driver);
+}
+
+static void __exit uclogic_exit(void)
+{
+	hid_unregister_driver(&uclogic_driver);
+}
+
+module_init(uclogic_init);
+module_exit(uclogic_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/hid-waltop.c b/drivers/hid/hid-waltop.c
new file mode 100644
index 000000000000..b3a4163f2e67
--- /dev/null
+++ b/drivers/hid/hid-waltop.c
@@ -0,0 +1,1099 @@
+/*
+ *  HID driver for Waltop devices not fully compliant with HID standard
+ *
+ *  Copyright (c) 2010 Nikolai Kondrashov
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/hid.h>
+#include <linux/module.h>
+
+#include "hid-ids.h"
+
+/*
+ * There exists an official driver on the manufacturer's website, which
+ * wasn't submitted to the kernel, for some reason. The official driver
+ * doesn't seem to support extra features of some tablets, like wheels.
+ *
+ * It shows that the feature report ID 2 could be used to control any waltop
+ * tablet input mode, switching it between "default", "tablet" and "ink".
+ *
+ * This driver only uses "default" mode for all the supported tablets. This
+ * mode tries to be HID-compatible (not very successfully), but cripples the
+ * resolution of some tablets.
+ *
+ * The "tablet" mode uses some proprietary, yet decipherable protocol, which
+ * represents the correct resolution, but is possibly HID-incompatible (i.e.
+ * indescribable by a report descriptor).
+ *
+ * The purpose of the "ink" mode is unknown.
+ *
+ * The feature reports needed for switching to each mode are these:
+ *
+ * 02 16 00     default
+ * 02 16 01     tablet
+ * 02 16 02     ink
+ */
+
+/*
+ * Original Slim Tablet 5.8 inch report descriptor.
+ *
+ * All the reports except the report with ID 16 (the stylus) are unused,
+ * possibly because the tablet is not configured to, or because they were
+ * just copied from a more capable model. The full purpose of features
+ * described for report ID 2 is unknown.
+ *
+ * The stylus buttons are described as three bit fields, whereas actually
+ * it's an "array", i.e. they're reported as button numbers (1, 2 and 3).
+ * The "eraser" field is not used. There is also a "push" without a "pop" in
+ * the stylus description.
+ *
+ *  Usage Page (Desktop),           ; Generic desktop controls (01h)
+ *  Usage (Mouse),                  ; Mouse (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (1),
+ *    Usage (Pointer),              ; Pointer (01h, physical collection)
+ *    Collection (Physical),
+ *      Usage Page (Button),        ; Button (09h)
+ *      Usage Minimum (01h),
+ *      Usage Maximum (05h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Size (3),
+ *      Report Count (1),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Usage (Wheel),              ; Wheel (38h, dynamic value)
+ *      Logical Minimum (-127),
+ *      Logical Maximum (127),
+ *      Report Size (8),
+ *      Report Count (3),
+ *      Input (Variable, Relative),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Digitizer),         ; Digitizer (0Dh)
+ *  Usage (Pen),                    ; Pen (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (2),
+ *    Usage (Stylus),               ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *      Usage (Azimuth),            ; Azimuth (3Fh, dynamic value)
+ *      Usage (Altitude),           ; Altitude (40h, dynamic value)
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (2),
+ *      Feature (Variable),
+ *    End Collection,
+ *    Report ID (5),
+ *    Usage Page (Digitizer),       ; Digitizer (0Dh)
+ *    Usage (Stylus),               ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (10),
+ *    Usage Page (Digitizer),       ; Digitizer (0Dh)
+ *    Usage (Stylus),               ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (3),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (16),
+ *    Usage (Stylus),               ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (Tip Switch),         ; Tip switch (42h, momentary control)
+ *      Usage (Barrel Switch),      ; Barrel switch (44h, momentary control)
+ *      Usage (Invert),             ; Invert (3Ch, momentary control)
+ *      Usage (Eraser),             ; Eraser (45h, momentary control)
+ *      Usage (In Range),           ; In range (32h, momentary control)
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Count (3),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),       ; Generic desktop controls (01h)
+ *      Usage (X),                  ; X (30h, dynamic value)
+ *      Report Size (16),
+ *      Report Count (1),
+ *      Push,
+ *      Unit Exponent (13),
+ *      Unit (Inch^3),
+ *      Logical Minimum (0),
+ *      Logical Maximum (10000),
+ *      Physical Minimum (0),
+ *      Physical Maximum (10000),
+ *      Input (Variable),
+ *      Usage (Y),                  ; Y (31h, dynamic value)
+ *      Logical Maximum (6000),
+ *      Physical Maximum (6000),
+ *      Input (Variable),
+ *      Usage Page (Digitizer),     ; Digitizer (0Dh)
+ *      Usage (Tip Pressure),       ; Tip pressure (30h, dynamic value)
+ *      Logical Minimum (0),
+ *      Logical Maximum (1023),
+ *      Physical Minimum (0),
+ *      Physical Maximum (1023),
+ *      Input (Variable),
+ *    End Collection,
+ *  End Collection
+ */
+
+/* Size of the original report descriptor of Slim Tablet 5.8 inch */
+#define SLIM_TABLET_5_8_INCH_RDESC_ORIG_SIZE	222
+
+/*
+ * Fixed Slim Tablet 5.8 inch descriptor.
+ *
+ * All the reports except the stylus report (ID 16) were removed as unused.
+ * The stylus buttons description was fixed.
+ */
+static __u8 slim_tablet_5_8_inch_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x10,         /*      Report ID (16),                 */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x15, 0x01,         /*          Logical Minimum (1),        */
+	0x25, 0x03,         /*          Logical Maximum (3),        */
+	0x75, 0x04,         /*          Report Size (4),            */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x80,               /*          Input,                      */
+	0x09, 0x32,         /*          Usage (In Range),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x03,         /*          Input (Constant, Variable), */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0x88, 0x13,   /*          Physical Maximum (5000),    */
+	0x26, 0x10, 0x27,   /*          Logical Maximum (10000),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0xB8, 0x0B,   /*          Physical Maximum (3000),    */
+	0x26, 0x70, 0x17,   /*          Logical Maximum (6000),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0                /*  End Collection                      */
+};
+
+/*
+ * Original Slim Tablet 12.1 inch report descriptor.
+ *
+ * The descriptor is similar to the Slim Tablet 5.8 inch descriptor with the
+ * addition of a keyboard report, seemingly unused. It may have get here
+ * from a Media Tablet - probably an unimplemented feature.
+ *
+ *  Usage Page (Desktop),             ; Generic desktop controls (01h)
+ *  Usage (Mouse),                    ; Mouse (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (1),
+ *    Usage (Pointer),                ; Pointer (01h, physical collection)
+ *    Collection (Physical),
+ *      Usage Page (Button),          ; Button (09h)
+ *      Usage Minimum (01h),
+ *      Usage Maximum (05h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Size (3),
+ *      Report Count (1),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),         ; Generic desktop controls (01h)
+ *      Usage (X),                    ; X (30h, dynamic value)
+ *      Usage (Y),                    ; Y (31h, dynamic value)
+ *      Usage (Wheel),                ; Wheel (38h, dynamic value)
+ *      Logical Minimum (-127),
+ *      Logical Maximum (127),
+ *      Report Size (8),
+ *      Report Count (3),
+ *      Input (Variable, Relative),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Digitizer),           ; Digitizer (0Dh)
+ *  Usage (Pen),                      ; Pen (02h, application collection)
+ *  Collection (Application),
+ *    Report ID (2),
+ *    Usage (Stylus),                 ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *      Usage (Azimuth),              ; Azimuth (3Fh, dynamic value)
+ *      Usage (Altitude),             ; Altitude (40h, dynamic value)
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (2),
+ *      Feature (Variable),
+ *    End Collection,
+ *    Report ID (5),
+ *    Usage Page (Digitizer),         ; Digitizer (0Dh)
+ *    Usage (Stylus),                 ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (10),
+ *    Usage Page (Digitizer),         ; Digitizer (0Dh)
+ *    Usage (Stylus),                 ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (3),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (16),
+ *    Usage (Stylus),                 ; Stylus (20h, logical collection)
+ *    Collection (Physical),
+ *      Usage (Tip Switch),           ; Tip switch (42h, momentary control)
+ *      Usage (Barrel Switch),        ; Barrel switch (44h, momentary control)
+ *      Usage (Invert),               ; Invert (3Ch, momentary control)
+ *      Usage (Eraser),               ; Eraser (45h, momentary control)
+ *      Usage (In Range),             ; In range (32h, momentary control)
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Count (3),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),         ; Generic desktop controls (01h)
+ *      Usage (X),                    ; X (30h, dynamic value)
+ *      Report Size (16),
+ *      Report Count (1),
+ *      Push,
+ *      Unit Exponent (13),
+ *      Unit (Inch^3),
+ *      Logical Minimum (0),
+ *      Logical Maximum (20000),
+ *      Physical Minimum (0),
+ *      Physical Maximum (20000),
+ *      Input (Variable),
+ *      Usage (Y),                    ; Y (31h, dynamic value)
+ *      Logical Maximum (12500),
+ *      Physical Maximum (12500),
+ *      Input (Variable),
+ *      Usage Page (Digitizer),       ; Digitizer (0Dh)
+ *      Usage (Tip Pressure),         ; Tip pressure (30h, dynamic value)
+ *      Logical Minimum (0),
+ *      Logical Maximum (1023),
+ *      Physical Minimum (0),
+ *      Physical Maximum (1023),
+ *      Input (Variable),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),             ; Generic desktop controls (01h)
+ *  Usage (Keyboard),                 ; Keyboard (06h, application collection)
+ *  Collection (Application),
+ *    Report ID (13),
+ *    Usage Page (Keyboard),          ; Keyboard/keypad (07h)
+ *    Usage Minimum (KB Leftcontrol), ; Keyboard left control
+ *                                    ; (E0h, dynamic value)
+ *    Usage Maximum (KB Right GUI),   ; Keyboard right GUI (E7h, dynamic value)
+ *    Logical Minimum (0),
+ *    Logical Maximum (1),
+ *    Report Size (1),
+ *    Report Count (8),
+ *    Input (Variable),
+ *    Report Size (8),
+ *    Report Count (1),
+ *    Input (Constant),
+ *    Usage Page (Keyboard),          ; Keyboard/keypad (07h)
+ *    Usage Minimum (None),           ; No event (00h, selector)
+ *    Usage Maximum (KB Application), ; Keyboard Application (65h, selector)
+ *    Logical Minimum (0),
+ *    Logical Maximum (101),
+ *    Report Size (8),
+ *    Report Count (5),
+ *    Input,
+ *  End Collection
+ */
+
+/* Size of the original report descriptor of Slim Tablet 12.1 inch */
+#define SLIM_TABLET_12_1_INCH_RDESC_ORIG_SIZE	269
+
+/*
+ * Fixed Slim Tablet 12.1 inch descriptor.
+ *
+ * All the reports except the stylus report (ID 16) were removed as unused.
+ * The stylus buttons description was fixed.
+ */
+static __u8 slim_tablet_12_1_inch_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x10,         /*      Report ID (16),                 */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x15, 0x01,         /*          Logical Minimum (1),        */
+	0x25, 0x03,         /*          Logical Maximum (3),        */
+	0x75, 0x04,         /*          Report Size (4),            */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x80,               /*          Input,                      */
+	0x09, 0x32,         /*          Usage (In Range),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x03,         /*          Input (Constant, Variable), */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0x10, 0x27,   /*          Physical Maximum (10000),   */
+	0x26, 0x20, 0x4E,   /*          Logical Maximum (20000),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0x6A, 0x18,   /*          Physical Maximum (6250),    */
+	0x26, 0xD4, 0x30,   /*          Logical Maximum (12500),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0                /*  End Collection                      */
+};
+
+/*
+ * Original Media Tablet 10.6 inch report descriptor.
+ *
+ * There are at least two versions of this model in the wild. They are
+ * represented by Genius G-Pen M609 (older version) and Genius G-Pen M609X
+ * (newer version).
+ *
+ * Both versions have the usual pen with two barrel buttons and two
+ * identical wheels with center buttons in the top corners of the tablet
+ * base. They also have buttons on the top, between the wheels, for
+ * selecting the wheels' functions and wide/standard mode. In the wide mode
+ * the whole working surface is sensed, in the standard mode a narrower area
+ * is sensed, but the logical report extents remain the same. These modes
+ * correspond roughly to 16:9 and 4:3 aspect ratios respectively.
+ *
+ * The older version has three wheel function buttons ("scroll", "zoom" and
+ * "volume") and two separate buttons for wide and standard mode. The newer
+ * version has four wheel function buttons (plus "brush") and only one
+ * button is used for selecting wide/standard mode. So, the total number of
+ * buttons remains the same, but one of the mode buttons is repurposed as a
+ * wheels' function button in the newer version.
+ *
+ * The wheel functions are:
+ * scroll   - the wheels act as scroll wheels, the center buttons switch
+ *            between vertical and horizontal scrolling;
+ * zoom     - the wheels zoom in/out, the buttons supposedly reset to 100%;
+ * volume   - the wheels control the sound volume, the buttons mute;
+ * brush    - the wheels are supposed to control brush width in a graphics
+ *            editor, the buttons do nothing.
+ *
+ * Below is the newer version's report descriptor. It may very well be that
+ * the older version's descriptor is different and thus it won't be
+ * supported.
+ *
+ * The mouse report (ID 1) only uses the wheel field for reporting the tablet
+ * wheels' scroll mode. The keyboard report (ID 13) is used to report the
+ * wheels' zoom and brush control functions as key presses. The report ID 12
+ * is used to report the wheels' volume control functions. The stylus report
+ * (ID 16) has the same problems as the Slim Tablet 5.8 inch report has.
+ *
+ * The rest of the reports are unused, at least in the default configuration.
+ * The purpose of the features is unknown.
+ *
+ *  Usage Page (Desktop),
+ *  Usage (Mouse),
+ *  Collection (Application),
+ *    Report ID (1),
+ *    Usage (Pointer),
+ *    Collection (Physical),
+ *      Usage Page (Button),
+ *      Usage Minimum (01h),
+ *      Usage Maximum (05h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Size (3),
+ *      Report Count (1),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),
+ *      Usage (X),
+ *      Usage (Y),
+ *      Usage (Wheel),
+ *      Logical Minimum (-127),
+ *      Logical Maximum (127),
+ *      Report Size (8),
+ *      Report Count (3),
+ *      Input (Variable, Relative),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Digitizer),
+ *  Usage (Pen),
+ *  Collection (Application),
+ *    Report ID (2),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *      Usage (Azimuth),
+ *      Usage (Altitude),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (2),
+ *      Feature (Variable),
+ *    End Collection,
+ *    Report ID (5),
+ *    Usage Page (Digitizer),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (10),
+ *    Usage Page (Digitizer),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (16),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (Tip Switch),
+ *      Usage (Barrel Switch),
+ *      Usage (Invert),
+ *      Usage (Eraser),
+ *      Usage (In Range),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Count (3),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),
+ *      Usage (X),
+ *      Report Size (16),
+ *      Report Count (1),
+ *      Push,
+ *      Unit Exponent (13),
+ *      Unit (Inch^3),
+ *      Logical Minimum (0),
+ *      Logical Maximum (18000),
+ *      Physical Minimum (0),
+ *      Physical Maximum (18000),
+ *      Input (Variable),
+ *      Usage (Y),
+ *      Logical Maximum (11000),
+ *      Physical Maximum (11000),
+ *      Input (Variable),
+ *      Usage Page (Digitizer),
+ *      Usage (Tip Pressure),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1023),
+ *      Physical Minimum (0),
+ *      Physical Maximum (1023),
+ *      Input (Variable),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),
+ *  Usage (Keyboard),
+ *  Collection (Application),
+ *    Report ID (13),
+ *    Usage Page (Keyboard),
+ *    Usage Minimum (KB Leftcontrol),
+ *    Usage Maximum (KB Right GUI),
+ *    Logical Minimum (0),
+ *    Logical Maximum (1),
+ *    Report Size (1),
+ *    Report Count (8),
+ *    Input (Variable),
+ *    Report Size (8),
+ *    Report Count (1),
+ *    Input (Constant),
+ *    Usage Page (Keyboard),
+ *    Usage Minimum (None),
+ *    Usage Maximum (KB Application),
+ *    Logical Minimum (0),
+ *    Logical Maximum (101),
+ *    Report Size (8),
+ *    Report Count (5),
+ *    Input,
+ *  End Collection,
+ *  Usage Page (Consumer),
+ *  Usage (Consumer Control),
+ *  Collection (Application),
+ *    Report ID (12),
+ *    Usage (Volume Inc),
+ *    Usage (Volume Dec),
+ *    Usage (Mute),
+ *    Logical Minimum (0),
+ *    Logical Maximum (1),
+ *    Report Size (1),
+ *    Report Count (3),
+ *    Input (Variable, Relative),
+ *    Report Size (5),
+ *    Report Count (1),
+ *    Input (Constant, Variable, Relative),
+ *  End Collection
+ */
+
+/* Size of the original report descriptor of Media Tablet 10.6 inch */
+#define MEDIA_TABLET_10_6_INCH_RDESC_ORIG_SIZE	300
+
+/*
+ * Fixed Media Tablet 10.6 inch descriptor.
+ *
+ * The descriptions of reports unused in the default configuration are
+ * removed. The stylus report (ID 16) is fixed similarly to Slim Tablet 5.8
+ * inch.  The unused mouse report (ID 1) fields are replaced with constant
+ * padding.
+ *
+ * The keyboard report (ID 13) is hacked to instead have an "array" field
+ * reporting consumer page controls, and all the unused bits are masked out
+ * with constant padding. The "brush" wheels' function is represented as "Scan
+ * Previous/Next Track" controls due to the lack of brush controls in the
+ * usage tables specification.
+ */
+static __u8 media_tablet_10_6_inch_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x10,         /*      Report ID (16),                 */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x15, 0x01,         /*          Logical Minimum (1),        */
+	0x25, 0x03,         /*          Logical Maximum (3),        */
+	0x75, 0x04,         /*          Report Size (4),            */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x80,               /*          Input,                      */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x09, 0x32,         /*          Usage (In Range),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x03,         /*          Input (Constant, Variable), */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0x28, 0x23,   /*          Physical Maximum (9000),    */
+	0x26, 0x50, 0x46,   /*          Logical Maximum (18000),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0x7C, 0x15,   /*          Physical Maximum (5500),    */
+	0x26, 0xF8, 0x2A,   /*          Logical Maximum (11000),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0,               /*  End Collection,                     */
+	0x05, 0x01,         /*  Usage Page (Desktop),               */
+	0x09, 0x02,         /*  Usage (Mouse),                      */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x01,         /*      Report ID (1),                  */
+	0x09, 0x01,         /*      Usage (Pointer),                */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x08,         /*          Report Size (8),            */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x03,         /*          Input (Constant, Variable), */
+	0x95, 0x02,         /*          Report Count (2),           */
+	0x15, 0xFF,         /*          Logical Minimum (-1),       */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x09, 0x38,         /*          Usage (Wheel),              */
+	0x0B, 0x38, 0x02,   /*          Usage (Consumer AC Pan),    */
+		0x0C, 0x00,
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0x95, 0x02,         /*          Report Count (2),           */
+	0x81, 0x03,         /*          Input (Constant, Variable), */
+	0xC0,               /*      End Collection,                 */
+	0xC0,               /*  End Collection,                     */
+	0x05, 0x0C,         /*  Usage Page (Consumer),              */
+	0x09, 0x01,         /*  Usage (Consumer Control),           */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x0D,         /*      Report ID (13),                 */
+	0x95, 0x01,         /*      Report Count (1),               */
+	0x75, 0x10,         /*      Report Size (16),               */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0x0A, 0x2F, 0x02,   /*      Usage (AC Zoom),                */
+	0x0A, 0x2E, 0x02,   /*      Usage (AC Zoom Out),            */
+	0x0A, 0x2D, 0x02,   /*      Usage (AC Zoom In),             */
+	0x09, 0xB6,         /*      Usage (Scan Previous Track),    */
+	0x09, 0xB5,         /*      Usage (Scan Next Track),        */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x0A, 0x2E, 0x02,   /*      Usage (AC Zoom Out),            */
+	0x0A, 0x2D, 0x02,   /*      Usage (AC Zoom In),             */
+	0x15, 0x0C,         /*      Logical Minimum (12),           */
+	0x25, 0x17,         /*      Logical Maximum (23),           */
+	0x75, 0x05,         /*      Report Size (5),                */
+	0x80,               /*      Input,                          */
+	0x75, 0x03,         /*      Report Size (3),                */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0x75, 0x20,         /*      Report Size (32),               */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0xC0,               /*  End Collection,                     */
+	0x09, 0x01,         /*  Usage (Consumer Control),           */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x0C,         /*      Report ID (12),                 */
+	0x75, 0x01,         /*      Report Size (1),                */
+	0x09, 0xE9,         /*      Usage (Volume Inc),             */
+	0x09, 0xEA,         /*      Usage (Volume Dec),             */
+	0x09, 0xE2,         /*      Usage (Mute),                   */
+	0x14,               /*      Logical Minimum (0),            */
+	0x25, 0x01,         /*      Logical Maximum (1),            */
+	0x95, 0x03,         /*      Report Count (3),               */
+	0x81, 0x06,         /*      Input (Variable, Relative),     */
+	0x95, 0x35,         /*      Report Count (53),              */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0xC0                /*  End Collection                      */
+};
+
+/*
+ * Original Media Tablet 14.1 inch report descriptor.
+ *
+ * There are at least two versions of this model in the wild. They are
+ * represented by Genius G-Pen M712 (older version) and Genius G-Pen M712X
+ * (newer version). The hardware difference between these versions is the same
+ * as between older and newer versions of Media Tablet 10.6 inch. The report
+ * descriptors are identical for both versions.
+ *
+ * The function, behavior and report descriptor of this tablet is similar to
+ * that of Media Tablet 10.6 inch. However, there is one more field (with
+ * Consumer AC Pan usage) in the mouse description. Then the tablet X and Y
+ * logical extents both get scaled to 0..16383 range (a hardware limit?),
+ * which kind of defeats the advertised 4000 LPI resolution, considering the
+ * physical extents of 12x7.25 inches. Plus, reports 5, 10 and 255 are used
+ * sometimes (while moving the pen) with unknown purpose. Also, the key codes
+ * generated for zoom in/out are different.
+ *
+ *  Usage Page (Desktop),
+ *  Usage (Mouse),
+ *  Collection (Application),
+ *    Report ID (1),
+ *    Usage (Pointer),
+ *    Collection (Physical),
+ *      Usage Page (Button),
+ *      Usage Minimum (01h),
+ *      Usage Maximum (05h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Size (3),
+ *      Report Count (1),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),
+ *      Usage (X),
+ *      Usage (Y),
+ *      Usage (Wheel),
+ *      Logical Minimum (-127),
+ *      Logical Maximum (127),
+ *      Report Size (8),
+ *      Report Count (3),
+ *      Input (Variable, Relative),
+ *      Usage Page (Consumer),
+ *      Logical Minimum (-127),
+ *      Logical Maximum (127),
+ *      Report Size (8),
+ *      Report Count (1),
+ *      Usage (AC Pan),
+ *      Input (Variable, Relative),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Digitizer),
+ *  Usage (Pen),
+ *  Collection (Application),
+ *    Report ID (2),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *      Usage (Azimuth),
+ *      Usage (Altitude),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (2),
+ *      Feature (Variable),
+ *    End Collection,
+ *    Report ID (5),
+ *    Usage Page (Digitizer),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (10),
+ *    Usage Page (Digitizer),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (00h),
+ *      Logical Minimum (0),
+ *      Logical Maximum (255),
+ *      Report Size (8),
+ *      Report Count (7),
+ *      Input (Variable),
+ *    End Collection,
+ *    Report ID (16),
+ *    Usage (Stylus),
+ *    Collection (Physical),
+ *      Usage (Tip Switch),
+ *      Usage (Barrel Switch),
+ *      Usage (Invert),
+ *      Usage (Eraser),
+ *      Usage (In Range),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1),
+ *      Report Size (1),
+ *      Report Count (5),
+ *      Input (Variable),
+ *      Report Count (3),
+ *      Input (Constant, Variable),
+ *      Usage Page (Desktop),
+ *      Usage (X),
+ *      Report Size (16),
+ *      Report Count (1),
+ *      Push,
+ *      Unit Exponent (13),
+ *      Unit (Inch^3),
+ *      Logical Minimum (0),
+ *      Logical Maximum (16383),
+ *      Physical Minimum (0),
+ *      Physical Maximum (16383),
+ *      Input (Variable),
+ *      Usage (Y),
+ *      Input (Variable),
+ *      Usage Page (Digitizer),
+ *      Usage (Tip Pressure),
+ *      Logical Minimum (0),
+ *      Logical Maximum (1023),
+ *      Physical Minimum (0),
+ *      Physical Maximum (1023),
+ *      Input (Variable),
+ *    End Collection,
+ *  End Collection,
+ *  Usage Page (Desktop),
+ *  Usage (Keyboard),
+ *  Collection (Application),
+ *    Report ID (13),
+ *    Usage Page (Keyboard),
+ *    Usage Minimum (KB Leftcontrol),
+ *    Usage Maximum (KB Right GUI),
+ *    Logical Minimum (0),
+ *    Logical Maximum (1),
+ *    Report Size (1),
+ *    Report Count (8),
+ *    Input (Variable),
+ *    Report Size (8),
+ *    Report Count (1),
+ *    Input (Constant),
+ *    Usage Page (Keyboard),
+ *    Usage Minimum (None),
+ *    Usage Maximum (KB Application),
+ *    Logical Minimum (0),
+ *    Logical Maximum (101),
+ *    Report Size (8),
+ *    Report Count (5),
+ *    Input,
+ *  End Collection,
+ *  Usage Page (Consumer),
+ *  Usage (Consumer Control),
+ *  Collection (Application),
+ *    Report ID (12),
+ *    Usage (Volume Inc),
+ *    Usage (Volume Dec),
+ *    Usage (Mute),
+ *    Logical Minimum (0),
+ *    Logical Maximum (1),
+ *    Report Size (1),
+ *    Report Count (3),
+ *    Input (Variable, Relative),
+ *    Report Size (5),
+ *    Report Count (1),
+ *    Input (Constant, Variable, Relative),
+ *  End Collection
+ */
+
+/* Size of the original report descriptor of Media Tablet 14.1 inch */
+#define MEDIA_TABLET_14_1_INCH_RDESC_ORIG_SIZE	309
+
+/*
+ * Fixed Media Tablet 14.1 inch descriptor.
+ * It is fixed similarly to the Media Tablet 10.6 inch descriptor.
+ */
+static __u8 media_tablet_14_1_inch_rdesc_fixed[] = {
+	0x05, 0x0D,         /*  Usage Page (Digitizer),             */
+	0x09, 0x02,         /*  Usage (Pen),                        */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x10,         /*      Report ID (16),                 */
+	0x09, 0x20,         /*      Usage (Stylus),                 */
+	0xA0,               /*      Collection (Physical),          */
+	0x09, 0x42,         /*          Usage (Tip Switch),         */
+	0x09, 0x44,         /*          Usage (Barrel Switch),      */
+	0x09, 0x46,         /*          Usage (Tablet Pick),        */
+	0x15, 0x01,         /*          Logical Minimum (1),        */
+	0x25, 0x03,         /*          Logical Maximum (3),        */
+	0x75, 0x04,         /*          Report Size (4),            */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x80,               /*          Input,                      */
+	0x75, 0x01,         /*          Report Size (1),            */
+	0x09, 0x32,         /*          Usage (In Range),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x03,         /*          Input (Constant, Variable), */
+	0x75, 0x10,         /*          Report Size (16),           */
+	0x95, 0x01,         /*          Report Count (1),           */
+	0x14,               /*          Logical Minimum (0),        */
+	0xA4,               /*          Push,                       */
+	0x05, 0x01,         /*          Usage Page (Desktop),       */
+	0x65, 0x13,         /*          Unit (Inch),                */
+	0x55, 0xFD,         /*          Unit Exponent (-3),         */
+	0x34,               /*          Physical Minimum (0),       */
+	0x09, 0x30,         /*          Usage (X),                  */
+	0x46, 0xE0, 0x2E,   /*          Physical Maximum (12000),   */
+	0x26, 0xFF, 0x3F,   /*          Logical Maximum (16383),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0x09, 0x31,         /*          Usage (Y),                  */
+	0x46, 0x52, 0x1C,   /*          Physical Maximum (7250),    */
+	0x26, 0xFF, 0x3F,   /*          Logical Maximum (16383),    */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xB4,               /*          Pop,                        */
+	0x09, 0x30,         /*          Usage (Tip Pressure),       */
+	0x26, 0xFF, 0x03,   /*          Logical Maximum (1023),     */
+	0x81, 0x02,         /*          Input (Variable),           */
+	0xC0,               /*      End Collection,                 */
+	0xC0,               /*  End Collection,                     */
+	0x05, 0x01,         /*  Usage Page (Desktop),               */
+	0x09, 0x02,         /*  Usage (Mouse),                      */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x01,         /*      Report ID (1),                  */
+	0x09, 0x01,         /*      Usage (Pointer),                */
+	0xA0,               /*      Collection (Physical),          */
+	0x75, 0x08,         /*          Report Size (8),            */
+	0x95, 0x03,         /*          Report Count (3),           */
+	0x81, 0x03,         /*          Input (Constant, Variable), */
+	0x95, 0x02,         /*          Report Count (2),           */
+	0x15, 0xFF,         /*          Logical Minimum (-1),       */
+	0x25, 0x01,         /*          Logical Maximum (1),        */
+	0x09, 0x38,         /*          Usage (Wheel),              */
+	0x0B, 0x38, 0x02,   /*          Usage (Consumer AC Pan),    */
+		0x0C, 0x00,
+	0x81, 0x06,         /*          Input (Variable, Relative), */
+	0xC0,               /*      End Collection,                 */
+	0xC0,               /*  End Collection,                     */
+	0x05, 0x0C,         /*  Usage Page (Consumer),              */
+	0x09, 0x01,         /*  Usage (Consumer Control),           */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x0D,         /*      Report ID (13),                 */
+	0x95, 0x01,         /*      Report Count (1),               */
+	0x75, 0x10,         /*      Report Size (16),               */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0x0A, 0x2F, 0x02,   /*      Usage (AC Zoom),                */
+	0x0A, 0x2E, 0x02,   /*      Usage (AC Zoom Out),            */
+	0x0A, 0x2D, 0x02,   /*      Usage (AC Zoom In),             */
+	0x09, 0xB6,         /*      Usage (Scan Previous Track),    */
+	0x09, 0xB5,         /*      Usage (Scan Next Track),        */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x08,               /*      Usage (00h),                    */
+	0x0A, 0x2E, 0x02,   /*      Usage (AC Zoom Out),            */
+	0x0A, 0x2D, 0x02,   /*      Usage (AC Zoom In),             */
+	0x15, 0x0C,         /*      Logical Minimum (12),           */
+	0x25, 0x17,         /*      Logical Maximum (23),           */
+	0x75, 0x05,         /*      Report Size (5),                */
+	0x80,               /*      Input,                          */
+	0x75, 0x03,         /*      Report Size (3),                */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0x75, 0x20,         /*      Report Size (32),               */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0xC0,               /*  End Collection,                     */
+	0x09, 0x01,         /*  Usage (Consumer Control),           */
+	0xA1, 0x01,         /*  Collection (Application),           */
+	0x85, 0x0C,         /*      Report ID (12),                 */
+	0x75, 0x01,         /*      Report Size (1),                */
+	0x09, 0xE9,         /*      Usage (Volume Inc),             */
+	0x09, 0xEA,         /*      Usage (Volume Dec),             */
+	0x09, 0xE2,         /*      Usage (Mute),                   */
+	0x14,               /*      Logical Minimum (0),            */
+	0x25, 0x01,         /*      Logical Maximum (1),            */
+	0x95, 0x03,         /*      Report Count (3),               */
+	0x81, 0x06,         /*      Input (Variable, Relative),     */
+	0x75, 0x05,         /*      Report Size (5),                */
+	0x81, 0x03,         /*      Input (Constant, Variable),     */
+	0xC0                /*  End Collection                      */
+};
+
+static __u8 *waltop_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+		unsigned int *rsize)
+{
+	switch (hdev->product) {
+	case USB_DEVICE_ID_WALTOP_SLIM_TABLET_5_8_INCH:
+		if (*rsize == SLIM_TABLET_5_8_INCH_RDESC_ORIG_SIZE) {
+			rdesc = slim_tablet_5_8_inch_rdesc_fixed;
+			*rsize = sizeof(slim_tablet_5_8_inch_rdesc_fixed);
+		}
+		break;
+	case USB_DEVICE_ID_WALTOP_SLIM_TABLET_12_1_INCH:
+		if (*rsize == SLIM_TABLET_12_1_INCH_RDESC_ORIG_SIZE) {
+			rdesc = slim_tablet_12_1_inch_rdesc_fixed;
+			*rsize = sizeof(slim_tablet_12_1_inch_rdesc_fixed);
+		}
+		break;
+	case USB_DEVICE_ID_WALTOP_MEDIA_TABLET_10_6_INCH:
+		if (*rsize == MEDIA_TABLET_10_6_INCH_RDESC_ORIG_SIZE) {
+			rdesc = media_tablet_10_6_inch_rdesc_fixed;
+			*rsize = sizeof(media_tablet_10_6_inch_rdesc_fixed);
+		}
+		break;
+	case USB_DEVICE_ID_WALTOP_MEDIA_TABLET_14_1_INCH:
+		if (*rsize == MEDIA_TABLET_14_1_INCH_RDESC_ORIG_SIZE) {
+			rdesc = media_tablet_14_1_inch_rdesc_fixed;
+			*rsize = sizeof(media_tablet_14_1_inch_rdesc_fixed);
+		}
+		break;
+	}
+	return rdesc;
+}
+
+static const struct hid_device_id waltop_devices[] = {
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP,
+				USB_DEVICE_ID_WALTOP_SLIM_TABLET_5_8_INCH) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP,
+				USB_DEVICE_ID_WALTOP_SLIM_TABLET_12_1_INCH) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP,
+				USB_DEVICE_ID_WALTOP_MEDIA_TABLET_10_6_INCH) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP,
+				USB_DEVICE_ID_WALTOP_MEDIA_TABLET_14_1_INCH) },
+	{ }
+};
+MODULE_DEVICE_TABLE(hid, waltop_devices);
+
+static struct hid_driver waltop_driver = {
+	.name = "waltop",
+	.id_table = waltop_devices,
+	.report_fixup = waltop_report_fixup,
+};
+
+static int __init waltop_init(void)
+{
+	return hid_register_driver(&waltop_driver);
+}
+
+static void __exit waltop_exit(void)
+{
+	hid_unregister_driver(&waltop_driver);
+}
+
+module_init(waltop_init);
+module_exit(waltop_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/hid-zydacron.c b/drivers/hid/hid-zydacron.c
index 9e8d35a203e4..aac1f9273149 100644
--- a/drivers/hid/hid-zydacron.c
+++ b/drivers/hid/hid-zydacron.c
@@ -27,10 +27,10 @@ struct zc_device {
 * Zydacron remote control has an invalid HID report descriptor,
 * that needs fixing before we can parse it.
 */
-static void zc_report_fixup(struct hid_device *hdev, __u8 *rdesc,
-	unsigned int rsize)
+static __u8 *zc_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+	unsigned int *rsize)
 {
-	if (rsize >= 253 &&
+	if (*rsize >= 253 &&
 		rdesc[0x96] == 0xbc && rdesc[0x97] == 0xff &&
 		rdesc[0xca] == 0xbc && rdesc[0xcb] == 0xff &&
 		rdesc[0xe1] == 0xbc && rdesc[0xe2] == 0xff) {
@@ -40,6 +40,7 @@ static void zc_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 			rdesc[0x96] = rdesc[0xca] = rdesc[0xe1] = 0x0c;
 			rdesc[0x97] = rdesc[0xcb] = rdesc[0xe2] = 0x00;
 		}
+	return rdesc;
 }
 
 #define zc_map_key_clear(c) \
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 925992f549f0..8a4b32dca9f7 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -218,9 +218,13 @@ static int hidraw_release(struct inode * inode, struct file * file)
 	unsigned int minor = iminor(inode);
 	struct hidraw *dev;
 	struct hidraw_list *list = file->private_data;
+	int ret;
 
-	if (!hidraw_table[minor])
-		return -ENODEV;
+	mutex_lock(&minors_lock);
+	if (!hidraw_table[minor]) {
+		ret = -ENODEV;
+		goto unlock;
+	}
 
 	list_del(&list->node);
 	dev = hidraw_table[minor];
@@ -233,10 +237,12 @@ static int hidraw_release(struct inode * inode, struct file * file)
 			kfree(list->hidraw);
 		}
 	}
-
 	kfree(list);
+	ret = 0;
+unlock:
+	mutex_unlock(&minors_lock);
 
-	return 0;
+	return ret;
 }
 
 static long hidraw_ioctl(struct file *file, unsigned int cmd,
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 599041a7f670..5489eab3a6bd 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -807,9 +807,10 @@ static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t co
 	struct usb_host_interface *interface = intf->cur_altsetting;
 	int ret;
 
-	if (usbhid->urbout) {
+	if (usbhid->urbout && report_type != HID_FEATURE_REPORT) {
 		int actual_length;
 		int skipped_report_id = 0;
+
 		if (buf[0] == 0x0) {
 			/* Don't send the Report ID */
 			buf++;
@@ -1469,9 +1470,6 @@ static int __init hid_init(void)
 	retval = usbhid_quirks_init(quirks_param);
 	if (retval)
 		goto usbhid_quirks_init_fail;
-	retval = hiddev_init();
-	if (retval)
-		goto hiddev_init_fail;
 	retval = usb_register(&hid_driver);
 	if (retval)
 		goto usb_register_fail;
@@ -1479,8 +1477,6 @@ static int __init hid_init(void)
 
 	return 0;
 usb_register_fail:
-	hiddev_exit();
-hiddev_init_fail:
 	usbhid_quirks_exit();
 usbhid_quirks_init_fail:
 	hid_unregister_driver(&hid_usb_driver);
@@ -1493,7 +1489,6 @@ no_queue:
 static void __exit hid_exit(void)
 {
 	usb_deregister(&hid_driver);
-	hiddev_exit();
 	usbhid_quirks_exit();
 	hid_unregister_driver(&hid_usb_driver);
 	destroy_workqueue(resumption_waker);
diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c
index f0260c699adb..2c185477eeb3 100644
--- a/drivers/hid/usbhid/hid-quirks.c
+++ b/drivers/hid/usbhid/hid-quirks.c
@@ -34,7 +34,6 @@ static const struct hid_blacklist {
 	{ USB_VENDOR_ID_ALPS, USB_DEVICE_ID_IBM_GAMEPAD, HID_QUIRK_BADPAD },
 	{ USB_VENDOR_ID_CHIC, USB_DEVICE_ID_CHIC_GAMEPAD, HID_QUIRK_BADPAD },
 	{ USB_VENDOR_ID_DWAV, USB_DEVICE_ID_EGALAX_TOUCHCONTROLLER, HID_QUIRK_MULTI_INPUT | HID_QUIRK_NOGET },
-	{ USB_VENDOR_ID_DWAV, USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_MOJO, USB_DEVICE_ID_RETRO_ADAPTER, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_TURBOX, USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_DRIVING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT },
@@ -63,6 +62,7 @@ static const struct hid_blacklist {
 	{ USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_FLIGHT_SIM_YOKE, HID_QUIRK_NOGET },
 	{ USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_PRO_PEDALS, HID_QUIRK_NOGET },
 	{ USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_3AXIS_5BUTTON_STICK, HID_QUIRK_NOGET },
+	{ USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_AXIS_295, HID_QUIRK_NOGET },
 	{ USB_VENDOR_ID_DMI, USB_DEVICE_ID_DMI_ENC, HID_QUIRK_NOGET },
 	{ USB_VENDOR_ID_ELO, USB_DEVICE_ID_ELO_TS2700, HID_QUIRK_NOGET },
 	{ USB_VENDOR_ID_PRODIGE, USB_DEVICE_ID_PRODIGE_CORDLESS, HID_QUIRK_NOGET },
@@ -72,6 +72,10 @@ static const struct hid_blacklist {
 	{ USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_PF1209, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_KNA5, HID_QUIRK_MULTI_INPUT },
+	{ USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U, HID_QUIRK_MULTI_INPUT },
+	{ USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP8060U, HID_QUIRK_MULTI_INPUT },
+	{ USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_MEDIA_TABLET_10_6_INCH, HID_QUIRK_MULTI_INPUT },
+	{ USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_MEDIA_TABLET_14_1_INCH, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_DUAL_USB_JOYPAD, HID_QUIRK_NOGET | HID_QUIRK_MULTI_INPUT | HID_QUIRK_SKIP_OUTPUT_REPORTS },
 	{ USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_QUAD_USB_JOYPAD, HID_QUIRK_NOGET | HID_QUIRK_MULTI_INPUT },
 
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index dfcb27613ec5..fedd88df9a18 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -67,8 +67,6 @@ struct hiddev_list {
 	struct mutex thread_lock;
 };
 
-static struct usb_driver hiddev_driver;
-
 /*
  * Find a report, given the report's type and ID.  The ID can be specified
  * indirectly by REPORT_ID_FIRST (which returns the first report of the given
@@ -926,41 +924,3 @@ void hiddev_disconnect(struct hid_device *hid)
 		kfree(hiddev);
 	}
 }
-
-/* Currently this driver is a USB driver.  It's not a conventional one in
- * the sense that it doesn't probe at the USB level.  Instead it waits to
- * be connected by HID through the hiddev_connect / hiddev_disconnect
- * routines.  The reason to register as a USB device is to gain part of the
- * minor number space from the USB major.
- *
- * In theory, should the HID code be generalized to more than one physical
- * medium (say, IEEE 1384), this driver will probably need to register its
- * own major number, and in doing so, no longer need to register with USB.
- * At that point the probe routine and hiddev_driver struct below will no
- * longer be useful.
- */
-
-
-/* We never attach in this manner, and rely on HID to connect us.  This
- * is why there is no disconnect routine defined in the usb_driver either.
- */
-static int hiddev_usbd_probe(struct usb_interface *intf,
-			     const struct usb_device_id *hiddev_info)
-{
-	return -ENODEV;
-}
-
-static /* const */ struct usb_driver hiddev_driver = {
-	.name =		"hiddev",
-	.probe =	hiddev_usbd_probe,
-};
-
-int __init hiddev_init(void)
-{
-	return usb_register(&hiddev_driver);
-}
-
-void hiddev_exit(void)
-{
-	usb_deregister(&hiddev_driver);
-}
diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig
index 30f06e956bfb..b923074b2cbe 100644
--- a/drivers/i2c/Kconfig
+++ b/drivers/i2c/Kconfig
@@ -75,7 +75,8 @@ config I2C_HELPER_AUTO
 	  In doubt, say Y.
 
 config I2C_SMBUS
-	tristate "SMBus-specific protocols" if !I2C_HELPER_AUTO
+	tristate
+	prompt "SMBus-specific protocols" if !I2C_HELPER_AUTO
 	help
 	  Say Y here if you want support for SMBus extensions to the I2C
 	  specification. At the moment, the only supported extension is
diff --git a/drivers/i2c/Makefile b/drivers/i2c/Makefile
index c00fd66388f5..23ac61e2db39 100644
--- a/drivers/i2c/Makefile
+++ b/drivers/i2c/Makefile
@@ -9,6 +9,4 @@ obj-$(CONFIG_I2C_CHARDEV)	+= i2c-dev.o
 obj-$(CONFIG_I2C_MUX)		+= i2c-mux.o
 obj-y				+= algos/ busses/ muxes/
 
-ifeq ($(CONFIG_I2C_DEBUG_CORE),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-$(CONFIG_I2C_DEBUG_CORE) := -DDEBUG
diff --git a/drivers/i2c/algos/Kconfig b/drivers/i2c/algos/Kconfig
index 7b2ce4a08524..3998dd620a03 100644
--- a/drivers/i2c/algos/Kconfig
+++ b/drivers/i2c/algos/Kconfig
@@ -15,3 +15,15 @@ config I2C_ALGOPCA
 	tristate "I2C PCA 9564 interfaces"
 
 endmenu
+
+# In automatic configuration mode, we still have to define the
+# symbols to avoid unmet dependencies.
+
+if I2C_HELPER_AUTO
+config I2C_ALGOBIT
+	tristate
+config I2C_ALGOPCF
+	tristate
+config I2C_ALGOPCA
+	tristate
+endif
diff --git a/drivers/i2c/algos/Makefile b/drivers/i2c/algos/Makefile
index 18b3e962ec09..215303f60d61 100644
--- a/drivers/i2c/algos/Makefile
+++ b/drivers/i2c/algos/Makefile
@@ -6,6 +6,4 @@ obj-$(CONFIG_I2C_ALGOBIT)	+= i2c-algo-bit.o
 obj-$(CONFIG_I2C_ALGOPCF)	+= i2c-algo-pcf.o
 obj-$(CONFIG_I2C_ALGOPCA)	+= i2c-algo-pca.o
 
-ifeq ($(CONFIG_I2C_DEBUG_ALGO),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-$(CONFIG_I2C_DEBUG_ALGO) := -DDEBUG
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index c3ef49230cba..033ad413f328 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -76,6 +76,4 @@ obj-$(CONFIG_I2C_STUB)		+= i2c-stub.o
 obj-$(CONFIG_SCx200_ACB)	+= scx200_acb.o
 obj-$(CONFIG_SCx200_I2C)	+= scx200_i2c.o
 
-ifeq ($(CONFIG_I2C_DEBUG_BUS),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG
diff --git a/drivers/i2c/busses/i2c-amd8111.c b/drivers/i2c/busses/i2c-amd8111.c
index af1e5e254b7b..6b6a6b1d7025 100644
--- a/drivers/i2c/busses/i2c-amd8111.c
+++ b/drivers/i2c/busses/i2c-amd8111.c
@@ -69,7 +69,7 @@ static struct pci_driver amd8111_driver;
  * ACPI 2.0 chapter 13 access of registers of the EC
  */
 
-static unsigned int amd_ec_wait_write(struct amd_smbus *smbus)
+static int amd_ec_wait_write(struct amd_smbus *smbus)
 {
 	int timeout = 500;
 
@@ -85,7 +85,7 @@ static unsigned int amd_ec_wait_write(struct amd_smbus *smbus)
 	return 0;
 }
 
-static unsigned int amd_ec_wait_read(struct amd_smbus *smbus)
+static int amd_ec_wait_read(struct amd_smbus *smbus)
 {
 	int timeout = 500;
 
@@ -101,7 +101,7 @@ static unsigned int amd_ec_wait_read(struct amd_smbus *smbus)
 	return 0;
 }
 
-static unsigned int amd_ec_read(struct amd_smbus *smbus, unsigned char address,
+static int amd_ec_read(struct amd_smbus *smbus, unsigned char address,
 		unsigned char *data)
 {
 	int status;
@@ -124,7 +124,7 @@ static unsigned int amd_ec_read(struct amd_smbus *smbus, unsigned char address,
 	return 0;
 }
 
-static unsigned int amd_ec_write(struct amd_smbus *smbus, unsigned char address,
+static int amd_ec_write(struct amd_smbus *smbus, unsigned char address,
 		unsigned char data)
 {
 	int status;
@@ -196,7 +196,7 @@ static s32 amd8111_access(struct i2c_adapter * adap, u16 addr,
 {
 	struct amd_smbus *smbus = adap->algo_data;
 	unsigned char protocol, len, pec, temp[2];
-	int i;
+	int i, status;
 
 	protocol = (read_write == I2C_SMBUS_READ) ? AMD_SMB_PRTCL_READ
 						  : AMD_SMB_PRTCL_WRITE;
@@ -209,38 +209,62 @@ static s32 amd8111_access(struct i2c_adapter * adap, u16 addr,
 			break;
 
 		case I2C_SMBUS_BYTE:
-			if (read_write == I2C_SMBUS_WRITE)
-				amd_ec_write(smbus, AMD_SMB_CMD, command);
+			if (read_write == I2C_SMBUS_WRITE) {
+				status = amd_ec_write(smbus, AMD_SMB_CMD,
+						      command);
+				if (status)
+					return status;
+			}
 			protocol |= AMD_SMB_PRTCL_BYTE;
 			break;
 
 		case I2C_SMBUS_BYTE_DATA:
-			amd_ec_write(smbus, AMD_SMB_CMD, command);
-			if (read_write == I2C_SMBUS_WRITE)
-				amd_ec_write(smbus, AMD_SMB_DATA, data->byte);
+			status = amd_ec_write(smbus, AMD_SMB_CMD, command);
+			if (status)
+				return status;
+			if (read_write == I2C_SMBUS_WRITE) {
+				status = amd_ec_write(smbus, AMD_SMB_DATA,
+						      data->byte);
+				if (status)
+					return status;
+			}
 			protocol |= AMD_SMB_PRTCL_BYTE_DATA;
 			break;
 
 		case I2C_SMBUS_WORD_DATA:
-			amd_ec_write(smbus, AMD_SMB_CMD, command);
+			status = amd_ec_write(smbus, AMD_SMB_CMD, command);
+			if (status)
+				return status;
 			if (read_write == I2C_SMBUS_WRITE) {
-				amd_ec_write(smbus, AMD_SMB_DATA,
-					     data->word & 0xff);
-				amd_ec_write(smbus, AMD_SMB_DATA + 1,
-					     data->word >> 8);
+				status = amd_ec_write(smbus, AMD_SMB_DATA,
+						      data->word & 0xff);
+				if (status)
+					return status;
+				status = amd_ec_write(smbus, AMD_SMB_DATA + 1,
+						      data->word >> 8);
+				if (status)
+					return status;
 			}
 			protocol |= AMD_SMB_PRTCL_WORD_DATA | pec;
 			break;
 
 		case I2C_SMBUS_BLOCK_DATA:
-			amd_ec_write(smbus, AMD_SMB_CMD, command);
+			status = amd_ec_write(smbus, AMD_SMB_CMD, command);
+			if (status)
+				return status;
 			if (read_write == I2C_SMBUS_WRITE) {
 				len = min_t(u8, data->block[0],
 					    I2C_SMBUS_BLOCK_MAX);
-				amd_ec_write(smbus, AMD_SMB_BCNT, len);
-				for (i = 0; i < len; i++)
-					amd_ec_write(smbus, AMD_SMB_DATA + i,
-						     data->block[i + 1]);
+				status = amd_ec_write(smbus, AMD_SMB_BCNT, len);
+				if (status)
+					return status;
+				for (i = 0; i < len; i++) {
+					status =
+					  amd_ec_write(smbus, AMD_SMB_DATA + i,
+						       data->block[i + 1]);
+					if (status)
+						return status;
+				}
 			}
 			protocol |= AMD_SMB_PRTCL_BLOCK_DATA | pec;
 			break;
@@ -248,19 +272,35 @@ static s32 amd8111_access(struct i2c_adapter * adap, u16 addr,
 		case I2C_SMBUS_I2C_BLOCK_DATA:
 			len = min_t(u8, data->block[0],
 				    I2C_SMBUS_BLOCK_MAX);
-			amd_ec_write(smbus, AMD_SMB_CMD, command);
-			amd_ec_write(smbus, AMD_SMB_BCNT, len);
+			status = amd_ec_write(smbus, AMD_SMB_CMD, command);
+			if (status)
+				return status;
+			status = amd_ec_write(smbus, AMD_SMB_BCNT, len);
+			if (status)
+				return status;
 			if (read_write == I2C_SMBUS_WRITE)
-				for (i = 0; i < len; i++)
-					amd_ec_write(smbus, AMD_SMB_DATA + i,
-						     data->block[i + 1]);
+				for (i = 0; i < len; i++) {
+					status =
+					  amd_ec_write(smbus, AMD_SMB_DATA + i,
+						       data->block[i + 1]);
+					if (status)
+						return status;
+				}
 			protocol |= AMD_SMB_PRTCL_I2C_BLOCK_DATA;
 			break;
 
 		case I2C_SMBUS_PROC_CALL:
-			amd_ec_write(smbus, AMD_SMB_CMD, command);
-			amd_ec_write(smbus, AMD_SMB_DATA, data->word & 0xff);
-			amd_ec_write(smbus, AMD_SMB_DATA + 1, data->word >> 8);
+			status = amd_ec_write(smbus, AMD_SMB_CMD, command);
+			if (status)
+				return status;
+			status = amd_ec_write(smbus, AMD_SMB_DATA,
+					      data->word & 0xff);
+			if (status)
+				return status;
+			status = amd_ec_write(smbus, AMD_SMB_DATA + 1,
+					      data->word >> 8);
+			if (status)
+				return status;
 			protocol = AMD_SMB_PRTCL_PROC_CALL | pec;
 			read_write = I2C_SMBUS_READ;
 			break;
@@ -268,11 +308,18 @@ static s32 amd8111_access(struct i2c_adapter * adap, u16 addr,
 		case I2C_SMBUS_BLOCK_PROC_CALL:
 			len = min_t(u8, data->block[0],
 				    I2C_SMBUS_BLOCK_MAX - 1);
-			amd_ec_write(smbus, AMD_SMB_CMD, command);
-			amd_ec_write(smbus, AMD_SMB_BCNT, len);
-			for (i = 0; i < len; i++)
-				amd_ec_write(smbus, AMD_SMB_DATA + i,
-					     data->block[i + 1]);
+			status = amd_ec_write(smbus, AMD_SMB_CMD, command);
+			if (status)
+				return status;
+			status = amd_ec_write(smbus, AMD_SMB_BCNT, len);
+			if (status)
+				return status;
+			for (i = 0; i < len; i++) {
+				status = amd_ec_write(smbus, AMD_SMB_DATA + i,
+						      data->block[i + 1]);
+				if (status)
+					return status;
+			}
 			protocol = AMD_SMB_PRTCL_BLOCK_PROC_CALL | pec;
 			read_write = I2C_SMBUS_READ;
 			break;
@@ -282,24 +329,29 @@ static s32 amd8111_access(struct i2c_adapter * adap, u16 addr,
 			return -EOPNOTSUPP;
 	}
 
-	amd_ec_write(smbus, AMD_SMB_ADDR, addr << 1);
-	amd_ec_write(smbus, AMD_SMB_PRTCL, protocol);
+	status = amd_ec_write(smbus, AMD_SMB_ADDR, addr << 1);
+	if (status)
+		return status;
+	status = amd_ec_write(smbus, AMD_SMB_PRTCL, protocol);
+	if (status)
+		return status;
 
-	/* FIXME this discards status from ec_read(); so temp[0] will
-	 * hold stack garbage ... the rest of this routine will act
-	 * nonsensically.  Ignored ec_write() status might explain
-	 * some such failures...
-	 */
-	amd_ec_read(smbus, AMD_SMB_STS, temp + 0);
+	status = amd_ec_read(smbus, AMD_SMB_STS, temp + 0);
+	if (status)
+		return status;
 
 	if (~temp[0] & AMD_SMB_STS_DONE) {
 		udelay(500);
-		amd_ec_read(smbus, AMD_SMB_STS, temp + 0);
+		status = amd_ec_read(smbus, AMD_SMB_STS, temp + 0);
+		if (status)
+			return status;
 	}
 
 	if (~temp[0] & AMD_SMB_STS_DONE) {
 		msleep(1);
-		amd_ec_read(smbus, AMD_SMB_STS, temp + 0);
+		status = amd_ec_read(smbus, AMD_SMB_STS, temp + 0);
+		if (status)
+			return status;
 	}
 
 	if ((~temp[0] & AMD_SMB_STS_DONE) || (temp[0] & AMD_SMB_STS_STATUS))
@@ -311,24 +363,35 @@ static s32 amd8111_access(struct i2c_adapter * adap, u16 addr,
 	switch (size) {
 		case I2C_SMBUS_BYTE:
 		case I2C_SMBUS_BYTE_DATA:
-			amd_ec_read(smbus, AMD_SMB_DATA, &data->byte);
+			status = amd_ec_read(smbus, AMD_SMB_DATA, &data->byte);
+			if (status)
+				return status;
 			break;
 
 		case I2C_SMBUS_WORD_DATA:
 		case I2C_SMBUS_PROC_CALL:
-			amd_ec_read(smbus, AMD_SMB_DATA, temp + 0);
-			amd_ec_read(smbus, AMD_SMB_DATA + 1, temp + 1);
+			status = amd_ec_read(smbus, AMD_SMB_DATA, temp + 0);
+			if (status)
+				return status;
+			status = amd_ec_read(smbus, AMD_SMB_DATA + 1, temp + 1);
+			if (status)
+				return status;
 			data->word = (temp[1] << 8) | temp[0];
 			break;
 
 		case I2C_SMBUS_BLOCK_DATA:
 		case I2C_SMBUS_BLOCK_PROC_CALL:
-			amd_ec_read(smbus, AMD_SMB_BCNT, &len);
+			status = amd_ec_read(smbus, AMD_SMB_BCNT, &len);
+			if (status)
+				return status;
 			len = min_t(u8, len, I2C_SMBUS_BLOCK_MAX);
 		case I2C_SMBUS_I2C_BLOCK_DATA:
-			for (i = 0; i < len; i++)
-				amd_ec_read(smbus, AMD_SMB_DATA + i,
-					    data->block + i + 1);
+			for (i = 0; i < len; i++) {
+				status = amd_ec_read(smbus, AMD_SMB_DATA + i,
+						     data->block + i + 1);
+				if (status)
+					return status;
+			}
 			data->block[0] = len;
 			break;
 	}
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index 89eedf45d30e..6e3c38240336 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -41,7 +41,6 @@
 #include <asm/irq.h>
 #include <linux/io.h>
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/of_platform.h>
 #include <linux/of_i2c.h>
 
diff --git a/drivers/i2c/busses/i2c-nuc900.c b/drivers/i2c/busses/i2c-nuc900.c
index 92d770d7bbc2..72434263787b 100644
--- a/drivers/i2c/busses/i2c-nuc900.c
+++ b/drivers/i2c/busses/i2c-nuc900.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/interrupt.h>
diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c
index 5f6d7f89e225..ace67995d7de 100644
--- a/drivers/i2c/busses/i2c-pca-platform.c
+++ b/drivers/i2c/busses/i2c-pca-platform.c
@@ -224,7 +224,7 @@ static int __devinit i2c_pca_pf_probe(struct platform_device *pdev)
 
 	if (irq) {
 		ret = request_irq(irq, i2c_pca_pf_handler,
-			IRQF_TRIGGER_FALLING, i2c->adap.name, i2c);
+			IRQF_TRIGGER_FALLING, pdev->name, i2c);
 		if (ret)
 			goto e_reqirq;
 	}
diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c
index c94e51b2651e..f4c19a97e0b3 100644
--- a/drivers/i2c/busses/i2c-pxa.c
+++ b/drivers/i2c/busses/i2c-pxa.c
@@ -22,7 +22,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/sched.h>
diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c
index bf831bf81587..6a292ea5e35c 100644
--- a/drivers/i2c/busses/i2c-s3c2410.c
+++ b/drivers/i2c/busses/i2c-s3c2410.c
@@ -24,7 +24,6 @@
 #include <linux/module.h>
 
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/interrupt.h>
diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c
index 4c6fff5f330d..0b012f1f8ac5 100644
--- a/drivers/i2c/busses/i2c-viapro.c
+++ b/drivers/i2c/busses/i2c-viapro.c
@@ -185,14 +185,8 @@ static int vt596_transaction(u8 size)
 	}
 
 	if (temp & 0x04) {
-		int read = inb_p(SMBHSTADD) & 0x01;
 		result = -ENXIO;
-		/* The quick and receive byte commands are used to probe
-		   for chips, so errors are expected, and we don't want
-		   to frighten the user. */
-		if (!((size == VT596_QUICK && !read) ||
-		      (size == VT596_BYTE && read)))
-			dev_err(&vt596_adapter.dev, "Transaction error!\n");
+		dev_dbg(&vt596_adapter.dev, "No response\n");
 	}
 
 	/* Resetting status register */
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index bea4c5021d26..d231f683f576 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -425,14 +425,14 @@ static int __i2c_check_addr_busy(struct device *dev, void *addrp)
 /* walk up mux tree */
 static int i2c_check_mux_parents(struct i2c_adapter *adapter, int addr)
 {
+	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 	int result;
 
 	result = device_for_each_child(&adapter->dev, &addr,
 					__i2c_check_addr_busy);
 
-	if (!result && i2c_parent_is_i2c_adapter(adapter))
-		result = i2c_check_mux_parents(
-				    to_i2c_adapter(adapter->dev.parent), addr);
+	if (!result && parent)
+		result = i2c_check_mux_parents(parent, addr);
 
 	return result;
 }
@@ -453,11 +453,11 @@ static int i2c_check_mux_children(struct device *dev, void *addrp)
 
 static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr)
 {
+	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 	int result = 0;
 
-	if (i2c_parent_is_i2c_adapter(adapter))
-		result = i2c_check_mux_parents(
-				    to_i2c_adapter(adapter->dev.parent), addr);
+	if (parent)
+		result = i2c_check_mux_parents(parent, addr);
 
 	if (!result)
 		result = device_for_each_child(&adapter->dev, &addr,
@@ -472,8 +472,10 @@ static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr)
  */
 void i2c_lock_adapter(struct i2c_adapter *adapter)
 {
-	if (i2c_parent_is_i2c_adapter(adapter))
-		i2c_lock_adapter(to_i2c_adapter(adapter->dev.parent));
+	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
+
+	if (parent)
+		i2c_lock_adapter(parent);
 	else
 		rt_mutex_lock(&adapter->bus_lock);
 }
@@ -485,8 +487,10 @@ EXPORT_SYMBOL_GPL(i2c_lock_adapter);
  */
 static int i2c_trylock_adapter(struct i2c_adapter *adapter)
 {
-	if (i2c_parent_is_i2c_adapter(adapter))
-		return i2c_trylock_adapter(to_i2c_adapter(adapter->dev.parent));
+	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
+
+	if (parent)
+		return i2c_trylock_adapter(parent);
 	else
 		return rt_mutex_trylock(&adapter->bus_lock);
 }
@@ -497,8 +501,10 @@ static int i2c_trylock_adapter(struct i2c_adapter *adapter)
  */
 void i2c_unlock_adapter(struct i2c_adapter *adapter)
 {
-	if (i2c_parent_is_i2c_adapter(adapter))
-		i2c_unlock_adapter(to_i2c_adapter(adapter->dev.parent));
+	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
+
+	if (parent)
+		i2c_unlock_adapter(parent);
 	else
 		rt_mutex_unlock(&adapter->bus_lock);
 }
@@ -677,8 +683,6 @@ i2c_sysfs_new_device(struct device *dev, struct device_attribute *attr,
 	char *blank, end;
 	int res;
 
-	dev_warn(dev, "The new_device interface is still experimental "
-		 "and may change in a near future\n");
 	memset(&info, 0, sizeof(struct i2c_board_info));
 
 	blank = strchr(buf, ' ');
@@ -1504,26 +1508,25 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
 	if (!driver->detect || !address_list)
 		return 0;
 
+	/* Stop here if the classes do not match */
+	if (!(adapter->class & driver->class))
+		return 0;
+
 	/* Set up a temporary client to help detect callback */
 	temp_client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL);
 	if (!temp_client)
 		return -ENOMEM;
 	temp_client->adapter = adapter;
 
-	/* Stop here if the classes do not match */
-	if (!(adapter->class & driver->class))
-		goto exit_free;
-
 	for (i = 0; address_list[i] != I2C_CLIENT_END; i += 1) {
 		dev_dbg(&adapter->dev, "found normal entry for adapter %d, "
 			"addr 0x%02x\n", adap_id, address_list[i]);
 		temp_client->addr = address_list[i];
 		err = i2c_detect_address(temp_client, driver);
-		if (err)
-			goto exit_free;
+		if (unlikely(err))
+			break;
 	}
 
- exit_free:
 	kfree(temp_client);
 	return err;
 }
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 5f3a52d517c3..cec0f3ba97f8 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -192,13 +192,12 @@ static int i2cdev_check(struct device *dev, void *addrp)
 /* walk up mux tree */
 static int i2cdev_check_mux_parents(struct i2c_adapter *adapter, int addr)
 {
+	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 	int result;
 
 	result = device_for_each_child(&adapter->dev, &addr, i2cdev_check);
-
-	if (!result && i2c_parent_is_i2c_adapter(adapter))
-		result = i2cdev_check_mux_parents(
-				    to_i2c_adapter(adapter->dev.parent), addr);
+	if (!result && parent)
+		result = i2cdev_check_mux_parents(parent, addr);
 
 	return result;
 }
@@ -222,11 +221,11 @@ static int i2cdev_check_mux_children(struct device *dev, void *addrp)
    driver bound to it, as NOT busy. */
 static int i2cdev_check_addr(struct i2c_adapter *adapter, unsigned int addr)
 {
+	struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter);
 	int result = 0;
 
-	if (i2c_parent_is_i2c_adapter(adapter))
-		result = i2cdev_check_mux_parents(
-				    to_i2c_adapter(adapter->dev.parent), addr);
+	if (parent)
+		result = i2cdev_check_mux_parents(parent, addr);
 
 	if (!result)
 		result = device_for_each_child(&adapter->dev, &addr,
diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index 4c9a99c4fcb0..4d91d80bfd23 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -5,6 +5,16 @@
 menu "Multiplexer I2C Chip support"
 	depends on I2C_MUX
 
+config I2C_MUX_PCA9541
+	tristate "NXP PCA9541 I2C Master Selector"
+	depends on EXPERIMENTAL
+	help
+	  If you say yes here you get support for the NXP PCA9541
+	  I2C Master Selector.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called pca9541.
+
 config I2C_MUX_PCA954x
 	tristate "Philips PCA954x I2C Mux/switches"
 	depends on EXPERIMENTAL
diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile
index bd83b5274815..d743806d9b42 100644
--- a/drivers/i2c/muxes/Makefile
+++ b/drivers/i2c/muxes/Makefile
@@ -1,8 +1,7 @@
 #
 # Makefile for multiplexer I2C chip drivers.
 
+obj-$(CONFIG_I2C_MUX_PCA9541)	+= pca9541.o
 obj-$(CONFIG_I2C_MUX_PCA954x)	+= pca954x.o
 
-ifeq ($(CONFIG_I2C_DEBUG_BUS),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG
diff --git a/drivers/i2c/muxes/pca9541.c b/drivers/i2c/muxes/pca9541.c
new file mode 100644
index 000000000000..ed699c5aa79d
--- /dev/null
+++ b/drivers/i2c/muxes/pca9541.c
@@ -0,0 +1,411 @@
+/*
+ * I2C multiplexer driver for PCA9541 bus master selector
+ *
+ * Copyright (c) 2010 Ericsson AB.
+ *
+ * Author: Guenter Roeck <guenter.roeck@ericsson.com>
+ *
+ * Derived from:
+ *  pca954x.c
+ *
+ *  Copyright (c) 2008-2009 Rodolfo Giometti <giometti@linux.it>
+ *  Copyright (c) 2008-2009 Eurotech S.p.A. <info@eurotech.it>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+
+#include <linux/i2c/pca954x.h>
+
+/*
+ * The PCA9541 is a bus master selector. It supports two I2C masters connected
+ * to a single slave bus.
+ *
+ * Before each bus transaction, a master has to acquire bus ownership. After the
+ * transaction is complete, bus ownership has to be released. This fits well
+ * into the I2C multiplexer framework, which provides select and release
+ * functions for this purpose. For this reason, this driver is modeled as
+ * single-channel I2C bus multiplexer.
+ *
+ * This driver assumes that the two bus masters are controlled by two different
+ * hosts. If a single host controls both masters, platform code has to ensure
+ * that only one of the masters is instantiated at any given time.
+ */
+
+#define PCA9541_CONTROL		0x01
+#define PCA9541_ISTAT		0x02
+
+#define PCA9541_CTL_MYBUS	(1 << 0)
+#define PCA9541_CTL_NMYBUS	(1 << 1)
+#define PCA9541_CTL_BUSON	(1 << 2)
+#define PCA9541_CTL_NBUSON	(1 << 3)
+#define PCA9541_CTL_BUSINIT	(1 << 4)
+#define PCA9541_CTL_TESTON	(1 << 6)
+#define PCA9541_CTL_NTESTON	(1 << 7)
+
+#define PCA9541_ISTAT_INTIN	(1 << 0)
+#define PCA9541_ISTAT_BUSINIT	(1 << 1)
+#define PCA9541_ISTAT_BUSOK	(1 << 2)
+#define PCA9541_ISTAT_BUSLOST	(1 << 3)
+#define PCA9541_ISTAT_MYTEST	(1 << 6)
+#define PCA9541_ISTAT_NMYTEST	(1 << 7)
+
+#define BUSON		(PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON)
+#define MYBUS		(PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS)
+#define mybus(x)	(!((x) & MYBUS) || ((x) & MYBUS) == MYBUS)
+#define busoff(x)	(!((x) & BUSON) || ((x) & BUSON) == BUSON)
+
+/* arbitration timeouts, in jiffies */
+#define ARB_TIMEOUT	(HZ / 8)	/* 125 ms until forcing bus ownership */
+#define ARB2_TIMEOUT	(HZ / 4)	/* 250 ms until acquisition failure */
+
+/* arbitration retry delays, in us */
+#define SELECT_DELAY_SHORT	50
+#define SELECT_DELAY_LONG	1000
+
+struct pca9541 {
+	struct i2c_adapter *mux_adap;
+	unsigned long select_timeout;
+	unsigned long arb_timeout;
+};
+
+static const struct i2c_device_id pca9541_id[] = {
+	{"pca9541", 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(i2c, pca9541_id);
+
+/*
+ * Write to chip register. Don't use i2c_transfer()/i2c_smbus_xfer()
+ * as they will try to lock the adapter a second time.
+ */
+static int pca9541_reg_write(struct i2c_client *client, u8 command, u8 val)
+{
+	struct i2c_adapter *adap = client->adapter;
+	int ret;
+
+	if (adap->algo->master_xfer) {
+		struct i2c_msg msg;
+		char buf[2];
+
+		msg.addr = client->addr;
+		msg.flags = 0;
+		msg.len = 2;
+		buf[0] = command;
+		buf[1] = val;
+		msg.buf = buf;
+		ret = adap->algo->master_xfer(adap, &msg, 1);
+	} else {
+		union i2c_smbus_data data;
+
+		data.byte = val;
+		ret = adap->algo->smbus_xfer(adap, client->addr,
+					     client->flags,
+					     I2C_SMBUS_WRITE,
+					     command,
+					     I2C_SMBUS_BYTE_DATA, &data);
+	}
+
+	return ret;
+}
+
+/*
+ * Read from chip register. Don't use i2c_transfer()/i2c_smbus_xfer()
+ * as they will try to lock adapter a second time.
+ */
+static int pca9541_reg_read(struct i2c_client *client, u8 command)
+{
+	struct i2c_adapter *adap = client->adapter;
+	int ret;
+	u8 val;
+
+	if (adap->algo->master_xfer) {
+		struct i2c_msg msg[2] = {
+			{
+				.addr = client->addr,
+				.flags = 0,
+				.len = 1,
+				.buf = &command
+			},
+			{
+				.addr = client->addr,
+				.flags = I2C_M_RD,
+				.len = 1,
+				.buf = &val
+			}
+		};
+		ret = adap->algo->master_xfer(adap, msg, 2);
+		if (ret == 2)
+			ret = val;
+		else if (ret >= 0)
+			ret = -EIO;
+	} else {
+		union i2c_smbus_data data;
+
+		ret = adap->algo->smbus_xfer(adap, client->addr,
+					     client->flags,
+					     I2C_SMBUS_READ,
+					     command,
+					     I2C_SMBUS_BYTE_DATA, &data);
+		if (!ret)
+			ret = data.byte;
+	}
+	return ret;
+}
+
+/*
+ * Arbitration management functions
+ */
+
+/* Release bus. Also reset NTESTON and BUSINIT if it was set. */
+static void pca9541_release_bus(struct i2c_client *client)
+{
+	int reg;
+
+	reg = pca9541_reg_read(client, PCA9541_CONTROL);
+	if (reg >= 0 && !busoff(reg) && mybus(reg))
+		pca9541_reg_write(client, PCA9541_CONTROL,
+				  (reg & PCA9541_CTL_NBUSON) >> 1);
+}
+
+/*
+ * Arbitration is defined as a two-step process. A bus master can only activate
+ * the slave bus if it owns it; otherwise it has to request ownership first.
+ * This multi-step process ensures that access contention is resolved
+ * gracefully.
+ *
+ * Bus	Ownership	Other master	Action
+ * state		requested access
+ * ----------------------------------------------------
+ * off	-		yes		wait for arbitration timeout or
+ *					for other master to drop request
+ * off	no		no		take ownership
+ * off	yes		no		turn on bus
+ * on	yes		-		done
+ * on	no		-		wait for arbitration timeout or
+ *					for other master to release bus
+ *
+ * The main contention point occurs if the slave bus is off and both masters
+ * request ownership at the same time. In this case, one master will turn on
+ * the slave bus, believing that it owns it. The other master will request
+ * bus ownership. Result is that the bus is turned on, and master which did
+ * _not_ own the slave bus before ends up owning it.
+ */
+
+/* Control commands per PCA9541 datasheet */
+static const u8 pca9541_control[16] = {
+	4, 0, 1, 5, 4, 4, 5, 5, 0, 0, 1, 1, 0, 4, 5, 1
+};
+
+/*
+ * Channel arbitration
+ *
+ * Return values:
+ *  <0: error
+ *  0 : bus not acquired
+ *  1 : bus acquired
+ */
+static int pca9541_arbitrate(struct i2c_client *client)
+{
+	struct pca9541 *data = i2c_get_clientdata(client);
+	int reg;
+
+	reg = pca9541_reg_read(client, PCA9541_CONTROL);
+	if (reg < 0)
+		return reg;
+
+	if (busoff(reg)) {
+		int istat;
+		/*
+		 * Bus is off. Request ownership or turn it on unless
+		 * other master requested ownership.
+		 */
+		istat = pca9541_reg_read(client, PCA9541_ISTAT);
+		if (!(istat & PCA9541_ISTAT_NMYTEST)
+		    || time_is_before_eq_jiffies(data->arb_timeout)) {
+			/*
+			 * Other master did not request ownership,
+			 * or arbitration timeout expired. Take the bus.
+			 */
+			pca9541_reg_write(client,
+					  PCA9541_CONTROL,
+					  pca9541_control[reg & 0x0f]
+					  | PCA9541_CTL_NTESTON);
+			data->select_timeout = SELECT_DELAY_SHORT;
+		} else {
+			/*
+			 * Other master requested ownership.
+			 * Set extra long timeout to give it time to acquire it.
+			 */
+			data->select_timeout = SELECT_DELAY_LONG * 2;
+		}
+	} else if (mybus(reg)) {
+		/*
+		 * Bus is on, and we own it. We are done with acquisition.
+		 * Reset NTESTON and BUSINIT, then return success.
+		 */
+		if (reg & (PCA9541_CTL_NTESTON | PCA9541_CTL_BUSINIT))
+			pca9541_reg_write(client,
+					  PCA9541_CONTROL,
+					  reg & ~(PCA9541_CTL_NTESTON
+						  | PCA9541_CTL_BUSINIT));
+		return 1;
+	} else {
+		/*
+		 * Other master owns the bus.
+		 * If arbitration timeout has expired, force ownership.
+		 * Otherwise request it.
+		 */
+		data->select_timeout = SELECT_DELAY_LONG;
+		if (time_is_before_eq_jiffies(data->arb_timeout)) {
+			/* Time is up, take the bus and reset it. */
+			pca9541_reg_write(client,
+					  PCA9541_CONTROL,
+					  pca9541_control[reg & 0x0f]
+					  | PCA9541_CTL_BUSINIT
+					  | PCA9541_CTL_NTESTON);
+		} else {
+			/* Request bus ownership if needed */
+			if (!(reg & PCA9541_CTL_NTESTON))
+				pca9541_reg_write(client,
+						  PCA9541_CONTROL,
+						  reg | PCA9541_CTL_NTESTON);
+		}
+	}
+	return 0;
+}
+
+static int pca9541_select_chan(struct i2c_adapter *adap, void *client, u32 chan)
+{
+	struct pca9541 *data = i2c_get_clientdata(client);
+	int ret;
+	unsigned long timeout = jiffies + ARB2_TIMEOUT;
+		/* give up after this time */
+
+	data->arb_timeout = jiffies + ARB_TIMEOUT;
+		/* force bus ownership after this time */
+
+	do {
+		ret = pca9541_arbitrate(client);
+		if (ret)
+			return ret < 0 ? ret : 0;
+
+		if (data->select_timeout == SELECT_DELAY_SHORT)
+			udelay(data->select_timeout);
+		else
+			msleep(data->select_timeout / 1000);
+	} while (time_is_after_eq_jiffies(timeout));
+
+	return -ETIMEDOUT;
+}
+
+static int pca9541_release_chan(struct i2c_adapter *adap,
+				void *client, u32 chan)
+{
+	pca9541_release_bus(client);
+	return 0;
+}
+
+/*
+ * I2C init/probing/exit functions
+ */
+static int pca9541_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
+{
+	struct i2c_adapter *adap = client->adapter;
+	struct pca954x_platform_data *pdata = client->dev.platform_data;
+	struct pca9541 *data;
+	int force;
+	int ret = -ENODEV;
+
+	if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_BYTE_DATA))
+		goto err;
+
+	data = kzalloc(sizeof(struct pca9541), GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	i2c_set_clientdata(client, data);
+
+	/*
+	 * I2C accesses are unprotected here.
+	 * We have to lock the adapter before releasing the bus.
+	 */
+	i2c_lock_adapter(adap);
+	pca9541_release_bus(client);
+	i2c_unlock_adapter(adap);
+
+	/* Create mux adapter */
+
+	force = 0;
+	if (pdata)
+		force = pdata->modes[0].adap_id;
+	data->mux_adap = i2c_add_mux_adapter(adap, client, force, 0,
+					     pca9541_select_chan,
+					     pca9541_release_chan);
+
+	if (data->mux_adap == NULL) {
+		dev_err(&client->dev, "failed to register master selector\n");
+		goto exit_free;
+	}
+
+	dev_info(&client->dev, "registered master selector for I2C %s\n",
+		 client->name);
+
+	return 0;
+
+exit_free:
+	kfree(data);
+err:
+	return ret;
+}
+
+static int pca9541_remove(struct i2c_client *client)
+{
+	struct pca9541 *data = i2c_get_clientdata(client);
+
+	i2c_del_mux_adapter(data->mux_adap);
+
+	kfree(data);
+	return 0;
+}
+
+static struct i2c_driver pca9541_driver = {
+	.driver = {
+		   .name = "pca9541",
+		   .owner = THIS_MODULE,
+		   },
+	.probe = pca9541_probe,
+	.remove = pca9541_remove,
+	.id_table = pca9541_id,
+};
+
+static int __init pca9541_init(void)
+{
+	return i2c_add_driver(&pca9541_driver);
+}
+
+static void __exit pca9541_exit(void)
+{
+	i2c_del_driver(&pca9541_driver);
+}
+
+module_init(pca9541_init);
+module_exit(pca9541_exit);
+
+MODULE_AUTHOR("Guenter Roeck <guenter.roeck@ericsson.com>");
+MODULE_DESCRIPTION("PCA9541 I2C master selector driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/i2c/muxes/pca954x.c b/drivers/i2c/muxes/pca954x.c
index 6f9accf3189d..54e1ce73534b 100644
--- a/drivers/i2c/muxes/pca954x.c
+++ b/drivers/i2c/muxes/pca954x.c
@@ -181,8 +181,8 @@ static int pca954x_deselect_mux(struct i2c_adapter *adap,
 /*
  * I2C init/probing/exit functions
  */
-static int __devinit pca954x_probe(struct i2c_client *client,
-				   const struct i2c_device_id *id)
+static int pca954x_probe(struct i2c_client *client,
+			 const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adap = to_i2c_adapter(client->dev.parent);
 	struct pca954x_platform_data *pdata = client->dev.platform_data;
@@ -255,7 +255,7 @@ err:
 	return ret;
 }
 
-static int __devexit pca954x_remove(struct i2c_client *client)
+static int pca954x_remove(struct i2c_client *client)
 {
 	struct pca954x *data = i2c_get_clientdata(client);
 	const struct chip_desc *chip = &chips[data->type];
@@ -279,7 +279,7 @@ static struct i2c_driver pca954x_driver = {
 		.owner	= THIS_MODULE,
 	},
 	.probe		= pca954x_probe,
-	.remove		= __devexit_p(pca954x_remove),
+	.remove		= pca954x_remove,
 	.id_table	= pca954x_id,
 };
 
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 4e298bc8949d..5a46b8c5d68a 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -32,6 +32,7 @@
  * The pointer to our (page) of device descriptions.
  */
 static void *kvm_devices;
+struct work_struct hotplug_work;
 
 struct kvm_device {
 	struct virtio_device vdev;
@@ -328,13 +329,54 @@ static void scan_devices(void)
 }
 
 /*
+ * match for a kvm device with a specific desc pointer
+ */
+static int match_desc(struct device *dev, void *data)
+{
+	if ((ulong)to_kvmdev(dev_to_virtio(dev))->desc == (ulong)data)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * hotplug_device tries to find changes in the device page.
+ */
+static void hotplug_devices(struct work_struct *dummy)
+{
+	unsigned int i;
+	struct kvm_device_desc *d;
+	struct device *dev;
+
+	for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+		d = kvm_devices + i;
+
+		/* end of list */
+		if (d->type == 0)
+			break;
+
+		/* device already exists */
+		dev = device_find_child(kvm_root, d, match_desc);
+		if (dev) {
+			/* XXX check for hotplug remove */
+			put_device(dev);
+			continue;
+		}
+
+		/* new device */
+		printk(KERN_INFO "Adding new virtio device %p\n", d);
+		add_kvm_device(d, i);
+	}
+}
+
+/*
  * we emulate the request_irq behaviour on top of s390 extints
  */
 static void kvm_extint_handler(u16 code)
 {
 	struct virtqueue *vq;
 	u16 subcode;
-	int config_changed;
+	u32 param;
 
 	subcode = S390_lowcore.cpu_addr;
 	if ((subcode & 0xff00) != VIRTIO_SUBCODE_64)
@@ -343,18 +385,28 @@ static void kvm_extint_handler(u16 code)
 	/* The LSB might be overloaded, we have to mask it */
 	vq = (struct virtqueue *)(S390_lowcore.ext_params2 & ~1UL);
 
-	/* We use the LSB of extparam, to decide, if this interrupt is a config
-	 * change or a "standard" interrupt */
-	config_changed = S390_lowcore.ext_params & 1;
+	/* We use ext_params to decide what this interrupt means */
+	param = S390_lowcore.ext_params & VIRTIO_PARAM_MASK;
 
-	if (config_changed) {
+	switch (param) {
+	case VIRTIO_PARAM_CONFIG_CHANGED:
+	{
 		struct virtio_driver *drv;
 		drv = container_of(vq->vdev->dev.driver,
 				   struct virtio_driver, driver);
 		if (drv->config_changed)
 			drv->config_changed(vq->vdev);
-	} else
+
+		break;
+	}
+	case VIRTIO_PARAM_DEV_ADD:
+		schedule_work(&hotplug_work);
+		break;
+	case VIRTIO_PARAM_VRING_INTERRUPT:
+	default:
 		vring_interrupt(0, vq);
+		break;
+	}
 }
 
 /*
@@ -383,6 +435,8 @@ static int __init kvm_devices_init(void)
 
 	kvm_devices = (void *) real_memory_size;
 
+	INIT_WORK(&hotplug_work, hotplug_devices);
+
 	ctl_set_bit(0, 9);
 	register_external_interrupt(0x2603, kvm_extint_handler);
 
diff --git a/drivers/video/aty/radeon_i2c.c b/drivers/video/aty/radeon_i2c.c
index 359fc64e761a..78d1f4cd1fe0 100644
--- a/drivers/video/aty/radeon_i2c.c
+++ b/drivers/video/aty/radeon_i2c.c
@@ -7,7 +7,6 @@
 
 
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/i2c-algo-bit.h>
 
 #include <asm/io.h>
diff --git a/drivers/video/i810/i810.h b/drivers/video/i810/i810.h
index 328ae6c673ec..f37de60ecc59 100644
--- a/drivers/video/i810/i810.h
+++ b/drivers/video/i810/i810.h
@@ -17,7 +17,6 @@
 #include <linux/agp_backend.h>
 #include <linux/fb.h>
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/i2c-algo-bit.h>
 #include <video/vga.h>
 
diff --git a/drivers/video/intelfb/intelfb_i2c.c b/drivers/video/intelfb/intelfb_i2c.c
index 487f2be47460..3300bd31d9d7 100644
--- a/drivers/video/intelfb/intelfb_i2c.c
+++ b/drivers/video/intelfb/intelfb_i2c.c
@@ -32,7 +32,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <linux/fb.h>
 
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/i2c-algo-bit.h>
 
 #include <asm/io.h>
diff --git a/drivers/video/savage/savagefb.h b/drivers/video/savage/savagefb.h
index 8bfdfc3c5234..e4c3f214eb8e 100644
--- a/drivers/video/savage/savagefb.h
+++ b/drivers/video/savage/savagefb.h
@@ -13,7 +13,6 @@
 #define __SAVAGEFB_H__
 
 #include <linux/i2c.h>
-#include <linux/i2c-id.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/mutex.h>
 #include <video/vga.h>
diff --git a/include/crypto/cryptd.h b/include/crypto/cryptd.h
index 1c96b255017c..ba98918bbd9b 100644
--- a/include/crypto/cryptd.h
+++ b/include/crypto/cryptd.h
@@ -1,5 +1,12 @@
 /*
  * Software async crypto daemon
+ *
+ * Added AEAD support to cryptd.
+ *    Authors: Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
  */
 
 #ifndef _CRYPTO_CRYPT_H
@@ -42,4 +49,21 @@ struct crypto_shash *cryptd_ahash_child(struct cryptd_ahash *tfm);
 struct shash_desc *cryptd_shash_desc(struct ahash_request *req);
 void cryptd_free_ahash(struct cryptd_ahash *tfm);
 
+struct cryptd_aead {
+	struct crypto_aead base;
+};
+
+static inline struct cryptd_aead *__cryptd_aead_cast(
+	struct crypto_aead *tfm)
+{
+	return (struct cryptd_aead *)tfm;
+}
+
+struct cryptd_aead *cryptd_alloc_aead(const char *alg_name,
+					  u32 type, u32 mask);
+
+struct crypto_aead *cryptd_aead_child(struct cryptd_aead *tfm);
+
+void cryptd_free_aead(struct cryptd_aead *tfm);
+
 #endif
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 42a0f1d11365..bb0f56f5c01e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -316,6 +316,7 @@ struct hid_item {
 #define HID_QUIRK_FULLSPEED_INTERVAL		0x10000000
 #define HID_QUIRK_NO_INIT_REPORTS		0x20000000
 #define HID_QUIRK_NO_IGNORE			0x40000000
+#define HID_QUIRK_NO_INPUT_SYNC			0x80000000
 
 /*
  * This is the global environment of the parser. This information is
@@ -626,8 +627,8 @@ struct hid_driver {
 	int (*event)(struct hid_device *hdev, struct hid_field *field,
 			struct hid_usage *usage, __s32 value);
 
-	void (*report_fixup)(struct hid_device *hdev, __u8 *buf,
-			unsigned int size);
+	__u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf,
+			unsigned int *size);
 
 	int (*input_mapping)(struct hid_device *hdev,
 			struct hid_input *hidinput, struct hid_field *field,
diff --git a/include/linux/hiddev.h b/include/linux/hiddev.h
index bb6f58baf319..a3f481a3063b 100644
--- a/include/linux/hiddev.h
+++ b/include/linux/hiddev.h
@@ -226,8 +226,6 @@ void hiddev_disconnect(struct hid_device *);
 void hiddev_hid_event(struct hid_device *hid, struct hid_field *field,
 		      struct hid_usage *usage, __s32 value);
 void hiddev_report_event(struct hid_device *hid, struct hid_report *report);
-int __init hiddev_init(void);
-void hiddev_exit(void);
 #else
 static inline int hiddev_connect(struct hid_device *hid,
 		unsigned int force)
@@ -236,8 +234,6 @@ static inline void hiddev_disconnect(struct hid_device *hid) { }
 static inline void hiddev_hid_event(struct hid_device *hid, struct hid_field *field,
 		      struct hid_usage *usage, __s32 value) { }
 static inline void hiddev_report_event(struct hid_device *hid, struct hid_report *report) { }
-static inline int hiddev_init(void) { return 0; }
-static inline void hiddev_exit(void) { }
 #endif
 
 #endif
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 4bae0b72ed3c..1f66fa06a97c 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -384,11 +384,15 @@ static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
 	dev_set_drvdata(&dev->dev, data);
 }
 
-static inline int i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
+static inline struct i2c_adapter *
+i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
 {
-	return adapter->dev.parent != NULL
-		&& adapter->dev.parent->bus == &i2c_bus_type
-		&& adapter->dev.parent->type == &i2c_adapter_type;
+	struct device *parent = adapter->dev.parent;
+
+	if (parent != NULL && parent->type == &i2c_adapter_type)
+		return to_i2c_adapter(parent);
+	else
+		return NULL;
 }
 
 /* Adapter locking functions, exported for shared pin cases */
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 636fc381c897..919ae53adc5c 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -414,6 +414,14 @@ struct kvm_enable_cap {
 	__u8  pad[64];
 };
 
+/* for KVM_PPC_GET_PVINFO */
+struct kvm_ppc_pvinfo {
+	/* out */
+	__u32 flags;
+	__u32 hcall[4];
+	__u8  pad[108];
+};
+
 #define KVMIO 0xAE
 
 /*
@@ -530,6 +538,8 @@ struct kvm_enable_cap {
 #ifdef __KVM_HAVE_XCRS
 #define KVM_CAP_XCRS 56
 #endif
+#define KVM_CAP_PPC_GET_PVINFO 57
+#define KVM_CAP_PPC_IRQ_LEVEL 58
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -664,6 +674,8 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
+/* Available with KVM_CAP_PPC_GET_PVINFO */
+#define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac740b26eb10..a0557422715e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -36,9 +36,10 @@
 #define KVM_REQ_PENDING_TIMER      5
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
-#define KVM_REQ_KVMCLOCK_UPDATE    8
+#define KVM_REQ_CLOCK_UPDATE       8
 #define KVM_REQ_KICK               9
 #define KVM_REQ_DEACTIVATE_FPU    10
+#define KVM_REQ_EVENT             11
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -289,6 +290,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
 
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
+			    int nr_pages);
+
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
@@ -296,6 +300,8 @@ void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
@@ -477,8 +483,7 @@ int kvm_deassign_device(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *assigned_dev);
 #else /* CONFIG_IOMMU_API */
 static inline int kvm_iommu_map_pages(struct kvm *kvm,
-				      gfn_t base_gfn,
-				      unsigned long npages)
+				      struct kvm_memory_slot *slot)
 {
 	return 0;
 }
@@ -518,11 +523,22 @@ static inline void kvm_guest_exit(void)
 	current->flags &= ~PF_VCPU;
 }
 
+static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+					       gfn_t gfn)
+{
+	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
 static inline gpa_t gfn_to_gpa(gfn_t gfn)
 {
 	return (gpa_t)gfn << PAGE_SHIFT;
 }
 
+static inline gfn_t gpa_to_gfn(gpa_t gpa)
+{
+	return (gfn_t)(gpa >> PAGE_SHIFT);
+}
+
 static inline hpa_t pfn_to_hpa(pfn_t pfn)
 {
 	return (hpa_t)pfn << PAGE_SHIFT;
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index d73109243fda..47a070b0520e 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -17,6 +17,8 @@
 
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
+#define KVM_HC_FEATURES			3
+#define KVM_HC_PPC_MAP_MAGIC_PAGE	4
 
 /*
  * hypercalls use architecture specific
@@ -24,11 +26,6 @@
 #include <asm/kvm_para.h>
 
 #ifdef __KERNEL__
-#ifdef CONFIG_KVM_GUEST
-void __init kvm_guest_init(void);
-#else
-#define kvm_guest_init() do { } while (0)
-#endif
 
 static inline int kvm_para_has_feature(unsigned int feature)
 {
diff --git a/include/linux/padata.h b/include/linux/padata.h
index bdcd1e9eacea..4633b2f726b6 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -127,8 +127,8 @@ struct padata_cpumask {
  */
 struct parallel_data {
 	struct padata_instance		*pinst;
-	struct padata_parallel_queue	*pqueue;
-	struct padata_serial_queue	*squeue;
+	struct padata_parallel_queue	__percpu *pqueue;
+	struct padata_serial_queue	__percpu *squeue;
 	atomic_t			seq_nr;
 	atomic_t			reorder_objects;
 	atomic_t			refcnt;
diff --git a/mm/util.c b/mm/util.c
index 4735ea481816..73dac81e9f78 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -245,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
 
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+				 int nr_pages, int write, struct page **pages)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 369e38010ad5..8edca9141b78 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -17,7 +17,7 @@
  * Authors:
  *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
  *
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  */
 
 #include <linux/kvm_host.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 60e5e4612b0b..5225052aebc1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5,7 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
@@ -705,14 +705,12 @@ skip_lpage:
 	if (r)
 		goto out_free;
 
-#ifdef CONFIG_DMAR
 	/* map the pages in iommu page table */
 	if (npages) {
 		r = kvm_iommu_map_pages(kvm, &new);
 		if (r)
 			goto out_free;
 	}
-#endif
 
 	r = -ENOMEM;
 	slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
@@ -927,35 +925,46 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return memslot - slots->memslots;
 }
 
-static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
-{
-	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
-}
-
-unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
+				     gfn_t *nr_pages)
 {
 	struct kvm_memory_slot *slot;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return bad_hva();
+
+	if (nr_pages)
+		*nr_pages = slot->npages - (gfn - slot->base_gfn);
+
 	return gfn_to_hva_memslot(slot, gfn);
 }
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+	return gfn_to_hva_many(kvm, gfn, NULL);
+}
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
 {
 	struct page *page[1];
 	int npages;
 	pfn_t pfn;
 
-	might_sleep();
-
-	npages = get_user_pages_fast(addr, 1, 1, page);
+	if (atomic)
+		npages = __get_user_pages_fast(addr, 1, 1, page);
+	else {
+		might_sleep();
+		npages = get_user_pages_fast(addr, 1, 1, page);
+	}
 
 	if (unlikely(npages != 1)) {
 		struct vm_area_struct *vma;
 
+		if (atomic)
+			goto return_fault_page;
+
 		down_read(&current->mm->mmap_sem);
 		if (is_hwpoison_address(addr)) {
 			up_read(&current->mm->mmap_sem);
@@ -968,6 +977,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
 			up_read(&current->mm->mmap_sem);
+return_fault_page:
 			get_page(fault_page);
 			return page_to_pfn(fault_page);
 		}
@@ -981,7 +991,13 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 	return pfn;
 }
 
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+{
+	return hva_to_pfn(kvm, addr, true);
+}
+EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
+
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
 {
 	unsigned long addr;
 
@@ -991,7 +1007,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr);
+	return hva_to_pfn(kvm, addr, atomic);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, true);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, false);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
@@ -999,9 +1026,26 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(kvm, addr);
+	return hva_to_pfn(kvm, addr, false);
 }
 
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
+								  int nr_pages)
+{
+	unsigned long addr;
+	gfn_t entry;
+
+	addr = gfn_to_hva_many(kvm, gfn, &entry);
+	if (kvm_is_error_hva(addr))
+		return -1;
+
+	if (entry < nr_pages)
+		return 0;
+
+	return __get_user_pages_fast(addr, nr_pages, 1, pages);
+}
+EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
 	pfn_t pfn;
@@ -1964,7 +2008,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	case CPU_STARTING:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
 		       cpu);
+		spin_lock(&kvm_lock);
 		hardware_enable(NULL);
+		spin_unlock(&kvm_lock);
 		break;
 	}
 	return NOTIFY_OK;
@@ -1977,7 +2023,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void)
 		/* spin while reset goes on */
 		local_irq_enable();
 		while (true)
-			;
+			cpu_relax();
 	}
 	/* Fault while not rebooting.  We want the trace. */
 	BUG();
@@ -2171,8 +2217,10 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 
 static int kvm_resume(struct sys_device *dev)
 {
-	if (kvm_usage_count)
+	if (kvm_usage_count) {
+		WARN_ON(spin_is_locked(&kvm_lock));
 		hardware_enable(NULL);
+	}
 	return 0;
 }